def __init__(self, brain, trainer_parameters, training, load, seed, run_id): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id) self.policy = BCPolicy(seed, brain, trainer_parameters, load) self.n_sequences = 1 self.cumulative_rewards = {} self.episode_steps = {} self.stats = { "Losses/Cloning Loss": [], "Environment/Episode Length": [], "Environment/Cumulative Reward": [], } self.batches_per_epoch = trainer_parameters["batches_per_epoch"] self.demonstration_buffer = AgentBuffer() self.evaluation_buffer = ProcessingBuffer()
def test_buffer(): agent_1_buffer = construct_fake_buffer(1) agent_2_buffer = construct_fake_buffer(2) agent_3_buffer = construct_fake_buffer(3) a = agent_1_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=1, sequential=True) assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]])) a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=3, sequential=True) assert_array( np.array(a), np.array([ [231, 232, 233], [241, 242, 243], [251, 252, 253], [261, 262, 263], [271, 272, 273], [281, 282, 283], ]), ) a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=3, sequential=False) assert_array( np.array(a), np.array([ [251, 252, 253], [261, 262, 263], [271, 272, 273], [261, 262, 263], [271, 272, 273], [281, 282, 283], ]), ) agent_1_buffer.reset_agent() assert agent_1_buffer.num_experiences == 0 update_buffer = AgentBuffer() agent_2_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) agent_3_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20 assert np.array(update_buffer[BufferKey.CONTINUOUS_ACTION]).shape == (20, 2) c = update_buffer.make_mini_batch(start=0, end=1) assert c.keys() == update_buffer.keys() assert np.array(c[BufferKey.CONTINUOUS_ACTION]).shape == (1, 2)
def test_buffer(): b = construct_fake_processing_buffer() a = b[1]["vector_observation"].get_batch(batch_size=2, training_length=1, sequential=True) assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]])) a = b[2]["vector_observation"].get_batch(batch_size=2, training_length=3, sequential=True) assert_array( np.array(a), np.array([ [231, 232, 233], [241, 242, 243], [251, 252, 253], [261, 262, 263], [271, 272, 273], [281, 282, 283], ]), ) a = b[2]["vector_observation"].get_batch(batch_size=2, training_length=3, sequential=False) assert_array( np.array(a), np.array([ [251, 252, 253], [261, 262, 263], [271, 272, 273], [261, 262, 263], [271, 272, 273], [281, 282, 283], ]), ) b[4].reset_agent() assert len(b[4]) == 0 update_buffer = AgentBuffer() b.append_to_update_buffer(update_buffer, 3, batch_size=None, training_length=2) b.append_to_update_buffer(update_buffer, 2, batch_size=None, training_length=2) assert len(update_buffer["action"]) == 20 assert np.array(update_buffer["action"]).shape == (20, 2) c = update_buffer.make_mini_batch(start=0, end=1) assert c.keys() == update_buffer.keys() assert np.array(c["action"]).shape == (1, 2)
def _append_to_update_buffer(self, agentbuffer_trajectory: AgentBuffer) -> None: """ Append an AgentBuffer to the update buffer. If the trainer isn't training, don't update to avoid a memory leak. """ if self.should_still_train: seq_len = ( self.trainer_settings.network_settings.memory.sequence_length if self.trainer_settings.network_settings.memory is not None else 1) agentbuffer_trajectory.resequence_and_append( self.update_buffer, training_length=seq_len)
def test_buffer_save_load(): original = construct_fake_buffer(3) import io write_buffer = io.BytesIO() original.save_to_file(write_buffer) loaded = AgentBuffer() loaded.load_from_file(write_buffer) assert len(original) == len(loaded) for k in original.keys(): assert np.allclose(original[k], loaded[k])
def __init__(self, *args, **kwargs): super(RLTrainer, self).__init__(*args, **kwargs) # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}." .format(self.__class__.__name__)) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} self.processing_buffer = ProcessingBuffer() self.update_buffer = AgentBuffer() self.episode_steps = {}
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], behavior_spec: BehaviorSpec, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, current_pair_info in enumerate(pair_infos): if idx > len(pair_infos) - 2: break next_pair_info = pair_infos[idx + 1] current_decision_step, current_terminal_step = steps_from_proto( [current_pair_info.agent_info], behavior_spec ) next_decision_step, next_terminal_step = steps_from_proto( [next_pair_info.agent_info], behavior_spec ) previous_action = ( np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0 ) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32 ) next_done = len(next_terminal_step) == 1 next_reward = 0 if len(next_terminal_step) == 1: next_reward = next_terminal_step.reward[0] else: next_reward = next_decision_step.reward[0] current_obs = None if len(current_terminal_step) == 1: current_obs = list(current_terminal_step.values())[0].obs else: current_obs = list(current_decision_step.values())[0].obs demo_raw_buffer["done"].append(next_done) demo_raw_buffer["rewards"].append(next_reward) split_obs = SplitObservations.from_observations(current_obs) for i, obs in enumerate(split_obs.visual_observations): demo_raw_buffer["visual_obs%d" % i].append(obs) demo_raw_buffer["vector_obs"].append(split_obs.vector_observations) demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions) demo_raw_buffer["prev_action"].append(previous_action) if next_done: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) return demo_processed_buffer
def construct_fake_buffer(fake_agent_id): b = AgentBuffer() for step in range(9): b[ObsUtil.get_name_at(0)].append( np.array( [ 100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3, ], dtype=np.float32, )) b[BufferKey.CONTINUOUS_ACTION].append( np.array( [ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5, ], dtype=np.float32, )) b[BufferKey.GROUP_CONTINUOUS_ACTION].append([ np.array( [ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5, ], dtype=np.float32, ) ] * 3) return b
def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_observations = [ np.random.normal(size=shape).astype(np.float32) for shape in behavior_spec.observation_shapes ] next_observations = [ np.random.normal(size=shape).astype(np.float32) for shape in behavior_spec.observation_shapes ] action_buffer = behavior_spec.action_spec.random_action(1) action = {} if behavior_spec.action_spec.continuous_size > 0: action["continuous_action"] = action_buffer.continuous if behavior_spec.action_spec.discrete_size > 0: action["discrete_action"] = action_buffer.discrete for _ in range(number): curr_split_obs = SplitObservations.from_observations(curr_observations) next_split_obs = SplitObservations.from_observations(next_observations) for i, _ in enumerate(curr_split_obs.visual_observations): buffer["visual_obs%d" % i].append( curr_split_obs.visual_observations[i]) buffer["next_visual_obs%d" % i].append( next_split_obs.visual_observations[i]) buffer["vector_obs"].append(curr_split_obs.vector_observations) buffer["next_vector_in"].append(next_split_obs.vector_observations) for _act_type, _act in action.items(): buffer[_act_type].append(_act[0, :]) buffer["reward"].append(np.ones(1, dtype=np.float32) * reward) buffer["masks"].append(np.ones(1, dtype=np.float32)) buffer["done"] = np.zeros(number, dtype=np.float32) return buffer
def test_sac_rnn_policy(dummy_config): # Test evaluate tf.reset_default_graph() policy = create_sac_policy_mock(dummy_config, use_rnn=True, use_discrete=True, use_visual=False) step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS) run_out = policy.evaluate(step, list(step.agent_id)) assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) # Test update buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain, memory_size=8) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] update_buffer = AgentBuffer() buffer.resequence_and_append(update_buffer, training_length=policy.sequence_length) run_out = policy.update( update_buffer, num_sequences=update_buffer.num_experiences // policy.sequence_length, )
def test_obsutil_group_from_buffer(): buff = AgentBuffer() # Create some obs for _ in range(3): buff[GroupObsUtil.get_name_at(0)].append( 3 * [np.ones((5, ), dtype=np.float32)]) # Some agents have died for _ in range(2): buff[GroupObsUtil.get_name_at(0)].append( 1 * [np.ones((5, ), dtype=np.float32)]) # Get the group obs, which will be a List of Lists of np.ndarray, where each element is the same # length as the AgentBuffer but contains only one agent's obs. Dead agents are padded by # NaNs. gobs = GroupObsUtil.from_buffer(buff, 1) # Agent 0 is full agent_0_obs = gobs[0] for obs in agent_0_obs: assert obs.shape == (buff.num_experiences, 5) assert not np.isnan(obs).any() agent_1_obs = gobs[1] for obs in agent_1_obs: assert obs.shape == (buff.num_experiences, 5) for i, _exp_obs in enumerate(obs): if i >= 3: assert np.isnan(_exp_obs).all() else: assert not np.isnan(_exp_obs).any()
def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_obs = [ np.random.normal(size=sen_spec.shape).astype(np.float32) for sen_spec in behavior_spec.sensor_specs ] next_obs = [ np.random.normal(size=sen_spec.shape).astype(np.float32) for sen_spec in behavior_spec.sensor_specs ] action_buffer = behavior_spec.action_spec.random_action(1) action = {} if behavior_spec.action_spec.continuous_size > 0: action["continuous_action"] = action_buffer.continuous if behavior_spec.action_spec.discrete_size > 0: action["discrete_action"] = action_buffer.discrete for _ in range(number): for i, obs in enumerate(curr_obs): buffer[ObsUtil.get_name_at(i)].append(obs) for i, obs in enumerate(next_obs): buffer[ObsUtil.get_name_at_next(i)].append(obs) buffer["actions"].append(action) for _act_type, _act in action.items(): buffer[_act_type].append(_act[0, :]) buffer["reward"].append(np.ones(1, dtype=np.float32) * reward) buffer["masks"].append(np.ones(1, dtype=np.float32)) buffer["done"] = np.zeros(number, dtype=np.float32) return buffer
def to_agentbuffer(self) -> AgentBuffer: """ Converts a Trajectory to an AgentBuffer :param trajectory: A Trajectory :returns: AgentBuffer. Note that the length of the AgentBuffer will be one less than the trajectory, as the next observation need to be populated from the last step of the trajectory. """ agent_buffer_trajectory = AgentBuffer() vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs) for step, exp in enumerate(self.steps): if step < len(self.steps) - 1: next_vec_vis_obs = SplitObservations.from_observations( self.steps[step + 1].obs) else: next_vec_vis_obs = SplitObservations.from_observations( self.next_obs) for i, _ in enumerate(vec_vis_obs.visual_observations): agent_buffer_trajectory["visual_obs%d" % i].append( vec_vis_obs.visual_observations[i]) agent_buffer_trajectory["next_visual_obs%d" % i].append( next_vec_vis_obs.visual_observations[i]) agent_buffer_trajectory["vector_obs"].append( vec_vis_obs.vector_observations) agent_buffer_trajectory["next_vector_in"].append( next_vec_vis_obs.vector_observations) if exp.memory is not None: agent_buffer_trajectory["memory"].append(exp.memory) agent_buffer_trajectory["masks"].append(1.0) agent_buffer_trajectory["done"].append(exp.done) # Add the outputs of the last eval if exp.action_pre is not None: actions_pre = exp.action_pre agent_buffer_trajectory["actions_pre"].append(actions_pre) # value is a dictionary from name of reward to value estimate of the value head agent_buffer_trajectory["actions"].append(exp.action) agent_buffer_trajectory["action_probs"].append(exp.action_probs) # Store action masks if necessary. Note that 1 means active, while # in AgentExperience False means active. if exp.action_mask is not None: mask = 1 - np.concatenate(exp.action_mask) agent_buffer_trajectory["action_mask"].append(mask, padding_value=1) else: # This should never be needed unless the environment somehow doesn't supply the # action mask in a discrete space. agent_buffer_trajectory["action_mask"].append(np.ones( exp.action_probs.shape, dtype=np.float32), padding_value=1) agent_buffer_trajectory["prev_action"].append(exp.prev_action) agent_buffer_trajectory["environment_rewards"].append(exp.reward) # Store the next visual obs as the current vec_vis_obs = next_vec_vis_obs return agent_buffer_trajectory
def test_agent_action_group_from_buffer(): buff = AgentBuffer() # Create some actions for _ in range(3): buff[BufferKey.GROUP_CONTINUOUS_ACTION].append( 3 * [np.ones((5,), dtype=np.float32)] ) buff[BufferKey.GROUP_DISCRETE_ACTION].append( 3 * [np.ones((4,), dtype=np.float32)] ) # Some agents have died for _ in range(2): buff[BufferKey.GROUP_CONTINUOUS_ACTION].append( 1 * [np.ones((5,), dtype=np.float32)] ) buff[BufferKey.GROUP_DISCRETE_ACTION].append( 1 * [np.ones((4,), dtype=np.float32)] ) # Get the group actions, which will be a List of Lists of AgentAction, where each element is the same # length as the AgentBuffer but contains only one agent's obs. Dead agents are padded by # NaNs. gact = AgentAction.group_from_buffer(buff) # Agent 0 is full agent_0_act = gact[0] assert agent_0_act.continuous_tensor.shape == (buff.num_experiences, 5) assert agent_0_act.discrete_tensor.shape == (buff.num_experiences, 4) agent_1_act = gact[1] assert agent_1_act.continuous_tensor.shape == (buff.num_experiences, 5) assert agent_1_act.discrete_tensor.shape == (buff.num_experiences, 4) assert (agent_1_act.continuous_tensor[0:3] > 0).all() assert (agent_1_act.continuous_tensor[3:] == 0).all() assert (agent_1_act.discrete_tensor[0:3] > 0).all() assert (agent_1_act.discrete_tensor[3:] == 0).all()
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.cumulative_returns_since_policy_update: List[float] = [] self.collected_rewards: Dict[str, Dict[str, int]] = { "environment": defaultdict(lambda: 0) } self.update_buffer: AgentBuffer = AgentBuffer() self._stats_reporter.add_property(StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict()) self.framework = self.trainer_settings.framework if self.framework == FrameworkType.PYTORCH and not torch_utils.is_available( ): raise UnityTrainerException( "To use the experimental PyTorch backend, install the PyTorch Python package first." ) logger.debug(f"Using framework {self.framework.value}") self._next_save_step = 0 self._next_summary_step = 0 self.model_saver = self.create_model_saver(self.framework, self.trainer_settings, self.artifact_path, self.load)
def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_observations = [ np.random.normal(size=shape).astype(np.float32) for shape in behavior_spec.observation_shapes ] next_observations = [ np.random.normal(size=shape).astype(np.float32) for shape in behavior_spec.observation_shapes ] action = behavior_spec.action_spec.random_action(1)[0, :] for _ in range(number): curr_split_obs = SplitObservations.from_observations(curr_observations) next_split_obs = SplitObservations.from_observations(next_observations) for i, _ in enumerate(curr_split_obs.visual_observations): buffer["visual_obs%d" % i].append( curr_split_obs.visual_observations[i]) buffer["next_visual_obs%d" % i].append( next_split_obs.visual_observations[i]) buffer["vector_obs"].append(curr_split_obs.vector_observations) buffer["next_vector_in"].append(next_split_obs.vector_observations) buffer["actions"].append(action) buffer["reward"].append(np.ones(1, dtype=np.float32) * reward) buffer["masks"].append(np.ones(1, dtype=np.float32)) buffer["done"] = np.zeros(number, dtype=np.float32) return buffer
def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_obs = [ np.random.normal(size=obs_spec.shape).astype(np.float32) for obs_spec in behavior_spec.observation_specs ] next_obs = [ np.random.normal(size=obs_spec.shape).astype(np.float32) for obs_spec in behavior_spec.observation_specs ] action_buffer = behavior_spec.action_spec.random_action(1) action = {} if behavior_spec.action_spec.continuous_size > 0: action[BufferKey.CONTINUOUS_ACTION] = action_buffer.continuous if behavior_spec.action_spec.discrete_size > 0: action[BufferKey.DISCRETE_ACTION] = action_buffer.discrete for _ in range(number): for i, obs in enumerate(curr_obs): buffer[ObsUtil.get_name_at(i)].append(obs) for i, obs in enumerate(next_obs): buffer[ObsUtil.get_name_at_next(i)].append(obs) # TODO # buffer[AgentBufferKey.ACTIONS].append(action) for _act_type, _act in action.items(): buffer[_act_type].append(_act[0, :]) # TODO was "rewards" buffer[BufferKey.ENVIRONMENT_REWARDS].append( np.ones(1, dtype=np.float32) * reward) buffer[BufferKey.MASKS].append(np.ones(1, dtype=np.float32)) buffer[BufferKey.DONE] = np.zeros(number, dtype=np.float32) return buffer
def to_agentbuffer(self) -> AgentBuffer: """ Converts a Trajectory to an AgentBuffer :param trajectory: A Trajectory :returns: AgentBuffer. Note that the length of the AgentBuffer will be one less than the trajectory, as the next observation need to be populated from the last step of the trajectory. """ agent_buffer_trajectory = AgentBuffer() obs = self.steps[0].obs for step, exp in enumerate(self.steps): if step < len(self.steps) - 1: next_obs = self.steps[step + 1].obs else: next_obs = self.next_obs num_obs = len(obs) for i in range(num_obs): agent_buffer_trajectory[ObsUtil.get_name_at(i)].append(obs[i]) agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append( next_obs[i]) if exp.memory is not None: agent_buffer_trajectory["memory"].append(exp.memory) agent_buffer_trajectory["masks"].append(1.0) agent_buffer_trajectory["done"].append(exp.done) # Adds the log prob and action of continuous/discrete separately agent_buffer_trajectory["continuous_action"].append( exp.action.continuous) agent_buffer_trajectory["discrete_action"].append( exp.action.discrete) agent_buffer_trajectory["continuous_log_probs"].append( exp.action_probs.continuous) agent_buffer_trajectory["discrete_log_probs"].append( exp.action_probs.discrete) # Store action masks if necessary. Note that 1 means active, while # in AgentExperience False means active. if exp.action_mask is not None: mask = 1 - np.concatenate(exp.action_mask) agent_buffer_trajectory["action_mask"].append(mask, padding_value=1) else: # This should never be needed unless the environment somehow doesn't supply the # action mask in a discrete space. action_shape = exp.action.discrete.shape agent_buffer_trajectory["action_mask"].append(np.ones( action_shape, dtype=np.float32), padding_value=1) agent_buffer_trajectory["prev_action"].append(exp.prev_action) agent_buffer_trajectory["environment_rewards"].append(exp.reward) # Store the next visual obs as the current obs = next_obs return agent_buffer_trajectory
def test_buffer_truncate(): agent_1_buffer = construct_fake_buffer(1) agent_2_buffer = construct_fake_buffer(2) update_buffer = AgentBuffer() agent_1_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) agent_2_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) # Test non-LSTM update_buffer.truncate(2) assert update_buffer.num_experiences == 2 agent_1_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) agent_2_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) # Test LSTM, truncate should be some multiple of sequence_length update_buffer.truncate(4, sequence_length=3) assert update_buffer.num_experiences == 3 for buffer_field in update_buffer.values(): assert isinstance(buffer_field, AgentBufferField)
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], group_spec: AgentGroupSpec, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, current_pair_info in enumerate(pair_infos): if idx > len(pair_infos) - 2: break next_pair_info = pair_infos[idx + 1] current_step_info = batched_step_result_from_proto( [current_pair_info.agent_info], group_spec) next_step_info = batched_step_result_from_proto( [next_pair_info.agent_info], group_spec) previous_action = (np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32) curr_agent_id = current_step_info.agent_id[0] current_agent_step_info = current_step_info.get_agent_step_result( curr_agent_id) next_agent_id = next_step_info.agent_id[0] next_agent_step_info = next_step_info.get_agent_step_result( next_agent_id) demo_raw_buffer["done"].append(next_agent_step_info.done) demo_raw_buffer["rewards"].append(next_agent_step_info.reward) split_obs = SplitObservations.from_observations( current_agent_step_info.obs) for i, obs in enumerate(split_obs.visual_observations): demo_raw_buffer["visual_obs%d" % i].append(obs) demo_raw_buffer["vector_obs"].append(split_obs.vector_observations) demo_raw_buffer["actions"].append( current_pair_info.action_info.vector_actions) demo_raw_buffer["prev_action"].append(previous_action) if next_step_info.done: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append(demo_processed_buffer, batch_size=None, training_length=sequence_length) return demo_processed_buffer
def test_clear_update_buffer(): trainer = create_rl_trainer() trainer.processing_buffer = construct_fake_processing_buffer() trainer.update_buffer = AgentBuffer() trainer.processing_buffer.append_to_update_buffer( trainer.update_buffer, 2, batch_size=None, training_length=2 ) trainer.clear_update_buffer() for _, arr in trainer.update_buffer.items(): assert len(arr) == 0
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], brain_params: BrainParameters, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, experience in enumerate(pair_infos): if idx > len(pair_infos) - 2: break current_pair_info = pair_infos[idx] next_pair_info = pair_infos[idx + 1] current_brain_info = BrainInfo.from_agent_proto( 0, [current_pair_info.agent_info], brain_params ) next_brain_info = BrainInfo.from_agent_proto( 0, [next_pair_info.agent_info], brain_params ) previous_action = ( np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0 ) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32 ) demo_raw_buffer["done"].append(next_brain_info.local_done[0]) demo_raw_buffer["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_raw_buffer["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) if brain_params.vector_observation_space_size > 0: demo_raw_buffer["vector_obs"].append( current_brain_info.vector_observations[0] ) demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions) demo_raw_buffer["prev_action"].append(previous_action) if next_brain_info.local_done[0]: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) return demo_processed_buffer
def __init__(self, *args, **kwargs): super(RLTrainer, self).__init__(*args, **kwargs) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.cumulative_returns_since_policy_update: List[float] = [] self.collected_rewards: Dict[str, Dict[str, int]] = { "environment": defaultdict(lambda: 0) } self.update_buffer: AgentBuffer = AgentBuffer() self._stats_reporter.add_property(StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict())
def to_agentbuffer(self) -> AgentBuffer: """ Converts a Trajectory to an AgentBuffer :param trajectory: A Trajectory :returns: AgentBuffer. Note that the length of the AgentBuffer will be one less than the trajectory, as the next observation need to be populated from the last step of the trajectory. """ agent_buffer_trajectory = AgentBuffer() vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs) for step, exp in enumerate(self.steps): if step < len(self.steps) - 1: next_vec_vis_obs = SplitObservations.from_observations( self.steps[step + 1].obs) else: next_vec_vis_obs = SplitObservations.from_observations( self.next_obs) for i, _ in enumerate(vec_vis_obs.visual_observations): agent_buffer_trajectory["visual_obs%d" % i].append( vec_vis_obs.visual_observations[i]) agent_buffer_trajectory["next_visual_obs%d" % i].append( next_vec_vis_obs.visual_observations[i]) agent_buffer_trajectory["vector_obs"].append( vec_vis_obs.vector_observations) agent_buffer_trajectory["next_vector_in"].append( next_vec_vis_obs.vector_observations) if exp.memory is not None: agent_buffer_trajectory["memory"].append(exp.memory) agent_buffer_trajectory["masks"].append(1.0) agent_buffer_trajectory["done"].append(exp.done) # Add the outputs of the last eval if exp.action_pre is not None: actions_pre = exp.action_pre agent_buffer_trajectory["actions_pre"].append(actions_pre) # value is a dictionary from name of reward to value estimate of the value head agent_buffer_trajectory["actions"].append(exp.action) agent_buffer_trajectory["action_probs"].append(exp.action_probs) # Store action masks if necessary. Eventually these will be # None for continuous actions if exp.action_mask is not None: agent_buffer_trajectory["action_mask"].append(exp.action_mask, padding_value=1) agent_buffer_trajectory["prev_action"].append(exp.prev_action) agent_buffer_trajectory["environment_rewards"].append(exp.reward) # Store the next visual obs as the current vec_vis_obs = next_vec_vis_obs return agent_buffer_trajectory
def test_buffer_truncate(): b = construct_fake_processing_buffer() update_buffer = AgentBuffer() b.append_to_update_buffer(update_buffer, 3, batch_size=None, training_length=2) b.append_to_update_buffer(update_buffer, 2, batch_size=None, training_length=2) # Test non-LSTM update_buffer.truncate(2) assert update_buffer.num_experiences == 2 b.append_to_update_buffer(update_buffer, 3, batch_size=None, training_length=2) b.append_to_update_buffer(update_buffer, 2, batch_size=None, training_length=2) # Test LSTM, truncate should be some multiple of sequence_length update_buffer.truncate(4, sequence_length=3) assert update_buffer.num_experiences == 3
def construct_fake_buffer(fake_agent_id): b = AgentBuffer() for step in range(9): b["vector_observation"].append([ 100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3, ]) b["action"].append([ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5 ]) return b
def construct_fake_buffer(fake_agent_id): b = AgentBuffer() for step in range(9): b[ObsUtil.get_name_at(0)].append([ 100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3, ]) b[BufferKey.CONTINUOUS_ACTION].append([ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5 ]) return b
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult: """ Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward function drawn straight from a Buffer. :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) when drawing from the update buffer. :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator """ mini_batch_len = len(next(iter(mini_batch.values()))) return RewardSignalResult( self.strength * np.zeros(mini_batch_len, dtype=np.float32), np.zeros(mini_batch_len, dtype=np.float32), )
def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8): buffer = ProcessingBuffer() update_buffer = AgentBuffer() # Make a buffer for idx, experience in enumerate(brain_infos): if idx > len(brain_infos) - 2: break current_brain_info = brain_infos[idx] next_brain_info = brain_infos[idx + 1] buffer[0].last_brain_info = current_brain_info buffer[0]["done"].append(next_brain_info.local_done[0]) buffer[0]["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): buffer[0]["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) buffer[0]["next_visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) if brain_params.vector_observation_space_size > 0: buffer[0]["vector_obs"].append(current_brain_info.vector_observations[0]) buffer[0]["next_vector_in"].append( current_brain_info.vector_observations[0] ) fake_action_size = len(brain_params.vector_action_space_size) if brain_params.vector_action_space_type == "continuous": fake_action_size = brain_params.vector_action_space_size[0] buffer[0]["actions"].append(np.zeros(fake_action_size, dtype=np.float32)) buffer[0]["prev_action"].append(np.zeros(fake_action_size, dtype=np.float32)) buffer[0]["masks"].append(1.0) buffer[0]["advantages"].append(1.0) if brain_params.vector_action_space_type == "discrete": buffer[0]["action_probs"].append( np.ones(sum(brain_params.vector_action_space_size), dtype=np.float32) ) else: buffer[0]["action_probs"].append( np.ones(buffer[0]["actions"][0].shape, dtype=np.float32) ) buffer[0]["actions_pre"].append( np.ones(buffer[0]["actions"][0].shape, dtype=np.float32) ) buffer[0]["action_mask"].append( np.ones(np.sum(brain_params.vector_action_space_size), dtype=np.float32) ) buffer[0]["memory"].append(np.ones(memory_size, dtype=np.float32)) buffer.append_to_update_buffer( update_buffer, 0, batch_size=None, training_length=sequence_length ) return update_buffer
def test_num_experiences(): agent_1_buffer = construct_fake_buffer(1) agent_2_buffer = construct_fake_buffer(2) update_buffer = AgentBuffer() assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 0 assert update_buffer.num_experiences == 0 agent_1_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) agent_2_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20 assert update_buffer.num_experiences == 20