def add_experiences( self, decision_steps: DecisionSteps, terminal_steps: TerminalSteps, worker_id: int, previous_action: ActionInfo, ) -> None: """ Adds experiences to each agent's experience history. :param decision_steps: current DecisionSteps. :param terminal_steps: current TerminalSteps. :param previous_action: The outputs of the Policy's get_action method. """ take_action_outputs = previous_action.outputs if take_action_outputs: for _entropy in take_action_outputs["entropy"]: self._stats_reporter.add_stat("Policy/Entropy", _entropy) # Make unique agent_ids that are global across workers action_global_agent_ids = [ get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids ] for global_id in action_global_agent_ids: if global_id in self._last_step_result: # Don't store if agent just reset self._last_take_action_outputs[global_id] = take_action_outputs # Iterate over all the terminal steps, first gather all the group obs # and then create the AgentExperiences/Trajectories. _add_to_group_status # stores Group statuses in a common data structure self.group_status for terminal_step in terminal_steps.values(): self._add_group_status_and_obs(terminal_step, worker_id) for terminal_step in terminal_steps.values(): local_id = terminal_step.agent_id global_id = get_global_agent_id(worker_id, local_id) self._process_step(terminal_step, worker_id, terminal_steps.agent_id_to_index[local_id]) # Clear the last seen group obs when agents die. self._clear_group_status_and_obs(global_id) # Iterate over all the decision steps, first gather all the group obs # and then create the trajectories. _add_to_group_status # stores Group statuses in a common data structure self.group_status for ongoing_step in decision_steps.values(): self._add_group_status_and_obs(ongoing_step, worker_id) for ongoing_step in decision_steps.values(): local_id = ongoing_step.agent_id self._process_step(ongoing_step, worker_id, decision_steps.agent_id_to_index[local_id]) for _gid in action_global_agent_ids: # If the ID doesn't have a last step result, the agent just reset, # don't store the action. if _gid in self._last_step_result: if "action" in take_action_outputs: self.policy.save_previous_action( [_gid], take_action_outputs["action"])
def add_experiences( self, decision_steps: DecisionSteps, terminal_steps: TerminalSteps, worker_id: int, previous_action: ActionInfo, ) -> None: """ Adds experiences to each agent's experience history. :param decision_steps: current DecisionSteps. :param terminal_steps: current TerminalSteps. :param previous_action: The outputs of the Policy's get_action method. """ take_action_outputs = previous_action.outputs if take_action_outputs: for _entropy in take_action_outputs["entropy"]: self.stats_reporter.add_stat("Policy/Entropy", _entropy) # Make unique agent_ids that are global across workers action_global_agent_ids = [ get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids ] for global_id in action_global_agent_ids: if global_id in self.last_step_result: # Don't store if agent just reset self.last_take_action_outputs[global_id] = take_action_outputs # Iterate over all the terminal steps for terminal_step in terminal_steps.values(): local_id = terminal_step.agent_id global_id = get_global_agent_id(worker_id, local_id) self._process_step( terminal_step, global_id, terminal_steps.agent_id_to_index[local_id] ) # Iterate over all the decision steps for ongoing_step in decision_steps.values(): local_id = ongoing_step.agent_id global_id = get_global_agent_id(worker_id, local_id) self._process_step( ongoing_step, global_id, decision_steps.agent_id_to_index[local_id] ) for _gid in action_global_agent_ids: # If the ID doesn't have a last step result, the agent just reset, # don't store the action. if _gid in self.last_step_result: if "action" in take_action_outputs: self.policy.save_previous_action( [_gid], take_action_outputs["action"] )
def get_action(self, decision_requests: DecisionSteps, worker_id: int = 0) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param decision_requests: A dictionary of brain names and DecisionSteps from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the DecisionSteps came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if len(decision_requests) == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in decision_requests.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return decision_requests, global_agent_ids) self.save_memories(global_agent_ids, run_out.get("memory_out")) return ActionInfo( action=run_out.get("action"), value=run_out.get("value"), outputs=run_out, agent_ids=decision_requests.agent_id, )
def get_action(self, decision_requests: DecisionSteps, worker_id: int = 0) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param worker_id: :param decision_requests: A dictionary of behavior names and DecisionSteps from environment. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if len(decision_requests) == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in decision_requests.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate(decision_requests, global_agent_ids) # pylint: disable=assignment-from-no-return self.save_memories(global_agent_ids, run_out.get("memory_out")) self.check_nan_action(run_out.get("action")) return ActionInfo( action=run_out.get("action"), env_action=run_out.get("env_action"), value=run_out.get("value"), outputs=run_out, agent_ids=list(decision_requests.agent_id), )
def test_end_episode(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1]])), } mock_decision_step, mock_terminal_step = mb.create_mock_steps( num_agents=1, observation_shapes=[(8,)], action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1]])), env_action=ActionTuple(continuous=np.array([[0.1]])), value=[0.1], outputs=fake_action_outputs, agent_ids=mock_decision_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_step, mock_terminal_step, 0, ActionInfo.empty() ) # Run 3 trajectories, with different workers (to simulate different agents) remove_calls = [] for _ep in range(3): remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) for _ in range(5): processor.add_experiences( mock_decision_step, mock_terminal_step, _ep, fake_action_info ) # Make sure we don't add experiences from the prior agents after the done # Call end episode processor.end_episode() # Check that we removed every agent policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0
def get_action( self, decision_requests: DecisionSteps, worker_id: int = 0 ) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param decision_requests: A dictionary of brain names and DecisionSteps from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the DecisionSteps came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if len(decision_requests) == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in decision_requests.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return decision_requests, global_agent_ids ) self.save_memories(global_agent_ids, run_out.get("memory_out")) # For Compatibility with buffer changes for hybrid action support if "log_probs" in run_out: log_probs_tuple = LogProbsTuple() if self.behavior_spec.action_spec.is_continuous(): log_probs_tuple.add_continuous(run_out["log_probs"]) else: log_probs_tuple.add_discrete(run_out["log_probs"]) run_out["log_probs"] = log_probs_tuple if "action" in run_out: action_tuple = ActionTuple() env_action_tuple = ActionTuple() if self.behavior_spec.action_spec.is_continuous(): action_tuple.add_continuous(run_out["pre_action"]) env_action_tuple.add_continuous(run_out["action"]) else: action_tuple.add_discrete(run_out["action"]) env_action_tuple.add_discrete(run_out["action"]) run_out["action"] = action_tuple run_out["env_action"] = env_action_tuple self.check_nan_action(run_out.get("action")) return ActionInfo( action=run_out.get("action"), env_action=run_out.get("env_action"), value=run_out.get("value"), outputs=run_out, agent_ids=decision_requests.agent_id, )
def _add_group_status_and_obs( self, step: Union[TerminalStep, DecisionStep], worker_id: int ) -> None: """ Takes a TerminalStep or DecisionStep and adds the information in it to self.group_status. This information can then be retrieved when constructing trajectories to get the status of group mates. Also stores the current observation into current_group_obs, to be used to get the next group observations for bootstrapping. :param step: TerminalStep or DecisionStep :param worker_id: Worker ID of this particular environment. Used to generate a global group id. """ global_agent_id = get_global_agent_id(worker_id, step.agent_id) stored_decision_step, idx = self._last_step_result.get( global_agent_id, (None, None) ) stored_take_action_outputs = self._last_take_action_outputs.get( global_agent_id, None ) if stored_decision_step is not None and stored_take_action_outputs is not None: # 0, the default group_id, means that the agent doesn't belong to an agent group. # If 0, don't add any groupmate information. if step.group_id > 0: global_group_id = get_global_group_id(worker_id, step.group_id) stored_actions = stored_take_action_outputs["action"] action_tuple = ActionTuple( continuous=stored_actions.continuous[idx], discrete=stored_actions.discrete[idx], ) group_status = AgentStatus( obs=stored_decision_step.obs, reward=step.reward, action=action_tuple, done=isinstance(step, TerminalStep), ) self._group_status[global_group_id][global_agent_id] = group_status self._current_group_obs[global_group_id][global_agent_id] = step.obs
def _process_step( self, step: Union[TerminalStep, DecisionStep], worker_id: int, index: int ) -> None: terminated = isinstance(step, TerminalStep) global_agent_id = get_global_agent_id(worker_id, step.agent_id) global_group_id = get_global_group_id(worker_id, step.group_id) stored_decision_step, idx = self._last_step_result.get( global_agent_id, (None, None) ) stored_take_action_outputs = self._last_take_action_outputs.get( global_agent_id, None ) if not terminated: # Index is needed to grab from last_take_action_outputs self._last_step_result[global_agent_id] = (step, index) # This state is the consequence of a past action if stored_decision_step is not None and stored_take_action_outputs is not None: obs = stored_decision_step.obs if self.policy.use_recurrent: memory = self.policy.retrieve_previous_memories([global_agent_id])[0, :] else: memory = None done = terminated # Since this is an ongoing step interrupted = step.interrupted if terminated else False # Add the outputs of the last eval stored_actions = stored_take_action_outputs["action"] action_tuple = ActionTuple( continuous=stored_actions.continuous[idx], discrete=stored_actions.discrete[idx], ) stored_action_probs = stored_take_action_outputs["log_probs"] log_probs_tuple = LogProbsTuple( continuous=stored_action_probs.continuous[idx], discrete=stored_action_probs.discrete[idx], ) action_mask = stored_decision_step.action_mask prev_action = self.policy.retrieve_previous_action([global_agent_id])[0, :] # Assemble teammate_obs. If none saved, then it will be an empty list. group_statuses = [] for _id, _mate_status in self._group_status[global_group_id].items(): if _id != global_agent_id: group_statuses.append(_mate_status) experience = AgentExperience( obs=obs, reward=step.reward, done=done, action=action_tuple, action_probs=log_probs_tuple, action_mask=action_mask, prev_action=prev_action, interrupted=interrupted, memory=memory, group_status=group_statuses, group_reward=step.group_reward, ) # Add the value outputs if needed self._experience_buffers[global_agent_id].append(experience) self._episode_rewards[global_agent_id] += step.reward if not terminated: self._episode_steps[global_agent_id] += 1 # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon if ( len(self._experience_buffers[global_agent_id]) >= self._max_trajectory_length or terminated ): next_obs = step.obs next_group_obs = [] for _id, _obs in self._current_group_obs[global_group_id].items(): if _id != global_agent_id: next_group_obs.append(_obs) trajectory = Trajectory( steps=self._experience_buffers[global_agent_id], agent_id=global_agent_id, next_obs=next_obs, next_group_obs=next_group_obs, behavior_id=self._behavior_id, ) for traj_queue in self._trajectory_queues: traj_queue.put(trajectory) self._experience_buffers[global_agent_id] = [] if terminated: # Record episode length. self._stats_reporter.add_stat( "Environment/Episode Length", self._episode_steps.get(global_agent_id, 0), ) self._clean_agent_data(global_agent_id)
def test_agent_deletion(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": [0.1], "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "pre_action": [0.1], "log_probs": [0.1], } mock_decision_step, mock_terminal_step = mb.create_mock_steps( num_agents=1, observation_shapes=[(8, )], action_shape=2) mock_done_decision_step, mock_done_terminal_step = mb.create_mock_steps( num_agents=1, observation_shapes=[(8, )], action_shape=2, done=True) fake_action_info = ActionInfo( action=[0.1], value=[0.1], outputs=fake_action_outputs, agent_ids=mock_decision_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences(mock_decision_step, mock_terminal_step, 0, ActionInfo.empty()) # Run 3 trajectories, with different workers (to simulate different agents) add_calls = [] remove_calls = [] for _ep in range(3): for _ in range(5): processor.add_experiences(mock_decision_step, mock_terminal_step, _ep, fake_action_info) add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1])) processor.add_experiences(mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info) # Make sure we don't add experiences from the prior agents after the done remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) policy.save_previous_action.assert_has_calls(add_calls) policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0 assert len(processor.last_step_result.keys()) == 0 # check that steps with immediate dones don't add to dicts processor.add_experiences(mock_done_decision_step, mock_done_terminal_step, 0, ActionInfo.empty()) assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0 assert len(processor.last_step_result.keys()) == 0