def evaluate( self, batched_step_result: BatchedStepResult, global_agent_ids: List[str] ) -> Dict[str, Any]: """ Evaluates policy for the agent experiences provided. :param batched_step_result: BatchedStepResult object containing inputs. :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result. :return: Outputs from network as defined by self.inference_dict. """ feed_dict = { self.model.batch_size: batched_step_result.n_agents(), self.model.sequence_length: 1, } epsilon = None if self.use_recurrent: if not self.use_continuous_act: feed_dict[self.model.prev_action] = self.retrieve_previous_action( global_agent_ids ) feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids) if self.use_continuous_act: epsilon = np.random.normal( size=(batched_step_result.n_agents(), self.model.act_size[0]) ) feed_dict[self.model.epsilon] = epsilon feed_dict = self.fill_eval_dict(feed_dict, batched_step_result) run_out = self._execute_model(feed_dict, self.inference_dict) return run_out
def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: n_extra_agents = step_result.n_agents() - self._n_agents if n_extra_agents < 0: # In this case, some Agents did not request a decision when expected raise UnityGymException( "The number of agents in the scene does not match the expected number." ) if step_result.n_agents() - sum(step_result.done) != self._n_agents: raise UnityGymException( "The number of agents in the scene does not match the expected number." ) for index, agent_id in enumerate(step_result.agent_id): if step_result.done[index]: self.agent_mapper.mark_agent_done(agent_id, step_result.reward[index]) # Set the new AgentDone flags to True # Note that the corresponding agent_id that gets marked done will be different # than the original agent that was done, but this is OK since the gym interface # only cares about the ordering. for index, agent_id in enumerate(step_result.agent_id): if not self._previous_step_result.contains_agent(agent_id): if step_result.done[index]: # If the Agent is already done (e.g. it ended its epsiode twice in one step) # Don't try to register it here. continue # Register this agent, and get the reward of the previous agent that # was in its index, so that we can return it to the gym. last_reward = self.agent_mapper.register_new_agent_id(agent_id) step_result.done[index] = True step_result.reward[index] = last_reward self._previous_step_result = step_result # store the new original # Get a permutation of the agent IDs so that a given ID stays in the same # index as where it was first seen. new_id_order = self.agent_mapper.get_id_permutation( list(step_result.agent_id)) _mask: Optional[List[np.array]] = None if step_result.action_mask is not None: _mask = [] for mask_index in range(len(step_result.action_mask)): _mask.append(step_result.action_mask[mask_index][new_id_order]) new_obs: List[np.array] = [] for obs_index in range(len(step_result.obs)): new_obs.append(step_result.obs[obs_index][new_id_order]) return BatchedStepResult( obs=new_obs, reward=step_result.reward[new_id_order], done=step_result.done[new_id_order], max_step=step_result.max_step[new_id_order], agent_id=step_result.agent_id[new_id_order], action_mask=_mask, )
def get_action( self, batched_step_result: BatchedStepResult, worker_id: int = 0 ) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param batched_step_result: A dictionary of brain names and BatchedStepResult from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the BatchedStepResult came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if batched_step_result.n_agents() == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in batched_step_result.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return batched_step_result, global_agent_ids ) self.save_memories(global_agent_ids, run_out.get("memory_out")) return ActionInfo( action=run_out.get("action"), value=run_out.get("value"), outputs=run_out, agent_ids=batched_step_result.agent_id, )
def step_result_to_brain_info( step_result: BatchedStepResult, group_spec: AgentGroupSpec, agent_id_prefix: int = None, ) -> BrainInfo: n_agents = step_result.n_agents() vis_obs_indices = [] vec_obs_indices = [] for index, observation in enumerate(step_result.obs): if len(observation.shape) == 2: vec_obs_indices.append(index) elif len(observation.shape) == 4: vis_obs_indices.append(index) else: raise UnityEnvironmentException( "Invalid input received from the environment, the observation should " "either be a vector of float or a PNG image") if len(vec_obs_indices) == 0: vec_obs = np.zeros((n_agents, 0), dtype=np.float32) else: vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices], axis=1) vis_obs = [step_result.obs[i] for i in vis_obs_indices] mask = np.ones((n_agents, np.sum(group_spec.action_size)), dtype=np.float32) if group_spec.is_action_discrete(): mask = np.ones((n_agents, np.sum(group_spec.discrete_action_branches)), dtype=np.float32) if step_result.action_mask is not None: mask = 1 - np.concatenate(step_result.action_mask, axis=1) if agent_id_prefix is None: agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)] else: agent_ids = [ f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id ] return BrainInfo( vis_obs, vec_obs, list(step_result.reward), agent_ids, list(step_result.done), list(step_result.max_step), mask, )
def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: n_extra_agents = step_result.n_agents() - self._n_agents if n_extra_agents < 0 or n_extra_agents > self._n_agents: # In this case, some Agents did not request a decision when expected # or too many requested a decision raise UnityGymException( "The number of agents in the scene does not match the expected number." ) # remove the done Agents indices_to_keep: List[int] = [] for index, is_done in enumerate(step_result.done): if not is_done: indices_to_keep.append(index) # Set the new AgentDone flags to True # Note that the corresponding agent_id that gets marked done will be different # than the original agent that was done, but this is OK since the gym interface # only cares about the ordering. for index, agent_id in enumerate(step_result.agent_id): if not self._previous_step_result.contains_agent(agent_id): step_result.done[index] = True if agent_id in self._done_agents: step_result.done[index] = True self._done_agents = set() self._previous_step_result = step_result # store the new original _mask: Optional[List[np.array]] = None if step_result.action_mask is not None: _mask = [] for mask_index in range(len(step_result.action_mask)): _mask.append(step_result.action_mask[mask_index][indices_to_keep]) new_obs: List[np.array] = [] for obs_index in range(len(step_result.obs)): new_obs.append(step_result.obs[obs_index][indices_to_keep]) return BatchedStepResult( obs=new_obs, reward=step_result.reward[indices_to_keep], done=step_result.done[indices_to_keep], max_step=step_result.max_step[indices_to_keep], agent_id=step_result.agent_id[indices_to_keep], action_mask=_mask, )
def evaluate( self, batched_step_result: BatchedStepResult, global_agent_ids: List[str] ) -> Dict[str, np.ndarray]: """ Evaluates policy for the agent experiences provided. :param batched_step_result: BatchedStepResult object containing inputs. :return: Outputs from network as defined by self.inference_dict. """ feed_dict = { self.model.batch_size: batched_step_result.n_agents(), self.model.sequence_length: 1, } if self.use_recurrent: if not self.use_continuous_act: feed_dict[self.model.prev_action] = self.retrieve_previous_action( global_agent_ids ) feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids) feed_dict = self.fill_eval_dict(feed_dict, batched_step_result) run_out = self._execute_model(feed_dict, self.inference_dict) return run_out