def evaluate( self, batched_step_result: BatchedStepResult, global_agent_ids: List[str] ) -> Dict[str, Any]: """ Evaluates policy for the agent experiences provided. :param batched_step_result: BatchedStepResult object containing inputs. :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result. :return: Outputs from network as defined by self.inference_dict. """ feed_dict = { self.model.batch_size: batched_step_result.n_agents(), self.model.sequence_length: 1, } epsilon = None if self.use_recurrent: if not self.use_continuous_act: feed_dict[self.model.prev_action] = self.retrieve_previous_action( global_agent_ids ) feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids) if self.use_continuous_act: epsilon = np.random.normal( size=(batched_step_result.n_agents(), self.model.act_size[0]) ) feed_dict[self.model.epsilon] = epsilon feed_dict = self.fill_eval_dict(feed_dict, batched_step_result) run_out = self._execute_model(feed_dict, self.inference_dict) return run_out
def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: n_extra_agents = step_result.n_agents() - self._n_agents if n_extra_agents < 0: # In this case, some Agents did not request a decision when expected raise UnityGymException( "The number of agents in the scene does not match the expected number." ) if step_result.n_agents() - sum(step_result.done) != self._n_agents: raise UnityGymException( "The number of agents in the scene does not match the expected number." ) for index, agent_id in enumerate(step_result.agent_id): if step_result.done[index]: self.agent_mapper.mark_agent_done(agent_id, step_result.reward[index]) # Set the new AgentDone flags to True # Note that the corresponding agent_id that gets marked done will be different # than the original agent that was done, but this is OK since the gym interface # only cares about the ordering. for index, agent_id in enumerate(step_result.agent_id): if not self._previous_step_result.contains_agent(agent_id): if step_result.done[index]: # If the Agent is already done (e.g. it ended its epsiode twice in one step) # Don't try to register it here. continue # Register this agent, and get the reward of the previous agent that # was in its index, so that we can return it to the gym. last_reward = self.agent_mapper.register_new_agent_id(agent_id) step_result.done[index] = True step_result.reward[index] = last_reward self._previous_step_result = step_result # store the new original # Get a permutation of the agent IDs so that a given ID stays in the same # index as where it was first seen. new_id_order = self.agent_mapper.get_id_permutation( list(step_result.agent_id)) _mask: Optional[List[np.array]] = None if step_result.action_mask is not None: _mask = [] for mask_index in range(len(step_result.action_mask)): _mask.append(step_result.action_mask[mask_index][new_id_order]) new_obs: List[np.array] = [] for obs_index in range(len(step_result.obs)): new_obs.append(step_result.obs[obs_index][new_id_order]) return BatchedStepResult( obs=new_obs, reward=step_result.reward[new_id_order], done=step_result.done[new_id_order], max_step=step_result.max_step[new_id_order], agent_id=step_result.agent_id[new_id_order], action_mask=_mask, )
def step(self) -> None: assert all(action is not None for action in self.action.values()) for name in self.names: if self.discrete: act = self.action[name][0][0] delta = 1 if act else -1 else: delta = self.action[name][0][0] delta = clamp(delta, -STEP_SIZE, STEP_SIZE) self.position[name] += delta self.position[name] = clamp(self.position[name], -1, 1) self.step_count[name] += 1 done = self.position[name] >= 1.0 or self.position[name] <= -1.0 if done: reward = SUCCESS_REWARD * self.position[name] * self.goal[name] else: reward = -TIME_PENALTY self.rewards[name] += reward m_vector_obs = [ np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal[name] ] m_reward = np.array([reward], dtype=np.float32) m_done = np.array([done], dtype=np.bool) m_agent_id = np.array([0], dtype=np.int32) action_mask = self._generate_mask() if done: self._reset_agent(name) self.step_result[name] = BatchedStepResult(m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask)
def step(self) -> None: assert self.action is not None if self.discrete: act = self.action[0][0] delta = 1 if act else -1 else: delta = self.action[0][0] delta = clamp(delta, -STEP_SIZE, STEP_SIZE) self.position += delta self.position = clamp(self.position, -1, 1) self.step_count += 1 done = self.position >= 1.0 or self.position <= -1.0 if done: reward = SUCCESS_REWARD * self.position * self.goal else: reward = -TIME_PENALTY m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal] m_reward = np.array([reward], dtype=np.float32) m_done = np.array([done], dtype=np.bool) m_agent_id = np.array([0], dtype=np.int32) if done: self._reset_agent() self.step_result = BatchedStepResult(m_vector_obs, m_reward, m_done, m_done, m_agent_id, None)
def get_action( self, batched_step_result: BatchedStepResult, worker_id: int = 0 ) -> ActionInfo: """ Decides actions given observations information, and takes them in environment. :param batched_step_result: A dictionary of brain names and BatchedStepResult from environment. :param worker_id: In parallel environment training, the unique id of the environment worker that the BatchedStepResult came from. Used to construct a globally unique id for each agent. :return: an ActionInfo containing action, memories, values and an object to be passed to add experiences """ if batched_step_result.n_agents() == 0: return ActionInfo.empty() global_agent_ids = [ get_global_agent_id(worker_id, int(agent_id)) for agent_id in batched_step_result.agent_id ] # For 1-D array, the iterator order is correct. run_out = self.evaluate( # pylint: disable=assignment-from-no-return batched_step_result, global_agent_ids ) self.save_memories(global_agent_ids, run_out.get("memory_out")) return ActionInfo( action=run_out.get("action"), value=run_out.get("value"), outputs=run_out, agent_ids=batched_step_result.agent_id, )
def _make_batched_step(self, name: str, done: bool, reward: float) -> BatchedStepResult: m_vector_obs = self._make_obs(self.goal[name]) m_reward = np.array([reward], dtype=np.float32) m_done = np.array([done], dtype=np.bool) m_agent_id = np.array([self.agent_id[name]], dtype=np.int32) action_mask = self._generate_mask() if done: self._reset_agent(name) new_vector_obs = self._make_obs(self.goal[name]) ( m_vector_obs, m_reward, m_done, m_agent_id, action_mask, ) = self._construct_reset_step( m_vector_obs, new_vector_obs, m_reward, m_done, m_agent_id, action_mask, name, ) return BatchedStepResult( m_vector_obs, m_reward, m_done, np.zeros(m_done.shape, dtype=bool), m_agent_id, action_mask, )
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 policy = FakePolicy(test_seed, basic_mock_brain(), basic_params()) # Doesn't really matter what this is dummy_groupspec = AgentGroupSpec([(1, )], "continuous", 1) no_agent_step = BatchedStepResult.empty(dummy_groupspec) result = policy.get_action(no_agent_step) assert result == ActionInfo.empty()
def _make_batched_step(self, name: str, done: bool, reward: float) -> BatchedStepResult: m_vector_obs = self._make_obs(self.goal[name]) m_reward = np.array([reward], dtype=np.float32) m_done = np.array([done], dtype=np.bool) m_agent_id = np.array([0], dtype=np.int32) action_mask = self._generate_mask() return BatchedStepResult(m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask)
def batched_step_result_from_proto( agent_info_list: Collection[AgentInfoProto], # pylint: disable=unsubscriptable-object group_spec: AgentGroupSpec, ) -> BatchedStepResult: obs_list: List[np.ndarray] = [] for obs_index, obs_shape in enumerate(group_spec.observation_shapes): is_visual = len(obs_shape) == 3 if is_visual: obs_shape = cast(Tuple[int, int, int], obs_shape) obs_list += [ _process_visual_observation(obs_index, obs_shape, agent_info_list) ] else: obs_list += [ _process_vector_observation(obs_index, obs_shape, agent_info_list) ] rewards = np.array([agent_info.reward for agent_info in agent_info_list], dtype=np.float32) d = np.dot(rewards, rewards) has_nan = np.isnan(d) has_inf = not np.isfinite(d) # In we have any NaN or Infs, use np.nan_to_num to replace these with finite values if has_nan or has_inf: rewards = np.nan_to_num(rewards) if has_nan: logger.warning(f"An agent had a NaN reward in the environment") done = np.array([agent_info.done for agent_info in agent_info_list], dtype=np.bool) max_step = np.array( [agent_info.max_step_reached for agent_info in agent_info_list], dtype=np.bool) agent_id = np.array([agent_info.id for agent_info in agent_info_list], dtype=np.int32) action_mask = None if group_spec.is_action_discrete(): if any([agent_info.action_mask is not None] for agent_info in agent_info_list): n_agents = len(agent_info_list) a_size = np.sum(group_spec.discrete_action_branches) mask_matrix = np.ones((n_agents, a_size), dtype=np.bool) for agent_index, agent_info in enumerate(agent_info_list): if agent_info.action_mask is not None: if len(agent_info.action_mask) == a_size: mask_matrix[agent_index, :] = [ False if agent_info.action_mask[k] else True for k in range(a_size) ] action_mask = (1 - mask_matrix).astype(np.bool) indices = _generate_split_indices( group_spec.discrete_action_branches) action_mask = np.split(action_mask, indices, axis=1) return BatchedStepResult(obs_list, rewards, done, max_step, agent_id, action_mask)
def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: n_extra_agents = step_result.n_agents() - self._n_agents if n_extra_agents < 0 or n_extra_agents > self._n_agents: # In this case, some Agents did not request a decision when expected # or too many requested a decision raise UnityGymException( "The number of agents in the scene does not match the expected number." ) # remove the done Agents indices_to_keep: List[int] = [] for index, is_done in enumerate(step_result.done): if not is_done: indices_to_keep.append(index) # Set the new AgentDone flags to True # Note that the corresponding agent_id that gets marked done will be different # than the original agent that was done, but this is OK since the gym interface # only cares about the ordering. for index, agent_id in enumerate(step_result.agent_id): if not self._previous_step_result.contains_agent(agent_id): step_result.done[index] = True if agent_id in self._done_agents: step_result.done[index] = True self._done_agents = set() self._previous_step_result = step_result # store the new original _mask: Optional[List[np.array]] = None if step_result.action_mask is not None: _mask = [] for mask_index in range(len(step_result.action_mask)): _mask.append(step_result.action_mask[mask_index][indices_to_keep]) new_obs: List[np.array] = [] for obs_index in range(len(step_result.obs)): new_obs.append(step_result.obs[obs_index][indices_to_keep]) return BatchedStepResult( obs=new_obs, reward=step_result.reward[indices_to_keep], done=step_result.done[indices_to_keep], max_step=step_result.max_step[indices_to_keep], agent_id=step_result.agent_id[indices_to_keep], action_mask=_mask, )
def reset(self) -> None: # type: ignore self._reset_agent() m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal] m_reward = np.array([0], dtype=np.float32) m_done = np.array([False], dtype=np.bool) m_agent_id = np.array([0], dtype=np.int32) self.step_result = BatchedStepResult(m_vector_obs, m_reward, m_done, m_done, m_agent_id, None)
def batched_step_result_from_proto( agent_info_list: Collection[AgentInfoProto], # pylint: disable=unsubscriptable-object envStat: EnvironmentStatisticsProto, group_spec: AgentGroupSpec, ) -> BatchedStepResult: obs_list: List[np.ndarray] = [] for obs_index, obs_shape in enumerate(group_spec.observation_shapes): is_visual = len(obs_shape) == 3 if is_visual: obs_shape = cast(Tuple[int, int, int], obs_shape) obs_list.append( _process_visual_observation(obs_index, obs_shape, agent_info_list)) else: obs_list.append( _process_vector_observation(obs_index, obs_shape, agent_info_list)) rewards = np.array([agent_info.reward for agent_info in agent_info_list], dtype=np.float32) _raise_on_nan_and_inf(rewards, "rewards") done = np.array([agent_info.done for agent_info in agent_info_list], dtype=np.bool) max_step = np.array( [agent_info.max_step_reached for agent_info in agent_info_list], dtype=np.bool) agent_id = np.array([agent_info.id for agent_info in agent_info_list], dtype=np.int32) action_mask = None if group_spec.is_action_discrete(): if any([agent_info.action_mask is not None] for agent_info in agent_info_list): n_agents = len(agent_info_list) a_size = np.sum(group_spec.discrete_action_branches) mask_matrix = np.ones((n_agents, a_size), dtype=np.bool) for agent_index, agent_info in enumerate(agent_info_list): if agent_info.action_mask is not None: if len(agent_info.action_mask) == a_size: mask_matrix[agent_index, :] = [ False if agent_info.action_mask[k] else True for k in range(a_size) ] action_mask = (1 - mask_matrix).astype(np.bool) indices = _generate_split_indices( group_spec.discrete_action_branches) action_mask = np.split(action_mask, indices, axis=1) # convert protobuf maps to dicts double_stat = dict( (key, envStat.double_stat[key]) for key in envStat.double_stat) string_stat = dict( (key, envStat.string_stat[key]) for key in envStat.string_stat) return BatchedStepResult(obs_list, rewards, done, max_step, agent_id, action_mask, double_stat, string_stat)
def _make_batched_step(self, name: str, done: bool, reward: float) -> BatchedStepResult: recurrent_obs_val = (self.goal[name] if self.step_count[name] <= self.num_show_steps else 0) m_vector_obs = self._make_obs(recurrent_obs_val) m_reward = np.array([reward], dtype=np.float32) m_done = np.array([done], dtype=np.bool) m_agent_id = np.array([0], dtype=np.int32) action_mask = self._generate_mask() return BatchedStepResult(m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask)
def create_mock_vector_step_result(num_agents=1, number_visual_observations=0): """ Creates a mock BatchedStepResult with vector observations. Imitates constant vector observations, rewards, dones, and agents. :int num_agents: Number of "agents" to imitate in your BatchedStepResult values. """ obs = [np.array([num_agents * [1, 2, 3]]).reshape(num_agents, 3)] if number_visual_observations: obs += [np.zeros(shape=(num_agents, 8, 8, 3), dtype=np.float32)] rewards = np.array(num_agents * [1.0]) done = np.array(num_agents * [False]) agents = np.array(range(0, num_agents)) return BatchedStepResult(obs, rewards, done, done, agents, None)
def _update_state(self, output: UnityRLOutputProto) -> None: """ Collects experience information from all external brains in environment at current step. """ for brain_name in self._env_specs.keys(): if brain_name in output.agentInfos: agent_info_list = output.agentInfos[brain_name].value self._env_state[brain_name] = batched_step_result_from_proto( agent_info_list, self._env_specs[brain_name]) else: self._env_state[brain_name] = BatchedStepResult.empty( self._env_specs[brain_name]) self._parse_side_channel_message(self.side_channels, output.side_channel)
def reset(self) -> None: # type: ignore for name in self.names: self._reset_agent(name) m_vector_obs = [ np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal[name] ] m_reward = np.array([0], dtype=np.float32) m_done = np.array([False], dtype=np.bool) m_agent_id = np.array([0], dtype=np.int32) action_mask = self._generate_mask() self.step_result[name] = BatchedStepResult(m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask)
def test_take_action_returns_nones_on_missing_values(): test_seed = 3 policy = FakePolicy(test_seed, basic_mock_brain(), basic_params()) policy.evaluate = MagicMock(return_value={}) policy.save_memories = MagicMock() step_with_agents = BatchedStepResult( [], np.array([], dtype=np.float32), np.array([False], dtype=np.bool), np.array([], dtype=np.bool), np.array([0]), None, ) result = policy.get_action(step_with_agents, worker_id=0) assert result == ActionInfo(None, None, {}, [0])
def step_result_to_brain_info( step_result: BatchedStepResult, group_spec: AgentGroupSpec, agent_id_prefix: int = None, ) -> BrainInfo: n_agents = step_result.n_agents() vis_obs_indices = [] vec_obs_indices = [] for index, observation in enumerate(step_result.obs): if len(observation.shape) == 2: vec_obs_indices.append(index) elif len(observation.shape) == 4: vis_obs_indices.append(index) else: raise UnityEnvironmentException( "Invalid input received from the environment, the observation should " "either be a vector of float or a PNG image") if len(vec_obs_indices) == 0: vec_obs = np.zeros((n_agents, 0), dtype=np.float32) else: vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices], axis=1) vis_obs = [step_result.obs[i] for i in vis_obs_indices] mask = np.ones((n_agents, np.sum(group_spec.action_size)), dtype=np.float32) if group_spec.is_action_discrete(): mask = np.ones((n_agents, np.sum(group_spec.discrete_action_branches)), dtype=np.float32) if step_result.action_mask is not None: mask = 1 - np.concatenate(step_result.action_mask, axis=1) if agent_id_prefix is None: agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)] else: agent_ids = [ f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id ] return BrainInfo( vis_obs, vec_obs, list(step_result.reward), agent_ids, list(step_result.done), list(step_result.max_step), mask, )
def create_mock_batchedstep( num_agents: int = 1, num_vector_observations: int = 0, num_vis_observations: int = 0, action_shape: List[int] = None, discrete: bool = False, done: bool = False, ) -> BatchedStepResult: """ Creates a mock BatchedStepResult with observations. Imitates constant vector/visual observations, rewards, dones, and agents. :int num_agents: Number of "agents" to imitate. :int num_vector_observations: Number of "observations" in your observation space :int num_vis_observations: Number of "observations" in your observation space :int num_vector_acts: Number of actions in your action space :bool discrete: Whether or not action space is discrete """ if action_shape is None: action_shape = [2] obs_list = [] for _ in range(num_vis_observations): obs_list.append(np.ones((num_agents, 84, 84, 3), dtype=np.float32)) if num_vector_observations > 1: obs_list.append( np.array(num_agents * [num_vector_observations * [1]], dtype=np.float32)) action_mask = None if discrete: action_mask = [ np.array(num_agents * [action_size * [False]]) for action_size in action_shape ] reward = np.array(num_agents * [1.0], dtype=np.float32) done = np.array(num_agents * [done], dtype=np.bool) max_step = np.array(num_agents * [False], dtype=np.bool) agent_id = np.arange(num_agents, dtype=np.int32) return BatchedStepResult(obs_list, reward, done, max_step, agent_id, action_mask)
def test_take_action_returns_action_info_when_available(): test_seed = 3 policy = FakePolicy(test_seed, basic_mock_brain(), basic_params()) policy_eval_out = { "action": np.array([1.0], dtype=np.float32), "memory_out": np.array([[2.5]], dtype=np.float32), "value": np.array([1.1], dtype=np.float32), } policy.evaluate = MagicMock(return_value=policy_eval_out) step_with_agents = BatchedStepResult( [], np.array([], dtype=np.float32), np.array([False], dtype=np.bool), np.array([], dtype=np.bool), np.array([0]), None, ) result = policy.get_action(step_with_agents) expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"], policy_eval_out, [0]) assert result == expected
def evaluate( self, batched_step_result: BatchedStepResult, global_agent_ids: List[str] ) -> Dict[str, np.ndarray]: """ Evaluates policy for the agent experiences provided. :param batched_step_result: BatchedStepResult object containing inputs. :return: Outputs from network as defined by self.inference_dict. """ feed_dict = { self.model.batch_size: batched_step_result.n_agents(), self.model.sequence_length: 1, } if self.use_recurrent: if not self.use_continuous_act: feed_dict[self.model.prev_action] = self.retrieve_previous_action( global_agent_ids ) feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids) feed_dict = self.fill_eval_dict(feed_dict, batched_step_result) run_out = self._execute_model(feed_dict, self.inference_dict) return run_out
def _sanitize_step_result(self, step_result): """ Takes as input a BatchedStepResult returned from mlagents_envs and cleans it in order to send back informations about agents in always the same order. This order is given by self._agents_id. 2 possible cases : 1) No agents terminated on the new timestep 2) One or more agents aterminated on the new timestep If 1), the step_result doesnt need to be modified. If 2), modifications need to be made on the step_result. For some reasons, when an agent is done, mlagents_envs returns in step_result informations about the done agent as well as informations about a new agent, added because the agent terminated. We want to treat these two agents as the same agent. Furthermore, the information about the new agent is located at a specific position in the step_result. To illustrate this, let's say we receive this step_result at timestep t: [0, 1, 2] and agent 1 terminated at t+1. We will receive : [1, 0, 3, 2]. Few things happen here: -the done agent (1) is put at the first place of the step_result at t+1. -the new agent (3) is put at the index that agent 1 was on the last timestep, + 1. In fact, we can generalize this in the case of n agents being done at timestep t+1: the index of a new agent corresponding to a certain agent which just terminated is the index of the agent that terminated on the last timestep + n - m, n being the number of done agents at timestep t+1, and m the number of done agents at timestep t. Why n ? Because n agents were "pushed" at the beginning of step_result thus we need to include them to access the new agent. Why m ? If agents were done at timestep t, they have been removed from the step_result of timestep t+1. We thus need to substract them to access the new agent (it is easier to see this if you take a pencil and a paper and simulate the process) So, in order to return a step_result which is "sanitized" i.e. return a step_result with the same order as self._agents_id, we need to do a few things : -create new_id_order: list of index corresponding to locations of self._agents_id (if new_id_order = [2, 0, 1], then id of index 0 in self._agents_id is located at index 2 in step_result, id of index 1 at index 0, and id of index 2 at index 1) -create index_gym_id_done: list of index of agent ids in self._agents_id that terminated at current timestep (done=True) -replace agents which are done by their successor agents in self._agents_id and create agents_new_id, a list of the new agents. To do that, we use the previous step result to locate the position of each done agent. We then deduce the position of their successor (index + n, as said above). Once we have the position of their successoir, we access their id. -create new step_result, which is composed of: -obs: observations of all agents. NOTE: mlagents_envs doesnt provide the last observation (S_T) of a done agent, so we return instead the first observation of its successor. -rewards: rewards obtained by all agents. We return step_result.reward[new_id_order] in order to rank them in the right order. -dones: whether or not agent termianted on the timestep. We return done=step_result.done[new_id_order] in order to rank them in the right order. -max_step: whether or not the agent terminated by running out of timesteps. We return step_result.max_step[new_id_order] in order to rank them in the right order. -agent_id: list of agent ids. -action_mask: not implemented, so None. """ #Case 1): simply return step_result # in this case: no done agents, the order of step_result is thus the same as the order of self._agents_id # so we can set new_id_order to be range(n) ([0, 1, 2, ..., n]) if len(self._agents_id) == step_result.n_agents(): self._previous_step_result = step_result self._previous_new_id_order = list(range(len(self._agents_id))) self._previous_done_agents = 0 return step_result #Case 2): modify step_result new_id_order = [] for agent_id in self._agents_id: agent_id_index_step_result = list( step_result.agent_id).index(agent_id) new_id_order.append(agent_id_index_step_result) index_gym_id_done = [] for index, agent_id in enumerate(step_result.agent_id): if step_result.done[index]: index_gym_id_done.append(self._agents_id.index(agent_id)) agents_new_id = [] #2 things here : -replace in self._agents_id the ids of dones agents by ids of their successor. # -create agents_new_id, a list of the successors' ids. for index_id_done in index_gym_id_done: index_new_agent = self._previous_new_id_order[index_id_done] + len( index_gym_id_done) - self._previous_done_agents self._agents_id[index_id_done] = list( step_result.agent_id)[index_new_agent] agents_new_id.append(list(step_result.agent_id)[index_new_agent]) new_obs = [] for index, agent_id in enumerate(self._agents_id): if agent_id in agents_new_id: new_obs.append(step_result.obs[0][self._previous_new_id_order[ index_gym_id_done[agents_new_id.index(agent_id)]] + len(index_gym_id_done) - self._previous_done_agents]) else: new_obs.append(step_result.obs[0][new_id_order[index]]) new_obs = [np.array(new_obs)] self._previous_step_result = step_result self._previous_new_id_order = new_id_order self._previous_done_agents = len(index_gym_id_done) new_step_result = BatchedStepResult( obs=new_obs, reward=step_result.reward[new_id_order], done=step_result.done[new_id_order], max_step=step_result.max_step[new_id_order], agent_id=step_result.agent_id[new_id_order], action_mask=None) return new_step_result
def add_experiences( self, batched_step_result: BatchedStepResult, worker_id: int, previous_action: ActionInfo, ) -> None: """ Adds experiences to each agent's experience history. :param batched_step_result: current BatchedStepResult. :param previous_action: The outputs of the Policy's get_action method. """ take_action_outputs = previous_action.outputs if take_action_outputs: for _entropy in take_action_outputs["entropy"]: self.stats_reporter.add_stat("Policy/Entropy", _entropy) # Make unique agent_ids that are global across workers action_global_agent_ids = [ get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids ] for global_id in action_global_agent_ids: if global_id in self.last_step_result: # Don't store if agent just reset self.last_take_action_outputs[global_id] = take_action_outputs for _id in batched_step_result.agent_id: # Assume agent_id is 1-D local_id = int( _id ) # Needed for mypy to pass since ndarray has no content type curr_agent_step = batched_step_result.get_agent_step_result( local_id) global_id = get_global_agent_id(worker_id, local_id) stored_agent_step, idx = self.last_step_result.get( global_id, (None, None)) stored_take_action_outputs = self.last_take_action_outputs.get( global_id, None) if stored_agent_step is not None and stored_take_action_outputs is not None: # We know the step is from the same worker, so use the local agent id. obs = stored_agent_step.obs if not stored_agent_step.done: if self.policy.use_recurrent: memory = self.policy.retrieve_memories([global_id ])[0, :] else: memory = None done = curr_agent_step.done max_step = curr_agent_step.max_step # Add the outputs of the last eval action = stored_take_action_outputs["action"][idx] if self.policy.use_continuous_act: action_pre = stored_take_action_outputs["pre_action"][ idx] else: action_pre = None action_probs = stored_take_action_outputs["log_probs"][idx] action_mask = stored_agent_step.action_mask prev_action = self.policy.retrieve_previous_action( [global_id])[0, :] experience = AgentExperience( obs=obs, reward=curr_agent_step.reward, done=done, action=action, action_probs=action_probs, action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, max_step=max_step, memory=memory, ) # Add the value outputs if needed self.experience_buffers[global_id].append(experience) self.episode_rewards[global_id] += curr_agent_step.reward if (curr_agent_step.done or (len(self.experience_buffers[global_id]) >= self.max_trajectory_length)) and len( self.experience_buffers[global_id]) > 0: # Make next AgentExperience next_obs = curr_agent_step.obs trajectory = Trajectory( steps=self.experience_buffers[global_id], agent_id=global_id, next_obs=next_obs, behavior_id=self.behavior_id, ) for traj_queue in self.trajectory_queues: traj_queue.put(trajectory) self.experience_buffers[global_id] = [] if curr_agent_step.done: # Record episode length for agents which have had at least # 1 step. Done after reset ignored. self.stats_reporter.add_stat( "Environment/Episode Length", self.episode_steps.get(global_id, 0), ) elif not curr_agent_step.done: self.episode_steps[global_id] += 1 # Index is needed to grab from last_take_action_outputs self.last_step_result[global_id] = ( curr_agent_step, batched_step_result.agent_id_to_index[_id], ) # Delete all done agents, regardless of if they had a 0-length episode. if curr_agent_step.done: self._clean_agent_data(global_id) for _gid in action_global_agent_ids: # If the ID doesn't have a last step result, the agent just reset, # don't store the action. if _gid in self.last_step_result: if "action" in take_action_outputs: self.policy.save_previous_action( [_gid], take_action_outputs["action"])
def add_experiences( self, batched_step_result: BatchedStepResult, worker_id: int, previous_action: ActionInfo, ) -> None: """ Adds experiences to each agent's experience history. :param batched_step_result: current BatchedStepResult. :param previous_action: The outputs of the Policy's get_action method. """ take_action_outputs = previous_action.outputs if take_action_outputs: for _entropy in take_action_outputs["entropy"]: self.stats_reporter.add_stat("Policy/Entropy", _entropy) self.stats_reporter.add_stat("Policy/Learning Rate", take_action_outputs["learning_rate"]) # Make unique agent_ids that are global across workers action_global_agent_ids = [ get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids ] for global_id in action_global_agent_ids: self.last_take_action_outputs[global_id] = take_action_outputs for _id in batched_step_result.agent_id: # Assume agent_id is 1-D local_id = int( _id ) # Needed for mypy to pass since ndarray has no content type curr_agent_step = batched_step_result.get_agent_step_result( local_id) global_id = get_global_agent_id(worker_id, local_id) stored_step = self.last_step_result.get(global_id, None) stored_take_action_outputs = self.last_take_action_outputs.get( global_id, None) if stored_step is not None and stored_take_action_outputs is not None: # We know the step is from the same worker, so use the local agent id. stored_agent_step = stored_step.get_agent_step_result(local_id) idx = stored_step.agent_id_to_index[local_id] obs = stored_agent_step.obs if not stored_agent_step.done: if self.policy.use_recurrent: memory = self.policy.retrieve_memories([global_id ])[0, :] else: memory = None done = curr_agent_step.done max_step = curr_agent_step.max_step # Add the outputs of the last eval action = stored_take_action_outputs["action"][idx] if self.policy.use_continuous_act: action_pre = stored_take_action_outputs["pre_action"][ idx] else: action_pre = None action_probs = stored_take_action_outputs["log_probs"][idx] action_mask = stored_agent_step.action_mask prev_action = self.policy.retrieve_previous_action( [global_id])[0, :] experience = AgentExperience( obs=obs, reward=curr_agent_step.reward, done=done, action=action, action_probs=action_probs, action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, max_step=max_step, memory=memory, ) # Add the value outputs if needed self.experience_buffers[global_id].append(experience) self.episode_rewards[global_id] += curr_agent_step.reward if (curr_agent_step.done or (len(self.experience_buffers[global_id]) >= self.max_trajectory_length)) and len( self.experience_buffers[global_id]) > 0: # Make next AgentExperience next_obs = curr_agent_step.obs trajectory = Trajectory( steps=self.experience_buffers[global_id], agent_id=global_id, next_obs=next_obs, behavior_id=self.behavior_id, ) for traj_queue in self.trajectory_queues: traj_queue.put(trajectory) self.experience_buffers[global_id] = [] if curr_agent_step.done: self.stats_reporter.add_stat( "Environment/Cumulative Reward", self.episode_rewards.get(global_id, 0), ) self.stats_reporter.add_stat( "Environment/Episode Length", self.episode_steps.get(global_id, 0), ) del self.episode_steps[global_id] del self.episode_rewards[global_id] elif not curr_agent_step.done: self.episode_steps[global_id] += 1 self.last_step_result[global_id] = batched_step_result if "action" in take_action_outputs: self.policy.save_previous_action(previous_action.agent_ids, take_action_outputs["action"])