Exemplo n.º 1
0
 def evaluate(
     self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
 ) -> Dict[str, Any]:
     """
     Evaluates policy for the agent experiences provided.
     :param batched_step_result: BatchedStepResult object containing inputs.
     :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
     :return: Outputs from network as defined by self.inference_dict.
     """
     feed_dict = {
         self.model.batch_size: batched_step_result.n_agents(),
         self.model.sequence_length: 1,
     }
     epsilon = None
     if self.use_recurrent:
         if not self.use_continuous_act:
             feed_dict[self.model.prev_action] = self.retrieve_previous_action(
                 global_agent_ids
             )
         feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids)
     if self.use_continuous_act:
         epsilon = np.random.normal(
             size=(batched_step_result.n_agents(), self.model.act_size[0])
         )
         feed_dict[self.model.epsilon] = epsilon
     feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
     run_out = self._execute_model(feed_dict, self.inference_dict)
     return run_out
Exemplo n.º 2
0
    def _sanitize_info(self,
                       step_result: BatchedStepResult) -> BatchedStepResult:
        n_extra_agents = step_result.n_agents() - self._n_agents
        if n_extra_agents < 0:
            # In this case, some Agents did not request a decision when expected
            raise UnityGymException(
                "The number of agents in the scene does not match the expected number."
            )

        if step_result.n_agents() - sum(step_result.done) != self._n_agents:
            raise UnityGymException(
                "The number of agents in the scene does not match the expected number."
            )

        for index, agent_id in enumerate(step_result.agent_id):
            if step_result.done[index]:
                self.agent_mapper.mark_agent_done(agent_id,
                                                  step_result.reward[index])

        # Set the new AgentDone flags to True
        # Note that the corresponding agent_id that gets marked done will be different
        # than the original agent that was done, but this is OK since the gym interface
        # only cares about the ordering.
        for index, agent_id in enumerate(step_result.agent_id):
            if not self._previous_step_result.contains_agent(agent_id):
                if step_result.done[index]:
                    # If the Agent is already done (e.g. it ended its epsiode twice in one step)
                    # Don't try to register it here.
                    continue
                # Register this agent, and get the reward of the previous agent that
                # was in its index, so that we can return it to the gym.
                last_reward = self.agent_mapper.register_new_agent_id(agent_id)
                step_result.done[index] = True
                step_result.reward[index] = last_reward

        self._previous_step_result = step_result  # store the new original

        # Get a permutation of the agent IDs so that a given ID stays in the same
        # index as where it was first seen.
        new_id_order = self.agent_mapper.get_id_permutation(
            list(step_result.agent_id))

        _mask: Optional[List[np.array]] = None
        if step_result.action_mask is not None:
            _mask = []
            for mask_index in range(len(step_result.action_mask)):
                _mask.append(step_result.action_mask[mask_index][new_id_order])
        new_obs: List[np.array] = []
        for obs_index in range(len(step_result.obs)):
            new_obs.append(step_result.obs[obs_index][new_id_order])
        return BatchedStepResult(
            obs=new_obs,
            reward=step_result.reward[new_id_order],
            done=step_result.done[new_id_order],
            max_step=step_result.max_step[new_id_order],
            agent_id=step_result.agent_id[new_id_order],
            action_mask=_mask,
        )
Exemplo n.º 3
0
    def step(self) -> None:
        assert all(action is not None for action in self.action.values())

        for name in self.names:
            if self.discrete:
                act = self.action[name][0][0]
                delta = 1 if act else -1
            else:
                delta = self.action[name][0][0]
            delta = clamp(delta, -STEP_SIZE, STEP_SIZE)
            self.position[name] += delta
            self.position[name] = clamp(self.position[name], -1, 1)
            self.step_count[name] += 1
            done = self.position[name] >= 1.0 or self.position[name] <= -1.0
            if done:
                reward = SUCCESS_REWARD * self.position[name] * self.goal[name]
            else:
                reward = -TIME_PENALTY
            self.rewards[name] += reward

            m_vector_obs = [
                np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal[name]
            ]
            m_reward = np.array([reward], dtype=np.float32)
            m_done = np.array([done], dtype=np.bool)
            m_agent_id = np.array([0], dtype=np.int32)
            action_mask = self._generate_mask()

            if done:
                self._reset_agent(name)

            self.step_result[name] = BatchedStepResult(m_vector_obs, m_reward,
                                                       m_done, m_done,
                                                       m_agent_id, action_mask)
Exemplo n.º 4
0
    def step(self) -> None:
        assert self.action is not None

        if self.discrete:
            act = self.action[0][0]
            delta = 1 if act else -1
        else:
            delta = self.action[0][0]
        delta = clamp(delta, -STEP_SIZE, STEP_SIZE)
        self.position += delta
        self.position = clamp(self.position, -1, 1)
        self.step_count += 1
        done = self.position >= 1.0 or self.position <= -1.0
        if done:
            reward = SUCCESS_REWARD * self.position * self.goal
        else:
            reward = -TIME_PENALTY

        m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal]
        m_reward = np.array([reward], dtype=np.float32)
        m_done = np.array([done], dtype=np.bool)
        m_agent_id = np.array([0], dtype=np.int32)

        if done:
            self._reset_agent()

        self.step_result = BatchedStepResult(m_vector_obs, m_reward, m_done,
                                             m_done, m_agent_id, None)
Exemplo n.º 5
0
    def get_action(
        self, batched_step_result: BatchedStepResult, worker_id: int = 0
    ) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param batched_step_result: A dictionary of brain names and BatchedStepResult from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the BatchedStepResult came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if batched_step_result.n_agents() == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in batched_step_result.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            batched_step_result, global_agent_ids
        )

        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        return ActionInfo(
            action=run_out.get("action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=batched_step_result.agent_id,
        )
Exemplo n.º 6
0
    def _make_batched_step(self, name: str, done: bool,
                           reward: float) -> BatchedStepResult:
        m_vector_obs = self._make_obs(self.goal[name])
        m_reward = np.array([reward], dtype=np.float32)
        m_done = np.array([done], dtype=np.bool)
        m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
        action_mask = self._generate_mask()

        if done:
            self._reset_agent(name)
            new_vector_obs = self._make_obs(self.goal[name])
            (
                m_vector_obs,
                m_reward,
                m_done,
                m_agent_id,
                action_mask,
            ) = self._construct_reset_step(
                m_vector_obs,
                new_vector_obs,
                m_reward,
                m_done,
                m_agent_id,
                action_mask,
                name,
            )
        return BatchedStepResult(
            m_vector_obs,
            m_reward,
            m_done,
            np.zeros(m_done.shape, dtype=bool),
            m_agent_id,
            action_mask,
        )
Exemplo n.º 7
0
def test_take_action_returns_empty_with_no_agents():
    test_seed = 3
    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
    # Doesn't really matter what this is
    dummy_groupspec = AgentGroupSpec([(1, )], "continuous", 1)
    no_agent_step = BatchedStepResult.empty(dummy_groupspec)
    result = policy.get_action(no_agent_step)
    assert result == ActionInfo.empty()
 def _make_batched_step(self, name: str, done: bool,
                        reward: float) -> BatchedStepResult:
     m_vector_obs = self._make_obs(self.goal[name])
     m_reward = np.array([reward], dtype=np.float32)
     m_done = np.array([done], dtype=np.bool)
     m_agent_id = np.array([0], dtype=np.int32)
     action_mask = self._generate_mask()
     return BatchedStepResult(m_vector_obs, m_reward, m_done, m_done,
                              m_agent_id, action_mask)
Exemplo n.º 9
0
def batched_step_result_from_proto(
    agent_info_list: Collection[AgentInfoProto],  # pylint: disable=unsubscriptable-object
    group_spec: AgentGroupSpec,
) -> BatchedStepResult:
    obs_list: List[np.ndarray] = []
    for obs_index, obs_shape in enumerate(group_spec.observation_shapes):
        is_visual = len(obs_shape) == 3
        if is_visual:
            obs_shape = cast(Tuple[int, int, int], obs_shape)
            obs_list += [
                _process_visual_observation(obs_index, obs_shape,
                                            agent_info_list)
            ]
        else:
            obs_list += [
                _process_vector_observation(obs_index, obs_shape,
                                            agent_info_list)
            ]
    rewards = np.array([agent_info.reward for agent_info in agent_info_list],
                       dtype=np.float32)

    d = np.dot(rewards, rewards)
    has_nan = np.isnan(d)
    has_inf = not np.isfinite(d)
    # In we have any NaN or Infs, use np.nan_to_num to replace these with finite values
    if has_nan or has_inf:
        rewards = np.nan_to_num(rewards)
    if has_nan:
        logger.warning(f"An agent had a NaN reward in the environment")

    done = np.array([agent_info.done for agent_info in agent_info_list],
                    dtype=np.bool)
    max_step = np.array(
        [agent_info.max_step_reached for agent_info in agent_info_list],
        dtype=np.bool)
    agent_id = np.array([agent_info.id for agent_info in agent_info_list],
                        dtype=np.int32)
    action_mask = None
    if group_spec.is_action_discrete():
        if any([agent_info.action_mask is not None]
               for agent_info in agent_info_list):
            n_agents = len(agent_info_list)
            a_size = np.sum(group_spec.discrete_action_branches)
            mask_matrix = np.ones((n_agents, a_size), dtype=np.bool)
            for agent_index, agent_info in enumerate(agent_info_list):
                if agent_info.action_mask is not None:
                    if len(agent_info.action_mask) == a_size:
                        mask_matrix[agent_index, :] = [
                            False if agent_info.action_mask[k] else True
                            for k in range(a_size)
                        ]
            action_mask = (1 - mask_matrix).astype(np.bool)
            indices = _generate_split_indices(
                group_spec.discrete_action_branches)
            action_mask = np.split(action_mask, indices, axis=1)
    return BatchedStepResult(obs_list, rewards, done, max_step, agent_id,
                             action_mask)
Exemplo n.º 10
0
    def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult:
        n_extra_agents = step_result.n_agents() - self._n_agents
        if n_extra_agents < 0 or n_extra_agents > self._n_agents:
            # In this case, some Agents did not request a decision when expected
            # or too many requested a decision
            raise UnityGymException(
                "The number of agents in the scene does not match the expected number."
            )

        # remove the done Agents
        indices_to_keep: List[int] = []
        for index, is_done in enumerate(step_result.done):
            if not is_done:
                indices_to_keep.append(index)

        # Set the new AgentDone flags to True
        # Note that the corresponding agent_id that gets marked done will be different
        # than the original agent that was done, but this is OK since the gym interface
        # only cares about the ordering.
        for index, agent_id in enumerate(step_result.agent_id):
            if not self._previous_step_result.contains_agent(agent_id):
                step_result.done[index] = True
            if agent_id in self._done_agents:
                step_result.done[index] = True
        self._done_agents = set()
        self._previous_step_result = step_result  # store the new original

        _mask: Optional[List[np.array]] = None
        if step_result.action_mask is not None:
            _mask = []
            for mask_index in range(len(step_result.action_mask)):
                _mask.append(step_result.action_mask[mask_index][indices_to_keep])
        new_obs: List[np.array] = []
        for obs_index in range(len(step_result.obs)):
            new_obs.append(step_result.obs[obs_index][indices_to_keep])
        return BatchedStepResult(
            obs=new_obs,
            reward=step_result.reward[indices_to_keep],
            done=step_result.done[indices_to_keep],
            max_step=step_result.max_step[indices_to_keep],
            agent_id=step_result.agent_id[indices_to_keep],
            action_mask=_mask,
        )
Exemplo n.º 11
0
    def reset(self) -> None:  # type: ignore
        self._reset_agent()

        m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal]
        m_reward = np.array([0], dtype=np.float32)
        m_done = np.array([False], dtype=np.bool)
        m_agent_id = np.array([0], dtype=np.int32)

        self.step_result = BatchedStepResult(m_vector_obs, m_reward, m_done,
                                             m_done, m_agent_id, None)
Exemplo n.º 12
0
def batched_step_result_from_proto(
    agent_info_list: Collection[AgentInfoProto],  # pylint: disable=unsubscriptable-object
    envStat: EnvironmentStatisticsProto,
    group_spec: AgentGroupSpec,
) -> BatchedStepResult:
    obs_list: List[np.ndarray] = []
    for obs_index, obs_shape in enumerate(group_spec.observation_shapes):
        is_visual = len(obs_shape) == 3
        if is_visual:
            obs_shape = cast(Tuple[int, int, int], obs_shape)
            obs_list.append(
                _process_visual_observation(obs_index, obs_shape,
                                            agent_info_list))
        else:
            obs_list.append(
                _process_vector_observation(obs_index, obs_shape,
                                            agent_info_list))
    rewards = np.array([agent_info.reward for agent_info in agent_info_list],
                       dtype=np.float32)

    _raise_on_nan_and_inf(rewards, "rewards")

    done = np.array([agent_info.done for agent_info in agent_info_list],
                    dtype=np.bool)
    max_step = np.array(
        [agent_info.max_step_reached for agent_info in agent_info_list],
        dtype=np.bool)
    agent_id = np.array([agent_info.id for agent_info in agent_info_list],
                        dtype=np.int32)
    action_mask = None
    if group_spec.is_action_discrete():
        if any([agent_info.action_mask is not None]
               for agent_info in agent_info_list):
            n_agents = len(agent_info_list)
            a_size = np.sum(group_spec.discrete_action_branches)
            mask_matrix = np.ones((n_agents, a_size), dtype=np.bool)
            for agent_index, agent_info in enumerate(agent_info_list):
                if agent_info.action_mask is not None:
                    if len(agent_info.action_mask) == a_size:
                        mask_matrix[agent_index, :] = [
                            False if agent_info.action_mask[k] else True
                            for k in range(a_size)
                        ]
            action_mask = (1 - mask_matrix).astype(np.bool)
            indices = _generate_split_indices(
                group_spec.discrete_action_branches)
            action_mask = np.split(action_mask, indices, axis=1)
            # convert protobuf maps to dicts
            double_stat = dict(
                (key, envStat.double_stat[key]) for key in envStat.double_stat)
            string_stat = dict(
                (key, envStat.string_stat[key]) for key in envStat.string_stat)
    return BatchedStepResult(obs_list, rewards, done, max_step, agent_id,
                             action_mask, double_stat, string_stat)
Exemplo n.º 13
0
 def _make_batched_step(self, name: str, done: bool,
                        reward: float) -> BatchedStepResult:
     recurrent_obs_val = (self.goal[name]
                          if self.step_count[name] <= self.num_show_steps
                          else 0)
     m_vector_obs = self._make_obs(recurrent_obs_val)
     m_reward = np.array([reward], dtype=np.float32)
     m_done = np.array([done], dtype=np.bool)
     m_agent_id = np.array([0], dtype=np.int32)
     action_mask = self._generate_mask()
     return BatchedStepResult(m_vector_obs, m_reward, m_done, m_done,
                              m_agent_id, action_mask)
Exemplo n.º 14
0
def create_mock_vector_step_result(num_agents=1, number_visual_observations=0):
    """
    Creates a mock BatchedStepResult with vector observations. Imitates constant
    vector observations, rewards, dones, and agents.

    :int num_agents: Number of "agents" to imitate in your BatchedStepResult values.
    """
    obs = [np.array([num_agents * [1, 2, 3]]).reshape(num_agents, 3)]
    if number_visual_observations:
        obs += [np.zeros(shape=(num_agents, 8, 8, 3), dtype=np.float32)]
    rewards = np.array(num_agents * [1.0])
    done = np.array(num_agents * [False])
    agents = np.array(range(0, num_agents))
    return BatchedStepResult(obs, rewards, done, done, agents, None)
Exemplo n.º 15
0
 def _update_state(self, output: UnityRLOutputProto) -> None:
     """
     Collects experience information from all external brains in environment at current step.
     """
     for brain_name in self._env_specs.keys():
         if brain_name in output.agentInfos:
             agent_info_list = output.agentInfos[brain_name].value
             self._env_state[brain_name] = batched_step_result_from_proto(
                 agent_info_list, self._env_specs[brain_name])
         else:
             self._env_state[brain_name] = BatchedStepResult.empty(
                 self._env_specs[brain_name])
     self._parse_side_channel_message(self.side_channels,
                                      output.side_channel)
Exemplo n.º 16
0
    def reset(self) -> None:  # type: ignore
        for name in self.names:
            self._reset_agent(name)

            m_vector_obs = [
                np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal[name]
            ]
            m_reward = np.array([0], dtype=np.float32)
            m_done = np.array([False], dtype=np.bool)
            m_agent_id = np.array([0], dtype=np.int32)
            action_mask = self._generate_mask()

            self.step_result[name] = BatchedStepResult(m_vector_obs, m_reward,
                                                       m_done, m_done,
                                                       m_agent_id, action_mask)
Exemplo n.º 17
0
def test_take_action_returns_nones_on_missing_values():
    test_seed = 3
    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
    policy.evaluate = MagicMock(return_value={})
    policy.save_memories = MagicMock()
    step_with_agents = BatchedStepResult(
        [],
        np.array([], dtype=np.float32),
        np.array([False], dtype=np.bool),
        np.array([], dtype=np.bool),
        np.array([0]),
        None,
    )
    result = policy.get_action(step_with_agents, worker_id=0)
    assert result == ActionInfo(None, None, {}, [0])
Exemplo n.º 18
0
def step_result_to_brain_info(
    step_result: BatchedStepResult,
    group_spec: AgentGroupSpec,
    agent_id_prefix: int = None,
) -> BrainInfo:
    n_agents = step_result.n_agents()
    vis_obs_indices = []
    vec_obs_indices = []
    for index, observation in enumerate(step_result.obs):
        if len(observation.shape) == 2:
            vec_obs_indices.append(index)
        elif len(observation.shape) == 4:
            vis_obs_indices.append(index)
        else:
            raise UnityEnvironmentException(
                "Invalid input received from the environment, the observation should "
                "either be a vector of float or a PNG image")
    if len(vec_obs_indices) == 0:
        vec_obs = np.zeros((n_agents, 0), dtype=np.float32)
    else:
        vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices],
                                 axis=1)
    vis_obs = [step_result.obs[i] for i in vis_obs_indices]
    mask = np.ones((n_agents, np.sum(group_spec.action_size)),
                   dtype=np.float32)
    if group_spec.is_action_discrete():
        mask = np.ones((n_agents, np.sum(group_spec.discrete_action_branches)),
                       dtype=np.float32)
        if step_result.action_mask is not None:
            mask = 1 - np.concatenate(step_result.action_mask, axis=1)
    if agent_id_prefix is None:
        agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)]
    else:
        agent_ids = [
            f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id
        ]
    return BrainInfo(
        vis_obs,
        vec_obs,
        list(step_result.reward),
        agent_ids,
        list(step_result.done),
        list(step_result.max_step),
        mask,
    )
Exemplo n.º 19
0
def create_mock_batchedstep(
    num_agents: int = 1,
    num_vector_observations: int = 0,
    num_vis_observations: int = 0,
    action_shape: List[int] = None,
    discrete: bool = False,
    done: bool = False,
) -> BatchedStepResult:
    """
    Creates a mock BatchedStepResult with observations. Imitates constant
    vector/visual observations, rewards, dones, and agents.

    :int num_agents: Number of "agents" to imitate.
    :int num_vector_observations: Number of "observations" in your observation space
    :int num_vis_observations: Number of "observations" in your observation space
    :int num_vector_acts: Number of actions in your action space
    :bool discrete: Whether or not action space is discrete
    """
    if action_shape is None:
        action_shape = [2]

    obs_list = []
    for _ in range(num_vis_observations):
        obs_list.append(np.ones((num_agents, 84, 84, 3), dtype=np.float32))
    if num_vector_observations > 1:
        obs_list.append(
            np.array(num_agents * [num_vector_observations * [1]],
                     dtype=np.float32))
    action_mask = None
    if discrete:
        action_mask = [
            np.array(num_agents * [action_size * [False]])
            for action_size in action_shape
        ]

    reward = np.array(num_agents * [1.0], dtype=np.float32)
    done = np.array(num_agents * [done], dtype=np.bool)
    max_step = np.array(num_agents * [False], dtype=np.bool)
    agent_id = np.arange(num_agents, dtype=np.int32)

    return BatchedStepResult(obs_list, reward, done, max_step, agent_id,
                             action_mask)
Exemplo n.º 20
0
def test_take_action_returns_action_info_when_available():
    test_seed = 3
    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
    policy_eval_out = {
        "action": np.array([1.0], dtype=np.float32),
        "memory_out": np.array([[2.5]], dtype=np.float32),
        "value": np.array([1.1], dtype=np.float32),
    }
    policy.evaluate = MagicMock(return_value=policy_eval_out)
    step_with_agents = BatchedStepResult(
        [],
        np.array([], dtype=np.float32),
        np.array([False], dtype=np.bool),
        np.array([], dtype=np.bool),
        np.array([0]),
        None,
    )
    result = policy.get_action(step_with_agents)
    expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"],
                          policy_eval_out, [0])
    assert result == expected
Exemplo n.º 21
0
    def evaluate(
        self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
    ) -> Dict[str, np.ndarray]:
        """
        Evaluates policy for the agent experiences provided.
        :param batched_step_result: BatchedStepResult object containing inputs.
        :return: Outputs from network as defined by self.inference_dict.
        """
        feed_dict = {
            self.model.batch_size: batched_step_result.n_agents(),
            self.model.sequence_length: 1,
        }
        if self.use_recurrent:
            if not self.use_continuous_act:
                feed_dict[self.model.prev_action] = self.retrieve_previous_action(
                    global_agent_ids
                )
            feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids)

        feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
        run_out = self._execute_model(feed_dict, self.inference_dict)
        return run_out
Exemplo n.º 22
0
    def _sanitize_step_result(self, step_result):
        """
        Takes as input a BatchedStepResult returned from mlagents_envs and cleans it in order to send back informations about agents in always the same order.
        This order is given by self._agents_id. 2 possible cases :
        1) No agents terminated on the new timestep
        2) One or more agents aterminated on the new timestep
        
        If 1), the step_result doesnt need to be modified.
        If 2), modifications need to be made on the step_result.
        
        For some reasons, when an agent is done, mlagents_envs returns in step_result informations about the done agent as well as informations about a new agent,
        added because the agent terminated. We want to treat these two agents as the same agent. Furthermore, the information about the new agent is located at a
        specific position in the step_result.
        To illustrate this, let's say we receive this step_result at timestep t: [0, 1, 2] and agent 1 terminated at t+1. We will receive : [1, 0, 3, 2].
        Few things happen here:
        -the done agent (1) is put at the first place of the step_result at t+1.
        -the new agent (3) is put at the index that agent 1 was on the last timestep, + 1.
        
        In fact, we can generalize this in the case of n agents being done at timestep t+1: the index of a new agent corresponding to a certain agent which just
        terminated is the index of the agent that terminated on the last timestep + n - m, n being the number of done agents at timestep t+1, and m the number of done agents at timestep t.
        Why n ? Because n agents were "pushed" at the beginning of step_result thus we need to include them to access the new agent.
        Why m ? If agents were done at timestep t, they have been removed from the step_result of timestep t+1. We thus need to substract them to access the new agent
        (it is easier to see this if you take a pencil and a paper and simulate the process)
        
        So, in order to return a step_result which is "sanitized" i.e. return a step_result with the same order as self._agents_id, we need to do a few things :
        -create new_id_order: list of index corresponding to locations of self._agents_id 
                              (if new_id_order = [2, 0, 1], then id of index 0 in self._agents_id is located at index 2 in step_result, id of index 1 at index 0, and id of index 2 at index 1)
        -create index_gym_id_done: list of index of agent ids in self._agents_id that terminated at current timestep (done=True)
        -replace agents which are done by their successor agents in self._agents_id and create agents_new_id, a list of the new agents.
         To do that, we use the previous step result to locate the position of each done agent. We then deduce the position of their successor (index + n, as said above).
         Once we have the position of their successoir, we access their id.
        -create new step_result, which is composed of:
            -obs: observations of all agents. NOTE: mlagents_envs doesnt provide the last observation (S_T) of a done agent, so we return instead the first observation
                  of its successor.
            -rewards: rewards obtained by all agents. We return step_result.reward[new_id_order] in order to rank them in the right order.
            -dones: whether or not agent termianted on the timestep. We return done=step_result.done[new_id_order] in order to rank them in the right order.
            -max_step: whether or not the agent terminated by running out of timesteps. We return step_result.max_step[new_id_order] in order to rank them in the right order.
            -agent_id: list of agent ids.
            -action_mask: not implemented, so None.
            
        """

        #Case 1): simply return step_result
        # in this case: no done agents, the order of step_result is thus the same as the order of self._agents_id
        # so we can set new_id_order to be range(n) ([0, 1, 2, ..., n])
        if len(self._agents_id) == step_result.n_agents():
            self._previous_step_result = step_result
            self._previous_new_id_order = list(range(len(self._agents_id)))
            self._previous_done_agents = 0

            return step_result

        #Case 2): modify step_result

        new_id_order = []
        for agent_id in self._agents_id:
            agent_id_index_step_result = list(
                step_result.agent_id).index(agent_id)
            new_id_order.append(agent_id_index_step_result)

        index_gym_id_done = []
        for index, agent_id in enumerate(step_result.agent_id):
            if step_result.done[index]:
                index_gym_id_done.append(self._agents_id.index(agent_id))

        agents_new_id = []
        #2 things here : -replace in self._agents_id the ids of dones agents by ids of their successor.
        #                -create agents_new_id, a list of the successors' ids.
        for index_id_done in index_gym_id_done:
            index_new_agent = self._previous_new_id_order[index_id_done] + len(
                index_gym_id_done) - self._previous_done_agents
            self._agents_id[index_id_done] = list(
                step_result.agent_id)[index_new_agent]
            agents_new_id.append(list(step_result.agent_id)[index_new_agent])

        new_obs = []
        for index, agent_id in enumerate(self._agents_id):
            if agent_id in agents_new_id:
                new_obs.append(step_result.obs[0][self._previous_new_id_order[
                    index_gym_id_done[agents_new_id.index(agent_id)]] +
                                                  len(index_gym_id_done) -
                                                  self._previous_done_agents])
            else:
                new_obs.append(step_result.obs[0][new_id_order[index]])
        new_obs = [np.array(new_obs)]

        self._previous_step_result = step_result
        self._previous_new_id_order = new_id_order
        self._previous_done_agents = len(index_gym_id_done)

        new_step_result = BatchedStepResult(
            obs=new_obs,
            reward=step_result.reward[new_id_order],
            done=step_result.done[new_id_order],
            max_step=step_result.max_step[new_id_order],
            agent_id=step_result.agent_id[new_id_order],
            action_mask=None)

        return new_step_result
Exemplo n.º 23
0
    def add_experiences(
        self,
        batched_step_result: BatchedStepResult,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param batched_step_result: current BatchedStepResult.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id)
            for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            if global_id in self.last_step_result:  # Don't store if agent just reset
                self.last_take_action_outputs[global_id] = take_action_outputs

        for _id in batched_step_result.agent_id:  # Assume agent_id is 1-D
            local_id = int(
                _id
            )  # Needed for mypy to pass since ndarray has no content type
            curr_agent_step = batched_step_result.get_agent_step_result(
                local_id)
            global_id = get_global_agent_id(worker_id, local_id)
            stored_agent_step, idx = self.last_step_result.get(
                global_id, (None, None))
            stored_take_action_outputs = self.last_take_action_outputs.get(
                global_id, None)

            if stored_agent_step is not None and stored_take_action_outputs is not None:
                # We know the step is from the same worker, so use the local agent id.
                obs = stored_agent_step.obs
                if not stored_agent_step.done:
                    if self.policy.use_recurrent:
                        memory = self.policy.retrieve_memories([global_id
                                                                ])[0, :]
                    else:
                        memory = None

                    done = curr_agent_step.done
                    max_step = curr_agent_step.max_step

                    # Add the outputs of the last eval
                    action = stored_take_action_outputs["action"][idx]
                    if self.policy.use_continuous_act:
                        action_pre = stored_take_action_outputs["pre_action"][
                            idx]
                    else:
                        action_pre = None
                    action_probs = stored_take_action_outputs["log_probs"][idx]
                    action_mask = stored_agent_step.action_mask
                    prev_action = self.policy.retrieve_previous_action(
                        [global_id])[0, :]

                    experience = AgentExperience(
                        obs=obs,
                        reward=curr_agent_step.reward,
                        done=done,
                        action=action,
                        action_probs=action_probs,
                        action_pre=action_pre,
                        action_mask=action_mask,
                        prev_action=prev_action,
                        max_step=max_step,
                        memory=memory,
                    )
                    # Add the value outputs if needed
                    self.experience_buffers[global_id].append(experience)
                    self.episode_rewards[global_id] += curr_agent_step.reward
                if (curr_agent_step.done or
                    (len(self.experience_buffers[global_id]) >=
                     self.max_trajectory_length)) and len(
                         self.experience_buffers[global_id]) > 0:
                    # Make next AgentExperience
                    next_obs = curr_agent_step.obs
                    trajectory = Trajectory(
                        steps=self.experience_buffers[global_id],
                        agent_id=global_id,
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
                    for traj_queue in self.trajectory_queues:
                        traj_queue.put(trajectory)
                    self.experience_buffers[global_id] = []
                    if curr_agent_step.done:
                        # Record episode length for agents which have had at least
                        # 1 step. Done after reset ignored.
                        self.stats_reporter.add_stat(
                            "Environment/Episode Length",
                            self.episode_steps.get(global_id, 0),
                        )
                elif not curr_agent_step.done:
                    self.episode_steps[global_id] += 1

            # Index is needed to grab from last_take_action_outputs
            self.last_step_result[global_id] = (
                curr_agent_step,
                batched_step_result.agent_id_to_index[_id],
            )
            # Delete all done agents, regardless of if they had a 0-length episode.
            if curr_agent_step.done:
                self._clean_agent_data(global_id)

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
            # don't store the action.
            if _gid in self.last_step_result:
                if "action" in take_action_outputs:
                    self.policy.save_previous_action(
                        [_gid], take_action_outputs["action"])
Exemplo n.º 24
0
    def add_experiences(
        self,
        batched_step_result: BatchedStepResult,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param batched_step_result: current BatchedStepResult.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)
            self.stats_reporter.add_stat("Policy/Learning Rate",
                                         take_action_outputs["learning_rate"])

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id)
            for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            self.last_take_action_outputs[global_id] = take_action_outputs

        for _id in batched_step_result.agent_id:  # Assume agent_id is 1-D
            local_id = int(
                _id
            )  # Needed for mypy to pass since ndarray has no content type
            curr_agent_step = batched_step_result.get_agent_step_result(
                local_id)
            global_id = get_global_agent_id(worker_id, local_id)
            stored_step = self.last_step_result.get(global_id, None)
            stored_take_action_outputs = self.last_take_action_outputs.get(
                global_id, None)
            if stored_step is not None and stored_take_action_outputs is not None:
                # We know the step is from the same worker, so use the local agent id.
                stored_agent_step = stored_step.get_agent_step_result(local_id)
                idx = stored_step.agent_id_to_index[local_id]
                obs = stored_agent_step.obs
                if not stored_agent_step.done:
                    if self.policy.use_recurrent:
                        memory = self.policy.retrieve_memories([global_id
                                                                ])[0, :]
                    else:
                        memory = None

                    done = curr_agent_step.done
                    max_step = curr_agent_step.max_step

                    # Add the outputs of the last eval
                    action = stored_take_action_outputs["action"][idx]
                    if self.policy.use_continuous_act:
                        action_pre = stored_take_action_outputs["pre_action"][
                            idx]
                    else:
                        action_pre = None
                    action_probs = stored_take_action_outputs["log_probs"][idx]
                    action_mask = stored_agent_step.action_mask
                    prev_action = self.policy.retrieve_previous_action(
                        [global_id])[0, :]

                    experience = AgentExperience(
                        obs=obs,
                        reward=curr_agent_step.reward,
                        done=done,
                        action=action,
                        action_probs=action_probs,
                        action_pre=action_pre,
                        action_mask=action_mask,
                        prev_action=prev_action,
                        max_step=max_step,
                        memory=memory,
                    )
                    # Add the value outputs if needed
                    self.experience_buffers[global_id].append(experience)
                    self.episode_rewards[global_id] += curr_agent_step.reward
                if (curr_agent_step.done or
                    (len(self.experience_buffers[global_id]) >=
                     self.max_trajectory_length)) and len(
                         self.experience_buffers[global_id]) > 0:
                    # Make next AgentExperience
                    next_obs = curr_agent_step.obs
                    trajectory = Trajectory(
                        steps=self.experience_buffers[global_id],
                        agent_id=global_id,
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
                    for traj_queue in self.trajectory_queues:
                        traj_queue.put(trajectory)
                    self.experience_buffers[global_id] = []
                    if curr_agent_step.done:
                        self.stats_reporter.add_stat(
                            "Environment/Cumulative Reward",
                            self.episode_rewards.get(global_id, 0),
                        )
                        self.stats_reporter.add_stat(
                            "Environment/Episode Length",
                            self.episode_steps.get(global_id, 0),
                        )
                        del self.episode_steps[global_id]
                        del self.episode_rewards[global_id]
                elif not curr_agent_step.done:
                    self.episode_steps[global_id] += 1

            self.last_step_result[global_id] = batched_step_result

        if "action" in take_action_outputs:
            self.policy.save_previous_action(previous_action.agent_ids,
                                             take_action_outputs["action"])