Exemplo n.º 1
0
 def evaluate(
     self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
 ) -> Dict[str, Any]:
     """
     Evaluates policy for the agent experiences provided.
     :param batched_step_result: BatchedStepResult object containing inputs.
     :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
     :return: Outputs from network as defined by self.inference_dict.
     """
     feed_dict = {
         self.model.batch_size: batched_step_result.n_agents(),
         self.model.sequence_length: 1,
     }
     epsilon = None
     if self.use_recurrent:
         if not self.use_continuous_act:
             feed_dict[self.model.prev_action] = self.retrieve_previous_action(
                 global_agent_ids
             )
         feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids)
     if self.use_continuous_act:
         epsilon = np.random.normal(
             size=(batched_step_result.n_agents(), self.model.act_size[0])
         )
         feed_dict[self.model.epsilon] = epsilon
     feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
     run_out = self._execute_model(feed_dict, self.inference_dict)
     return run_out
Exemplo n.º 2
0
    def _sanitize_info(self,
                       step_result: BatchedStepResult) -> BatchedStepResult:
        n_extra_agents = step_result.n_agents() - self._n_agents
        if n_extra_agents < 0:
            # In this case, some Agents did not request a decision when expected
            raise UnityGymException(
                "The number of agents in the scene does not match the expected number."
            )

        if step_result.n_agents() - sum(step_result.done) != self._n_agents:
            raise UnityGymException(
                "The number of agents in the scene does not match the expected number."
            )

        for index, agent_id in enumerate(step_result.agent_id):
            if step_result.done[index]:
                self.agent_mapper.mark_agent_done(agent_id,
                                                  step_result.reward[index])

        # Set the new AgentDone flags to True
        # Note that the corresponding agent_id that gets marked done will be different
        # than the original agent that was done, but this is OK since the gym interface
        # only cares about the ordering.
        for index, agent_id in enumerate(step_result.agent_id):
            if not self._previous_step_result.contains_agent(agent_id):
                if step_result.done[index]:
                    # If the Agent is already done (e.g. it ended its epsiode twice in one step)
                    # Don't try to register it here.
                    continue
                # Register this agent, and get the reward of the previous agent that
                # was in its index, so that we can return it to the gym.
                last_reward = self.agent_mapper.register_new_agent_id(agent_id)
                step_result.done[index] = True
                step_result.reward[index] = last_reward

        self._previous_step_result = step_result  # store the new original

        # Get a permutation of the agent IDs so that a given ID stays in the same
        # index as where it was first seen.
        new_id_order = self.agent_mapper.get_id_permutation(
            list(step_result.agent_id))

        _mask: Optional[List[np.array]] = None
        if step_result.action_mask is not None:
            _mask = []
            for mask_index in range(len(step_result.action_mask)):
                _mask.append(step_result.action_mask[mask_index][new_id_order])
        new_obs: List[np.array] = []
        for obs_index in range(len(step_result.obs)):
            new_obs.append(step_result.obs[obs_index][new_id_order])
        return BatchedStepResult(
            obs=new_obs,
            reward=step_result.reward[new_id_order],
            done=step_result.done[new_id_order],
            max_step=step_result.max_step[new_id_order],
            agent_id=step_result.agent_id[new_id_order],
            action_mask=_mask,
        )
Exemplo n.º 3
0
    def get_action(
        self, batched_step_result: BatchedStepResult, worker_id: int = 0
    ) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param batched_step_result: A dictionary of brain names and BatchedStepResult from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the BatchedStepResult came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if batched_step_result.n_agents() == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in batched_step_result.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            batched_step_result, global_agent_ids
        )

        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        return ActionInfo(
            action=run_out.get("action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=batched_step_result.agent_id,
        )
Exemplo n.º 4
0
def step_result_to_brain_info(
    step_result: BatchedStepResult,
    group_spec: AgentGroupSpec,
    agent_id_prefix: int = None,
) -> BrainInfo:
    n_agents = step_result.n_agents()
    vis_obs_indices = []
    vec_obs_indices = []
    for index, observation in enumerate(step_result.obs):
        if len(observation.shape) == 2:
            vec_obs_indices.append(index)
        elif len(observation.shape) == 4:
            vis_obs_indices.append(index)
        else:
            raise UnityEnvironmentException(
                "Invalid input received from the environment, the observation should "
                "either be a vector of float or a PNG image")
    if len(vec_obs_indices) == 0:
        vec_obs = np.zeros((n_agents, 0), dtype=np.float32)
    else:
        vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices],
                                 axis=1)
    vis_obs = [step_result.obs[i] for i in vis_obs_indices]
    mask = np.ones((n_agents, np.sum(group_spec.action_size)),
                   dtype=np.float32)
    if group_spec.is_action_discrete():
        mask = np.ones((n_agents, np.sum(group_spec.discrete_action_branches)),
                       dtype=np.float32)
        if step_result.action_mask is not None:
            mask = 1 - np.concatenate(step_result.action_mask, axis=1)
    if agent_id_prefix is None:
        agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)]
    else:
        agent_ids = [
            f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id
        ]
    return BrainInfo(
        vis_obs,
        vec_obs,
        list(step_result.reward),
        agent_ids,
        list(step_result.done),
        list(step_result.max_step),
        mask,
    )
Exemplo n.º 5
0
    def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult:
        n_extra_agents = step_result.n_agents() - self._n_agents
        if n_extra_agents < 0 or n_extra_agents > self._n_agents:
            # In this case, some Agents did not request a decision when expected
            # or too many requested a decision
            raise UnityGymException(
                "The number of agents in the scene does not match the expected number."
            )

        # remove the done Agents
        indices_to_keep: List[int] = []
        for index, is_done in enumerate(step_result.done):
            if not is_done:
                indices_to_keep.append(index)

        # Set the new AgentDone flags to True
        # Note that the corresponding agent_id that gets marked done will be different
        # than the original agent that was done, but this is OK since the gym interface
        # only cares about the ordering.
        for index, agent_id in enumerate(step_result.agent_id):
            if not self._previous_step_result.contains_agent(agent_id):
                step_result.done[index] = True
            if agent_id in self._done_agents:
                step_result.done[index] = True
        self._done_agents = set()
        self._previous_step_result = step_result  # store the new original

        _mask: Optional[List[np.array]] = None
        if step_result.action_mask is not None:
            _mask = []
            for mask_index in range(len(step_result.action_mask)):
                _mask.append(step_result.action_mask[mask_index][indices_to_keep])
        new_obs: List[np.array] = []
        for obs_index in range(len(step_result.obs)):
            new_obs.append(step_result.obs[obs_index][indices_to_keep])
        return BatchedStepResult(
            obs=new_obs,
            reward=step_result.reward[indices_to_keep],
            done=step_result.done[indices_to_keep],
            max_step=step_result.max_step[indices_to_keep],
            agent_id=step_result.agent_id[indices_to_keep],
            action_mask=_mask,
        )
Exemplo n.º 6
0
    def evaluate(
        self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
    ) -> Dict[str, np.ndarray]:
        """
        Evaluates policy for the agent experiences provided.
        :param batched_step_result: BatchedStepResult object containing inputs.
        :return: Outputs from network as defined by self.inference_dict.
        """
        feed_dict = {
            self.model.batch_size: batched_step_result.n_agents(),
            self.model.sequence_length: 1,
        }
        if self.use_recurrent:
            if not self.use_continuous_act:
                feed_dict[self.model.prev_action] = self.retrieve_previous_action(
                    global_agent_ids
                )
            feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids)

        feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
        run_out = self._execute_model(feed_dict, self.inference_dict)
        return run_out