Пример #1
0
    def add_experiences(
        self,
        decision_steps: DecisionSteps,
        terminal_steps: TerminalSteps,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param decision_steps: current DecisionSteps.
        :param terminal_steps: current TerminalSteps.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self._stats_reporter.add_stat("Policy/Entropy", _entropy)

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id)
            for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            if global_id in self._last_step_result:  # Don't store if agent just reset
                self._last_take_action_outputs[global_id] = take_action_outputs

        # Iterate over all the terminal steps, first gather all the group obs
        # and then create the AgentExperiences/Trajectories. _add_to_group_status
        # stores Group statuses in a common data structure self.group_status
        for terminal_step in terminal_steps.values():
            self._add_group_status_and_obs(terminal_step, worker_id)
        for terminal_step in terminal_steps.values():
            local_id = terminal_step.agent_id
            global_id = get_global_agent_id(worker_id, local_id)
            self._process_step(terminal_step, worker_id,
                               terminal_steps.agent_id_to_index[local_id])
            # Clear the last seen group obs when agents die.
            self._clear_group_status_and_obs(global_id)

        # Iterate over all the decision steps, first gather all the group obs
        # and then create the trajectories. _add_to_group_status
        # stores Group statuses in a common data structure self.group_status
        for ongoing_step in decision_steps.values():
            self._add_group_status_and_obs(ongoing_step, worker_id)
        for ongoing_step in decision_steps.values():
            local_id = ongoing_step.agent_id
            self._process_step(ongoing_step, worker_id,
                               decision_steps.agent_id_to_index[local_id])

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
            # don't store the action.
            if _gid in self._last_step_result:
                if "action" in take_action_outputs:
                    self.policy.save_previous_action(
                        [_gid], take_action_outputs["action"])
Пример #2
0
    def add_experiences(
        self,
        decision_steps: DecisionSteps,
        terminal_steps: TerminalSteps,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param decision_steps: current DecisionSteps.
        :param terminal_steps: current TerminalSteps.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            if global_id in self.last_step_result:  # Don't store if agent just reset
                self.last_take_action_outputs[global_id] = take_action_outputs

        # Iterate over all the terminal steps
        for terminal_step in terminal_steps.values():
            local_id = terminal_step.agent_id
            global_id = get_global_agent_id(worker_id, local_id)
            self._process_step(
                terminal_step, global_id, terminal_steps.agent_id_to_index[local_id]
            )
        # Iterate over all the decision steps
        for ongoing_step in decision_steps.values():
            local_id = ongoing_step.agent_id
            global_id = get_global_agent_id(worker_id, local_id)
            self._process_step(
                ongoing_step, global_id, decision_steps.agent_id_to_index[local_id]
            )

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
            # don't store the action.
            if _gid in self.last_step_result:
                if "action" in take_action_outputs:
                    self.policy.save_previous_action(
                        [_gid], take_action_outputs["action"]
                    )
Пример #3
0
    def get_action(self,
                   decision_requests: DecisionSteps,
                   worker_id: int = 0) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param decision_requests: A dictionary of brain names and DecisionSteps from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the DecisionSteps came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if len(decision_requests) == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in decision_requests.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            decision_requests, global_agent_ids)

        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        return ActionInfo(
            action=run_out.get("action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=decision_requests.agent_id,
        )
Пример #4
0
    def get_action(self,
                   decision_requests: DecisionSteps,
                   worker_id: int = 0) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param worker_id:
        :param decision_requests: A dictionary of behavior names and DecisionSteps from environment.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if len(decision_requests) == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in decision_requests.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(decision_requests, global_agent_ids)  # pylint: disable=assignment-from-no-return
        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        self.check_nan_action(run_out.get("action"))
        return ActionInfo(
            action=run_out.get("action"),
            env_action=run_out.get("env_action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=list(decision_requests.agent_id),
        )
Пример #5
0
def test_end_episode():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )
    fake_action_outputs = {
        "action": ActionTuple(continuous=np.array([[0.1]])),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
    }

    mock_decision_step, mock_terminal_step = mb.create_mock_steps(
        num_agents=1,
        observation_shapes=[(8,)],
        action_spec=ActionSpec.create_continuous(2),
    )
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1]])),
        env_action=ActionTuple(continuous=np.array([[0.1]])),
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_step.agent_id,
    )

    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_step, mock_terminal_step, 0, ActionInfo.empty()
    )
    # Run 3 trajectories, with different workers (to simulate different agents)
    remove_calls = []
    for _ep in range(3):
        remove_calls.append(mock.call([get_global_agent_id(_ep, 0)]))
        for _ in range(5):
            processor.add_experiences(
                mock_decision_step, mock_terminal_step, _ep, fake_action_info
            )
            # Make sure we don't add experiences from the prior agents after the done

    # Call end episode
    processor.end_episode()
    # Check that we removed every agent
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
Пример #6
0
    def get_action(
        self, decision_requests: DecisionSteps, worker_id: int = 0
    ) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param decision_requests: A dictionary of brain names and DecisionSteps from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the DecisionSteps came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if len(decision_requests) == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in decision_requests.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            decision_requests, global_agent_ids
        )

        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        # For Compatibility with buffer changes for hybrid action support
        if "log_probs" in run_out:
            log_probs_tuple = LogProbsTuple()
            if self.behavior_spec.action_spec.is_continuous():
                log_probs_tuple.add_continuous(run_out["log_probs"])
            else:
                log_probs_tuple.add_discrete(run_out["log_probs"])
            run_out["log_probs"] = log_probs_tuple
        if "action" in run_out:
            action_tuple = ActionTuple()
            env_action_tuple = ActionTuple()
            if self.behavior_spec.action_spec.is_continuous():
                action_tuple.add_continuous(run_out["pre_action"])
                env_action_tuple.add_continuous(run_out["action"])
            else:
                action_tuple.add_discrete(run_out["action"])
                env_action_tuple.add_discrete(run_out["action"])
            run_out["action"] = action_tuple
            run_out["env_action"] = env_action_tuple
        self.check_nan_action(run_out.get("action"))
        return ActionInfo(
            action=run_out.get("action"),
            env_action=run_out.get("env_action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=decision_requests.agent_id,
        )
Пример #7
0
 def _add_group_status_and_obs(
     self, step: Union[TerminalStep, DecisionStep], worker_id: int
 ) -> None:
     """
     Takes a TerminalStep or DecisionStep and adds the information in it
     to self.group_status. This information can then be retrieved
     when constructing trajectories to get the status of group mates. Also stores the current
     observation into current_group_obs, to be used to get the next group observations
     for bootstrapping.
     :param step: TerminalStep or DecisionStep
     :param worker_id: Worker ID of this particular environment. Used to generate a
         global group id.
     """
     global_agent_id = get_global_agent_id(worker_id, step.agent_id)
     stored_decision_step, idx = self._last_step_result.get(
         global_agent_id, (None, None)
     )
     stored_take_action_outputs = self._last_take_action_outputs.get(
         global_agent_id, None
     )
     if stored_decision_step is not None and stored_take_action_outputs is not None:
         # 0, the default group_id, means that the agent doesn't belong to an agent group.
         # If 0, don't add any groupmate information.
         if step.group_id > 0:
             global_group_id = get_global_group_id(worker_id, step.group_id)
             stored_actions = stored_take_action_outputs["action"]
             action_tuple = ActionTuple(
                 continuous=stored_actions.continuous[idx],
                 discrete=stored_actions.discrete[idx],
             )
             group_status = AgentStatus(
                 obs=stored_decision_step.obs,
                 reward=step.reward,
                 action=action_tuple,
                 done=isinstance(step, TerminalStep),
             )
             self._group_status[global_group_id][global_agent_id] = group_status
             self._current_group_obs[global_group_id][global_agent_id] = step.obs
Пример #8
0
    def _process_step(
        self, step: Union[TerminalStep, DecisionStep], worker_id: int, index: int
    ) -> None:
        terminated = isinstance(step, TerminalStep)
        global_agent_id = get_global_agent_id(worker_id, step.agent_id)
        global_group_id = get_global_group_id(worker_id, step.group_id)
        stored_decision_step, idx = self._last_step_result.get(
            global_agent_id, (None, None)
        )
        stored_take_action_outputs = self._last_take_action_outputs.get(
            global_agent_id, None
        )
        if not terminated:
            # Index is needed to grab from last_take_action_outputs
            self._last_step_result[global_agent_id] = (step, index)

        # This state is the consequence of a past action
        if stored_decision_step is not None and stored_take_action_outputs is not None:
            obs = stored_decision_step.obs
            if self.policy.use_recurrent:
                memory = self.policy.retrieve_previous_memories([global_agent_id])[0, :]
            else:
                memory = None
            done = terminated  # Since this is an ongoing step
            interrupted = step.interrupted if terminated else False
            # Add the outputs of the last eval
            stored_actions = stored_take_action_outputs["action"]
            action_tuple = ActionTuple(
                continuous=stored_actions.continuous[idx],
                discrete=stored_actions.discrete[idx],
            )
            stored_action_probs = stored_take_action_outputs["log_probs"]
            log_probs_tuple = LogProbsTuple(
                continuous=stored_action_probs.continuous[idx],
                discrete=stored_action_probs.discrete[idx],
            )
            action_mask = stored_decision_step.action_mask
            prev_action = self.policy.retrieve_previous_action([global_agent_id])[0, :]

            # Assemble teammate_obs. If none saved, then it will be an empty list.
            group_statuses = []
            for _id, _mate_status in self._group_status[global_group_id].items():
                if _id != global_agent_id:
                    group_statuses.append(_mate_status)

            experience = AgentExperience(
                obs=obs,
                reward=step.reward,
                done=done,
                action=action_tuple,
                action_probs=log_probs_tuple,
                action_mask=action_mask,
                prev_action=prev_action,
                interrupted=interrupted,
                memory=memory,
                group_status=group_statuses,
                group_reward=step.group_reward,
            )
            # Add the value outputs if needed
            self._experience_buffers[global_agent_id].append(experience)
            self._episode_rewards[global_agent_id] += step.reward
            if not terminated:
                self._episode_steps[global_agent_id] += 1

            # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon
            if (
                len(self._experience_buffers[global_agent_id])
                >= self._max_trajectory_length
                or terminated
            ):
                next_obs = step.obs
                next_group_obs = []
                for _id, _obs in self._current_group_obs[global_group_id].items():
                    if _id != global_agent_id:
                        next_group_obs.append(_obs)

                trajectory = Trajectory(
                    steps=self._experience_buffers[global_agent_id],
                    agent_id=global_agent_id,
                    next_obs=next_obs,
                    next_group_obs=next_group_obs,
                    behavior_id=self._behavior_id,
                )
                for traj_queue in self._trajectory_queues:
                    traj_queue.put(trajectory)
                self._experience_buffers[global_agent_id] = []
            if terminated:
                # Record episode length.
                self._stats_reporter.add_stat(
                    "Environment/Episode Length",
                    self._episode_steps.get(global_agent_id, 0),
                )
                self._clean_agent_data(global_agent_id)
def test_agent_deletion():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    fake_action_outputs = {
        "action": [0.1],
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "pre_action": [0.1],
        "log_probs": [0.1],
    }
    mock_decision_step, mock_terminal_step = mb.create_mock_steps(
        num_agents=1, observation_shapes=[(8, )], action_shape=2)
    mock_done_decision_step, mock_done_terminal_step = mb.create_mock_steps(
        num_agents=1, observation_shapes=[(8, )], action_shape=2, done=True)
    fake_action_info = ActionInfo(
        action=[0.1],
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_step.agent_id,
    )

    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(mock_decision_step, mock_terminal_step, 0,
                              ActionInfo.empty())

    # Run 3 trajectories, with different workers (to simulate different agents)
    add_calls = []
    remove_calls = []
    for _ep in range(3):
        for _ in range(5):
            processor.add_experiences(mock_decision_step, mock_terminal_step,
                                      _ep, fake_action_info)
            add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1]))
        processor.add_experiences(mock_done_decision_step,
                                  mock_done_terminal_step, _ep,
                                  fake_action_info)
        # Make sure we don't add experiences from the prior agents after the done
        remove_calls.append(mock.call([get_global_agent_id(_ep, 0)]))

    policy.save_previous_action.assert_has_calls(add_calls)
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
    assert len(processor.last_step_result.keys()) == 0

    # check that steps with immediate dones don't add to dicts
    processor.add_experiences(mock_done_decision_step, mock_done_terminal_step,
                              0, ActionInfo.empty())
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
    assert len(processor.last_step_result.keys()) == 0