def test_agent_deletion():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    fake_action_outputs = {
        "action": [0.1],
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "pre_action": [0.1],
        "log_probs": [0.1],
    }
    mock_step = mb.create_mock_batchedstep(
        num_agents=1,
        num_vector_observations=8,
        action_shape=[2],
        num_vis_observations=0,
    )
    mock_done_step = mb.create_mock_batchedstep(
        num_agents=1,
        num_vector_observations=8,
        action_shape=[2],
        num_vis_observations=0,
        done=True,
    )
    fake_action_info = ActionInfo(
        action=[0.1],
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_step.agent_id,
    )

    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(mock_step, 0, ActionInfo.empty())

    # Run 3 trajectories, with different workers (to simulate different agents)
    add_calls = []
    remove_calls = []
    for _ep in range(3):
        for _ in range(5):
            processor.add_experiences(mock_step, _ep, fake_action_info)
            add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1]))
        processor.add_experiences(mock_done_step, _ep, fake_action_info)
        # Make sure we don't add experiences from the prior agents after the done
        remove_calls.append(mock.call([get_global_agent_id(_ep, 0)]))

    policy.save_previous_action.assert_has_calls(add_calls)
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
Пример #2
0
    def add_experiences(
        self,
        decision_steps: DecisionSteps,
        terminal_steps: TerminalSteps,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param decision_steps: current DecisionSteps.
        :param terminal_steps: current TerminalSteps.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            if global_id in self.last_step_result:  # Don't store if agent just reset
                self.last_take_action_outputs[global_id] = take_action_outputs

        # Iterate over all the terminal steps
        for terminal_step in terminal_steps.values():
            local_id = terminal_step.agent_id
            global_id = get_global_agent_id(worker_id, local_id)
            self._process_step(
                terminal_step, global_id, terminal_steps.agent_id_to_index[local_id]
            )
        # Iterate over all the decision steps
        for ongoing_step in decision_steps.values():
            local_id = ongoing_step.agent_id
            global_id = get_global_agent_id(worker_id, local_id)
            self._process_step(
                ongoing_step, global_id, decision_steps.agent_id_to_index[local_id]
            )

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
            # don't store the action.
            if _gid in self.last_step_result:
                if "action" in take_action_outputs:
                    self.policy.save_previous_action(
                        [_gid], take_action_outputs["action"]
                    )
Пример #3
0
    def get_action(self,
                   batched_step_result: BatchedStepResult,
                   worker_id: int = 0) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param batched_step_result: A dictionary of brain names and BatchedStepResult from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the BatchedStepResult came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if batched_step_result.n_agents() == 0:
            return ActionInfo.empty()
        agents_done = [
            agent for agent, done in zip(batched_step_result.agent_id,
                                         batched_step_result.done) if done
        ]
        self.remove_memories(agents_done)
        self.remove_previous_action(agents_done)

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in batched_step_result.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            batched_step_result, global_agent_ids)
        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        action_info = ActionInfo(
            action=run_out.get("action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=batched_step_result.agent_id,
        )
        return action_info
Пример #4
0
    def get_action(self,
                   decision_requests: DecisionSteps,
                   worker_id: int = 0) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param decision_requests: A dictionary of brain names and DecisionSteps from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the DecisionSteps came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if len(decision_requests) == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in decision_requests.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            decision_requests, global_agent_ids)

        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        return ActionInfo(
            action=run_out.get("action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=decision_requests.agent_id,
        )
Пример #5
0
    def add_experiences(
        self,
        batched_step_result: BatchedStepResult,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param batched_step_result: current BatchedStepResult.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)
            self.stats_reporter.add_stat("Policy/Learning Rate",
                                         take_action_outputs["learning_rate"])

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id)
            for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            self.last_take_action_outputs[global_id] = take_action_outputs

        for _id in batched_step_result.agent_id:  # Assume agent_id is 1-D
            local_id = int(
                _id
            )  # Needed for mypy to pass since ndarray has no content type
            curr_agent_step = batched_step_result.get_agent_step_result(
                local_id)
            global_id = get_global_agent_id(worker_id, local_id)
            stored_step = self.last_step_result.get(global_id, None)
            stored_take_action_outputs = self.last_take_action_outputs.get(
                global_id, None)
            if stored_step is not None and stored_take_action_outputs is not None:
                # We know the step is from the same worker, so use the local agent id.
                stored_agent_step = stored_step.get_agent_step_result(local_id)
                idx = stored_step.agent_id_to_index[local_id]
                obs = stored_agent_step.obs
                if not stored_agent_step.done:
                    if self.policy.use_recurrent:
                        memory = self.policy.retrieve_memories([global_id
                                                                ])[0, :]
                    else:
                        memory = None

                    done = curr_agent_step.done
                    max_step = curr_agent_step.max_step

                    # Add the outputs of the last eval
                    action = stored_take_action_outputs["action"][idx]
                    if self.policy.use_continuous_act:
                        action_pre = stored_take_action_outputs["pre_action"][
                            idx]
                    else:
                        action_pre = None
                    action_probs = stored_take_action_outputs["log_probs"][idx]
                    action_mask = stored_agent_step.action_mask
                    prev_action = self.policy.retrieve_previous_action(
                        [global_id])[0, :]

                    experience = AgentExperience(
                        obs=obs,
                        reward=curr_agent_step.reward,
                        done=done,
                        action=action,
                        action_probs=action_probs,
                        action_pre=action_pre,
                        action_mask=action_mask,
                        prev_action=prev_action,
                        max_step=max_step,
                        memory=memory,
                    )
                    # Add the value outputs if needed
                    self.experience_buffers[global_id].append(experience)
                    self.episode_rewards[global_id] += curr_agent_step.reward
                if (curr_agent_step.done or
                    (len(self.experience_buffers[global_id]) >=
                     self.max_trajectory_length)) and len(
                         self.experience_buffers[global_id]) > 0:
                    # Make next AgentExperience
                    next_obs = curr_agent_step.obs
                    trajectory = Trajectory(
                        steps=self.experience_buffers[global_id],
                        agent_id=global_id,
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
                    for traj_queue in self.trajectory_queues:
                        traj_queue.put(trajectory)
                    self.experience_buffers[global_id] = []
                    if curr_agent_step.done:
                        self.stats_reporter.add_stat(
                            "Environment/Cumulative Reward",
                            self.episode_rewards.get(global_id, 0),
                        )
                        self.stats_reporter.add_stat(
                            "Environment/Episode Length",
                            self.episode_steps.get(global_id, 0),
                        )
                        del self.episode_steps[global_id]
                        del self.episode_rewards[global_id]
                elif not curr_agent_step.done:
                    self.episode_steps[global_id] += 1

            self.last_step_result[global_id] = batched_step_result

        if "action" in take_action_outputs:
            self.policy.save_previous_action(previous_action.agent_ids,
                                             take_action_outputs["action"])
Пример #6
0
    def add_experiences(
        self,
        batched_step_result: BatchedStepResult,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param batched_step_result: current BatchedStepResult.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id)
            for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            if global_id in self.last_step_result:  # Don't store if agent just reset
                self.last_take_action_outputs[global_id] = take_action_outputs

        for _id in batched_step_result.agent_id:  # Assume agent_id is 1-D
            local_id = int(
                _id
            )  # Needed for mypy to pass since ndarray has no content type
            curr_agent_step = batched_step_result.get_agent_step_result(
                local_id)
            global_id = get_global_agent_id(worker_id, local_id)
            stored_agent_step, idx = self.last_step_result.get(
                global_id, (None, None))
            stored_take_action_outputs = self.last_take_action_outputs.get(
                global_id, None)

            if stored_agent_step is not None and stored_take_action_outputs is not None:
                # We know the step is from the same worker, so use the local agent id.
                obs = stored_agent_step.obs
                if not stored_agent_step.done:
                    if self.policy.use_recurrent:
                        memory = self.policy.retrieve_memories([global_id
                                                                ])[0, :]
                    else:
                        memory = None

                    done = curr_agent_step.done
                    max_step = curr_agent_step.max_step

                    # Add the outputs of the last eval
                    action = stored_take_action_outputs["action"][idx]
                    if self.policy.use_continuous_act:
                        action_pre = stored_take_action_outputs["pre_action"][
                            idx]
                    else:
                        action_pre = None
                    action_probs = stored_take_action_outputs["log_probs"][idx]
                    action_mask = stored_agent_step.action_mask
                    prev_action = self.policy.retrieve_previous_action(
                        [global_id])[0, :]

                    experience = AgentExperience(
                        obs=obs,
                        reward=curr_agent_step.reward,
                        done=done,
                        action=action,
                        action_probs=action_probs,
                        action_pre=action_pre,
                        action_mask=action_mask,
                        prev_action=prev_action,
                        max_step=max_step,
                        memory=memory,
                    )
                    # Add the value outputs if needed
                    self.experience_buffers[global_id].append(experience)
                    self.episode_rewards[global_id] += curr_agent_step.reward
                if (curr_agent_step.done or
                    (len(self.experience_buffers[global_id]) >=
                     self.max_trajectory_length)) and len(
                         self.experience_buffers[global_id]) > 0:
                    # Make next AgentExperience
                    next_obs = curr_agent_step.obs
                    trajectory = Trajectory(
                        steps=self.experience_buffers[global_id],
                        agent_id=global_id,
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
                    for traj_queue in self.trajectory_queues:
                        traj_queue.put(trajectory)
                    self.experience_buffers[global_id] = []
                    if curr_agent_step.done:
                        # Record episode length for agents which have had at least
                        # 1 step. Done after reset ignored.
                        self.stats_reporter.add_stat(
                            "Environment/Episode Length",
                            self.episode_steps.get(global_id, 0),
                        )
                elif not curr_agent_step.done:
                    self.episode_steps[global_id] += 1

            # Index is needed to grab from last_take_action_outputs
            self.last_step_result[global_id] = (
                curr_agent_step,
                batched_step_result.agent_id_to_index[_id],
            )
            # Delete all done agents, regardless of if they had a 0-length episode.
            if curr_agent_step.done:
                self._clean_agent_data(global_id)

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
            # don't store the action.
            if _gid in self.last_step_result:
                if "action" in take_action_outputs:
                    self.policy.save_previous_action(
                        [_gid], take_action_outputs["action"])