def test_agent_deletion():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    fake_action_outputs = {
        "action": [0.1],
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "pre_action": [0.1],
        "log_probs": [0.1],
    }
    mock_step = mb.create_mock_batchedstep(
        num_agents=1,
        num_vector_observations=8,
        action_shape=[2],
        num_vis_observations=0,
    )
    mock_done_step = mb.create_mock_batchedstep(
        num_agents=1,
        num_vector_observations=8,
        action_shape=[2],
        num_vis_observations=0,
        done=True,
    )
    fake_action_info = ActionInfo(
        action=[0.1],
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_step.agent_id,
    )

    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(mock_step, 0, ActionInfo.empty())

    # Run 3 trajectories, with different workers (to simulate different agents)
    add_calls = []
    remove_calls = []
    for _ep in range(3):
        for _ in range(5):
            processor.add_experiences(mock_step, _ep, fake_action_info)
            add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1]))
        processor.add_experiences(mock_done_step, _ep, fake_action_info)
        # Make sure we don't add experiences from the prior agents after the done
        remove_calls.append(mock.call([get_global_agent_id(_ep, 0)]))

    policy.save_previous_action.assert_has_calls(add_calls)
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
예제 #2
0
    def get_action(self,
                   decision_requests: DecisionSteps,
                   worker_id: int = 0) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param worker_id:
        :param decision_requests: A dictionary of behavior names and DecisionSteps from environment.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if len(decision_requests) == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in decision_requests.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(decision_requests, global_agent_ids)  # pylint: disable=assignment-from-no-return
        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        self.check_nan_action(run_out.get("action"))
        return ActionInfo(
            action=run_out.get("action"),
            env_action=run_out.get("env_action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=list(decision_requests.agent_id),
        )
예제 #3
0
def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    fake_action_outputs = {
        "action": ActionTuple(continuous=np.array([[0.1], [0.1]])),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])),
    }
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=2,
        observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)],
        action_spec=ActionSpec.create_continuous(2),
    )
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
        env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
        value=[0.1, 0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_steps.agent_id,
    )
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    for _ in range(5):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Assert that two trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 2

    # Assert that the trajectory is of length 5
    trajectory = tqueue.put.call_args_list[0][0][0]
    assert len(trajectory.steps) == 5

    # Assert that the AgentProcessor is empty
    assert len(processor.experience_buffers[0]) == 0

    # Test empty steps
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=0,
        observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)],
        action_spec=ActionSpec.create_continuous(2),
    )
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    # Assert that the AgentProcessor is still empty
    assert len(processor.experience_buffers[0]) == 0
예제 #4
0
    def get_action(self,
                   batched_step_result: BatchedStepResult,
                   worker_id: int = 0) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param batched_step_result: A dictionary of brain names and BatchedStepResult from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the BatchedStepResult came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if batched_step_result.n_agents() == 0:
            return ActionInfo.empty()
        agents_done = [
            agent for agent, done in zip(batched_step_result.agent_id,
                                         batched_step_result.done) if done
        ]
        self.remove_memories(agents_done)
        self.remove_previous_action(agents_done)

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in batched_step_result.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            batched_step_result, global_agent_ids)
        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        action_info = ActionInfo(
            action=run_out.get("action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=batched_step_result.agent_id,
        )
        return action_info
예제 #5
0
    def get_action(self,
                   decision_requests: DecisionSteps,
                   worker_id: int = 0) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param decision_requests: A dictionary of brain names and DecisionSteps from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the DecisionSteps came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if len(decision_requests) == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in decision_requests.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            decision_requests, global_agent_ids)

        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        return ActionInfo(
            action=run_out.get("action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=decision_requests.agent_id,
        )
예제 #6
0
    def get_action(self, brain_info: BrainInfo) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param brain_info: A dictionary of brain names and BrainInfo from environment.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if len(brain_info.agents) == 0:
            return ActionInfo([], [], {}, [])

        agents_done = [
            agent
            for agent, done in zip(brain_info.agents, brain_info.local_done)
            if done
        ]

        self.remove_memories(agents_done)
        self.remove_previous_action(agents_done)

        run_out = self.evaluate(brain_info)  # pylint: disable=assignment-from-no-return
        self.save_memories(brain_info.agents, run_out.get("memory_out"))
        return ActionInfo(
            action=run_out.get("action"),
            value=run_out.get("value"),
            outputs=run_out,
            agents=brain_info.agents,
        )
def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    fake_action_outputs = {
        "action": [0.1, 0.1],
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "pre_action": [0.1, 0.1],
        "log_probs": [0.1, 0.1],
    }
    mock_step = mb.create_mock_batchedstep(
        num_agents=2,
        num_vector_observations=8,
        action_shape=[2],
        num_vis_observations=num_vis_obs,
    )
    fake_action_info = ActionInfo(
        action=[0.1, 0.1],
        value=[0.1, 0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_step.agent_id,
    )
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(mock_step, 0, ActionInfo.empty())
    for _ in range(5):
        processor.add_experiences(mock_step, 0, fake_action_info)

    # Assert that two trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 2

    # Assert that the trajectory is of length 5
    trajectory = tqueue.put.call_args_list[0][0][0]
    assert len(trajectory.steps) == 5

    # Assert that the AgentProcessor is empty
    assert len(processor.experience_buffers[0]) == 0

    # Test empty BatchedStepResult
    mock_step = mb.create_mock_batchedstep(
        num_agents=0,
        num_vector_observations=8,
        action_shape=[2],
        num_vis_observations=num_vis_obs,
    )
    processor.add_experiences(mock_step, 0, ActionInfo([], [], {}, []))
    # Assert that the AgentProcessor is still empty
    assert len(processor.experience_buffers[0]) == 0
예제 #8
0
def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes(
            [(8,)] + num_vis_obs * [(84, 84, 3)]
        ),
        action_spec=ActionSpec.create_continuous(2),
    )
    fake_action_info = _create_action_info(2, mock_decision_steps.agent_id)
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    for _ in range(5):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Assert that two trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 2

    # Assert that the trajectory is of length 5
    trajectory = tqueue.put.call_args_list[0][0][0]
    assert len(trajectory.steps) == 5
    # Make sure ungrouped agents don't have team obs
    for step in trajectory.steps:
        assert len(step.group_status) == 0

    # Assert that the AgentProcessor is empty
    assert len(processor._experience_buffers[0]) == 0

    # Test empty steps
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=0,
        observation_specs=create_observation_specs_with_shapes(
            [(8,)] + num_vis_obs * [(84, 84, 3)]
        ),
        action_spec=ActionSpec.create_continuous(2),
    )
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    # Assert that the AgentProcessor is still empty
    assert len(processor._experience_buffers[0]) == 0
예제 #9
0
def test_end_episode():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )
    fake_action_outputs = {
        "action": ActionTuple(continuous=np.array([[0.1]])),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
    }

    mock_decision_step, mock_terminal_step = mb.create_mock_steps(
        num_agents=1,
        observation_shapes=[(8,)],
        action_spec=ActionSpec.create_continuous(2),
    )
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1]])),
        env_action=ActionTuple(continuous=np.array([[0.1]])),
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_step.agent_id,
    )

    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_step, mock_terminal_step, 0, ActionInfo.empty()
    )
    # Run 3 trajectories, with different workers (to simulate different agents)
    remove_calls = []
    for _ep in range(3):
        remove_calls.append(mock.call([get_global_agent_id(_ep, 0)]))
        for _ in range(5):
            processor.add_experiences(
                mock_decision_step, mock_terminal_step, _ep, fake_action_info
            )
            # Make sure we don't add experiences from the prior agents after the done

    # Call end episode
    processor.end_episode()
    # Check that we removed every agent
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
예제 #10
0
    def get_action(
        self, decision_requests: DecisionSteps, worker_id: int = 0
    ) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param decision_requests: A dictionary of brain names and DecisionSteps from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the DecisionSteps came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if len(decision_requests) == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in decision_requests.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            decision_requests, global_agent_ids
        )

        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        # For Compatibility with buffer changes for hybrid action support
        if "log_probs" in run_out:
            log_probs_tuple = LogProbsTuple()
            if self.behavior_spec.action_spec.is_continuous():
                log_probs_tuple.add_continuous(run_out["log_probs"])
            else:
                log_probs_tuple.add_discrete(run_out["log_probs"])
            run_out["log_probs"] = log_probs_tuple
        if "action" in run_out:
            action_tuple = ActionTuple()
            env_action_tuple = ActionTuple()
            if self.behavior_spec.action_spec.is_continuous():
                action_tuple.add_continuous(run_out["pre_action"])
                env_action_tuple.add_continuous(run_out["action"])
            else:
                action_tuple.add_discrete(run_out["action"])
                env_action_tuple.add_discrete(run_out["action"])
            run_out["action"] = action_tuple
            run_out["env_action"] = env_action_tuple
        self.check_nan_action(run_out.get("action"))
        return ActionInfo(
            action=run_out.get("action"),
            env_action=run_out.get("env_action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=decision_requests.agent_id,
        )
예제 #11
0
    def _process_step_infos(self, step_infos: List[EnvironmentStep]) -> int:
        for step_info in step_infos:
            for name_behavior_id in step_info.name_behavior_ids:
                if name_behavior_id not in self.agent_managers:
                    logger.warning(
                        "Agent manager was not created for behavior id {}.".format(
                            name_behavior_id
                        )
                    )
                    continue
                decision_steps, terminal_steps = step_info.current_all_step_result[
                    name_behavior_id
                ]
                self.agent_managers[name_behavior_id].add_experiences(
                    decision_steps,
                    terminal_steps,
                    step_info.worker_id,
                    step_info.brain_name_to_action_info.get(
                        name_behavior_id, ActionInfo.empty()
                    ),
                )

                self.agent_managers[name_behavior_id].record_environment_stats(
                    step_info.environment_stats, step_info.worker_id
                )
        return len(step_infos)
예제 #12
0
def test_take_action_returns_empty_with_no_agents():
    test_seed = 3
    behavior_spec = basic_behavior_spec()
    policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
    no_agent_step = DecisionSteps.empty(behavior_spec)
    result = policy.get_action(no_agent_step)
    assert result == ActionInfo.empty()
예제 #13
0
def test_take_action_returns_empty_with_no_agents():
    test_seed = 3
    policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings(), "output")
    # Doesn't really matter what this is
    dummy_groupspec = BehaviorSpec([(1,)], "continuous", 1)
    no_agent_step = DecisionSteps.empty(dummy_groupspec)
    result = policy.get_action(no_agent_step)
    assert result == ActionInfo.empty()
예제 #14
0
def test_take_action_returns_empty_with_no_agents():
    test_seed = 3
    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
    # Doesn't really matter what this is
    dummy_groupspec = AgentGroupSpec([(1, )], "continuous", 1)
    no_agent_step = BatchedStepResult.empty(dummy_groupspec)
    result = policy.get_action(no_agent_step)
    assert result == ActionInfo.empty()
예제 #15
0
def test_take_action_returns_nones_on_missing_values():
    test_seed = 3
    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
    policy.evaluate = MagicMock(return_value={})
    policy.save_memories = MagicMock()
    step_with_agents = DecisionSteps([], np.array([], dtype=np.float32),
                                     np.array([0]), None)
    result = policy.get_action(step_with_agents, worker_id=0)
    assert result == ActionInfo(None, None, {}, [0])
예제 #16
0
def test_group_statuses():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=4,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        grouped=True,
    )
    fake_action_info = _create_action_info(4, mock_decision_steps.agent_id)
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    for _ in range(2):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Make terminal steps for some dead agents
    mock_decision_steps_2, mock_terminal_steps_2 = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        done=True,
        grouped=True,
    )

    processor.add_experiences(
        mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info
    )
    fake_action_info = _create_action_info(4, mock_decision_steps.agent_id)
    for _ in range(3):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Assert that four trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 4
    # Last trajectory should be the longest
    trajectory = tqueue.put.call_args_list[0][0][-1]

    # Make sure trajectory has the right Groupmate Experiences
    for step in trajectory.steps[0:3]:
        assert len(step.group_status) == 3
    # After 2 agents has died
    for step in trajectory.steps[3:]:
        assert len(step.group_status) == 1
예제 #17
0
def test_take_action_returns_nones_on_missing_values():
    test_seed = 3
    behavior_spec = basic_behavior_spec()
    policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
    policy.evaluate = MagicMock(return_value={})
    policy.save_memories = MagicMock()
    step_with_agents = DecisionSteps([], np.array([], dtype=np.float32),
                                     np.array([0]), None)
    result = policy.get_action(step_with_agents, worker_id=0)
    assert result == ActionInfo(None, None, {}, [0])
예제 #18
0
def test_take_action_returns_nones_on_missing_values():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    policy.evaluate = MagicMock(return_value={})
    policy.save_memories = MagicMock()
    brain_info_with_agents = BrainInfo([], [], [],
                                       agents=["an-agent-id"],
                                       local_done=[False])
    result = policy.get_action(brain_info_with_agents)
    assert result == ActionInfo(None, None, {})
예제 #19
0
 def _process_step_infos(self, step_infos: List[EnvironmentStep]) -> int:
     for step_info in step_infos:
         for name_behavior_id in step_info.name_behavior_ids:
             if name_behavior_id not in self.managers:
                 self.logger.warning(
                     "Agent manager was not created for behavior id {}.".
                     format(name_behavior_id))
                 continue
             self.managers[name_behavior_id].add_experiences(
                 step_info.current_all_brain_info[name_behavior_id],
                 step_info.brain_name_to_action_info.get(
                     name_behavior_id, ActionInfo([], [], {}, [])),
             )
     return len(step_infos)
예제 #20
0
def test_take_action_returns_action_info_when_available():
    test_seed = 3
    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
    policy_eval_out = {
        "action": np.array([1.0], dtype=np.float32),
        "memory_out": np.array([[2.5]], dtype=np.float32),
        "value": np.array([1.1], dtype=np.float32),
    }
    policy.evaluate = MagicMock(return_value=policy_eval_out)
    step_with_agents = DecisionSteps([], np.array([], dtype=np.float32),
                                     np.array([0]), None)
    result = policy.get_action(step_with_agents)
    expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"],
                          policy_eval_out, [0])
    assert result == expected
예제 #21
0
def test_take_action_returns_action_info_when_available():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    policy_eval_out = {
        "action": np.array([1.0], dtype=np.float32),
        "memory_out": np.array([[2.5]], dtype=np.float32),
        "value": np.array([1.1], dtype=np.float32),
    }
    policy.evaluate = MagicMock(return_value=policy_eval_out)
    brain_info_with_agents = BrainInfo([], [], [],
                                       agents=["an-agent-id"],
                                       local_done=[False])
    result = policy.get_action(brain_info_with_agents)
    expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"],
                          policy_eval_out)
    assert result == expected
예제 #22
0
def _create_action_info(num_agents: int, agent_ids: List[str]) -> ActionInfo:
    fake_action_outputs = {
        "action": ActionTuple(
            continuous=np.array([[0.1]] * num_agents, dtype=np.float32)
        ),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(
            continuous=np.array([[0.1]] * num_agents, dtype=np.float32)
        ),
    }
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1]] * num_agents, dtype=np.float32)),
        env_action=ActionTuple(
            continuous=np.array([[0.1]] * num_agents, dtype=np.float32)
        ),
        outputs=fake_action_outputs,
        agent_ids=agent_ids,
    )
    return fake_action_info
예제 #23
0
def test_take_action_returns_action_info_when_available():
    test_seed = 3
    behavior_spec = basic_behavior_spec()
    policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
    policy_eval_out = {
        "action": np.array([[1.0]], dtype=np.float32),
        "pre_action": np.array([[1.0]], dtype=np.float32),
        "memory_out": np.array([[2.5]], dtype=np.float32),
        "value": np.array([1.1], dtype=np.float32),
    }
    policy.evaluate = MagicMock(return_value=policy_eval_out)
    step_with_agents = DecisionSteps(
        [], np.array([], dtype=np.float32), np.array([0]), None
    )
    result = policy.get_action(step_with_agents)
    print(result)
    expected = ActionInfo(
        policy_eval_out["action"],
        policy_eval_out["env_action"],
        policy_eval_out["value"],
        policy_eval_out,
        [0],
    )
    assert result == expected
예제 #24
0
def test_take_action_returns_empty_with_no_agents():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    no_agent_brain_info = BrainInfo([], [], [], agents=[])
    result = policy.get_action(no_agent_brain_info)
    assert result == ActionInfo([], [], None)
예제 #25
0
def test_group_statuses():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=4,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        grouped=True,
    )
    fake_action_info = _create_action_info(4, mock_decision_steps.agent_id)
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    for _ in range(2):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Make terminal steps for some dead agents
    _, mock_terminal_steps_2 = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        done=True,
        grouped=True,
        agent_ids=[2, 3],
    )
    # Make decision steps continue for other agents
    mock_decision_steps_2, _ = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        done=False,
        grouped=True,
        agent_ids=[0, 1],
    )

    processor.add_experiences(
        mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info
    )
    # Continue to add for remaining live agents
    fake_action_info = _create_action_info(4, mock_decision_steps_2.agent_id)
    for _ in range(3):
        processor.add_experiences(
            mock_decision_steps_2, mock_terminal_steps, 0, fake_action_info
        )

    # Assert that four trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 4

    # Get the first trajectory, which should have been agent 2 (one of the killed agents)
    trajectory = tqueue.put.call_args_list[0][0][-1]
    assert len(trajectory.steps) == 3
    # Make sure trajectory has the right Groupmate Experiences.
    # All three steps should contain all agents
    for step in trajectory.steps:
        assert len(step.group_status) == 3

    # Last trajectory should be the longest. It should be that of agent 1, one of the surviving agents.
    trajectory = tqueue.put.call_args_list[-1][0][-1]
    assert len(trajectory.steps) == 5

    # Make sure trajectory has the right Groupmate Experiences.
    # THe first 3 steps should contain all of the obs (that 3rd step is also the terminal step of 2 of the agents)
    for step in trajectory.steps[0:3]:
        assert len(step.group_status) == 3
    # After 2 agents has died, there should only be 1 group status.
    for step in trajectory.steps[3:]:
        assert len(step.group_status) == 1