Exemplo n.º 1
0
def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    fake_action_outputs = {
        "action": ActionTuple(continuous=np.array([[0.1], [0.1]])),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])),
    }
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=2,
        observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)],
        action_spec=ActionSpec.create_continuous(2),
    )
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
        env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
        value=[0.1, 0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_steps.agent_id,
    )
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    for _ in range(5):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Assert that two trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 2

    # Assert that the trajectory is of length 5
    trajectory = tqueue.put.call_args_list[0][0][0]
    assert len(trajectory.steps) == 5

    # Assert that the AgentProcessor is empty
    assert len(processor.experience_buffers[0]) == 0

    # Test empty steps
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=0,
        observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)],
        action_spec=ActionSpec.create_continuous(2),
    )
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    # Assert that the AgentProcessor is still empty
    assert len(processor.experience_buffers[0]) == 0
Exemplo n.º 2
0
def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
    trainer = mock.Mock()
    processor = AgentProcessor(
        trainer,
        policy,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )
    fake_action_outputs = {
        "action": [0.1, 0.1],
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "pre_action": [0.1, 0.1],
        "log_probs": [0.1, 0.1],
    }
    mock_braininfo = mb.create_mock_braininfo(
        num_agents=2,
        num_vector_observations=8,
        num_vector_acts=2,
        num_vis_observations=num_vis_obs,
    )
    for i in range(5):
        processor.add_experiences(mock_braininfo, mock_braininfo,
                                  fake_action_outputs)

    # Assert that two trajectories have been added to the Trainer
    assert len(trainer.process_trajectory.call_args_list) == 2

    # Assert that the trajectory is of length 5
    trajectory = trainer.process_trajectory.call_args_list[0][0][0]
    assert len(trajectory.steps) == 5

    # Assert that the AgentProcessor is empty
    assert len(processor.experience_buffers[0]) == 0
def test_agent_deletion():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    fake_action_outputs = {
        "action": [0.1],
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "pre_action": [0.1],
        "log_probs": [0.1],
    }
    mock_step = mb.create_mock_batchedstep(
        num_agents=1,
        num_vector_observations=8,
        action_shape=[2],
        num_vis_observations=0,
    )
    mock_done_step = mb.create_mock_batchedstep(
        num_agents=1,
        num_vector_observations=8,
        action_shape=[2],
        num_vis_observations=0,
        done=True,
    )
    fake_action_info = ActionInfo(
        action=[0.1],
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_step.agent_id,
    )

    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(mock_step, 0, ActionInfo.empty())

    # Run 3 trajectories, with different workers (to simulate different agents)
    add_calls = []
    remove_calls = []
    for _ep in range(3):
        for _ in range(5):
            processor.add_experiences(mock_step, _ep, fake_action_info)
            add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1]))
        processor.add_experiences(mock_done_step, _ep, fake_action_info)
        # Make sure we don't add experiences from the prior agents after the done
        remove_calls.append(mock.call([get_global_agent_id(_ep, 0)]))

    policy.save_previous_action.assert_has_calls(add_calls)
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    fake_action_outputs = {
        "action": [0.1, 0.1],
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "pre_action": [0.1, 0.1],
        "log_probs": [0.1, 0.1],
    }
    mock_step = mb.create_mock_batchedstep(
        num_agents=2,
        num_vector_observations=8,
        action_shape=[2],
        num_vis_observations=num_vis_obs,
    )
    fake_action_info = ActionInfo(
        action=[0.1, 0.1],
        value=[0.1, 0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_step.agent_id,
    )
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(mock_step, 0, ActionInfo.empty())
    for _ in range(5):
        processor.add_experiences(mock_step, 0, fake_action_info)

    # Assert that two trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 2

    # Assert that the trajectory is of length 5
    trajectory = tqueue.put.call_args_list[0][0][0]
    assert len(trajectory.steps) == 5

    # Assert that the AgentProcessor is empty
    assert len(processor.experience_buffers[0]) == 0

    # Test empty BatchedStepResult
    mock_step = mb.create_mock_batchedstep(
        num_agents=0,
        num_vector_observations=8,
        action_shape=[2],
        num_vis_observations=num_vis_obs,
    )
    processor.add_experiences(mock_step, 0, ActionInfo([], [], {}, []))
    # Assert that the AgentProcessor is still empty
    assert len(processor.experience_buffers[0]) == 0
Exemplo n.º 5
0
def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes(
            [(8,)] + num_vis_obs * [(84, 84, 3)]
        ),
        action_spec=ActionSpec.create_continuous(2),
    )
    fake_action_info = _create_action_info(2, mock_decision_steps.agent_id)
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    for _ in range(5):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Assert that two trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 2

    # Assert that the trajectory is of length 5
    trajectory = tqueue.put.call_args_list[0][0][0]
    assert len(trajectory.steps) == 5
    # Make sure ungrouped agents don't have team obs
    for step in trajectory.steps:
        assert len(step.group_status) == 0

    # Assert that the AgentProcessor is empty
    assert len(processor._experience_buffers[0]) == 0

    # Test empty steps
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=0,
        observation_specs=create_observation_specs_with_shapes(
            [(8,)] + num_vis_obs * [(84, 84, 3)]
        ),
        action_spec=ActionSpec.create_continuous(2),
    )
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    # Assert that the AgentProcessor is still empty
    assert len(processor._experience_buffers[0]) == 0
Exemplo n.º 6
0
def test_end_episode():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )
    fake_action_outputs = {
        "action": ActionTuple(continuous=np.array([[0.1]])),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
    }

    mock_decision_step, mock_terminal_step = mb.create_mock_steps(
        num_agents=1,
        observation_shapes=[(8,)],
        action_spec=ActionSpec.create_continuous(2),
    )
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1]])),
        env_action=ActionTuple(continuous=np.array([[0.1]])),
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_step.agent_id,
    )

    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_step, mock_terminal_step, 0, ActionInfo.empty()
    )
    # Run 3 trajectories, with different workers (to simulate different agents)
    remove_calls = []
    for _ep in range(3):
        remove_calls.append(mock.call([get_global_agent_id(_ep, 0)]))
        for _ in range(5):
            processor.add_experiences(
                mock_decision_step, mock_terminal_step, _ep, fake_action_info
            )
            # Make sure we don't add experiences from the prior agents after the done

    # Call end episode
    processor.end_episode()
    # Check that we removed every agent
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
Exemplo n.º 7
0
 def start_learning(self, env_manager: EnvManager) -> None:
     self._create_model_path(self.model_path)
     tf.reset_default_graph()
     global_step = 0
     last_brain_names: Set[str] = set()
     try:
         self._reset_env(env_manager)
         while self._not_done_training():
             external_brains = set(env_manager.external_brains.keys())
             new_brains = external_brains - last_brain_names
             if last_brain_names != env_manager.external_brains.keys():
                 for name in new_brains:
                     trainer = self.trainer_factory.generate(
                         env_manager.external_brains[name])
                     self.start_trainer(trainer, env_manager)
                     agent_manager = AgentManager(processor=AgentProcessor(
                         trainer,
                         trainer.policy,
                         trainer.stats_reporter,
                         trainer.parameters.get("time_horizon",
                                                sys.maxsize),
                     ))
                     self.managers[name] = agent_manager
                 last_brain_names = external_brains
             n_steps = self.advance(env_manager)
             for i in range(n_steps):
                 global_step += 1
                 self.reset_env_if_ready(env_manager, global_step)
                 if self._should_save_model(global_step):
                     # Save Tensorflow model
                     self._save_model()
                 self.write_to_tensorboard(global_step)
         # Final save Tensorflow model
         if global_step != 0 and self.train_model:
             self._save_model()
     except (KeyboardInterrupt, UnityCommunicationException):
         if self.train_model:
             self._save_model_when_interrupted()
         pass
     if self.train_model:
         self._write_training_metrics()
         self._export_graph()
     self._write_timing_tree()
Exemplo n.º 8
0
    def start_learning(self, env_manager: EnvManager) -> None:
        self._create_model_path(self.model_path)
        tf.reset_default_graph()
        global_step = 0
        last_brain_behavior_ids: Set[str] = set()
        try:
            self._reset_env(env_manager)
            while self._not_done_training():
                external_brain_behavior_ids = set(
                    env_manager.external_brains.keys())
                new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
                for name_behavior_id in new_behavior_ids:
                    try:
                        brain_name, _ = name_behavior_id.split("?")
                    except ValueError:
                        brain_name = name_behavior_id

                    try:
                        trainer = self.trainers[brain_name]
                    except KeyError:
                        trainer = self.trainer_factory.generate(brain_name)
                        self.trainers[brain_name] = trainer
                        self.logger.info(trainer)
                        if self.train_model:
                            trainer.write_tensorboard_text(
                                "Hyperparameters", trainer.parameters)

                    policy = trainer.create_policy(
                        env_manager.external_brains[name_behavior_id])
                    trainer.add_policy(name_behavior_id, policy)

                    env_manager.set_policy(name_behavior_id, policy)

                    self.brain_name_to_identifier[brain_name].add(
                        name_behavior_id)

                    agent_manager = AgentManager(processor=AgentProcessor(
                        trainer,
                        policy,
                        name_behavior_id,
                        trainer.stats_reporter,
                        trainer.parameters.get("time_horizon", sys.maxsize),
                    ))
                    self.managers[name_behavior_id] = agent_manager

                last_brain_behavior_ids = external_brain_behavior_ids

                n_steps = self.advance(env_manager)
                for _ in range(n_steps):
                    global_step += 1
                    self.reset_env_if_ready(env_manager, global_step)
                    if self._should_save_model(global_step):
                        # Save Tensorflow model
                        self._save_model()
                    self.write_to_tensorboard(global_step)
            # Final save Tensorflow model
            if global_step != 0 and self.train_model:
                self._save_model()
        except (KeyboardInterrupt, UnityCommunicationException):
            if self.train_model:
                self._save_model_when_interrupted()
            pass
        if self.train_model:
            self._export_graph()
        self._write_timing_tree()
Exemplo n.º 9
0
def test_group_statuses():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=4,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        grouped=True,
    )
    fake_action_info = _create_action_info(4, mock_decision_steps.agent_id)
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    for _ in range(2):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Make terminal steps for some dead agents
    mock_decision_steps_2, mock_terminal_steps_2 = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        done=True,
        grouped=True,
    )

    processor.add_experiences(
        mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info
    )
    fake_action_info = _create_action_info(4, mock_decision_steps.agent_id)
    for _ in range(3):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Assert that four trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 4
    # Last trajectory should be the longest
    trajectory = tqueue.put.call_args_list[0][0][-1]

    # Make sure trajectory has the right Groupmate Experiences
    for step in trajectory.steps[0:3]:
        assert len(step.group_status) == 3
    # After 2 agents has died
    for step in trajectory.steps[3:]:
        assert len(step.group_status) == 1
Exemplo n.º 10
0
def test_group_statuses():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=4,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        grouped=True,
    )
    fake_action_info = _create_action_info(4, mock_decision_steps.agent_id)
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    for _ in range(2):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Make terminal steps for some dead agents
    _, mock_terminal_steps_2 = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        done=True,
        grouped=True,
        agent_ids=[2, 3],
    )
    # Make decision steps continue for other agents
    mock_decision_steps_2, _ = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        done=False,
        grouped=True,
        agent_ids=[0, 1],
    )

    processor.add_experiences(
        mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info
    )
    # Continue to add for remaining live agents
    fake_action_info = _create_action_info(4, mock_decision_steps_2.agent_id)
    for _ in range(3):
        processor.add_experiences(
            mock_decision_steps_2, mock_terminal_steps, 0, fake_action_info
        )

    # Assert that four trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 4

    # Get the first trajectory, which should have been agent 2 (one of the killed agents)
    trajectory = tqueue.put.call_args_list[0][0][-1]
    assert len(trajectory.steps) == 3
    # Make sure trajectory has the right Groupmate Experiences.
    # All three steps should contain all agents
    for step in trajectory.steps:
        assert len(step.group_status) == 3

    # Last trajectory should be the longest. It should be that of agent 1, one of the surviving agents.
    trajectory = tqueue.put.call_args_list[-1][0][-1]
    assert len(trajectory.steps) == 5

    # Make sure trajectory has the right Groupmate Experiences.
    # THe first 3 steps should contain all of the obs (that 3rd step is also the terminal step of 2 of the agents)
    for step in trajectory.steps[0:3]:
        assert len(step.group_status) == 3
    # After 2 agents has died, there should only be 1 group status.
    for step in trajectory.steps[3:]:
        assert len(step.group_status) == 1