예제 #1
0
    def test_warning_group_reward(self):
        with self.assertLogs("mlagents.trainers", level="WARN") as cm:
            rl_trainer = create_rl_trainer()
            # This one should warn
            trajectory = mb.make_fake_trajectory(
                length=10,
                observation_specs=create_observation_specs_with_shapes([(1, )
                                                                        ]),
                max_step_complete=True,
                action_spec=ActionSpec.create_discrete((2, )),
                group_reward=1.0,
            )
            buff = trajectory.to_agentbuffer()
            rl_trainer._warn_if_group_reward(buff)
            assert len(cm.output) > 0
            len_of_first_warning = len(cm.output)

            rl_trainer = create_rl_trainer()
            # This one shouldn't
            trajectory = mb.make_fake_trajectory(
                length=10,
                observation_specs=create_observation_specs_with_shapes([(1, )
                                                                        ]),
                max_step_complete=True,
                action_spec=ActionSpec.create_discrete((2, )),
            )
            buff = trajectory.to_agentbuffer()
            rl_trainer._warn_if_group_reward(buff)
            # Make sure warnings don't get bigger
            assert len(cm.output) == len_of_first_warning
예제 #2
0
def test_trajectory_to_agentbuffer():
    length = 15
    wanted_keys = [
        "next_visual_obs0",
        "visual_obs0",
        "vector_obs",
        "next_vector_in",
        "memory",
        "masks",
        "done",
        "actions_pre",
        "actions",
        "action_probs",
        "action_mask",
        "prev_action",
        "environment_rewards",
    ]
    wanted_keys = set(wanted_keys)
    trajectory = make_fake_trajectory(
        length=length,
        observation_shapes=[(VEC_OBS_SIZE, ), (84, 84, 3)],
        action_space=[ACTION_SIZE],
    )
    agentbuffer = trajectory.to_agentbuffer()
    seen_keys = set()
    for key, field in agentbuffer.items():
        assert len(field) == length
        seen_keys.add(key)

    assert seen_keys == wanted_keys
def test_advance(mocked_clear_update_buffer):
    trainer = create_rl_trainer()
    trajectory_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    time_horizon = 15
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)

    trainer.advance()
    # Check that get_step is correct
    assert trainer.get_step == time_horizon
    # Check that we can turn off the trainer and that the buffer is cleared
    for _ in range(0, 10):
        trajectory_queue.put(trajectory)
        trainer.advance()

    # Check that the buffer has been cleared
    assert not trainer.should_still_train
    assert mocked_clear_update_buffer.call_count > 0
예제 #4
0
def test_update_buffer_append():
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_specs=create_observation_specs_with_shapes([(1, )]),
        max_step_complete=True,
        action_spec=ActionSpec.create_discrete((2, )),
    )
    agentbuffer_trajectory = trajectory.to_agentbuffer()
    assert trainer.update_buffer.num_experiences == 0

    # Check that if we append, our update buffer gets longer.
    # max_steps = 100
    for i in range(10):
        trainer._process_trajectory(trajectory)
        trainer._append_to_update_buffer(agentbuffer_trajectory)
        assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon

    # Check that if we append after stopping training, nothing happens.
    # We process enough trajectories to hit max steps
    trainer.set_is_policy_updating(False)
    trainer._process_trajectory(trajectory)
    trainer._append_to_update_buffer(agentbuffer_trajectory)
    assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon
예제 #5
0
def test_trajectory_to_agentbuffer():
    length = 15
    wanted_keys = [
        "next_obs_0",
        "next_obs_1",
        "obs_0",
        "obs_1",
        "memory",
        "masks",
        "done",
        "continuous_action",
        "discrete_action",
        "continuous_log_probs",
        "discrete_log_probs",
        "action_mask",
        "prev_action",
        "environment_rewards",
    ]
    wanted_keys = set(wanted_keys)
    trajectory = make_fake_trajectory(
        length=length,
        sensor_specs=create_sensor_specs_with_shapes([(VEC_OBS_SIZE,), (84, 84, 3)]),
        action_spec=ActionSpec.create_continuous(ACTION_SIZE),
    )
    agentbuffer = trajectory.to_agentbuffer()
    seen_keys = set()
    for key, field in agentbuffer.items():
        assert len(field) == length
        seen_keys.add(key)

    assert seen_keys == wanted_keys
예제 #6
0
def test_trajectory_to_agentbuffer():
    length = 15
    wanted_keys = [
        (ObservationKeyPrefix.OBSERVATION, 0),
        (ObservationKeyPrefix.OBSERVATION, 1),
        (ObservationKeyPrefix.NEXT_OBSERVATION, 0),
        (ObservationKeyPrefix.NEXT_OBSERVATION, 1),
        BufferKey.MEMORY,
        BufferKey.MASKS,
        BufferKey.DONE,
        BufferKey.CONTINUOUS_ACTION,
        BufferKey.DISCRETE_ACTION,
        BufferKey.CONTINUOUS_LOG_PROBS,
        BufferKey.DISCRETE_LOG_PROBS,
        BufferKey.ACTION_MASK,
        BufferKey.PREV_ACTION,
        BufferKey.ENVIRONMENT_REWARDS,
    ]
    wanted_keys = set(wanted_keys)
    trajectory = make_fake_trajectory(
        length=length,
        observation_specs=create_observation_specs_with_shapes([
            (VEC_OBS_SIZE, ), (84, 84, 3)
        ]),
        action_spec=ActionSpec.create_continuous(ACTION_SIZE),
    )
    agentbuffer = trajectory.to_agentbuffer()
    seen_keys = set()
    for key, field in agentbuffer.items():
        assert len(field) == length
        seen_keys.add(key)

    assert seen_keys == wanted_keys
예제 #7
0
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary):
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    summary_freq = trainer.trainer_settings.summary_freq
    checkpoint_interval = trainer.trainer_settings.checkpoint_interval
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_specs=create_observation_specs_with_shapes([(1, )]),
        max_step_complete=True,
        action_spec=ActionSpec.create_discrete((2, )),
    )
    # Check that we can turn off the trainer and that the buffer is cleared
    num_trajectories = 5
    for _ in range(0, num_trajectories):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that we have called write_summary the appropriate number of times
    calls = [
        mock.call(step) for step in range(summary_freq, num_trajectories *
                                          time_horizon, summary_freq)
    ]
    mock_write_summary.assert_has_calls(calls, any_order=True)

    checkpoint_range = range(checkpoint_interval,
                             num_trajectories * time_horizon,
                             checkpoint_interval)
    calls = [mock.call(trainer.brain_name, step) for step in checkpoint_range]

    trainer.model_saver.save_checkpoint.assert_has_calls(calls, any_order=True)
    export_ext = "onnx"

    add_checkpoint_calls = [
        mock.call(
            trainer.brain_name,
            ModelCheckpoint(
                step,
                f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.{export_ext}",
                None,
                mock.ANY,
                [
                    f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.pt"
                ],
            ),
            trainer.trainer_settings.keep_checkpoints,
        ) for step in checkpoint_range
    ]
    mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
예제 #8
0
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary):
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    mock_policy.model_path = "mock_model_path"
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    summary_freq = trainer.trainer_settings.summary_freq
    checkpoint_interval = trainer.trainer_settings.checkpoint_interval
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_shapes=[(1, )],
        max_step_complete=True,
        action_space=[2],
    )
    # Check that we can turn off the trainer and that the buffer is cleared
    num_trajectories = 5
    for _ in range(0, num_trajectories):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that we have called write_summary the appropriate number of times
    calls = [
        mock.call(step) for step in range(summary_freq, num_trajectories *
                                          time_horizon, summary_freq)
    ]
    mock_write_summary.assert_has_calls(calls, any_order=True)

    checkpoint_range = range(checkpoint_interval,
                             num_trajectories * time_horizon,
                             checkpoint_interval)
    calls = [
        mock.call(f"{mock_policy.model_path}/{trainer.brain_name}-{step}",
                  mock.ANY) for step in checkpoint_range
    ]
    mock_policy.checkpoint.assert_has_calls(calls, any_order=True)

    add_checkpoint_calls = [
        mock.call(
            trainer.brain_name,
            NNCheckpoint(
                step,
                f"{mock_policy.model_path}/{trainer.brain_name}-{step}.nn",
                None,
                mock.ANY,
            ),
            trainer.trainer_settings.keep_checkpoints,
        ) for step in checkpoint_range
    ]
    mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
예제 #9
0
def test_trajectory_to_agentbuffer():
    length = 15
    # These keys should be of type np.ndarray
    wanted_keys = [
        (ObservationKeyPrefix.OBSERVATION, 0),
        (ObservationKeyPrefix.OBSERVATION, 1),
        (ObservationKeyPrefix.NEXT_OBSERVATION, 0),
        (ObservationKeyPrefix.NEXT_OBSERVATION, 1),
        BufferKey.MEMORY,
        BufferKey.MASKS,
        BufferKey.DONE,
        BufferKey.CONTINUOUS_ACTION,
        BufferKey.DISCRETE_ACTION,
        BufferKey.CONTINUOUS_LOG_PROBS,
        BufferKey.DISCRETE_LOG_PROBS,
        BufferKey.ACTION_MASK,
        BufferKey.PREV_ACTION,
        BufferKey.ENVIRONMENT_REWARDS,
        BufferKey.GROUP_REWARD,
    ]
    # These keys should be of type List
    wanted_group_keys = [
        BufferKey.GROUPMATE_REWARDS,
        BufferKey.GROUP_CONTINUOUS_ACTION,
        BufferKey.GROUP_DISCRETE_ACTION,
        BufferKey.GROUP_DONES,
        BufferKey.GROUP_NEXT_CONT_ACTION,
        BufferKey.GROUP_NEXT_DISC_ACTION,
    ]
    wanted_keys = set(wanted_keys + wanted_group_keys)
    trajectory = make_fake_trajectory(
        length=length,
        observation_specs=create_observation_specs_with_shapes([
            (VEC_OBS_SIZE, ), (84, 84, 3)
        ]),
        action_spec=ActionSpec.create_continuous(ACTION_SIZE),
        num_other_agents_in_group=4,
    )
    agentbuffer = trajectory.to_agentbuffer()
    seen_keys = set()
    for key, field in agentbuffer.items():
        assert len(field) == length
        seen_keys.add(key)

    assert seen_keys.issuperset(wanted_keys)

    for _key in wanted_group_keys:
        for step in agentbuffer[_key]:
            assert len(step) == 4
예제 #10
0
def _compare_two_optimizers(opt1: TorchOptimizer,
                            opt2: TorchOptimizer) -> None:
    trajectory = mb.make_fake_trajectory(
        length=10,
        observation_specs=opt1.policy.behavior_spec.observation_specs,
        action_spec=opt1.policy.behavior_spec.action_spec,
        max_step_complete=True,
    )
    with torch.no_grad():
        _, opt1_val_out, _ = opt1.get_trajectory_value_estimates(
            trajectory.to_agentbuffer(), trajectory.next_obs, done=False)
        _, opt2_val_out, _ = opt2.get_trajectory_value_estimates(
            trajectory.to_agentbuffer(), trajectory.next_obs, done=False)

    for opt1_val, opt2_val in zip(opt1_val_out.values(),
                                  opt2_val_out.values()):
        np.testing.assert_array_equal(opt1_val, opt2_val)
예제 #11
0
def test_sac_trainer_update_normalization(sac_config):
    behavior_id_team0 = "test_brain?team=0"
    brain_name = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0).brain_name
    mock_specs = mb.setup_test_behavior_specs(True,
                                              False,
                                              vector_action_space=[2],
                                              vector_obs_space=1)
    base_config = sac_config.behaviors
    output_path = "results_dir"
    train_model = True
    load_model = False
    seed = 42
    trainer_factory = TrainerFactory(
        trainer_config=base_config,
        output_path=output_path,
        train_model=train_model,
        load_model=load_model,
        seed=seed,
        param_manager=EnvironmentParameterManager(),
    )
    sac_trainer = trainer_factory.generate(brain_name)
    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0)
    policy = sac_trainer.create_policy(parsed_behavior_id0, mock_specs)
    sac_trainer.add_policy(parsed_behavior_id0, policy)
    trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
    sac_trainer.subscribe_trajectory_queue(trajectory_queue0)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_specs=create_observation_specs_with_shapes([(1, )]),
        action_spec=mock_specs.action_spec,
    )
    trajectory_queue0.put(trajectory)
    # mocking out update_normalization in both the policy and critic
    with patch(
            "mlagents.trainers.torch.networks.ValueNetwork.update_normalization"
    ) as optimizer_update_normalization_mock, patch(
            "mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization"
    ) as policy_update_normalization_mock:
        sac_trainer.advance()
        optimizer_update_normalization_mock.assert_called_once()
        policy_update_normalization_mock.assert_called_once()
예제 #12
0
def test_advance(mocked_clear_update_buffer, mocked_save_model):
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    mock_policy.model_path = "mock_model_path"
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_shapes=[(1, )],
        max_step_complete=True,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)

    trainer.advance()
    policy_queue.get_nowait()
    # Check that get_step is correct
    assert trainer.get_step == time_horizon
    # Check that we can turn off the trainer and that the buffer is cleared
    for _ in range(0, 5):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that if the policy doesn't update, we don't push it to the queue
    trainer.set_is_policy_updating(False)
    for _ in range(0, 10):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there nothing  in the policy queue
        with pytest.raises(AgentManagerQueue.Empty):
            policy_queue.get_nowait()

    # Check that the buffer has been cleared
    assert not trainer.should_still_train
    assert mocked_clear_update_buffer.call_count > 0
    assert mocked_save_model.call_count == 0
예제 #13
0
def test_poca_end_episode():
    name_behavior_id = "test_trainer"
    trainer = POCATrainer(
        name_behavior_id,
        10,
        TrainerSettings(max_steps=100, checkpoint_interval=10,
                        summary_freq=20),
        True,
        False,
        0,
        "mock_model_path",
    )
    behavior_spec = BehaviorSpec(create_observation_specs_with_shapes([(1, )]),
                                 ActionSpec.create_discrete((2, )))
    parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(
        name_behavior_id)
    mock_policy = trainer.create_policy(parsed_behavior_id, behavior_spec)
    trainer.add_policy(parsed_behavior_id, mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_specs=behavior_spec.observation_specs,
        max_step_complete=False,
        action_spec=behavior_spec.action_spec,
        num_other_agents_in_group=2,
        group_reward=1.0,
        is_terminal=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
    # Test that some trajectoories have been injested
    for reward in trainer.collected_group_rewards.values():
        assert reward == 10
    # Test end episode
    trainer.end_episode()
    assert len(trainer.collected_group_rewards.keys()) == 0
예제 #14
0
def test_summary_checkpoint(mock_write_summary, mock_save_model):
    trainer = create_rl_trainer()
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    summary_freq = trainer.trainer_settings.summary_freq
    checkpoint_interval = trainer.trainer_settings.checkpoint_interval
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    # Check that we can turn off the trainer and that the buffer is cleared
    num_trajectories = 5
    for _ in range(0, num_trajectories):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that we have called write_summary the appropriate number of times
    calls = [
        mock.call(step) for step in range(summary_freq, num_trajectories *
                                          time_horizon, summary_freq)
    ]
    mock_write_summary.assert_has_calls(calls, any_order=True)

    calls = [
        mock.call(trainer.brain_name)
        for step in range(checkpoint_interval, num_trajectories *
                          time_horizon, checkpoint_interval)
    ]
    mock_save_model.assert_has_calls(calls, any_order=True)