示例#1
0
    def test_warning_group_reward(self):
        with self.assertLogs("mlagents.trainers", level="WARN") as cm:
            rl_trainer = create_rl_trainer()
            # This one should warn
            trajectory = mb.make_fake_trajectory(
                length=10,
                observation_specs=create_observation_specs_with_shapes([(1, )
                                                                        ]),
                max_step_complete=True,
                action_spec=ActionSpec.create_discrete((2, )),
                group_reward=1.0,
            )
            buff = trajectory.to_agentbuffer()
            rl_trainer._warn_if_group_reward(buff)
            assert len(cm.output) > 0
            len_of_first_warning = len(cm.output)

            rl_trainer = create_rl_trainer()
            # This one shouldn't
            trajectory = mb.make_fake_trajectory(
                length=10,
                observation_specs=create_observation_specs_with_shapes([(1, )
                                                                        ]),
                max_step_complete=True,
                action_spec=ActionSpec.create_discrete((2, )),
            )
            buff = trajectory.to_agentbuffer()
            rl_trainer._warn_if_group_reward(buff)
            # Make sure warnings don't get bigger
            assert len(cm.output) == len_of_first_warning
示例#2
0
def create_mock_group_spec(
    number_visual_observations=0,
    vector_action_space_type="continuous",
    vector_observation_space_size=3,
    vector_action_space_size=None,
):
    """
    Creates a mock BrainParameters object with parameters.
    """
    # Avoid using mutable object as default param
    if vector_action_space_type == "continuous":
        if vector_action_space_size is None:
            vector_action_space_size = 2
        else:
            vector_action_space_size = vector_action_space_size[0]
        action_spec = ActionSpec.create_continuous(vector_action_space_size)
    else:
        if vector_action_space_size is None:
            vector_action_space_size = (2, )
        else:
            vector_action_space_size = tuple(vector_action_space_size)
        action_spec = ActionSpec.create_discrete(vector_action_space_size)
    obs_shapes = [(vector_observation_space_size, )]
    for _ in range(number_visual_observations):
        obs_shapes += [(8, 8, 3)]
    obs_spec = create_observation_specs_with_shapes(obs_shapes)
    return BehaviorSpec(obs_spec, action_spec)
示例#3
0
def test_update_buffer_append():
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_specs=create_observation_specs_with_shapes([(1, )]),
        max_step_complete=True,
        action_spec=ActionSpec.create_discrete((2, )),
    )
    agentbuffer_trajectory = trajectory.to_agentbuffer()
    assert trainer.update_buffer.num_experiences == 0

    # Check that if we append, our update buffer gets longer.
    # max_steps = 100
    for i in range(10):
        trainer._process_trajectory(trajectory)
        trainer._append_to_update_buffer(agentbuffer_trajectory)
        assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon

    # Check that if we append after stopping training, nothing happens.
    # We process enough trajectories to hit max steps
    trainer.set_is_policy_updating(False)
    trainer._process_trajectory(trajectory)
    trainer._append_to_update_buffer(agentbuffer_trajectory)
    assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon
示例#4
0
def test_action_generator():
    # Continuous
    action_len = 30
    specs = ActionSpec.create_continuous(action_len)
    zero_action = specs.empty_action(4).continuous
    assert np.array_equal(zero_action,
                          np.zeros((4, action_len), dtype=np.float32))
    print(specs.random_action(4))
    random_action = specs.random_action(4).continuous
    print(random_action)
    assert random_action.dtype == np.float32
    assert random_action.shape == (4, action_len)
    assert np.min(random_action) >= -1
    assert np.max(random_action) <= 1

    # Discrete
    action_shape = (10, 20, 30)
    specs = ActionSpec.create_discrete(action_shape)
    zero_action = specs.empty_action(4).discrete
    assert np.array_equal(zero_action,
                          np.zeros((4, len(action_shape)), dtype=np.int32))

    random_action = specs.random_action(4).discrete
    assert random_action.dtype == np.int32
    assert random_action.shape == (4, len(action_shape))
    assert np.min(random_action) >= 0
    for index, branch_size in enumerate(action_shape):
        assert np.max(random_action[:, index]) < branch_size
示例#5
0
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary):
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    summary_freq = trainer.trainer_settings.summary_freq
    checkpoint_interval = trainer.trainer_settings.checkpoint_interval
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_specs=create_observation_specs_with_shapes([(1, )]),
        max_step_complete=True,
        action_spec=ActionSpec.create_discrete((2, )),
    )
    # Check that we can turn off the trainer and that the buffer is cleared
    num_trajectories = 5
    for _ in range(0, num_trajectories):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that we have called write_summary the appropriate number of times
    calls = [
        mock.call(step) for step in range(summary_freq, num_trajectories *
                                          time_horizon, summary_freq)
    ]
    mock_write_summary.assert_has_calls(calls, any_order=True)

    checkpoint_range = range(checkpoint_interval,
                             num_trajectories * time_horizon,
                             checkpoint_interval)
    calls = [mock.call(trainer.brain_name, step) for step in checkpoint_range]

    trainer.model_saver.save_checkpoint.assert_has_calls(calls, any_order=True)
    export_ext = "onnx"

    add_checkpoint_calls = [
        mock.call(
            trainer.brain_name,
            ModelCheckpoint(
                step,
                f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.{export_ext}",
                None,
                mock.ANY,
                [
                    f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.pt"
                ],
            ),
            trainer.trainer_settings.keep_checkpoints,
        ) for step in checkpoint_range
    ]
    mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
示例#6
0
def test_action_masking_discrete_1():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(shapes, ActionSpec.create_discrete((10, )))
    ap_list = generate_list_agent_proto(n_agents, shapes)
    decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
    masks = decision_steps.action_mask
    assert isinstance(masks, list)
    assert len(masks) == 1
    assert masks[0].shape == (n_agents / 2, 10)
    assert masks[0][0, 0]
示例#7
0
def setup_test_behavior_specs(
    use_discrete=True, use_visual=False, vector_action_space=2, vector_obs_space=8
):
    if use_discrete:
        action_spec = ActionSpec.create_discrete(tuple(vector_action_space))
    else:
        action_spec = ActionSpec.create_continuous(vector_action_space)
    observation_shapes = [(84, 84, 3)] * int(use_visual) + [(vector_obs_space,)]
    obs_spec = create_observation_specs_with_shapes(observation_shapes)
    behavior_spec = BehaviorSpec(obs_spec, action_spec)
    return behavior_spec
示例#8
0
def test_specs():
    specs = ActionSpec.create_continuous(3)
    assert specs.discrete_branches == ()
    assert specs.discrete_size == 0
    assert specs.continuous_size == 3
    assert specs.empty_action(5).shape == (5, 3)
    assert specs.empty_action(5).dtype == np.float32

    specs = ActionSpec.create_discrete((3,))
    assert specs.discrete_branches == (3,)
    assert specs.discrete_size == 1
    assert specs.continuous_size == 0
    assert specs.empty_action(5).shape == (5, 1)
    assert specs.empty_action(5).dtype == np.int32
示例#9
0
def test_action_masking_discrete_2():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(create_sensor_specs_with_shapes(shapes),
                                 ActionSpec.create_discrete((2, 2, 6)))
    ap_list = generate_list_agent_proto(n_agents, shapes)
    decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
    masks = decision_steps.action_mask
    assert isinstance(masks, list)
    assert len(masks) == 3
    assert masks[0].shape == (n_agents / 2, 2)
    assert masks[1].shape == (n_agents / 2, 2)
    assert masks[2].shape == (n_agents / 2, 6)
    assert masks[0][0, 0]
def test_action_masking_discrete():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes),
                                 ActionSpec.create_discrete((7, 3)))
    ap_list = generate_list_agent_proto(n_agents, shapes)
    decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
    masks = decision_steps.action_mask
    assert isinstance(masks, list)
    assert len(masks) == 2
    assert masks[0].shape == (n_agents / 2, 7)  # half agents are done
    assert masks[1].shape == (n_agents / 2, 3)  # half agents are done
    assert masks[0][0, 0]
    assert not masks[1][0, 0]
    assert masks[1][0, 1]
示例#11
0
    def __init__(
        self,
        brain_names,
        use_discrete,
        step_size=STEP_SIZE,
        num_visual=0,
        num_vector=1,
        vis_obs_size=VIS_OBS_SIZE,
        vec_obs_size=OBS_SIZE,
        action_size=1,
    ):
        super().__init__()
        self.discrete = use_discrete
        self.num_visual = num_visual
        self.num_vector = num_vector
        self.vis_obs_size = vis_obs_size
        self.vec_obs_size = vec_obs_size
        if use_discrete:
            action_spec = ActionSpec.create_discrete(
                tuple(2 for _ in range(action_size))
            )
        else:
            action_spec = ActionSpec.create_continuous(action_size)
        self.behavior_spec = BehaviorSpec(self._make_obs_spec(), action_spec)
        self.action_size = action_size
        self.names = brain_names
        self.positions: Dict[str, List[float]] = {}
        self.step_count: Dict[str, float] = {}
        self.random = random.Random(str(self.behavior_spec))
        self.goal: Dict[str, int] = {}
        self.action = {}
        self.rewards: Dict[str, float] = {}
        self.final_rewards: Dict[str, List[float]] = {}
        self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
        self.agent_id: Dict[str, int] = {}
        self.step_size = step_size  # defines the difficulty of the test

        for name in self.names:
            self.agent_id[name] = 0
            self.goal[name] = self.random.choice([-1, 1])
            self.rewards[name] = 0
            self.final_rewards[name] = []
            self._reset_agent(name)
            self.action[name] = None
            self.step_result[name] = None
def test_simple_actor(use_discrete):
    obs_size = 4
    network_settings = NetworkSettings()
    obs_shapes = [(obs_size, )]
    act_size = [2]
    if use_discrete:
        masks = torch.ones((1, 1))
        action_spec = ActionSpec.create_discrete(tuple(act_size))
    else:
        masks = None
        action_spec = ActionSpec.create_continuous(act_size[0])
    actor = SimpleActor(obs_shapes, network_settings, action_spec)
    # Test get_dist
    sample_obs = torch.ones((1, obs_size))
    dists, _ = actor.get_dists([sample_obs], [], masks=masks)
    for dist in dists:
        if use_discrete:
            assert isinstance(dist, CategoricalDistInstance)
        else:
            assert isinstance(dist, GaussianDistInstance)

    # Test sample_actions
    actions = actor.sample_action(dists)
    for act in actions:
        if use_discrete:
            assert act.shape == (1, 1)
        else:
            assert act.shape == (1, act_size[0])

    # Test forward
    actions, ver_num, mem_size, is_cont, act_size_vec = actor.forward(
        [sample_obs], [], masks=masks)
    for act in actions:
        # This is different from above for ONNX export
        if use_discrete:
            assert act.shape == tuple(act_size)
        else:
            assert act.shape == (act_size[0], 1)

    assert mem_size == 0
    assert is_cont == int(not use_discrete)
    assert act_size_vec == torch.tensor(act_size)
示例#13
0
def test_advance(mocked_clear_update_buffer, mocked_save_model):
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_shapes=[(1,)],
        max_step_complete=True,
        action_spec=ActionSpec.create_discrete((2,)),
    )
    trajectory_queue.put(trajectory)

    trainer.advance()
    policy_queue.get_nowait()
    # Check that get_step is correct
    assert trainer.get_step == time_horizon
    # Check that we can turn off the trainer and that the buffer is cleared
    for _ in range(0, 5):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that if the policy doesn't update, we don't push it to the queue
    trainer.set_is_policy_updating(False)
    for _ in range(0, 10):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there nothing  in the policy queue
        with pytest.raises(AgentManagerQueue.Empty):
            policy_queue.get_nowait()

    # Check that the buffer has been cleared
    assert not trainer.should_still_train
    assert mocked_clear_update_buffer.call_count > 0
    assert mocked_save_model.call_count == 0
示例#14
0
def test_poca_end_episode():
    name_behavior_id = "test_trainer"
    trainer = POCATrainer(
        name_behavior_id,
        10,
        TrainerSettings(max_steps=100, checkpoint_interval=10,
                        summary_freq=20),
        True,
        False,
        0,
        "mock_model_path",
    )
    behavior_spec = BehaviorSpec(create_observation_specs_with_shapes([(1, )]),
                                 ActionSpec.create_discrete((2, )))
    parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(
        name_behavior_id)
    mock_policy = trainer.create_policy(parsed_behavior_id, behavior_spec)
    trainer.add_policy(parsed_behavior_id, mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_specs=behavior_spec.observation_specs,
        max_step_complete=False,
        action_spec=behavior_spec.action_spec,
        num_other_agents_in_group=2,
        group_reward=1.0,
        is_terminal=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
    # Test that some trajectoories have been injested
    for reward in trainer.collected_group_rewards.values():
        assert reward == 10
    # Test end episode
    trainer.end_episode()
    assert len(trainer.collected_group_rewards.keys()) == 0
示例#15
0
from mlagents.trainers.buffer import BufferKey
import pytest
import numpy as np
from mlagents.trainers.torch.components.reward_providers import (
    ExtrinsicRewardProvider,
    create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
    create_agent_buffer, )
from mlagents.trainers.tests.dummy_config import create_observation_specs_with_shapes

ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(5)
ACTIONSPEC_TWODISCRETE = ActionSpec.create_discrete((2, 3))


@pytest.mark.parametrize(
    "behavior_spec",
    [
        BehaviorSpec(create_observation_specs_with_shapes([(10, )]),
                     ACTIONSPEC_CONTINUOUS),
        BehaviorSpec(create_observation_specs_with_shapes([(10, )]),
                     ACTIONSPEC_TWODISCRETE),
    ],
)
def test_construction(behavior_spec: BehaviorSpec) -> None:
    settings = RewardSignalSettings()
    settings.gamma = 0.2
    extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
    assert extrinsic_rp.gamma == 0.2
示例#16
0
from mlagents_envs.base_env import ActionSpec


@pytest.fixture
def dummy_config():
    return ppo_dummy_config()


VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 64
NUM_AGENTS = 12

CONTINUOUS_ACTION_SPEC = ActionSpec.create_continuous(VECTOR_ACTION_SPACE)
DISCRETE_ACTION_SPEC = ActionSpec.create_discrete(tuple(DISCRETE_ACTION_SPACE))


def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config)
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False)
示例#17
0
def create_behavior_spec(num_visual, num_vector, vector_size):
    behavior_spec = BehaviorSpec(
        [(84, 84, 3)] * int(num_visual) + [(vector_size,)] * int(num_vector),
        ActionSpec.create_discrete((1,)),
    )
    return behavior_spec
示例#18
0
)
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
from mlagents.trainers.settings import GAILSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
    create_agent_buffer, )
from mlagents.trainers.torch.components.reward_providers.gail_reward_provider import (
    DiscriminatorNetwork, )

CONTINUOUS_PATH = (os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                os.pardir, os.pardir) + "/test.demo")
DISCRETE_PATH = (os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              os.pardir, os.pardir) + "/testdcvis.demo")
SEED = [42]

ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(2)
ACTIONSPEC_FOURDISCRETE = ActionSpec.create_discrete((2, 3, 3, 3))
ACTIONSPEC_DISCRETE = ActionSpec.create_discrete((20, ))


@pytest.mark.parametrize("behavior_spec",
                         [BehaviorSpec([(8, )], ACTIONSPEC_CONTINUOUS)])
def test_construction(behavior_spec: BehaviorSpec) -> None:
    gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)
    gail_rp = GAILRewardProvider(behavior_spec, gail_settings)
    assert gail_rp.name == "GAIL"


@pytest.mark.parametrize("behavior_spec",
                         [BehaviorSpec([(8, )], ACTIONSPEC_CONTINUOUS)])
def test_factory(behavior_spec: BehaviorSpec) -> None:
    gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)