def test_warning_group_reward(self): with self.assertLogs("mlagents.trainers", level="WARN") as cm: rl_trainer = create_rl_trainer() # This one should warn trajectory = mb.make_fake_trajectory( length=10, observation_specs=create_observation_specs_with_shapes([(1, ) ]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), group_reward=1.0, ) buff = trajectory.to_agentbuffer() rl_trainer._warn_if_group_reward(buff) assert len(cm.output) > 0 len_of_first_warning = len(cm.output) rl_trainer = create_rl_trainer() # This one shouldn't trajectory = mb.make_fake_trajectory( length=10, observation_specs=create_observation_specs_with_shapes([(1, ) ]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) buff = trajectory.to_agentbuffer() rl_trainer._warn_if_group_reward(buff) # Make sure warnings don't get bigger assert len(cm.output) == len_of_first_warning
def create_mock_group_spec( number_visual_observations=0, vector_action_space_type="continuous", vector_observation_space_size=3, vector_action_space_size=None, ): """ Creates a mock BrainParameters object with parameters. """ # Avoid using mutable object as default param if vector_action_space_type == "continuous": if vector_action_space_size is None: vector_action_space_size = 2 else: vector_action_space_size = vector_action_space_size[0] action_spec = ActionSpec.create_continuous(vector_action_space_size) else: if vector_action_space_size is None: vector_action_space_size = (2, ) else: vector_action_space_size = tuple(vector_action_space_size) action_spec = ActionSpec.create_discrete(vector_action_space_size) obs_shapes = [(vector_observation_space_size, )] for _ in range(number_visual_observations): obs_shapes += [(8, 8, 3)] obs_spec = create_observation_specs_with_shapes(obs_shapes) return BehaviorSpec(obs_spec, action_spec)
def test_update_buffer_append(): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, observation_specs=create_observation_specs_with_shapes([(1, )]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) agentbuffer_trajectory = trajectory.to_agentbuffer() assert trainer.update_buffer.num_experiences == 0 # Check that if we append, our update buffer gets longer. # max_steps = 100 for i in range(10): trainer._process_trajectory(trajectory) trainer._append_to_update_buffer(agentbuffer_trajectory) assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon # Check that if we append after stopping training, nothing happens. # We process enough trajectories to hit max steps trainer.set_is_policy_updating(False) trainer._process_trajectory(trajectory) trainer._append_to_update_buffer(agentbuffer_trajectory) assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon
def test_action_generator(): # Continuous action_len = 30 specs = ActionSpec.create_continuous(action_len) zero_action = specs.empty_action(4).continuous assert np.array_equal(zero_action, np.zeros((4, action_len), dtype=np.float32)) print(specs.random_action(4)) random_action = specs.random_action(4).continuous print(random_action) assert random_action.dtype == np.float32 assert random_action.shape == (4, action_len) assert np.min(random_action) >= -1 assert np.max(random_action) <= 1 # Discrete action_shape = (10, 20, 30) specs = ActionSpec.create_discrete(action_shape) zero_action = specs.empty_action(4).discrete assert np.array_equal(zero_action, np.zeros((4, len(action_shape)), dtype=np.int32)) random_action = specs.random_action(4).discrete assert random_action.dtype == np.int32 assert random_action.shape == (4, len(action_shape)) assert np.min(random_action) >= 0 for index, branch_size in enumerate(action_shape): assert np.max(random_action[:, index]) < branch_size
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 summary_freq = trainer.trainer_settings.summary_freq checkpoint_interval = trainer.trainer_settings.checkpoint_interval trajectory = mb.make_fake_trajectory( length=time_horizon, observation_specs=create_observation_specs_with_shapes([(1, )]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) # Check that we can turn off the trainer and that the buffer is cleared num_trajectories = 5 for _ in range(0, num_trajectories): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that we have called write_summary the appropriate number of times calls = [ mock.call(step) for step in range(summary_freq, num_trajectories * time_horizon, summary_freq) ] mock_write_summary.assert_has_calls(calls, any_order=True) checkpoint_range = range(checkpoint_interval, num_trajectories * time_horizon, checkpoint_interval) calls = [mock.call(trainer.brain_name, step) for step in checkpoint_range] trainer.model_saver.save_checkpoint.assert_has_calls(calls, any_order=True) export_ext = "onnx" add_checkpoint_calls = [ mock.call( trainer.brain_name, ModelCheckpoint( step, f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.{export_ext}", None, mock.ANY, [ f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.pt" ], ), trainer.trainer_settings.keep_checkpoints, ) for step in checkpoint_range ] mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
def test_action_masking_discrete_1(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(shapes, ActionSpec.create_discrete((10, ))) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 1 assert masks[0].shape == (n_agents / 2, 10) assert masks[0][0, 0]
def setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=2, vector_obs_space=8 ): if use_discrete: action_spec = ActionSpec.create_discrete(tuple(vector_action_space)) else: action_spec = ActionSpec.create_continuous(vector_action_space) observation_shapes = [(84, 84, 3)] * int(use_visual) + [(vector_obs_space,)] obs_spec = create_observation_specs_with_shapes(observation_shapes) behavior_spec = BehaviorSpec(obs_spec, action_spec) return behavior_spec
def test_specs(): specs = ActionSpec.create_continuous(3) assert specs.discrete_branches == () assert specs.discrete_size == 0 assert specs.continuous_size == 3 assert specs.empty_action(5).shape == (5, 3) assert specs.empty_action(5).dtype == np.float32 specs = ActionSpec.create_discrete((3,)) assert specs.discrete_branches == (3,) assert specs.discrete_size == 1 assert specs.continuous_size == 0 assert specs.empty_action(5).shape == (5, 1) assert specs.empty_action(5).dtype == np.int32
def test_action_masking_discrete_2(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_sensor_specs_with_shapes(shapes), ActionSpec.create_discrete((2, 2, 6))) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 3 assert masks[0].shape == (n_agents / 2, 2) assert masks[1].shape == (n_agents / 2, 2) assert masks[2].shape == (n_agents / 2, 6) assert masks[0][0, 0]
def test_action_masking_discrete(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_discrete((7, 3))) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 2 assert masks[0].shape == (n_agents / 2, 7) # half agents are done assert masks[1].shape == (n_agents / 2, 3) # half agents are done assert masks[0][0, 0] assert not masks[1][0, 0] assert masks[1][0, 1]
def __init__( self, brain_names, use_discrete, step_size=STEP_SIZE, num_visual=0, num_vector=1, vis_obs_size=VIS_OBS_SIZE, vec_obs_size=OBS_SIZE, action_size=1, ): super().__init__() self.discrete = use_discrete self.num_visual = num_visual self.num_vector = num_vector self.vis_obs_size = vis_obs_size self.vec_obs_size = vec_obs_size if use_discrete: action_spec = ActionSpec.create_discrete( tuple(2 for _ in range(action_size)) ) else: action_spec = ActionSpec.create_continuous(action_size) self.behavior_spec = BehaviorSpec(self._make_obs_spec(), action_spec) self.action_size = action_size self.names = brain_names self.positions: Dict[str, List[float]] = {} self.step_count: Dict[str, float] = {} self.random = random.Random(str(self.behavior_spec)) self.goal: Dict[str, int] = {} self.action = {} self.rewards: Dict[str, float] = {} self.final_rewards: Dict[str, List[float]] = {} self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {} self.agent_id: Dict[str, int] = {} self.step_size = step_size # defines the difficulty of the test for name in self.names: self.agent_id[name] = 0 self.goal[name] = self.random.choice([-1, 1]) self.rewards[name] = 0 self.final_rewards[name] = [] self._reset_agent(name) self.action[name] = None self.step_result[name] = None
def test_simple_actor(use_discrete): obs_size = 4 network_settings = NetworkSettings() obs_shapes = [(obs_size, )] act_size = [2] if use_discrete: masks = torch.ones((1, 1)) action_spec = ActionSpec.create_discrete(tuple(act_size)) else: masks = None action_spec = ActionSpec.create_continuous(act_size[0]) actor = SimpleActor(obs_shapes, network_settings, action_spec) # Test get_dist sample_obs = torch.ones((1, obs_size)) dists, _ = actor.get_dists([sample_obs], [], masks=masks) for dist in dists: if use_discrete: assert isinstance(dist, CategoricalDistInstance) else: assert isinstance(dist, GaussianDistInstance) # Test sample_actions actions = actor.sample_action(dists) for act in actions: if use_discrete: assert act.shape == (1, 1) else: assert act.shape == (1, act_size[0]) # Test forward actions, ver_num, mem_size, is_cont, act_size_vec = actor.forward( [sample_obs], [], masks=masks) for act in actions: # This is different from above for ONNX export if use_discrete: assert act.shape == tuple(act_size) else: assert act.shape == (act_size[0], 1) assert mem_size == 0 assert is_cont == int(not use_discrete) assert act_size_vec == torch.tensor(act_size)
def test_advance(mocked_clear_update_buffer, mocked_save_model): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, observation_shapes=[(1,)], max_step_complete=True, action_spec=ActionSpec.create_discrete((2,)), ) trajectory_queue.put(trajectory) trainer.advance() policy_queue.get_nowait() # Check that get_step is correct assert trainer.get_step == time_horizon # Check that we can turn off the trainer and that the buffer is cleared for _ in range(0, 5): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that if the policy doesn't update, we don't push it to the queue trainer.set_is_policy_updating(False) for _ in range(0, 10): trajectory_queue.put(trajectory) trainer.advance() # Check that there nothing in the policy queue with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait() # Check that the buffer has been cleared assert not trainer.should_still_train assert mocked_clear_update_buffer.call_count > 0 assert mocked_save_model.call_count == 0
def test_poca_end_episode(): name_behavior_id = "test_trainer" trainer = POCATrainer( name_behavior_id, 10, TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20), True, False, 0, "mock_model_path", ) behavior_spec = BehaviorSpec(create_observation_specs_with_shapes([(1, )]), ActionSpec.create_discrete((2, ))) parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id( name_behavior_id) mock_policy = trainer.create_policy(parsed_behavior_id, behavior_spec) trainer.add_policy(parsed_behavior_id, mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, observation_specs=behavior_spec.observation_specs, max_step_complete=False, action_spec=behavior_spec.action_spec, num_other_agents_in_group=2, group_reward=1.0, is_terminal=False, ) trajectory_queue.put(trajectory) trainer.advance() # Test that some trajectoories have been injested for reward in trainer.collected_group_rewards.values(): assert reward == 10 # Test end episode trainer.end_episode() assert len(trainer.collected_group_rewards.keys()) == 0
from mlagents.trainers.buffer import BufferKey import pytest import numpy as np from mlagents.trainers.torch.components.reward_providers import ( ExtrinsicRewardProvider, create_reward_provider, ) from mlagents_envs.base_env import BehaviorSpec, ActionSpec from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType from mlagents.trainers.tests.torch.test_reward_providers.utils import ( create_agent_buffer, ) from mlagents.trainers.tests.dummy_config import create_observation_specs_with_shapes ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(5) ACTIONSPEC_TWODISCRETE = ActionSpec.create_discrete((2, 3)) @pytest.mark.parametrize( "behavior_spec", [ BehaviorSpec(create_observation_specs_with_shapes([(10, )]), ACTIONSPEC_CONTINUOUS), BehaviorSpec(create_observation_specs_with_shapes([(10, )]), ACTIONSPEC_TWODISCRETE), ], ) def test_construction(behavior_spec: BehaviorSpec) -> None: settings = RewardSignalSettings() settings.gamma = 0.2 extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings) assert extrinsic_rp.gamma == 0.2
from mlagents_envs.base_env import ActionSpec @pytest.fixture def dummy_config(): return ppo_dummy_config() VECTOR_ACTION_SPACE = 2 VECTOR_OBS_SPACE = 8 DISCRETE_ACTION_SPACE = [3, 3, 3, 2] BUFFER_INIT_SAMPLES = 64 NUM_AGENTS = 12 CONTINUOUS_ACTION_SPEC = ActionSpec.create_continuous(VECTOR_ACTION_SPACE) DISCRETE_ACTION_SPEC = ActionSpec.create_discrete(tuple(DISCRETE_ACTION_SPACE)) def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config) trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False)
def create_behavior_spec(num_visual, num_vector, vector_size): behavior_spec = BehaviorSpec( [(84, 84, 3)] * int(num_visual) + [(vector_size,)] * int(num_vector), ActionSpec.create_discrete((1,)), ) return behavior_spec
) from mlagents_envs.base_env import BehaviorSpec, ActionSpec from mlagents.trainers.settings import GAILSettings, RewardSignalType from mlagents.trainers.tests.torch.test_reward_providers.utils import ( create_agent_buffer, ) from mlagents.trainers.torch.components.reward_providers.gail_reward_provider import ( DiscriminatorNetwork, ) CONTINUOUS_PATH = (os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir) + "/test.demo") DISCRETE_PATH = (os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir) + "/testdcvis.demo") SEED = [42] ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(2) ACTIONSPEC_FOURDISCRETE = ActionSpec.create_discrete((2, 3, 3, 3)) ACTIONSPEC_DISCRETE = ActionSpec.create_discrete((20, )) @pytest.mark.parametrize("behavior_spec", [BehaviorSpec([(8, )], ACTIONSPEC_CONTINUOUS)]) def test_construction(behavior_spec: BehaviorSpec) -> None: gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH) gail_rp = GAILRewardProvider(behavior_spec, gail_settings) assert gail_rp.name == "GAIL" @pytest.mark.parametrize("behavior_spec", [BehaviorSpec([(8, )], ACTIONSPEC_CONTINUOUS)]) def test_factory(behavior_spec: BehaviorSpec) -> None: gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)