def test_warning_group_reward(self): with self.assertLogs("mlagents.trainers", level="WARN") as cm: rl_trainer = create_rl_trainer() # This one should warn trajectory = mb.make_fake_trajectory( length=10, observation_specs=create_observation_specs_with_shapes([(1, ) ]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), group_reward=1.0, ) buff = trajectory.to_agentbuffer() rl_trainer._warn_if_group_reward(buff) assert len(cm.output) > 0 len_of_first_warning = len(cm.output) rl_trainer = create_rl_trainer() # This one shouldn't trajectory = mb.make_fake_trajectory( length=10, observation_specs=create_observation_specs_with_shapes([(1, ) ]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) buff = trajectory.to_agentbuffer() rl_trainer._warn_if_group_reward(buff) # Make sure warnings don't get bigger assert len(cm.output) == len_of_first_warning
def test_group_statuses(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=4, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), grouped=True, ) fake_action_info = _create_action_info(4, mock_decision_steps.agent_id) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(2): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Make terminal steps for some dead agents mock_decision_steps_2, mock_terminal_steps_2 = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), done=True, grouped=True, ) processor.add_experiences( mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info ) fake_action_info = _create_action_info(4, mock_decision_steps.agent_id) for _ in range(3): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Assert that four trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 4 # Last trajectory should be the longest trajectory = tqueue.put.call_args_list[0][0][-1] # Make sure trajectory has the right Groupmate Experiences for step in trajectory.steps[0:3]: assert len(step.group_status) == 3 # After 2 agents has died for step in trajectory.steps[3:]: assert len(step.group_status) == 1
def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes( [(8,)] + num_vis_obs * [(84, 84, 3)] ), action_spec=ActionSpec.create_continuous(2), ) fake_action_info = _create_action_info(2, mock_decision_steps.agent_id) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(5): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Make sure ungrouped agents don't have team obs for step in trajectory.steps: assert len(step.group_status) == 0 # Assert that the AgentProcessor is empty assert len(processor._experience_buffers[0]) == 0 # Test empty steps mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=0, observation_specs=create_observation_specs_with_shapes( [(8,)] + num_vis_obs * [(84, 84, 3)] ), action_spec=ActionSpec.create_continuous(2), ) processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) # Assert that the AgentProcessor is still empty assert len(processor._experience_buffers[0]) == 0
def test_networkbody_visual(): torch.manual_seed(0) vec_obs_size = 4 obs_size = (84, 84, 3) network_settings = NetworkSettings() obs_shapes = [(vec_obs_size, ), obs_size] networkbody = NetworkBody(create_observation_specs_with_shapes(obs_shapes), network_settings) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3) sample_obs = 0.1 * torch.ones((1, 84, 84, 3)) sample_vec_obs = torch.ones((1, vec_obs_size)) obs = [sample_vec_obs] + [sample_obs] for _ in range(150): encoded, _ = networkbody(obs) assert encoded.shape == (1, network_settings.hidden_units) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
def test_update_buffer_append(): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, observation_specs=create_observation_specs_with_shapes([(1, )]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) agentbuffer_trajectory = trajectory.to_agentbuffer() assert trainer.update_buffer.num_experiences == 0 # Check that if we append, our update buffer gets longer. # max_steps = 100 for i in range(10): trainer._process_trajectory(trajectory) trainer._append_to_update_buffer(agentbuffer_trajectory) assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon # Check that if we append after stopping training, nothing happens. # We process enough trajectories to hit max steps trainer.set_is_policy_updating(False) trainer._process_trajectory(trajectory) trainer._append_to_update_buffer(agentbuffer_trajectory) assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon
def test_networkbody_lstm(): torch.manual_seed(0) obs_size = 4 seq_len = 6 network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings( sequence_length=seq_len, memory_size=12)) obs_shapes = [(obs_size, )] networkbody = NetworkBody(create_observation_specs_with_shapes(obs_shapes), network_settings) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4) sample_obs = torch.ones((seq_len, obs_size)) for _ in range(300): encoded, _ = networkbody([sample_obs], memories=torch.ones(1, 1, 12), sequence_length=seq_len) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
def test_batched_step_result_from_proto(): n_agents = 10 shapes = [(3, ), (4, )] spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_continuous(3)) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, spec) for agent_id in range(n_agents): if agent_id in decision_steps: # we set the reward equal to the agent id in generate_list_agent_proto assert decision_steps[agent_id].reward == agent_id elif agent_id in terminal_steps: assert terminal_steps[agent_id].reward == agent_id else: raise Exception("Missing agent from the steps") # We sort the AgentId since they are split between DecisionSteps and TerminalSteps combined_agent_id = list(decision_steps.agent_id) + list( terminal_steps.agent_id) combined_agent_id.sort() assert combined_agent_id == list(range(n_agents)) for agent_id in range(n_agents): assert (agent_id in terminal_steps) == (agent_id % 2 == 0) if agent_id in terminal_steps: assert terminal_steps[agent_id].interrupted == (agent_id % 4 == 0) assert decision_steps.obs[0].shape[1] == shapes[0][0] assert decision_steps.obs[1].shape[1] == shapes[1][0] assert terminal_steps.obs[0].shape[1] == shapes[0][0] assert terminal_steps.obs[1].shape[1] == shapes[1][0]
def test_multinetworkbody_num_agents(with_actions): torch.manual_seed(0) act_size = 2 obs_size = 4 network_settings = NetworkSettings() obs_shapes = [(obs_size,)] action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) networkbody = MultiAgentNetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, action_spec ) sample_obs = [[0.1 * torch.ones((1, obs_size))]] # simulate baseline in POCA sample_act = [ AgentAction( 0.1 * torch.ones((1, 2)), [0.1 * torch.ones(1) for _ in range(act_size)] ) ] for n_agent, max_so_far in [(1, 1), (5, 5), (4, 5), (10, 10), (5, 10), (1, 10)]: if with_actions: encoded, _ = networkbody( obs_only=sample_obs * (n_agent - 1), obs=sample_obs, actions=sample_act ) else: encoded, _ = networkbody(obs_only=sample_obs * n_agent, obs=[], actions=[]) # look at the last value of the hidden units (the number of agents) target = (n_agent * 1.0 / max_so_far) * 2 - 1 assert abs(encoded[0, -1].item() - target) < 1e-6 assert encoded[0, -1].item() <= 1 assert encoded[0, -1].item() >= -1
def test_create_inputs(encoder_type, normalize, num_vector, num_visual): vec_obs_shape = (5,) vis_obs_shape = (84, 84, 3) obs_shapes = [] for _ in range(num_vector): obs_shapes.append(vec_obs_shape) for _ in range(num_visual): obs_shapes.append(vis_obs_shape) h_size = 128 obs_spec = create_observation_specs_with_shapes(obs_shapes) encoders, embedding_sizes = ModelUtils.create_input_processors( obs_spec, h_size, encoder_type, h_size, normalize ) total_output = sum(embedding_sizes) vec_enc = [] vis_enc = [] for i, enc in enumerate(encoders): if len(obs_shapes[i]) == 1: vec_enc.append(enc) else: vis_enc.append(enc) assert len(vec_enc) == num_vector assert len(vis_enc) == num_visual assert total_output == int(num_visual * h_size + vec_obs_shape[0] * num_vector) if num_vector > 0: assert isinstance(vec_enc[0], VectorInput) for enc in vis_enc: assert isinstance(enc, ModelUtils.get_encoder_for_type(encoder_type))
def create_mock_group_spec( number_visual_observations=0, vector_action_space_type="continuous", vector_observation_space_size=3, vector_action_space_size=None, ): """ Creates a mock BrainParameters object with parameters. """ # Avoid using mutable object as default param if vector_action_space_type == "continuous": if vector_action_space_size is None: vector_action_space_size = 2 else: vector_action_space_size = vector_action_space_size[0] action_spec = ActionSpec.create_continuous(vector_action_space_size) else: if vector_action_space_size is None: vector_action_space_size = (2, ) else: vector_action_space_size = tuple(vector_action_space_size) action_spec = ActionSpec.create_discrete(vector_action_space_size) obs_shapes = [(vector_observation_space_size, )] for _ in range(number_visual_observations): obs_shapes += [(8, 8, 3)] obs_spec = create_observation_specs_with_shapes(obs_shapes) return BehaviorSpec(obs_spec, action_spec)
def test_networkbody_vector(): torch.manual_seed(0) obs_size = 4 network_settings = NetworkSettings() obs_shapes = [(obs_size, )] networkbody = NetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, encoded_act_size=2, ) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3) sample_obs = 0.1 * torch.ones((1, obs_size)) sample_act = 0.1 * torch.ones((1, 2)) for _ in range(300): encoded, _ = networkbody([sample_obs], sample_act) assert encoded.shape == (1, network_settings.hidden_units) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten(): assert _enc == pytest.approx(1.0, abs=0.1)
def test_trajectory_to_agentbuffer(): length = 15 wanted_keys = [ (ObservationKeyPrefix.OBSERVATION, 0), (ObservationKeyPrefix.OBSERVATION, 1), (ObservationKeyPrefix.NEXT_OBSERVATION, 0), (ObservationKeyPrefix.NEXT_OBSERVATION, 1), BufferKey.MEMORY, BufferKey.MASKS, BufferKey.DONE, BufferKey.CONTINUOUS_ACTION, BufferKey.DISCRETE_ACTION, BufferKey.CONTINUOUS_LOG_PROBS, BufferKey.DISCRETE_LOG_PROBS, BufferKey.ACTION_MASK, BufferKey.PREV_ACTION, BufferKey.ENVIRONMENT_REWARDS, ] wanted_keys = set(wanted_keys) trajectory = make_fake_trajectory( length=length, observation_specs=create_observation_specs_with_shapes([ (VEC_OBS_SIZE, ), (84, 84, 3) ]), action_spec=ActionSpec.create_continuous(ACTION_SIZE), ) agentbuffer = trajectory.to_agentbuffer() seen_keys = set() for key, field in agentbuffer.items(): assert len(field) == length seen_keys.add(key) assert seen_keys == wanted_keys
def test_valuenetwork(): torch.manual_seed(0) obs_size = 4 num_outputs = 2 network_settings = NetworkSettings() obs_spec = create_observation_specs_with_shapes([(obs_size, )]) stream_names = [f"stream_name{n}" for n in range(4)] value_net = ValueNetwork(stream_names, obs_spec, network_settings, outputs_per_stream=num_outputs) optimizer = torch.optim.Adam(value_net.parameters(), lr=3e-3) for _ in range(50): sample_obs = torch.ones((1, obs_size)) values, _ = value_net([sample_obs]) loss = 0 for s_name in stream_names: assert values[s_name].shape == (1, num_outputs) # Try to force output to 1 loss += torch.nn.functional.mse_loss(values[s_name], torch.ones((1, num_outputs))) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for value in values.values(): for _out in value.tolist(): assert _out[0] == pytest.approx(1.0, abs=0.1)
def _make_observation_specs(self) -> List[ObservationSpec]: obs_shape: List[Any] = [] for _ in range(self.num_vector): obs_shape.append((self.vec_obs_size, )) for _ in range(self.num_visual): obs_shape.append(self.vis_obs_size) obs_spec = create_observation_specs_with_shapes(obs_shape) return obs_spec
def test_empty_decision_steps(): specs = BehaviorSpec( observation_specs=create_observation_specs_with_shapes([(3, 2), (5,)]), action_spec=ActionSpec.create_continuous(3), ) ds = DecisionSteps.empty(specs) assert len(ds.obs) == 2 assert ds.obs[0].shape == (0, 3, 2) assert ds.obs[1].shape == (0, 5)
def test_actor_critic(ac_type, lstm): obs_size = 4 network_settings = NetworkSettings( memory=NetworkSettings.MemorySettings() if lstm else None, normalize=True) obs_spec = create_observation_specs_with_shapes([(obs_size, )]) act_size = 2 mask = torch.ones([1, act_size * 2]) stream_names = [f"stream_name{n}" for n in range(4)] # action_spec = ActionSpec.create_continuous(act_size[0]) action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) actor = ac_type(obs_spec, network_settings, action_spec, stream_names) if lstm: sample_obs = torch.ones( (1, network_settings.memory.sequence_length, obs_size)) memories = torch.ones( (1, network_settings.memory.sequence_length, actor.memory_size)) else: sample_obs = torch.ones((1, obs_size)) memories = torch.tensor([]) # memories isn't always set to None, the network should be able to # deal with that. # Test critic pass value_out, memories_out = actor.critic_pass([sample_obs], memories=memories) for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) assert memories_out.shape == memories.shape else: assert value_out[stream].shape == (1, ) # Test get action stats and_value action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value( [sample_obs], memories=memories, masks=mask) if lstm: assert action.continuous_tensor.shape == (64, 2) else: assert action.continuous_tensor.shape == (1, 2) assert len(action.discrete_list) == 2 for _disc in action.discrete_list: if lstm: assert _disc.shape == (64, 1) else: assert _disc.shape == (1, 1) if mem_out is not None: assert mem_out.shape == memories.shape for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) else: assert value_out[stream].shape == (1, )
def test_empty_terminal_steps(): specs = BehaviorSpec( observation_specs=create_observation_specs_with_shapes([(3, 2), (5,)]), action_spec=ActionSpec.create_continuous(3), ) ts = TerminalSteps.empty(specs) assert len(ts.obs) == 2 assert ts.obs[0].shape == (0, 3, 2) assert ts.obs[1].shape == (0, 5)
def test_action_masking_continuous(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_continuous(10)) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert masks is None
def test_invalid_visual_input_size(encoder_type): with pytest.raises(UnityTrainerException): obs_spec = create_observation_specs_with_shapes([( ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type] - 1, ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type], 1, )]) ModelUtils.create_input_processors(obs_spec, 20, encoder_type, 20, False)
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 summary_freq = trainer.trainer_settings.summary_freq checkpoint_interval = trainer.trainer_settings.checkpoint_interval trajectory = mb.make_fake_trajectory( length=time_horizon, observation_specs=create_observation_specs_with_shapes([(1, )]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) # Check that we can turn off the trainer and that the buffer is cleared num_trajectories = 5 for _ in range(0, num_trajectories): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that we have called write_summary the appropriate number of times calls = [ mock.call(step) for step in range(summary_freq, num_trajectories * time_horizon, summary_freq) ] mock_write_summary.assert_has_calls(calls, any_order=True) checkpoint_range = range(checkpoint_interval, num_trajectories * time_horizon, checkpoint_interval) calls = [mock.call(trainer.brain_name, step) for step in checkpoint_range] trainer.model_saver.save_checkpoint.assert_has_calls(calls, any_order=True) export_ext = "onnx" add_checkpoint_calls = [ mock.call( trainer.brain_name, ModelCheckpoint( step, f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.{export_ext}", None, mock.ANY, [ f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.pt" ], ), trainer.trainer_settings.keep_checkpoints, ) for step in checkpoint_range ] mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
def test_batched_step_result_from_proto_raises_on_nan(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_continuous(3)) ap_list = generate_list_agent_proto(n_agents, shapes, nan_observations=True) with pytest.raises(RuntimeError): steps_from_proto(ap_list, behavior_spec)
def test_vector_observation(): n_agents = 10 shapes = [(3, ), (4, )] obs_specs = create_observation_specs_with_shapes(shapes) list_proto = generate_list_agent_proto(n_agents, shapes) for obs_index, shape in enumerate(shapes): arr = _process_rank_one_or_two_observation(obs_index, obs_specs[obs_index], list_proto) assert list(arr.shape) == ([n_agents] + list(shape)) assert np.allclose(arr, 0.1, atol=0.01)
def setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=2, vector_obs_space=8 ): if use_discrete: action_spec = ActionSpec.create_discrete(tuple(vector_action_space)) else: action_spec = ActionSpec.create_continuous(vector_action_space) observation_shapes = [(84, 84, 3)] * int(use_visual) + [(vector_obs_space,)] obs_spec = create_observation_specs_with_shapes(observation_shapes) behavior_spec = BehaviorSpec(obs_spec, action_spec) return behavior_spec
def test_mismatch_observations_raise_in_step_result_from_proto(): n_agents = 10 shapes = [(3, ), (4, )] spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_continuous(3)) ap_list = generate_list_agent_proto(n_agents, shapes) # Hack an observation to be larger, we should get an exception ap_list[0].observations[0].shape[0] += 1 ap_list[0].observations[0].float_data.data.append(0.42) with pytest.raises(UnityObservationException): steps_from_proto(ap_list, spec)
def test_end_episode(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1]], dtype=np.float32)), } mock_decision_step, mock_terminal_step = mb.create_mock_steps( num_agents=1, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)), env_action=ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)), outputs=fake_action_outputs, agent_ids=mock_decision_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_step, mock_terminal_step, 0, ActionInfo.empty() ) # Run 3 trajectories, with different workers (to simulate different agents) remove_calls = [] for _ep in range(3): remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) for _ in range(5): processor.add_experiences( mock_decision_step, mock_terminal_step, _ep, fake_action_info ) # Make sure we don't add experiences from the prior agents after the done # Call end episode processor.end_episode() # Check that we removed every agent policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor._experience_buffers.keys()) == 0 assert len(processor._last_take_action_outputs.keys()) == 0 assert len(processor._episode_steps.keys()) == 0 assert len(processor._episode_rewards.keys()) == 0
def test_process_visual_observation_bad_shape(): in_array_1 = np.random.rand(128, 64, 3) proto_obs_1 = generate_compressed_proto_obs(in_array_1) ap1 = AgentInfoProto() ap1.observations.extend([proto_obs_1]) ap_list = [ap1] shape = (128, 42, 3) obs_spec = create_observation_specs_with_shapes([shape])[0] with pytest.raises(UnityObservationException): _process_maybe_compressed_observation(0, obs_spec, ap_list)
def test_action_masking_discrete_1(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes), ActionSpec.create_discrete((10, ))) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 1 assert masks[0].shape == (n_agents / 2, 10) assert masks[0][0, 0]
def test_process_trajectory(dummy_config): mock_specs = mb.setup_test_behavior_specs(True, False, vector_action_space=[2], vector_obs_space=1) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" brain_name = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0).brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0, dummy_config, True, "0") # first policy encountered becomes policy trained by wrapped PPO parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0) policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) trajectory_queue0 = AgentManagerQueue(behavior_id_team0) trainer.subscribe_trajectory_queue(trajectory_queue0) # Ghost trainer should ignore this queue because off policy parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) trajectory_queue1 = AgentManagerQueue(behavior_id_team1) trainer.subscribe_trajectory_queue(trajectory_queue1) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_specs=create_observation_specs_with_shapes([(1, )]), action_spec=mock_specs.action_spec, ) trajectory_queue0.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.trainer.update_buffer.num_experiences == 15 trajectory_queue1.put(trajectory) trainer.advance() # Check that ghost trainer ignored off policy queue assert trainer.trainer.update_buffer.num_experiences == 15 # Check that it emptied the queue assert trajectory_queue1.empty()
def test_multinetworkbody_lstm(with_actions): torch.manual_seed(0) obs_size = 4 act_size = 2 seq_len = 16 n_agents = 3 network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings( sequence_length=seq_len, memory_size=12)) obs_shapes = [(obs_size, )] action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) networkbody = MultiAgentNetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, action_spec) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4) sample_obs = [[0.1 * torch.ones((seq_len, obs_size))] for _ in range(n_agents)] # simulate baseline in POCA sample_act = [ AgentAction( 0.1 * torch.ones((seq_len, 2)), [0.1 * torch.ones(seq_len) for _ in range(act_size)], ) for _ in range(n_agents - 1) ] for _ in range(300): if with_actions: encoded, _ = networkbody( obs_only=sample_obs[:1], obs=sample_obs[1:], actions=sample_act, memories=torch.ones(1, 1, 12), sequence_length=seq_len, ) else: encoded, _ = networkbody( obs_only=sample_obs, obs=[], actions=[], memories=torch.ones(1, 1, 12), sequence_length=seq_len, ) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
def test_trajectory_to_agentbuffer(): length = 15 # These keys should be of type np.ndarray wanted_keys = [ (ObservationKeyPrefix.OBSERVATION, 0), (ObservationKeyPrefix.OBSERVATION, 1), (ObservationKeyPrefix.NEXT_OBSERVATION, 0), (ObservationKeyPrefix.NEXT_OBSERVATION, 1), BufferKey.MEMORY, BufferKey.MASKS, BufferKey.DONE, BufferKey.CONTINUOUS_ACTION, BufferKey.DISCRETE_ACTION, BufferKey.CONTINUOUS_LOG_PROBS, BufferKey.DISCRETE_LOG_PROBS, BufferKey.ACTION_MASK, BufferKey.PREV_ACTION, BufferKey.ENVIRONMENT_REWARDS, BufferKey.GROUP_REWARD, ] # These keys should be of type List wanted_group_keys = [ BufferKey.GROUPMATE_REWARDS, BufferKey.GROUP_CONTINUOUS_ACTION, BufferKey.GROUP_DISCRETE_ACTION, BufferKey.GROUP_DONES, BufferKey.GROUP_NEXT_CONT_ACTION, BufferKey.GROUP_NEXT_DISC_ACTION, ] wanted_keys = set(wanted_keys + wanted_group_keys) trajectory = make_fake_trajectory( length=length, observation_specs=create_observation_specs_with_shapes([ (VEC_OBS_SIZE, ), (84, 84, 3) ]), action_spec=ActionSpec.create_continuous(ACTION_SIZE), num_other_agents_in_group=4, ) agentbuffer = trajectory.to_agentbuffer() seen_keys = set() for key, field in agentbuffer.items(): assert len(field) == length seen_keys.add(key) assert seen_keys.issuperset(wanted_keys) for _key in wanted_group_keys: for step in agentbuffer[_key]: assert len(step) == 4