def test_actor_critic(ac_type, lstm): obs_size = 4 network_settings = NetworkSettings( memory=NetworkSettings.MemorySettings() if lstm else None) obs_shapes = [(obs_size, )] act_size = [2] stream_names = [f"stream_name{n}" for n in range(4)] action_spec = ActionSpec.create_continuous(act_size[0]) actor = ac_type(obs_shapes, network_settings, action_spec, stream_names) if lstm: sample_obs = torch.ones( (1, network_settings.memory.sequence_length, obs_size)) memories = torch.ones( (1, network_settings.memory.sequence_length, actor.memory_size)) else: sample_obs = torch.ones((1, obs_size)) memories = torch.tensor([]) # memories isn't always set to None, the network should be able to # deal with that. # Test critic pass value_out, memories_out = actor.critic_pass([sample_obs], [], memories=memories) for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) assert memories_out.shape == memories.shape else: assert value_out[stream].shape == (1, ) # Test get_dist_and_value dists, value_out, mem_out = actor.get_dist_and_value([sample_obs], [], memories=memories) if mem_out is not None: assert mem_out.shape == memories.shape for dist in dists: assert isinstance(dist, GaussianDistInstance) for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) else: assert value_out[stream].shape == (1, )
def test_advance(mocked_clear_update_buffer, mocked_save_model): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, observation_shapes=[(1,)], max_step_complete=True, action_spec=ActionSpec.create_discrete((2,)), ) trajectory_queue.put(trajectory) trainer.advance() policy_queue.get_nowait() # Check that get_step is correct assert trainer.get_step == time_horizon # Check that we can turn off the trainer and that the buffer is cleared for _ in range(0, 5): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that if the policy doesn't update, we don't push it to the queue trainer.set_is_policy_updating(False) for _ in range(0, 10): trajectory_queue.put(trajectory) trainer.advance() # Check that there nothing in the policy queue with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait() # Check that the buffer has been cleared assert not trainer.should_still_train assert mocked_clear_update_buffer.call_count > 0 assert mocked_save_model.call_count == 0
def test_multinetworkbody_visual(with_actions): torch.manual_seed(0) act_size = 2 n_agents = 3 obs_size = 4 vis_obs_size = (84, 84, 3) network_settings = NetworkSettings() obs_shapes = [(obs_size, ), vis_obs_size] action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) networkbody = MultiAgentNetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, action_spec) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3) sample_obs = [[0.1 * torch.ones( (1, obs_size))] + [0.1 * torch.ones((1, 84, 84, 3))] for _ in range(n_agents)] # simulate baseline in POCA sample_act = [ AgentAction(0.1 * torch.ones((1, 2)), [0.1 * torch.ones(1) for _ in range(act_size)]) for _ in range(n_agents - 1) ] for _ in range(300): if with_actions: encoded, _ = networkbody(obs_only=sample_obs[:1], obs=sample_obs[1:], actions=sample_act) else: encoded, _ = networkbody(obs_only=sample_obs, obs=[], actions=[]) assert encoded.shape == (1, network_settings.hidden_units) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
def test_poca_end_episode(): name_behavior_id = "test_trainer" trainer = POCATrainer( name_behavior_id, 10, TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20), True, False, 0, "mock_model_path", ) behavior_spec = BehaviorSpec(create_observation_specs_with_shapes([(1, )]), ActionSpec.create_discrete((2, ))) parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id( name_behavior_id) mock_policy = trainer.create_policy(parsed_behavior_id, behavior_spec) trainer.add_policy(parsed_behavior_id, mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, observation_specs=behavior_spec.observation_specs, max_step_complete=False, action_spec=behavior_spec.action_spec, num_other_agents_in_group=2, group_reward=1.0, is_terminal=False, ) trajectory_queue.put(trajectory) trainer.advance() # Test that some trajectoories have been injested for reward in trainer.collected_group_rewards.values(): assert reward == 10 # Test end episode trainer.end_episode() assert len(trainer.collected_group_rewards.keys()) == 0
def create_mock_steps( num_agents: int, observation_shapes: List[Tuple], action_spec: ActionSpec, done: bool = False, ) -> Tuple[DecisionSteps, TerminalSteps]: """ Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations. Imitates constant vector/visual observations, rewards, dones, and agents. :int num_agents: Number of "agents" to imitate. :List observation_shapes: A List of the observation spaces in your steps :int num_vector_acts: Number of actions in your action space :bool discrete: Whether or not action space is discrete :bool done: Whether all the agents in the batch are done """ obs_list = [] for _shape in observation_shapes: obs_list.append(np.ones((num_agents, ) + _shape, dtype=np.float32)) action_mask = None if action_spec.is_discrete(): action_mask = [ np.array(num_agents * [action_size * [False]]) for action_size in action_spec.discrete_branches # type: ignore ] # type: ignore reward = np.array(num_agents * [1.0], dtype=np.float32) interrupted = np.array(num_agents * [False], dtype=np.bool) agent_id = np.arange(num_agents, dtype=np.int32) behavior_spec = BehaviorSpec(observation_shapes, action_spec) if done: return ( DecisionSteps.empty(behavior_spec), TerminalSteps(obs_list, reward, interrupted, agent_id), ) else: return ( DecisionSteps(obs_list, reward, agent_id, action_mask), TerminalSteps.empty(behavior_spec), )
def test_specs(): specs = ActionSpec.create_continuous(3) assert specs.discrete_branches == () assert specs.discrete_size == 0 assert specs.continuous_size == 3 assert specs.empty_action(5).continuous.shape == (5, 3) assert specs.empty_action(5).continuous.dtype == np.float32 specs = ActionSpec.create_discrete((3, )) assert specs.discrete_branches == (3, ) assert specs.discrete_size == 1 assert specs.continuous_size == 0 assert specs.empty_action(5).discrete.shape == (5, 1) assert specs.empty_action(5).discrete.dtype == np.int32 specs = ActionSpec(3, (3, )) assert specs.continuous_size == 3 assert specs.discrete_branches == (3, ) assert specs.discrete_size == 1 assert specs.empty_action(5).continuous.shape == (5, 3) assert specs.empty_action(5).continuous.dtype == np.float32 assert specs.empty_action(5).discrete.shape == (5, 1) assert specs.empty_action(5).discrete.dtype == np.int32
from mlagents_envs.base_env import ActionSpec @pytest.fixture def dummy_config(): return ppo_dummy_config() VECTOR_ACTION_SPACE = 2 VECTOR_OBS_SPACE = 8 DISCRETE_ACTION_SPACE = [3, 3, 3, 2] BUFFER_INIT_SAMPLES = 64 NUM_AGENTS = 12 CONTINUOUS_ACTION_SPEC = ActionSpec.create_continuous(VECTOR_ACTION_SPACE) DISCRETE_ACTION_SPEC = ActionSpec.create_discrete(tuple(DISCRETE_ACTION_SPACE)) def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config) trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None)
def test_agent_deletion(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1]])), } mock_decision_step, mock_terminal_step = mb.create_mock_steps( num_agents=1, observation_shapes=[(8,)], action_spec=ActionSpec.create_continuous(2), ) mock_done_decision_step, mock_done_terminal_step = mb.create_mock_steps( num_agents=1, observation_shapes=[(8,)], action_spec=ActionSpec.create_continuous(2), done=True, ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1]])), env_action=ActionTuple(continuous=np.array([[0.1]])), value=[0.1], outputs=fake_action_outputs, agent_ids=mock_decision_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_step, mock_terminal_step, 0, ActionInfo.empty() ) # Run 3 trajectories, with different workers (to simulate different agents) add_calls = [] remove_calls = [] for _ep in range(3): for _ in range(5): processor.add_experiences( mock_decision_step, mock_terminal_step, _ep, fake_action_info ) add_calls.append( mock.call([get_global_agent_id(_ep, 0)], fake_action_outputs["action"]) ) processor.add_experiences( mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info ) # Make sure we don't add experiences from the prior agents after the done remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) policy.save_previous_action.assert_has_calls(add_calls) policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0 assert len(processor.last_step_result.keys()) == 0 # check that steps with immediate dones don't add to dicts processor.add_experiences( mock_done_decision_step, mock_done_terminal_step, 0, ActionInfo.empty() ) assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0 assert len(processor.last_step_result.keys()) == 0
def __init__( self, brain_names, step_size=STEP_SIZE, num_visual=0, num_vector=1, num_var_len=0, vis_obs_size=VIS_OBS_SIZE, vec_obs_size=OBS_SIZE, var_len_obs_size=VAR_LEN_SIZE, action_sizes=(1, 0), ): super().__init__() self.num_visual = num_visual self.num_vector = num_vector self.num_var_len = num_var_len self.vis_obs_size = vis_obs_size self.vec_obs_size = vec_obs_size self.var_len_obs_size = var_len_obs_size continuous_action_size, discrete_action_size = action_sizes discrete_tuple = tuple(2 for _ in range(discrete_action_size)) action_spec = ActionSpec(continuous_action_size, discrete_tuple) self.total_action_size = (continuous_action_size + discrete_action_size ) # to set the goals/positions self.action_spec = action_spec self.behavior_spec = BehaviorSpec(self._make_observation_specs(), action_spec) self.action_spec = action_spec self.names = brain_names self.positions: Dict[str, List[float]] = {} self.step_count: Dict[str, float] = {} # Concatenate the arguments for a consistent random seed seed = ( brain_names, step_size, num_visual, num_vector, num_var_len, vis_obs_size, vec_obs_size, var_len_obs_size, action_sizes, ) self.random = random.Random(str(seed)) self.goal: Dict[str, int] = {} self.action = {} self.rewards: Dict[str, float] = {} self.final_rewards: Dict[str, List[float]] = {} self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {} self.agent_id: Dict[str, int] = {} self.step_size = step_size # defines the difficulty of the test # Allow to be used as a UnityEnvironment during tests self.academy_capabilities = None for name in self.names: self.agent_id[name] = 0 self.goal[name] = self.random.choice([-1, 1]) self.rewards[name] = 0 self.final_rewards[name] = [] self._reset_agent(name) self.action[name] = None self.step_result[name] = None
def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1], [0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])), } mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes([(8, )] + num_vis_obs * [(84, 84, 3)]), action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1], [0.1]])), env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])), value=[0.1, 0.1], outputs=fake_action_outputs, agent_ids=mock_decision_steps.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences(mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()) for _ in range(5): processor.add_experiences(mock_decision_steps, mock_terminal_steps, 0, fake_action_info) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0 # Test empty steps mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=0, observation_specs=create_observation_specs_with_shapes([(8, )] + num_vis_obs * [(84, 84, 3)]), action_spec=ActionSpec.create_continuous(2), ) processor.add_experiences(mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()) # Assert that the AgentProcessor is still empty assert len(processor.experience_buffers[0]) == 0
create_reward_provider, ) from mlagents_envs.base_env import BehaviorSpec, ActionSpec from mlagents.trainers.settings import GAILSettings, RewardSignalType from mlagents.trainers.tests.torch.test_reward_providers.utils import ( create_agent_buffer, ) from mlagents.trainers.torch.components.reward_providers.gail_reward_provider import ( DiscriminatorNetwork, ) CONTINUOUS_PATH = (os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir) + "/test.demo") DISCRETE_PATH = (os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir) + "/testdcvis.demo") SEED = [42] ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(2) ACTIONSPEC_FOURDISCRETE = ActionSpec.create_discrete((2, 3, 3, 3)) ACTIONSPEC_DISCRETE = ActionSpec.create_discrete((20, )) @pytest.mark.parametrize("behavior_spec", [BehaviorSpec([(8, )], ACTIONSPEC_CONTINUOUS)]) def test_construction(behavior_spec: BehaviorSpec) -> None: gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH) gail_rp = GAILRewardProvider(behavior_spec, gail_settings) assert gail_rp.name == "GAIL" @pytest.mark.parametrize("behavior_spec", [BehaviorSpec([(8, )], ACTIONSPEC_CONTINUOUS)]) def test_factory(behavior_spec: BehaviorSpec) -> None:
def make_fake_trajectory( length: int, observation_shapes: List[Tuple], action_spec: ActionSpec, max_step_complete: bool = False, memory_size: int = 10, ) -> Trajectory: """ Makes a fake trajectory of length length. If max_step_complete, the trajectory is terminated by a max step rather than a done. """ steps_list = [] action_size = action_spec.discrete_size + action_spec.continuous_size action_probs = np.ones( int( np.sum(action_spec.discrete_branches) + action_spec.continuous_size), dtype=np.float32, ) for _i in range(length - 1): obs = [] for _shape in observation_shapes: obs.append(np.ones(_shape, dtype=np.float32)) reward = 1.0 done = False action = np.zeros(action_size, dtype=np.float32) action_pre = np.zeros(action_size, dtype=np.float32) action_mask = ([[False for _ in range(branch)] for branch in action_spec.discrete_branches ] # type: ignore if action_spec.is_discrete() else None) prev_action = np.ones(action_size, dtype=np.float32) max_step = False memory = np.ones(memory_size, dtype=np.float32) agent_id = "test_agent" behavior_id = "test_brain" experience = AgentExperience( obs=obs, reward=reward, done=done, action=action, action_probs=action_probs, action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, interrupted=max_step, memory=memory, ) steps_list.append(experience) obs = [] for _shape in observation_shapes: obs.append(np.ones(_shape, dtype=np.float32)) last_experience = AgentExperience( obs=obs, reward=reward, done=not max_step_complete, action=action, action_probs=action_probs, action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, interrupted=max_step_complete, memory=memory, ) steps_list.append(last_experience) return Trajectory(steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs)
torch.onnx.export(network, dummy_input, EXPORT_FILE, opset_version=9, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes) if __name__ == '__main__': obs_spec = [ ObservationSpec(shape=(16, ), dimension_property=(DimensionProperty.UNSPECIFIED, ), observation_type=ObservationType.DEFAULT) ] act_spec = ActionSpec(continuous_size=4, discrete_branches=()) net_settings = NetworkSettings(normalize=False, hidden_units=256, num_layers=2, vis_encode_type=EncoderType.SIMPLE, memory=NetworkSettings.MemorySettings( sequence_length=64, memory_size=256)) network = SerializableSimpleActor(obs_spec, net_settings, act_spec) state_dict = torch.load(MODEL_FILE, map_location=torch.device('cpu')) filtered_sd = { i: j for i, j in state_dict['Policy'].items() if 'critic' not in i } network.load_state_dict(filtered_sd)
from mlagents.trainers.buffer import BufferKey import pytest import numpy as np from mlagents.trainers.torch.components.reward_providers import ( ExtrinsicRewardProvider, create_reward_provider, ) from mlagents_envs.base_env import BehaviorSpec, ActionSpec from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType from mlagents.trainers.tests.torch.test_reward_providers.utils import ( create_agent_buffer, ) from mlagents.trainers.tests.dummy_config import create_observation_specs_with_shapes ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(5) ACTIONSPEC_TWODISCRETE = ActionSpec.create_discrete((2, 3)) @pytest.mark.parametrize( "behavior_spec", [ BehaviorSpec(create_observation_specs_with_shapes([(10, )]), ACTIONSPEC_CONTINUOUS), BehaviorSpec(create_observation_specs_with_shapes([(10, )]), ACTIONSPEC_TWODISCRETE), ], ) def test_construction(behavior_spec: BehaviorSpec) -> None: settings = RewardSignalSettings() settings.gamma = 0.2 extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings) assert extrinsic_rp.gamma == 0.2
def create_action_model(inp_size, act_size, deterministic=False): mask = torch.ones([1, act_size ** 2]) action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) action_model = ActionModel(inp_size, action_spec, deterministic=deterministic) return action_model, mask
def create_action_model(inp_size, act_size): mask = torch.ones([1, act_size * 2]) action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) action_model = ActionModel(inp_size, action_spec) return action_model, mask
def basic_behavior_spec(): dummy_actionspec = ActionSpec.create_continuous(1) dummy_groupspec = BehaviorSpec([(1, )], dummy_actionspec) return dummy_groupspec
def test_group_statuses(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=4, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), grouped=True, ) fake_action_info = _create_action_info(4, mock_decision_steps.agent_id) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty() ) for _ in range(2): processor.add_experiences( mock_decision_steps, mock_terminal_steps, 0, fake_action_info ) # Make terminal steps for some dead agents _, mock_terminal_steps_2 = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), done=True, grouped=True, agent_ids=[2, 3], ) # Make decision steps continue for other agents mock_decision_steps_2, _ = mb.create_mock_steps( num_agents=2, observation_specs=create_observation_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), done=False, grouped=True, agent_ids=[0, 1], ) processor.add_experiences( mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info ) # Continue to add for remaining live agents fake_action_info = _create_action_info(4, mock_decision_steps_2.agent_id) for _ in range(3): processor.add_experiences( mock_decision_steps_2, mock_terminal_steps, 0, fake_action_info ) # Assert that four trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 4 # Get the first trajectory, which should have been agent 2 (one of the killed agents) trajectory = tqueue.put.call_args_list[0][0][-1] assert len(trajectory.steps) == 3 # Make sure trajectory has the right Groupmate Experiences. # All three steps should contain all agents for step in trajectory.steps: assert len(step.group_status) == 3 # Last trajectory should be the longest. It should be that of agent 1, one of the surviving agents. trajectory = tqueue.put.call_args_list[-1][0][-1] assert len(trajectory.steps) == 5 # Make sure trajectory has the right Groupmate Experiences. # THe first 3 steps should contain all of the obs (that 3rd step is also the terminal step of 2 of the agents) for step in trajectory.steps[0:3]: assert len(step.group_status) == 3 # After 2 agents has died, there should only be 1 group status. for step in trajectory.steps[3:]: assert len(step.group_status) == 1
def create_behavior_spec(num_visual, num_vector, vector_size): behavior_spec = BehaviorSpec( [(84, 84, 3)] * int(num_visual) + [(vector_size,)] * int(num_vector), ActionSpec.create_discrete((1,)), ) return behavior_spec
def make_fake_trajectory( length: int, observation_specs: List[ObservationSpec], action_spec: ActionSpec, max_step_complete: bool = False, memory_size: int = 10, num_other_agents_in_group: int = 0, ) -> Trajectory: """ Makes a fake trajectory of length length. If max_step_complete, the trajectory is terminated by a max step rather than a done. """ steps_list = [] action_size = action_spec.discrete_size + action_spec.continuous_size for _i in range(length - 1): obs = [] for obs_spec in observation_specs: obs.append(np.ones(obs_spec.shape, dtype=np.float32)) reward = 1.0 done = False action = ActionTuple( continuous=np.zeros(action_spec.continuous_size, dtype=np.float32), discrete=np.zeros(action_spec.discrete_size, dtype=np.int32), ) action_probs = LogProbsTuple( continuous=np.ones(action_spec.continuous_size, dtype=np.float32), discrete=np.ones(action_spec.discrete_size, dtype=np.float32), ) action_mask = ( [ [False for _ in range(branch)] for branch in action_spec.discrete_branches ] # type: ignore if action_spec.is_discrete() else None ) if action_spec.is_discrete(): prev_action = np.ones(action_size, dtype=np.int32) else: prev_action = np.ones(action_size, dtype=np.float32) max_step = False memory = np.ones(memory_size, dtype=np.float32) agent_id = "test_agent" behavior_id = "test_brain" group_status = [] for _ in range(num_other_agents_in_group): group_status.append(AgentStatus(obs, reward, action, done)) experience = AgentExperience( obs=obs, reward=reward, done=done, action=action, action_probs=action_probs, action_mask=action_mask, prev_action=prev_action, interrupted=max_step, memory=memory, group_status=group_status, group_reward=0, ) steps_list.append(experience) obs = [] for obs_spec in observation_specs: obs.append(np.ones(obs_spec.shape, dtype=np.float32)) last_experience = AgentExperience( obs=obs, reward=reward, done=not max_step_complete, action=action, action_probs=action_probs, action_mask=action_mask, prev_action=prev_action, interrupted=max_step_complete, memory=memory, group_status=group_status, group_reward=0, ) steps_list.append(last_experience) return Trajectory( steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs, next_group_obs=[obs] * num_other_agents_in_group, )
def test_actor_critic(ac_type, lstm): obs_size = 4 network_settings = NetworkSettings( memory=NetworkSettings.MemorySettings() if lstm else None, normalize=True ) obs_shapes = [(obs_size,)] act_size = 2 mask = torch.ones([1, act_size * 2]) stream_names = [f"stream_name{n}" for n in range(4)] # action_spec = ActionSpec.create_continuous(act_size[0]) action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) actor = ac_type(obs_shapes, network_settings, action_spec, stream_names) if lstm: sample_obs = torch.ones((1, network_settings.memory.sequence_length, obs_size)) memories = torch.ones( (1, network_settings.memory.sequence_length, actor.memory_size) ) else: sample_obs = torch.ones((1, obs_size)) memories = torch.tensor([]) # memories isn't always set to None, the network should be able to # deal with that. # Test critic pass value_out, memories_out = actor.critic_pass([sample_obs], [], memories=memories) for stream in stream_names: if lstm: assert value_out[stream].shape == (network_settings.memory.sequence_length,) assert memories_out.shape == memories.shape else: assert value_out[stream].shape == (1,) # Test get action stats and_value action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value( [sample_obs], [], memories=memories, masks=mask ) if lstm: assert action.continuous_tensor.shape == (64, 2) else: assert action.continuous_tensor.shape == (1, 2) assert len(action.discrete_list) == 2 for _disc in action.discrete_list: if lstm: assert _disc.shape == (64, 1) else: assert _disc.shape == (1, 1) if mem_out is not None: assert mem_out.shape == memories.shape for stream in stream_names: if lstm: assert value_out[stream].shape == (network_settings.memory.sequence_length,) else: assert value_out[stream].shape == (1,) # Test normalization actor.update_normalization(sample_obs) if isinstance(actor, SeparateActorCritic): for act_proc, crit_proc in zip( actor.network_body.vector_processors, actor.critic.network_body.vector_processors, ): assert compare_models(act_proc, crit_proc)