def test_publish_queue(dummy_config): brain_params_team0 = BrainParameters( brain_name="test_brain?team=0", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[1], vector_action_descriptions=[], vector_action_space_type=0, ) parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( brain_params_team0.brain_name ) brain_name = parsed_behavior_id0.brain_name brain_params_team1 = BrainParameters( brain_name="test_brain?team=1", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[1], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer( ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" ) # First policy encountered becomes policy trained by wrapped PPO # This queue should remain empty after swap snapshot policy = trainer.create_policy(brain_params_team0) trainer.add_policy(parsed_behavior_id0, policy) policy_queue0 = AgentManagerQueue(brain_params_team0.brain_name) trainer.publish_policy_queue(policy_queue0) # Ghost trainer should use this queue for ghost policy swap policy = trainer.create_policy(brain_params_team1) parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( brain_params_team1.brain_name ) trainer.add_policy(parsed_behavior_id1, policy) policy_queue1 = AgentManagerQueue(brain_params_team1.brain_name) trainer.publish_policy_queue(policy_queue1) # check ghost trainer swap pushes to ghost queue and not trainer assert policy_queue0.empty() and policy_queue1.empty() trainer._swap_snapshots() assert policy_queue0.empty() and not policy_queue1.empty() # clear policy_queue1.get_nowait() mock_brain = mb.setup_mock_brain( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.trainer.update_buffer = buffer # when ghost trainer advance and wrapped trainer buffers full # the wrapped trainer pushes updated policy to correct queue assert policy_queue0.empty() and policy_queue1.empty() trainer.advance() assert not policy_queue0.empty() and policy_queue1.empty()
def test_process_trajectory(dummy_config): brain_params_team0 = BrainParameters( brain_name="test_brain?team=0", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) brain_name = BehaviorIdentifiers.from_name_behavior_id( brain_params_team0.brain_name ).brain_name brain_params_team1 = BrainParameters( brain_name="test_brain?team=1", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer( ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" ) # first policy encountered becomes policy trained by wrapped PPO policy = trainer.create_policy(brain_params_team0) parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( brain_params_team0.brain_name ) trainer.add_policy(parsed_behavior_id0, policy) trajectory_queue0 = AgentManagerQueue(brain_params_team0.brain_name) trainer.subscribe_trajectory_queue(trajectory_queue0) # Ghost trainer should ignore this queue because off policy policy = trainer.create_policy(brain_params_team1) parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( brain_params_team1.brain_name ) trainer.add_policy(parsed_behavior_id1, policy) trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name) trainer.subscribe_trajectory_queue(trajectory_queue1) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_queue0.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.trainer.update_buffer.num_experiences == 15 trajectory_queue1.put(trajectory) trainer.advance() # Check that ghost trainer ignored off policy queue assert trainer.trainer.update_buffer.num_experiences == 15 # Check that it emptied the queue assert trajectory_queue1.empty()
def test_advance(dummy_config): brain_params = make_brain_parameters(discrete_action=False, visual_inputs=0, vec_obs_size=6) dummy_config.hyperparameters.steps_per_update = 20 dummy_config.hyperparameters.reward_signal_steps_per_update = 20 dummy_config.hyperparameters.buffer_init_steps = 0 trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(brain_params.brain_name, brain_params) trainer.add_policy(brain_params.brain_name, policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) trajectory = make_fake_trajectory( length=15, max_step_complete=True, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=6, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").num > 0 # Assert we're not just using the default values assert (trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").mean > 0) # Make sure there is a policy on the queue policy_queue.get_nowait() # Add another trajectory. Since this is less than 20 steps total (enough for) # two updates, there should NOT be a policy on the queue. trajectory = make_fake_trajectory( length=5, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait() # Call add_policy and check that we update the correct number of times. # This is to emulate a load from checkpoint. policy = trainer.create_policy(brain_params.brain_name, brain_params) policy.get_current_step = lambda: 200 trainer.add_policy(brain_params.brain_name, policy) trainer.optimizer.update = mock.Mock() trainer.optimizer.update_reward_signals = mock.Mock() trainer.optimizer.update_reward_signals.return_value = {} trainer.optimizer.update.return_value = {} trajectory_queue.put(trajectory) trainer.advance() # Make sure we did exactly 1 update assert trainer.optimizer.update.call_count == 1 assert trainer.optimizer.update_reward_signals.call_count == 1
def test_advance(mocked_save_model): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, observation_specs=create_observation_specs_with_shapes([(1, )]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) trajectory_queue.put(trajectory) trainer.advance() policy_queue.get_nowait() # Check that get_step is correct assert trainer.get_step == time_horizon # Check that we can turn off the trainer and that the buffer is cleared for _ in range(0, 5): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that if the policy doesn't update, we don't push it to the queue trainer.set_is_policy_updating(False) for _ in range(0, 10): trajectory_queue.put(trajectory) trainer.advance() # Check that there nothing in the policy queue with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait() # Check that no model has been saved assert not trainer.should_still_train assert mocked_save_model.call_count == 0
def test_advance(mocked_clear_update_buffer): trainer = create_rl_trainer() trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() policy_queue.get_nowait() # Check that get_step is correct assert trainer.get_step == time_horizon # Check that we can turn off the trainer and that the buffer is cleared for _ in range(0, 5): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that if the policy doesn't update, we don't push it to the queue trainer.set_is_policy_updating(False) for _ in range(0, 10): trajectory_queue.put(trajectory) trainer.advance() # Check that there nothing in the policy queue with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait() # Check that the buffer has been cleared assert not trainer.should_still_train assert mocked_clear_update_buffer.call_count > 0
def test_publish_queue(dummy_config): mock_specs = mb.setup_test_behavior_specs(True, False, vector_action_space=[1], vector_obs_space=8) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0) brain_name = parsed_behavior_id0.brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0, dummy_config, True, "0") # First policy encountered becomes policy trained by wrapped PPO # This queue should remain empty after swap snapshot policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) policy_queue0 = AgentManagerQueue(behavior_id_team0) trainer.publish_policy_queue(policy_queue0) # Ghost trainer should use this queue for ghost policy swap parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) policy_queue1 = AgentManagerQueue(behavior_id_team1) trainer.publish_policy_queue(policy_queue1) # check ghost trainer swap pushes to ghost queue and not trainer assert policy_queue0.empty() and policy_queue1.empty() trainer._swap_snapshots() assert policy_queue0.empty() and not policy_queue1.empty() # clear policy_queue1.get_nowait() mock_specs = mb.setup_test_behavior_specs( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.trainer.update_buffer = buffer # when ghost trainer advance and wrapped trainer buffers full # the wrapped trainer pushes updated policy to correct queue assert policy_queue0.empty() and policy_queue1.empty() trainer.advance() assert not policy_queue0.empty() and policy_queue1.empty()
def test_advance(dummy_config): brain_params = make_brain_parameters(discrete_action=False, visual_inputs=0, vec_obs_size=6) dummy_config["output_path"] = "./results/test_trainer_models/TestModel" dummy_config["steps_per_update"] = 20 trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(brain_params.brain_name, brain_params) trainer.add_policy(brain_params.brain_name, policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) trajectory = make_fake_trajectory( length=15, max_step_complete=True, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=6, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").num > 0 # Assert we're not just using the default values assert (trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").mean > 0) # Make sure there is a policy on the queue policy_queue.get_nowait() # Add another trajectory. Since this is less than 20 steps total (enough for) # two updates, there should NOT be a policy on the queue. trajectory = make_fake_trajectory( length=5, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait()