def test_process_trajectory(dummy_config): brain_params_team0 = BrainParameters( brain_name="test_brain?team=0", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) brain_name = BehaviorIdentifiers.from_name_behavior_id( brain_params_team0.brain_name).brain_name brain_params_team1 = BrainParameters( brain_name="test_brain?team=1", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0", False) trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0") # first policy encountered becomes policy trained by wrapped PPO policy = trainer.create_policy(brain_params_team0) trainer.add_policy(brain_params_team0.brain_name, policy) trajectory_queue0 = AgentManagerQueue(brain_params_team0.brain_name) trainer.subscribe_trajectory_queue(trajectory_queue0) # Ghost trainer should ignore this queue because off policy policy = trainer.create_policy(brain_params_team1) trainer.add_policy(brain_params_team1.brain_name, policy) trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name) trainer.subscribe_trajectory_queue(trajectory_queue1) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_queue0.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.trainer.update_buffer.num_experiences == 15 trajectory_queue1.put(trajectory) trainer.advance() # Check that ghost trainer ignored off policy queue assert trainer.trainer.update_buffer.num_experiences == 15 # Check that it emptied the queue assert trajectory_queue1.empty()
def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete): optimizer = create_test_ppo_optimizer(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, observation_specs=optimizer.policy.behavior_spec.observation_specs, action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC, max_step_complete=True, ) run_out, final_value_out = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=False) for key, val in run_out.items(): assert type(key) is str assert len(val) == 15 run_out, final_value_out = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=True) for key, val in final_value_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly optimizer.reward_signals["extrinsic"].use_terminal_states = False run_out, final_value_out = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=False) for key, val in final_value_out.items(): assert type(key) is str assert val != 0.0
def test_process_trajectory(dummy_config): brain_params = make_brain_parameters( discrete_action=False, visual_inputs=0, vec_obs_size=6 ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(brain_params) trainer.add_policy(brain_params.brain_name, policy) trajectory_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trajectory = make_fake_trajectory( length=15, max_step_complete=True, vec_obs_size=6, num_vis_obs=0, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=15, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
def test_process_trajectory(dummy_config): mock_specs = mb.setup_test_behavior_specs(True, False, vector_action_space=[2], vector_obs_space=1) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" brain_name = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0).brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0, dummy_config, True, "0") # first policy encountered becomes policy trained by wrapped PPO parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0) policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) trajectory_queue0 = AgentManagerQueue(behavior_id_team0) trainer.subscribe_trajectory_queue(trajectory_queue0) # Ghost trainer should ignore this queue because off policy parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) trajectory_queue1 = AgentManagerQueue(behavior_id_team1) trainer.subscribe_trajectory_queue(trajectory_queue1) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) trajectory_queue0.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.trainer.update_buffer.num_experiences == 15 trajectory_queue1.put(trajectory) trainer.advance() # Check that ghost trainer ignored off policy queue assert trainer.trainer.update_buffer.num_experiences == 15 # Check that it emptied the queue assert trajectory_queue1.empty()
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" policy = PPOPolicy(0, brain_params, dummy_config, False, False) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=False) for key, val in run_out.items(): assert type(key) is str assert type(val) is float run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True) for key, val in run_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly policy.reward_signals["extrinsic"].use_terminal_states = False run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True) for key, val in run_out.items(): assert type(key) is str assert val != 0.0 agentbuffer = trajectory.to_agentbuffer() batched_values = policy.get_batched_value_estimates(agentbuffer) for values in batched_values.values(): assert len(values) == 15
def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete): optimizer = create_test_ppo_optimizer(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Time horizon is longer than sequence length, make sure to test # process trajectory on multiple sequences in trajectory + some padding time_horizon = 30 trajectory = make_fake_trajectory( length=time_horizon, observation_specs=optimizer.policy.behavior_spec.observation_specs, action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC, max_step_complete=True, ) run_out, final_value_out, all_memories = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=False) if rnn: # Check that memories don't have a Torch gradient for mem in optimizer.critic_memory_dict.values(): assert not mem.requires_grad for key, val in run_out.items(): assert type(key) is str assert len(val) == time_horizon if all_memories is not None: assert len(all_memories) == time_horizon run_out, final_value_out, _ = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=True) for key, val in final_value_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly optimizer.reward_signals["extrinsic"].use_terminal_states = False run_out, final_value_out, _ = optimizer.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=False) for key, val in final_value_out.items(): assert type(key) is str assert val != 0.0
def _create_fake_trajectory(use_discrete, use_visual, time_horizon): if use_discrete: act_space = DISCRETE_ACTION_SPACE else: act_space = VECTOR_ACTION_SPACE if use_visual: num_vis_obs = 1 vec_obs_size = 0 else: num_vis_obs = 0 vec_obs_size = VECTOR_OBS_SPACE trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=vec_obs_size, num_vis_obs=num_vis_obs, action_space=act_space, ) return trajectory
def test_large_normalization(): behavior_spec = mb.setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1 ) # Taken from Walker seed 3713 which causes NaN without proper initialization large_obs1 = [ 1800.00036621, 1799.96972656, 1800.01245117, 1800.07214355, 1800.02758789, 1799.98303223, 1799.88647461, 1799.89575195, 1800.03479004, 1800.14025879, 1800.17675781, 1800.20581055, 1800.33740234, 1800.36450195, 1800.43457031, 1800.45544434, 1800.44604492, 1800.56713867, 1800.73901367, ] large_obs2 = [ 1799.99975586, 1799.96679688, 1799.92980957, 1799.89550781, 1799.93774414, 1799.95300293, 1799.94067383, 1799.92993164, 1799.84057617, 1799.69873047, 1799.70605469, 1799.82849121, 1799.85095215, 1799.76977539, 1799.78283691, 1799.76708984, 1799.67163086, 1799.59191895, 1799.5135498, 1799.45556641, 1799.3717041, ] policy = TFPolicy( 0, behavior_spec, TrainerSettings(network_settings=NetworkSettings(normalize=True)), "testdir", False, ) time_horizon = len(large_obs1) trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_space=[2], ) for i in range(time_horizon): trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert mean[0] == pytest.approx(np.mean(large_obs1, dtype=np.float32), abs=0.01) assert variance[0] / steps == pytest.approx( np.var(large_obs1, dtype=np.float32), abs=0.01 ) time_horizon = len(large_obs2) trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_space=[2], ) for i in range(time_horizon): trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert mean[0] == pytest.approx( np.mean(large_obs1 + large_obs2, dtype=np.float32), abs=0.01 ) assert variance[0] / steps == pytest.approx( np.var(large_obs1 + large_obs2, dtype=np.float32), abs=0.01 )
def test_advance(dummy_config): brain_params = make_brain_parameters( discrete_action=False, visual_inputs=0, vec_obs_size=6 ) dummy_config.hyperparameters.steps_per_update = 20 dummy_config.hyperparameters.reward_signal_steps_per_update = 20 dummy_config.hyperparameters.buffer_init_steps = 0 trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(brain_params.brain_name, brain_params) trainer.add_policy(brain_params.brain_name, policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) trajectory = make_fake_trajectory( length=15, max_step_complete=True, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=6, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0 # Assert we're not just using the default values assert ( trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").mean > 0 ) # Make sure there is a policy on the queue policy_queue.get_nowait() # Add another trajectory. Since this is less than 20 steps total (enough for) # two updates, there should NOT be a policy on the queue. trajectory = make_fake_trajectory( length=5, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait() # Call add_policy and check that we update the correct number of times. # This is to emulate a load from checkpoint. policy = trainer.create_policy(brain_params.brain_name, brain_params) policy.get_current_step = lambda: 200 trainer.add_policy(brain_params.brain_name, policy) trainer.optimizer.update = mock.Mock() trainer.optimizer.update_reward_signals = mock.Mock() trainer.optimizer.update_reward_signals.return_value = {} trainer.optimizer.update.return_value = {} trajectory_queue.put(trajectory) trainer.advance() # Make sure we did exactly 1 update assert trainer.optimizer.update.call_count == 1 assert trainer.optimizer.update_reward_signals.call_count == 1
def test_normalization(): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) policy = NNPolicy( 0, brain_params, TrainerSettings(network_settings=NetworkSettings(normalize=True)), False, "testdir", False, ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert steps == 6 assert mean[0] == 0.5 # Note: variance is divided by number of steps, and initialized to 1 to avoid # divide by 0. The right answer is 0.25 assert (variance[0] - 1) / steps == 0.25 # Make another update, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert steps == 16 assert mean[0] == 0.8125 assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
def test_normalization(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params.brain_name, 0, dummy_config, True, False, 0, "0", False) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) policy = trainer.create_policy(brain_params) trainer.add_policy(brain_params.brain_name, policy) trainer._process_trajectory(trajectory) # Check that the running mean and variance is correct steps, mean, variance = trainer.policy.sess.run([ trainer.policy.model.normalization_steps, trainer.policy.model.running_mean, trainer.policy.model.running_variance, ]) assert steps == 6 assert mean[0] == 0.5 # Note: variance is divided by number of steps, and initialized to 1 to avoid # divide by 0. The right answer is 0.25 assert (variance[0] - 1) / steps == 0.25 # Make another update, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trainer._process_trajectory(trajectory) # Check that the running mean and variance is correct steps, mean, variance = trainer.policy.sess.run([ trainer.policy.model.normalization_steps, trainer.policy.model.running_mean, trainer.policy.model.running_variance, ]) assert steps == 16 assert mean[0] == 0.8125 assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
def test_normalizer_after_load(tmp_path): behavior_spec = mb.setup_test_behavior_specs(use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) trainer_params = TrainerSettings(network_settings=NetworkSettings( normalize=True)) policy = TFPolicy(0, behavior_spec, trainer_params) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 6 assert mean[0] == 0.5 assert variance[0] / steps == pytest.approx(0.25, abs=0.01) # Save ckpt and load into another policy path1 = os.path.join(tmp_path, "runid1") model_saver = TFModelSaver(trainer_params, path1) model_saver.register(policy) mock_brain_name = "MockBrain" model_saver.save_checkpoint(mock_brain_name, 6) assert len(os.listdir(tmp_path)) > 0 policy1 = TFPolicy(0, behavior_spec, trainer_params) model_saver = TFModelSaver(trainer_params, path1, load=True) model_saver.register(policy1) model_saver.initialize_or_load(policy1) # Make another update to new policy, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) trajectory_buffer = trajectory.to_agentbuffer() policy1.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy1.sess.run([ policy1.normalization_steps, policy1.running_mean, policy1.running_variance ]) assert steps == 16 assert mean[0] == 0.8125 assert variance[0] / steps == pytest.approx(0.152, abs=0.01)
def test_poca_get_value_estimates(dummy_config, rnn, visual, discrete): optimizer = create_test_poca_optimizer(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) time_horizon = 30 trajectory = make_fake_trajectory( length=time_horizon, observation_specs=optimizer.policy.behavior_spec.observation_specs, action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC, max_step_complete=True, num_other_agents_in_group=NUM_AGENTS, ) ( value_estimates, baseline_estimates, value_next, value_memories, baseline_memories, ) = optimizer.get_trajectory_and_baseline_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, trajectory.next_group_obs, done=False, ) for key, val in value_estimates.items(): assert type(key) is str assert len(val) == time_horizon for key, val in baseline_estimates.items(): assert type(key) is str assert len(val) == time_horizon if value_memories is not None: assert len(value_memories) == time_horizon assert len(baseline_memories) == time_horizon ( value_estimates, baseline_estimates, value_next, value_memories, baseline_memories, ) = optimizer.get_trajectory_and_baseline_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, trajectory.next_group_obs, done=True, ) for key, val in value_next.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly optimizer.reward_signals["extrinsic"].use_terminal_states = False ( value_estimates, baseline_estimates, value_next, value_memories, baseline_memories, ) = optimizer.get_trajectory_and_baseline_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, trajectory.next_group_obs, done=False, ) for key, val in value_next.items(): assert type(key) is str assert val != 0.0
def test_advance(dummy_config): brain_params = make_brain_parameters(discrete_action=False, visual_inputs=0, vec_obs_size=6) dummy_config["output_path"] = "./results/test_trainer_models/TestModel" dummy_config["steps_per_update"] = 20 trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(brain_params.brain_name, brain_params) trainer.add_policy(brain_params.brain_name, policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) trajectory = make_fake_trajectory( length=15, max_step_complete=True, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=6, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").num > 0 # Assert we're not just using the default values assert (trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").mean > 0) # Make sure there is a policy on the queue policy_queue.get_nowait() # Add another trajectory. Since this is less than 20 steps total (enough for) # two updates, there should NOT be a policy on the queue. trajectory = make_fake_trajectory( length=5, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait()