def test_add_rewards_output(dummy_config): brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False) rewardsout = AllRewardsOutput( reward_signals={ "extrinsic": RewardSignalResult(scaled_reward=np.array([1.0, 1.0]), unscaled_reward=np.array([1.0, 1.0])) }, environment=np.array([1.0, 1.0]), ) values = {"extrinsic": np.array([[2.0]])} agent_id = "123" idx = 0 # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail. next_idx = 1 trainer.add_rewards_outputs( rewardsout, values=values, agent_id=agent_id, agent_idx=idx, agent_next_idx=next_idx, ) assert trainer.training_buffer[agent_id]["extrinsic_value_estimates"][ 0] == 2.0 assert trainer.training_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
def test_trainer_increment_step(dummy_config): trainer_params = dummy_config brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0) trainer = PPOTrainer(brain_params, 0, trainer_params, True, False, 0, "0", False) policy_mock = mock.Mock() step_count = 10 policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.policy = policy_mock trainer.increment_step(5) policy_mock.increment_step.assert_called_with(5) assert trainer.step == 10
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_brain = mb.setup_mock_brain( use_discrete, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16 ) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(mock_brain.brain_name, mock_brain) trainer.add_policy(mock_brain.brain_name, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.update_buffer = buffer trainer._update_policy()
def test_trainer_update_policy(dummy_config, use_discrete): mock_brain = mb.setup_mock_brain( use_discrete, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params["use_recurrent"] = True # Test curiosity reward signal trainer_params["reward_signals"]["curiosity"] = {} trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0 trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99 trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128 trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(mock_brain.brain_name, mock_brain) trainer.add_policy(mock_brain.brain_name, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.update_buffer = buffer trainer._update_policy()
def test_add_get_policy(ppo_optimizer, mock_create_model_saver, dummy_config): mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer trainer = PPOTrainer("test_policy", 0, dummy_config, True, False, 0, "0") policy = mock.Mock(spec=TFPolicy) policy.get_current_step.return_value = 2000 behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) trainer.add_policy(behavior_id, policy) assert trainer.get_policy("test_policy") == policy # Make sure the summary steps were loaded properly assert trainer.get_step == 2000
def test_process_trajectory(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params.brain_name, 0, dummy_config, True, False, 0, "0", False) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=2, ) policy = trainer.create_policy(brain_params) trainer.add_policy(brain_params.brain_name, policy) trainer.process_trajectory(trajectory) # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that GAE worked assert ("advantages" in trainer.update_buffer and "discounted_returns" in trainer.update_buffer) # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=time_horizon + 1, max_step_complete=False, vec_obs_size=1, num_vis_obs=0, action_space=2, ) trainer.process_trajectory(trajectory) # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").num > 0
def test_process_trajectory(dummy_config): brain_params_team0 = BrainParameters( brain_name="test_brain?team=0", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) brain_name = BehaviorIdentifiers.from_name_behavior_id( brain_params_team0.brain_name).brain_name brain_params_team1 = BrainParameters( brain_name="test_brain?team=1", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0", False) trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0") # first policy encountered becomes policy trained by wrapped PPO policy = trainer.create_policy(brain_params_team0) trainer.add_policy(brain_params_team0.brain_name, policy) trajectory_queue0 = AgentManagerQueue(brain_params_team0.brain_name) trainer.subscribe_trajectory_queue(trajectory_queue0) # Ghost trainer should ignore this queue because off policy policy = trainer.create_policy(brain_params_team1) trainer.add_policy(brain_params_team1.brain_name, policy) trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name) trainer.subscribe_trajectory_queue(trajectory_queue1) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_queue0.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.trainer.update_buffer.num_experiences == 15 trajectory_queue1.put(trajectory) trainer.advance() # Check that ghost trainer ignored off policy queue assert trainer.trainer.update_buffer.num_experiences == 15 # Check that it emptied the queue assert trajectory_queue1.empty()
def test_resume(dummy_config, tmp_path): mock_specs = mb.setup_test_behavior_specs(True, False, vector_action_space=[2], vector_obs_space=1) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" brain_name = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0).brain_name tmp_path = tmp_path.as_posix() ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path) controller = GhostController(100) trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path) parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0) policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) trainer.save_model() # Make a new trainer, check that the policies are the same ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path) trainer2 = GhostTrainer(ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path) policy = trainer2.create_policy(parsed_behavior_id0, mock_specs) trainer2.add_policy(parsed_behavior_id0, policy) policy = trainer2.create_policy(parsed_behavior_id1, mock_specs) trainer2.add_policy(parsed_behavior_id1, policy) trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id) trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id) weights = trainer1_policy.get_weights() weights2 = trainer2_policy.get_weights() for w, lw in zip(weights, weights2): np.testing.assert_array_equal(w, lw)
def test_trainer_increment_step(): trainer_params = { "trainer": "ppo", "batch_size": 2048, "beta": 0.005, "buffer_size": 20480, "epsilon": 0.2, "gamma": 0.995, "hidden_units": 512, "lambd": 0.95, "learning_rate": 0.0003, "max_steps": "2e6", "memory_size": 256, "normalize": True, "num_epoch": 3, "num_layers": 3, "time_horizon": 1000, "sequence_length": 64, "summary_freq": 3000, "use_recurrent": False, "use_curiosity": False, "curiosity_strength": 0.01, "curiosity_enc_size": 128, "summary_path": "./summaries/test_trainer_summary", "model_path": "./models/test_trainer_models/TestModel", "keep_checkpoints": 5, "reward_signals": { "extrinsic": { "strength": 1.0, "gamma": 0.99 } }, } brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0) trainer = PPOTrainer(brain_params, 0, trainer_params, True, False, 0, "0", False) policy_mock = mock.Mock() step_count = 10 policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.policy = policy_mock trainer.increment_step(5) policy_mock.increment_step.assert_called_with(5) assert trainer.step == 10
def test_add_get_policy(ppo_optimizer, dummy_config): brain_params = make_brain_parameters( discrete_action=False, visual_inputs=0, vec_obs_size=6 ) mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = mock.Mock(spec=NNPolicy) policy.get_current_step.return_value = 2000 trainer.add_policy(brain_params.brain_name, policy) assert trainer.get_policy(brain_params.brain_name) == policy # Make sure the summary steps were loaded properly assert trainer.get_step == 2000 assert trainer.next_summary_step > 2000 # Test incorrect class of policy policy = mock.Mock() with pytest.raises(RuntimeError): trainer.add_policy(brain_params, policy)
def test_bad_config(dummy_config): brain_params = make_brain_parameters(discrete_action=False, visual_inputs=0, vec_obs_size=6) # Test that we throw an error if we have sequence length greater than batch size dummy_config["sequence_length"] = 64 dummy_config["batch_size"] = 32 dummy_config["use_recurrent"] = True with pytest.raises(UnityTrainerException): _ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_behavior_spec = mb.setup_test_behavior_specs( use_discrete, False, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config mock_brain_name = "MockBrain" behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name) trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(behavior_id, mock_behavior_spec) trainer.add_policy(behavior_id, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_behavior_spec) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not if use_discrete: n_agents = len(buffer["discrete_log_probs"]) buffer["discrete_log_probs"].reset_field() for _ in range(n_agents): buffer["discrete_log_probs"].append( np.ones( int(sum(mock_behavior_spec.action_spec.discrete_branches)), dtype=np.float32, )) else: n_agents = len(buffer["continuous_log_probs"]) buffer["continuous_log_probs"].reset_field() for _ in range(n_agents): buffer["continuous_log_probs"].append( np.ones(mock_behavior_spec.action_spec.continuous_size, dtype=np.float32)) trainer.update_buffer = buffer trainer._update_policy()
def initialize_trainers(self, trainer_config: Dict[str, Dict[str, str]]): """ Initialization of the trainers :param trainer_config: The configurations of the trainers """ trainer_parameters_dict = {} print("External Brains") print(self.external_brains) for brain_name in self.external_brains: print(brain_name) trainer_parameters = trainer_config['default'].copy() trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id) + '_' + brain_name) trainer_parameters['model_path'] = '{basedir}/{name}'.format( basedir=self.model_path, name=brain_name) trainer_parameters['keep_checkpoints'] = self.keep_checkpoints if brain_name in trainer_config: _brain_key = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] for k in trainer_config[_brain_key]: trainer_parameters[k] = trainer_config[_brain_key][k] trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in self.external_brains: if trainer_parameters_dict[brain_name]['trainer'] == 'offline_bc': self.trainers[brain_name] = OfflineBCTrainer( self.external_brains[brain_name], trainer_parameters_dict[brain_name], self.train_model, self.load_model, self.seed, self.run_id) elif trainer_parameters_dict[brain_name]['trainer'] == 'online_bc': self.trainers[brain_name] = OnlineBCTrainer( self.external_brains[brain_name], trainer_parameters_dict[brain_name], self.train_model, self.load_model, self.seed, self.run_id) elif trainer_parameters_dict[brain_name]['trainer'] == 'ppo': print("Now implement ppo method in trainer_controller") print( "Parameters brain name for PP////////O trainer in trainer_controller" ) print(self.external_brains[brain_name]) print("Now end of parameters") self.trainers[brain_name] = PPOTrainer( self.external_brains[brain_name], self.meta_curriculum.brains_to_curriculums[brain_name]. min_lesson_length if self.meta_curriculum else 0, trainer_parameters_dict[brain_name], self.train_model, self.load_model, self.seed, self.run_id) self.trainer_metrics[brain_name] = self.trainers[ brain_name].trainer_metrics else: raise UnityEnvironmentException('The trainer config contains ' 'an unknown trainer type for ' 'brain {}'.format(brain_name))
def test_trainer_increment_step(dummy_config): trainer_params = dummy_config brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) trainer = PPOTrainer(brain_params, 0, trainer_params, True, False, 0, "0", False) policy_mock = mock.Mock() step_count = 10 policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.policy = policy_mock trainer.increment_step(5) policy_mock.increment_step.assert_called_with(5) assert trainer.step == 10
def test_bad_config(): brain_params = make_brain_parameters(discrete_action=False, visual_inputs=0, vec_obs_size=6) # Test that we throw an error if we have sequence length greater than batch size with pytest.raises(TrainerConfigError): TrainerSettings( network_settings=NetworkSettings( memory=NetworkSettings.MemorySettings(sequence_length=64)), hyperparameters=PPOSettings(batch_size=32), ) _ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
def test_normalization(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=2, ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) trainer.process_trajectory(trajectory) # Check that the running mean and variance is correct steps, mean, variance = trainer.ppo_policy.sess.run([ trainer.policy.model.normalization_steps, trainer.policy.model.running_mean, trainer.policy.model.running_variance, ]) assert steps == 6 assert mean[0] == 0.5 # Note: variance is divided by number of steps, and initialized to 1 to avoid # divide by 0. The right answer is 0.25 assert (variance[0] - 1) / steps == 0.25 # Make another update, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=2, ) trainer.process_trajectory(trajectory) # Check that the running mean and variance is correct steps, mean, variance = trainer.ppo_policy.sess.run([ trainer.policy.model.normalization_steps, trainer.policy.model.running_mean, trainer.policy.model.running_variance, ]) assert steps == 16 assert mean[0] == 0.8125 assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
def test_load_and_set(dummy_config, use_discrete): mock_specs = mb.setup_test_behavior_specs( use_discrete, False, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") trainer.seed = 1 policy = trainer.create_policy("test", mock_specs) trainer.seed = 20 # otherwise graphs are the same to_load_policy = trainer.create_policy("test", mock_specs) weights = policy.get_weights() load_weights = to_load_policy.get_weights() try: for w, lw in zip(weights, load_weights): np.testing.assert_array_equal(w, lw) except AssertionError: pass to_load_policy.load_weights(weights) load_weights = to_load_policy.get_weights() for w, lw in zip(weights, load_weights): np.testing.assert_array_equal(w, lw)
def test_trainer_increment_step(ppo_optimizer, dummy_config): trainer_params = dummy_config mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) trainer = PPOTrainer( brain_params.brain_name, 0, trainer_params, True, False, 0, "0" ) policy_mock = mock.Mock(spec=NNPolicy) policy_mock.get_current_step.return_value = 0 step_count = ( 5 ) # 10 hacked because this function is no longer called through trainer policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.add_policy("testbehavior", policy_mock) trainer._increment_step(5, "testbehavior") policy_mock.increment_step.assert_called_with(5) assert trainer.step == step_count
def test_process_trajectory(dummy_config): mock_specs = mb.setup_test_behavior_specs(True, False, vector_action_space=[2], vector_obs_space=1) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" brain_name = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0).brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0, dummy_config, True, "0") # first policy encountered becomes policy trained by wrapped PPO parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0) policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) trajectory_queue0 = AgentManagerQueue(behavior_id_team0) trainer.subscribe_trajectory_queue(trajectory_queue0) # Ghost trainer should ignore this queue because off policy parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) trajectory_queue1 = AgentManagerQueue(behavior_id_team1) trainer.subscribe_trajectory_queue(trajectory_queue1) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) trajectory_queue0.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.trainer.update_buffer.num_experiences == 15 trajectory_queue1.put(trajectory) trainer.advance() # Check that ghost trainer ignored off policy queue assert trainer.trainer.update_buffer.num_experiences == 15 # Check that it emptied the queue assert trajectory_queue1.empty()
def test_add_rewards_output(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False) rewardsout = AllRewardsOutput( reward_signals={ "extrinsic": RewardSignalResult( scaled_reward=np.array([1.0, 1.0], dtype=np.float32), unscaled_reward=np.array([1.0, 1.0], dtype=np.float32), ) }, environment=np.array([1.0, 1.0], dtype=np.float32), ) values = {"extrinsic": np.array([[2.0]], dtype=np.float32)} agent_id = "123" idx = 0 # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail. next_idx = 1 trainer.add_rewards_outputs( rewardsout, values=values, agent_id=agent_id, agent_idx=idx, agent_next_idx=next_idx, ) assert trainer.processing_buffer[agent_id]["extrinsic_value_estimates"][ 0] == 2.0 assert trainer.processing_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
def test_trainer_increment_step(dummy_config): trainer_params = dummy_config brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) trainer = PPOTrainer(brain_params.brain_name, 0, trainer_params, True, False, 0, "0", False) policy_mock = mock.Mock() step_count = ( 5 ) # 10 hacked becausee this function is no longer called through trainer policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.policy = policy_mock trainer.increment_step(5) print(trainer.policy.increment_step(5)) policy_mock.increment_step.assert_called_with(5) assert trainer.step == step_count
def _initialize_trainers(self, trainer_config, sess): trainer_parameters_dict = {} # TODO: This probably doesn't need to be reinitialized. self.trainers = {} for brain_name in self.env.external_brain_names: trainer_parameters = trainer_config['default'].copy() if len(self.env.external_brain_names) > 1: graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name) trainer_parameters['graph_scope'] = graph_scope trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id) + '_' + graph_scope) else: trainer_parameters['graph_scope'] = '' trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id)) if brain_name in trainer_config: _brain_key = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] for k in trainer_config[_brain_key]: trainer_parameters[k] = trainer_config[_brain_key][k] trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in self.env.external_brain_names: if trainer_parameters_dict[brain_name]['trainer'] == 'imitation': self.trainers[brain_name] = BehavioralCloningTrainer( sess, self.env.brains[brain_name], trainer_parameters_dict[brain_name], self.train_model, self.seed, self.run_id) elif trainer_parameters_dict[brain_name]['trainer'] == 'ppo': ############################################################################### ####### External brain becomes internal brain in here ########## ############################################################################### self.trainers[brain_name] = PPOTrainer( sess, self.env.brains[brain_name], self.meta_curriculum.brains_to_curriculums[brain_name]. min_lesson_length if self.meta_curriculum else 0, trainer_parameters_dict[brain_name], self.train_model, self.seed, self.run_id) else: raise UnityEnvironmentException('The trainer config contains ' 'an unknown trainer type for ' 'brain {}'.format(brain_name))
def test_add_get_policy(ppo_optimizer, dummy_config): mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer trainer = PPOTrainer("test_policy", 0, dummy_config, True, False, 0, "0") policy = mock.Mock(spec=NNPolicy) policy.get_current_step.return_value = 2000 trainer.add_policy("test_policy", policy) assert trainer.get_policy("test_policy") == policy # Make sure the summary steps were loaded properly assert trainer.get_step == 2000 # Test incorrect class of policy policy = mock.Mock() with pytest.raises(RuntimeError): trainer.add_policy("test_policy", policy)
def test_trainer_increment_step(ppo_optimizer): trainer_params = PPO_CONFIG mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0") policy_mock = mock.Mock(spec=NNPolicy) policy_mock.get_current_step.return_value = 0 step_count = ( 5 # 10 hacked because this function is no longer called through trainer ) policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.add_policy("testbehavior", policy_mock) trainer._increment_step(5, "testbehavior") policy_mock.increment_step.assert_called_with(5) assert trainer.step == step_count
def test_trainer_increment_step(ppo_optimizer, mock_create_model_saver): trainer_params = PPO_CONFIG mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0") policy_mock = mock.Mock(spec=TFPolicy) policy_mock.get_current_step.return_value = 0 step_count = ( 5 # 10 hacked because this function is no longer called through trainer ) policy_mock.increment_step = mock.Mock(return_value=step_count) behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) trainer.add_policy(behavior_id, policy_mock) trainer._increment_step(5, trainer.brain_name) policy_mock.increment_step.assert_called_with(5) assert trainer.step == step_count
def test_trainer_update_policy(mock_env, dummy_config, use_discrete): env, mock_brain, _ = mb.setup_mock_env_and_brains( mock_env, use_discrete, False, num_agents=NUM_AGENTS, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params["use_recurrent"] = True # Test curiosity reward signal trainer_params["reward_signals"]["curiosity"] = {} trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0 trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99 trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128 trainer = PPOTrainer(mock_brain, 0, trainer_params, True, False, 0, "0", False) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["rewards"] buffer["extrinsic_returns"] = buffer["rewards"] buffer["extrinsic_value_estimates"] = buffer["rewards"] buffer["curiosity_rewards"] = buffer["rewards"] buffer["curiosity_returns"] = buffer["rewards"] buffer["curiosity_value_estimates"] = buffer["rewards"] trainer.update_buffer = buffer trainer.update_policy() # Make batch length a larger multiple of sequence length trainer.trainer_parameters["batch_size"] = 128 trainer.update_policy() # Make batch length a larger non-multiple of sequence length trainer.trainer_parameters["batch_size"] = 100 trainer.update_policy()
def test_publish_queue(dummy_config): mock_specs = mb.setup_test_behavior_specs( True, False, vector_action_space=[1], vector_obs_space=8 ) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) brain_name = parsed_behavior_id0.brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer( ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" ) # First policy encountered becomes policy trained by wrapped PPO # This queue should remain empty after swap snapshot policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) policy_queue0 = AgentManagerQueue(behavior_id_team0) trainer.publish_policy_queue(policy_queue0) # Ghost trainer should use this queue for ghost policy swap parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) policy_queue1 = AgentManagerQueue(behavior_id_team1) trainer.publish_policy_queue(policy_queue1) # check ghost trainer swap pushes to ghost queue and not trainer assert policy_queue0.empty() and policy_queue1.empty() trainer._swap_snapshots() assert policy_queue0.empty() and not policy_queue1.empty() # clear policy_queue1.get_nowait() mock_specs = mb.setup_test_behavior_specs( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.trainer.update_buffer = buffer # when ghost trainer advance and wrapped trainer buffers full # the wrapped trainer pushes updated policy to correct queue assert policy_queue0.empty() and policy_queue1.empty() trainer.advance() assert not policy_queue0.empty() and policy_queue1.empty()
def initialize_trainer( trainer_config: Any, brain_parameters: BrainParameters, summaries_dir: str, run_id: str, model_path: str, keep_checkpoints: int, train_model: bool, load_model: bool, seed: int, meta_curriculum: MetaCurriculum = None, multi_gpu: bool = False, ) -> Trainer: """ Initializes a trainer given a provided trainer configuration and brain parameters, as well as some general training session options. :param trainer_config: Original trainer configuration loaded from YAML :param brain_parameters: BrainParameters provided by the Unity environment :param summaries_dir: Directory to store trainer summary statistics :param run_id: Run ID to associate with this training run :param model_path: Path to save the model :param keep_checkpoints: How many model checkpoints to keep :param train_model: Whether to train the model (vs. run inference) :param load_model: Whether to load the model or randomly initialize :param seed: The random seed to use :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer :param multi_gpu: Whether to use multi-GPU training :return: """ brain_name = brain_parameters.brain_name if "default" not in trainer_config and brain_name not in trainer_config: raise TrainerConfigError( f'Trainer config must have either a "default" section, or a section for the brain name ({brain_name}). ' "See config/trainer_config.yaml for an example.") trainer_parameters = trainer_config.get("default", {}).copy() trainer_parameters["summary_path"] = str(run_id) + "_" + brain_name trainer_parameters["model_path"] = "{basedir}/{name}".format( basedir=model_path, name=brain_name) trainer_parameters["keep_checkpoints"] = keep_checkpoints if brain_name in trainer_config: _brain_key: Any = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] trainer_parameters.update(trainer_config[_brain_key]) min_lesson_length = 1 if meta_curriculum: if brain_name in meta_curriculum.brains_to_curriculums: min_lesson_length = meta_curriculum.brains_to_curriculums[ brain_name].min_lesson_length else: logger.warning( f"Metacurriculum enabled, but no curriculum for brain {brain_name}. " f"Brains with curricula: {meta_curriculum.brains_to_curriculums.keys()}. " ) trainer: Trainer = None # type: ignore # will be set to one of these, or raise if "trainer" not in trainer_parameters: raise TrainerConfigError( f'The "trainer" key must be set in your trainer config for brain {brain_name} (or the default brain).' ) trainer_type = trainer_parameters["trainer"] if trainer_type == "offline_bc": raise UnityTrainerException( "The offline_bc trainer has been removed. To train with demonstrations, " "please use a PPO or SAC trainer with the GAIL Reward Signal and/or the " "Behavioral Cloning feature enabled.") elif trainer_type == "ppo": trainer = PPOTrainer( brain_parameters, min_lesson_length, trainer_parameters, train_model, load_model, seed, run_id, multi_gpu, ) elif trainer_type == "sac": trainer = SACTrainer( brain_parameters, min_lesson_length, trainer_parameters, train_model, load_model, seed, run_id, ) else: raise TrainerConfigError( f'The trainer config contains an unknown trainer type "{trainer_type}" for brain {brain_name}' ) return trainer
def test_publish_queue(dummy_config): brain_params_team0 = BrainParameters( brain_name="test_brain?team=0", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[1], vector_action_descriptions=[], vector_action_space_type=0, ) parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( brain_params_team0.brain_name ) brain_name = parsed_behavior_id0.brain_name brain_params_team1 = BrainParameters( brain_name="test_brain?team=1", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[1], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer( ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" ) # First policy encountered becomes policy trained by wrapped PPO # This queue should remain empty after swap snapshot policy = trainer.create_policy(parsed_behavior_id0, brain_params_team0) trainer.add_policy(parsed_behavior_id0, policy) policy_queue0 = AgentManagerQueue(brain_params_team0.brain_name) trainer.publish_policy_queue(policy_queue0) # Ghost trainer should use this queue for ghost policy swap parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( brain_params_team1.brain_name ) policy = trainer.create_policy(parsed_behavior_id1, brain_params_team1) trainer.add_policy(parsed_behavior_id1, policy) policy_queue1 = AgentManagerQueue(brain_params_team1.brain_name) trainer.publish_policy_queue(policy_queue1) # check ghost trainer swap pushes to ghost queue and not trainer assert policy_queue0.empty() and policy_queue1.empty() trainer._swap_snapshots() assert policy_queue0.empty() and not policy_queue1.empty() # clear policy_queue1.get_nowait() mock_brain = mb.setup_mock_brain( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.trainer.update_buffer = buffer # when ghost trainer advance and wrapped trainer buffers full # the wrapped trainer pushes updated policy to correct queue assert policy_queue0.empty() and policy_queue1.empty() trainer.advance() assert not policy_queue0.empty() and policy_queue1.empty()
def initialize_trainers( trainer_config: Dict[str, Any], external_brains: Dict[str, BrainParameters], summaries_dir: str, run_id: str, model_path: str, keep_checkpoints: int, train_model: bool, load_model: bool, seed: int, meta_curriculum: MetaCurriculum = None, multi_gpu: bool = False, ) -> Dict[str, Trainer]: """ Initializes trainers given a provided trainer configuration and set of brains from the environment, as well as some general training session options. :param trainer_config: Original trainer configuration loaded from YAML :param external_brains: BrainParameters provided by the Unity environment :param summaries_dir: Directory to store trainer summary statistics :param run_id: Run ID to associate with this training run :param model_path: Path to save the model :param keep_checkpoints: How many model checkpoints to keep :param train_model: Whether to train the model (vs. run inference) :param load_model: Whether to load the model or randomly initialize :param seed: The random seed to use :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer :param multi_gpu: Whether to use multi-GPU training :return: """ trainers = {} trainer_parameters_dict = {} for brain_name in external_brains: trainer_parameters = trainer_config["default"].copy() trainer_parameters["summary_path"] = "{basedir}/{name}".format( basedir=summaries_dir, name=str(run_id) + "_" + brain_name) trainer_parameters["model_path"] = "{basedir}/{name}".format( basedir=model_path, name=brain_name) trainer_parameters["keep_checkpoints"] = keep_checkpoints if brain_name in trainer_config: _brain_key: Any = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] trainer_parameters.update(trainer_config[_brain_key]) trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in external_brains: if trainer_parameters_dict[brain_name]["trainer"] == "offline_bc": trainers[brain_name] = OfflineBCTrainer( external_brains[brain_name], trainer_parameters_dict[brain_name], train_model, load_model, seed, run_id, ) elif trainer_parameters_dict[brain_name]["trainer"] == "online_bc": trainers[brain_name] = OnlineBCTrainer( external_brains[brain_name], trainer_parameters_dict[brain_name], train_model, load_model, seed, run_id, ) elif trainer_parameters_dict[brain_name]["trainer"] == "ppo": trainers[brain_name] = PPOTrainer( external_brains[brain_name], meta_curriculum.brains_to_curriculums[brain_name]. min_lesson_length if meta_curriculum else 1, trainer_parameters_dict[brain_name], train_model, load_model, seed, run_id, multi_gpu, ) elif trainer_parameters_dict[brain_name]["trainer"] == "sac": trainers[brain_name] = SACTrainer( external_brains[brain_name], meta_curriculum.brains_to_curriculums[brain_name]. min_lesson_length if meta_curriculum else 1, trainer_parameters_dict[brain_name], train_model, load_model, seed, run_id, ) else: raise UnityEnvironmentException("The trainer config contains " "an unknown trainer type for " "brain {}".format(brain_name)) return trainers