def test_update(mock_get_devices, mock_construct_feed_dict, mock_execute_model, dummy_config): tf.reset_default_graph() mock_get_devices.return_value = ["/device:GPU:0", "/device:GPU:1"] mock_construct_feed_dict.return_value = {} mock_execute_model.return_value = { "value_loss": 0.1, "policy_loss": 0.3, "update_batch": None, } trainer_parameters = dummy_config trainer_parameters["model_path"] = "" trainer_parameters["keep_checkpoints"] = 3 brain = create_mock_brainparams() policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False) mock_mini_batch = mock.Mock() mock_mini_batch.items.return_value = [("action", [1, 2]), ("value", [3, 4])] run_out = policy.update(mock_mini_batch, 1) assert mock_mini_batch.items.call_count == len( mock_get_devices.return_value) assert mock_construct_feed_dict.call_count == len( mock_get_devices.return_value) assert run_out["Losses/Value Loss"] == 0.1 assert run_out["Losses/Policy Loss"] == 0.3
def test_average_gradients(mock_get_devices, dummy_config): tf.reset_default_graph() mock_get_devices.return_value = [ "/device:GPU:0", "/device:GPU:1", "/device:GPU:2", "/device:GPU:3", ] trainer_parameters = dummy_config trainer_parameters["model_path"] = "" trainer_parameters["keep_checkpoints"] = 3 brain = create_mock_brainparams() with tf.Session() as sess: policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False) var = tf.Variable(0) tower_grads = [ [(tf.constant(0.1), var)], [(tf.constant(0.2), var)], [(tf.constant(0.3), var)], [(tf.constant(0.4), var)], ] avg_grads = policy.average_gradients(tower_grads) init = tf.global_variables_initializer() sess.run(init) run_out = sess.run(avg_grads) assert run_out == [(0.25, 0)]
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy: """ Creates a PPO policy to trainers list of policies. :param brain_parameters: specifications for policy construction :return policy """ if self.multi_gpu and len(get_devices()) > 1: policy: PPOPolicy = MultiGpuPPOPolicy( self.seed, brain_parameters, self.trainer_parameters, self.is_training, self.load, ) else: policy = PPOPolicy( self.seed, brain_parameters, self.trainer_parameters, self.is_training, self.load, ) for _reward_signal in policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) return policy
def __init__( self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id, multi_gpu, ): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param reward_buff_cap: Max reward history to track in the reward buffer :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id, reward_buff_cap) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "model_path", "reward_signals", ] self.check_param_keys() if multi_gpu and len(get_devices()) > 1: self.ppo_policy = MultiGpuPPOPolicy(seed, brain, trainer_parameters, self.is_training, load) else: self.ppo_policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) self.policy = self.ppo_policy for _reward_signal in self.policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = {}
def test_create_model(mock_get_devices, dummy_config): tf.reset_default_graph() mock_get_devices.return_value = [ "/device:GPU:0", "/device:GPU:1", "/device:GPU:2", "/device:GPU:3", ] trainer_parameters = dummy_config trainer_parameters["model_path"] = "" trainer_parameters["keep_checkpoints"] = 3 brain = create_mock_brainparams() policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False) assert len(policy.towers) == len(mock_get_devices.return_value)