def _create_losses(self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step): """ Creates training-specific Tensorflow ops for PPO models. :param probs: Current policy probabilities :param old_probs: Past policy probabilities :param value_heads: Value estimate tensors from each value stream :param beta: Entropy regularization strength :param entropy: Current policy entropy :param epsilon: Value for policy-divergence threshold :param lr: Learning rate :param max_step: Total number of training steps. """ self.returns_holders = {} self.old_values = {} for name in value_heads.keys(): returns_holder = tf.placeholder(shape=[None], dtype=tf.float32, name=f"{name}_returns") old_value = tf.placeholder(shape=[None], dtype=tf.float32, name=f"{name}_value_estimate") self.returns_holders[name] = returns_holder self.old_values[name] = old_value self.advantage = tf.placeholder(shape=[None], dtype=tf.float32, name="advantages") advantage = tf.expand_dims(self.advantage, -1) self.decay_epsilon = ModelUtils.create_schedule( self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1) self.decay_beta = ModelUtils.create_schedule(self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5) value_losses = [] for name, head in value_heads.items(): clipped_value_estimate = self.old_values[name] + tf.clip_by_value( tf.reduce_sum(head, axis=1) - self.old_values[name], -self.decay_epsilon, self.decay_epsilon, ) v_opt_a = tf.squared_difference(self.returns_holders[name], tf.reduce_sum(head, axis=1)) v_opt_b = tf.squared_difference(self.returns_holders[name], clipped_value_estimate) value_loss = tf.reduce_mean( tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[1]) value_losses.append(value_loss) self.value_loss = tf.reduce_mean(value_losses) r_theta = tf.exp(probs - old_probs) p_opt_a = r_theta * advantage p_opt_b = (tf.clip_by_value(r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon) * advantage) self.policy_loss = -tf.reduce_mean( tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]) # For cleaner stats reporting self.abs_policy_loss = tf.abs(self.policy_loss) self.loss = ( self.policy_loss + 0.5 * self.value_loss - self.decay_beta * tf.reduce_mean( tf.dynamic_partition(entropy, self.policy.mask, 2)[1]))
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: Brain parameters used to generate specific network graph. :param lr: Learning rate. :param lr_schedule: Learning rate decay schedule. :param h_size: Size of hidden layers :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster, set higher to explore more. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers: Number of hidden layers between encoded input and policy & value layers :param tau: Strength of soft-Q update. :param m_size: Size of brain memory. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. policy.create_tf_graph() with policy.graph.as_default(): with tf.variable_scope(""): super().__init__(policy, trainer_params) hyperparameters: SACSettings = cast( SACSettings, trainer_params.hyperparameters) lr = hyperparameters.learning_rate lr_schedule = hyperparameters.learning_rate_schedule max_step = trainer_params.max_steps self.tau = hyperparameters.tau self.init_entcoef = hyperparameters.init_entcoef self.policy = policy self.act_size = policy.act_size policy_network_settings = policy.network_settings h_size = policy_network_settings.hidden_units num_layers = policy_network_settings.num_layers vis_encode_type = policy_network_settings.vis_encode_type self.tau = hyperparameters.tau self.burn_in_ratio = 0.0 # Non-exposed SAC parameters self.discrete_target_entropy_scale = ( 0.2 # Roughly equal to e-greedy 0.05 ) self.continuous_target_entropy_scale = 1.0 stream_names = list(self.reward_signals.keys()) # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [ _val.gamma for _val in trainer_params.reward_signals.values() ] self.use_dones_in_backup = { name: tf.Variable(1.0) for name in stream_names } self.disable_use_dones = { name: self.use_dones_in_backup[name].assign(0.0) for name in stream_names } if num_layers < 1: num_layers = 1 self.target_init_op: List[tf.Tensor] = [] self.target_update_op: List[tf.Tensor] = [] self.update_batch_policy: Optional[tf.Operation] = None self.update_batch_value: Optional[tf.Operation] = None self.update_batch_entropy: Optional[tf.Operation] = None self.policy_network = SACPolicyNetwork( policy=self.policy, m_size=self.policy.m_size, # 3x policy.m_size h_size=h_size, normalize=self.policy.normalize, use_recurrent=self.policy.use_recurrent, num_layers=num_layers, stream_names=stream_names, vis_encode_type=vis_encode_type, ) self.target_network = SACTargetNetwork( policy=self.policy, m_size=self.policy.m_size, # 1x policy.m_size h_size=h_size, normalize=self.policy.normalize, use_recurrent=self.policy.use_recurrent, num_layers=num_layers, stream_names=stream_names, vis_encode_type=vis_encode_type, ) # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value) self.m_size = 3 * self.policy.m_size self._create_inputs_and_outputs() self.learning_rate = ModelUtils.create_schedule( lr_schedule, lr, self.policy.global_step, int(max_step), min_value=1e-10, ) self._create_losses( self.policy_network.q1_heads, self.policy_network.q2_heads, lr, int(max_step), stream_names, discrete=not self.policy.use_continuous_act, ) self._create_sac_optimizer_ops() self.selected_actions = (self.policy.selected_actions ) # For GAIL and other reward signals if self.policy.normalize: target_update_norm = self.target_network.copy_normalization( self.policy.running_mean, self.policy.running_variance, self.policy.normalization_steps, ) # Update the normalization of the optimizer when the policy does. self.policy.update_normalization_op = tf.group([ self.policy.update_normalization_op, target_update_norm ]) self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Losses/Q1 Loss": "q1_loss", "Losses/Q2 Loss": "q2_loss", "Policy/Entropy Coeff": "entropy_coef", "Policy/Learning Rate": "learning_rate", } self.update_dict = { "value_loss": self.total_value_loss, "policy_loss": self.policy_loss, "q1_loss": self.q1_loss, "q2_loss": self.q2_loss, "entropy_coef": self.ent_coef, "update_batch": self.update_batch_policy, "update_value": self.update_batch_value, "update_entropy": self.update_batch_entropy, "learning_rate": self.learning_rate, }
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings): """ Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. The PPO optimizer has a value estimator and a loss function. :param policy: A TFPolicy object that will be updated by this PPO Optimizer. :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. policy.create_tf_graph() with policy.graph.as_default(): with tf.variable_scope("optimizer/"): super().__init__(policy, trainer_params) hyperparameters: PPOSettings = cast( PPOSettings, trainer_params.hyperparameters) lr = float(hyperparameters.learning_rate) self._schedule = hyperparameters.learning_rate_schedule epsilon = float(hyperparameters.epsilon) beta = float(hyperparameters.beta) max_step = float(trainer_params.max_steps) policy_network_settings = policy.network_settings h_size = int(policy_network_settings.hidden_units) num_layers = policy_network_settings.num_layers vis_encode_type = policy_network_settings.vis_encode_type self.burn_in_ratio = 0.0 self.stream_names = list(self.reward_signals.keys()) self.tf_optimizer_op: Optional[tf.train.Optimizer] = None self.grads = None self.update_batch: Optional[tf.Operation] = None self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Policy/Learning Rate": "learning_rate", "Policy/Epsilon": "decay_epsilon", "Policy/Beta": "decay_beta", } if self.policy.use_recurrent: self.m_size = self.policy.m_size self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_value_in", ) if num_layers < 1: num_layers = 1 if policy.use_continuous_act: self._create_cc_critic(h_size, num_layers, vis_encode_type) else: self._create_dc_critic(h_size, num_layers, vis_encode_type) self.learning_rate = ModelUtils.create_schedule( self._schedule, lr, self.policy.global_step, int(max_step), min_value=1e-10, ) self._create_losses( self.policy.total_log_probs, self.old_log_probs, self.value_heads, self.policy.entropy, beta, epsilon, lr, max_step, ) self._create_ppo_optimizer_ops() self.update_dict.update({ "value_loss": self.value_loss, "policy_loss": self.abs_policy_loss, "update_batch": self.update_batch, "learning_rate": self.learning_rate, "decay_epsilon": self.decay_epsilon, "decay_beta": self.decay_beta, })