def _train(self, experience, weights=None, train_step_counter=None): # TODO(sfishman): Support batch dimensions >1. if experience.step_type.shape[0] != 1: raise NotImplementedError( 'ReinforceAgent does not yet support batch ' 'dimensions greater than 1.') experience = tf.nest.map_structure(lambda t: tf.squeeze(t, 0), experience) returns = common.compute_returns(experience.reward, experience.discount) if self._debug_summaries: tf.contrib.summary.histogram('rewards', experience.reward) tf.contrib.summary.histogram('discounts', experience.discount) tf.contrib.summary.histogram('returns', returns) # TODO(kbnaoop): replace with tensor normalizer. if self._normalize_returns: ret_mean, ret_var = tf.nn.moments(x=returns, axes=[0]) returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6) if self._debug_summaries: tf.contrib.summary.histogram('normalized_returns', returns) # TODO(kbanoop): remove after changing network interface to accept # observations and step_types, instead of time_steps. time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) # TODO(kbanoop): Filter boundary steps. loss_info = self._loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) clip_gradients = None if self._gradient_clipping: clip_gradients = eager_utils.clip_gradient_norms_fn( self._gradient_clipping) loss_info = eager_utils.create_train_step( loss_info, self._optimizer, total_loss_fn=lambda loss_info: loss_info.loss, global_step=train_step_counter, transform_grads_fn=clip_gradients, summarize_gradients=self._summarize_grads_and_vars, variables_to_train=lambda: self._actor_network.trainable_weights, ) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): for var in self._actor_network.trainable_weights: tf.contrib.summary.histogram(var.name.replace(':', '_'), var) return loss_info
def testClipGradsFn(self): xs = tf.Variable(0.0) grads = tf.constant(4.0) gradients_to_variables = [(grads, xs)] clipped_gradients_to_variables = eager_utils.clip_gradient_norms_fn(3.0)( gradients_to_variables) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAlmostEqual(4.0, self.evaluate(gradients_to_variables[0][0])) self.assertAlmostEqual(3.0, self.evaluate(clipped_gradients_to_variables[0][0]))
def clip_and_summarize_gradients(grads_and_vars): """Clips gradients, and summarizes gradients and variables.""" if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms_fn( self._gradient_clipping)(grads_and_vars) if self._summarize_grads_and_vars: # TODO(kbanoop): Move gradient summaries to train_op after we switch to # eager train op, and move variable summaries to critic_loss. for grad, var in grads_and_vars: with tf.name_scope('Gradients/'): if grad is not None: tf.contrib.summary.histogram(grad.op.name, grad) with tf.name_scope('Variables/'): if var is not None: tf.contrib.summary.histogram(var.op.name, var) return grads_and_vars
def _train(self, experience, weights=None): loss_info = self._loss(experience, weights=weights) transform_grads_fn = None if self._gradient_clipping is not None: transform_grads_fn = eager_utils.clip_gradient_norms_fn( self._gradient_clipping) loss_info = eager_utils.create_train_step( loss_info, self._optimizer, total_loss_fn=lambda loss_info: loss_info.loss, global_step=self.train_step_counter, transform_grads_fn=transform_grads_fn, summarize_gradients=self._summarize_grads_and_vars, variables_to_train=lambda: self._cloning_network.trainable_weights, ) return loss_info
def _train(self, experience, train_step_counter=None, weights=None): time_steps, actions, next_time_steps = self._experience_to_transitions( experience) loss_info = self._loss( time_steps, actions, next_time_steps, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights) transform_grads_fn = None if self._gradient_clipping is not None: transform_grads_fn = eager_utils.clip_gradient_norms_fn( self._gradient_clipping) loss_info = eager_utils.create_train_step( loss_info, self._optimizer, total_loss_fn=lambda loss_info: loss_info.loss, global_step=train_step_counter, transform_grads_fn=transform_grads_fn, summarize_gradients=self._summarize_grads_and_vars, variables_to_train=lambda: self._q_network.trainable_weights, ) # Make sure the update_targets periodically object is only created once. if self._target_update_train_op is None: with tf.control_dependencies([loss_info.loss]): self._target_update_train_op = self._update_targets( self._target_update_tau, self._target_update_period) with tf.control_dependencies([self._target_update_train_op]): loss_info = tf.nest.map_structure( lambda t: tf.identity(t, name='loss_info'), loss_info) return loss_info
def build_train_op(self, time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, train_step, summarize_gradients, gradient_clipping, debug_summaries): """Compute the loss and create optimization op for one training epoch. All tensors should have a single batch dimension. Args: time_steps: A minibatch of TimeStep tuples. actions: A minibatch of actions. act_log_probs: A minibatch of action probabilities (probability under the sampling policy). returns: A minibatch of per-timestep returns. normalized_advantages: A minibatch of normalized per-timestep advantages. action_distribution_parameters: Parameters of data-collecting action distribution. Needed for KL computation. weights: Optional scalar or element-wise (per-batch-entry) importance weights. Includes a mask for invalid timesteps. train_step: A train_step variable to increment for each train step. Typically the global_step. summarize_gradients: If true, gradient summaries will be written. gradient_clipping: Norm length to clip gradients. debug_summaries: True if debug summaries should be created. Returns: A tf_agent.LossInfo named tuple with the total_loss and all intermediate losses in the extra field contained in a PPOLossInfo named tuple. """ # Evaluate the current policy on timesteps. # batch_size from time_steps batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size) distribution_step = self._collect_policy.distribution( time_steps, policy_state) # TODO(eholly): Rename policy distributions to something clear and uniform. current_policy_distribution = distribution_step.action # Call all loss functions and add all loss values. value_estimation_loss = self.value_estimation_loss( time_steps, returns, weights, debug_summaries) policy_gradient_loss = self.policy_gradient_loss( time_steps, actions, tf.stop_gradient(act_log_probs), tf.stop_gradient(normalized_advantages), current_policy_distribution, weights, debug_summaries=debug_summaries) if self._policy_l2_reg > 0.0 or self._value_function_l2_reg > 0.0: l2_regularization_loss = self.l2_regularization_loss( debug_summaries) else: l2_regularization_loss = tf.zeros_like(policy_gradient_loss) if self._entropy_regularization > 0.0: entropy_regularization_loss = self.entropy_regularization_loss( time_steps, current_policy_distribution, weights, debug_summaries) else: entropy_regularization_loss = tf.zeros_like(policy_gradient_loss) kl_penalty_loss = self.kl_penalty_loss(time_steps, action_distribution_parameters, current_policy_distribution, weights, debug_summaries) total_loss = (policy_gradient_loss + value_estimation_loss + l2_regularization_loss + entropy_regularization_loss + kl_penalty_loss) if gradient_clipping > 0: clip_gradients = eager_utils.clip_gradient_norms_fn( gradient_clipping) else: clip_gradients = lambda x: x # If summarize_gradients, create functions for summarizing both gradients # and variables. if summarize_gradients and debug_summaries: def _create_summaries(grads_and_vars): eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) grads_and_vars = clip_gradients(grads_and_vars) return grads_and_vars transform_grads_fn = _create_summaries else: transform_grads_fn = clip_gradients total_loss = eager_utils.create_train_op( total_loss, self._optimizer, global_step=train_step, transform_grads_fn=transform_grads_fn, variables_to_train=(self._actor_net.trainable_weights + self._value_net.trainable_weights)) return tf_agent.LossInfo( total_loss, PPOLossInfo( policy_gradient_loss=policy_gradient_loss, value_estimation_loss=value_estimation_loss, l2_regularization_loss=l2_regularization_loss, entropy_regularization_loss=entropy_regularization_loss, kl_penalty_loss=kl_penalty_loss, ))