def testMakeTimestepMaskWithPartialEpisode(self, allow_partial): first, mid, last = ts.StepType.FIRST, ts.StepType.MID, ts.StepType.LAST next_step_types = tf.constant([[mid, mid, last, first, mid, mid, last, first, mid, mid], [mid, mid, last, first, mid, mid, mid, mid, mid, last]]) zeros = tf.zeros_like(next_step_types) next_time_step = ts.TimeStep(next_step_types, zeros, zeros, zeros) if not allow_partial: # Mask should be 0.0 for transition timesteps (3, 7) and for all timesteps # belonging to the final, incomplete episode. expected_mask = [[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]] else: # Zeros only between episodes. Incomplete episodes are valid and not # zeroed out. expected_mask = [[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0], [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]] timestep_mask = ppo_utils.make_timestep_mask( next_time_step, allow_partial_episodes=allow_partial) timestep_mask_ = self.evaluate(timestep_mask) self.assertAllClose(expected_mask, timestep_mask_)
def _train(self, experience, weights=None): # unpack trajectories (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] value_state = self._collect_policy.get_initial_value_state( batch_size=batch_size) weights = ppo_utils.make_timestep_mask(next_time_steps) value_preds, _ = self._collect_policy.apply_value_network( experience.observation, experience.step_type, value_state=value_state) value_preds = tf.stop_gradient(value_preds) rewards = next_time_steps.reward # normalize rewards if self._reward_normalizer is not None: rewards = self._reward_normalizer.normalize( rewards, center_mean=False, clip_value=self._reward_norm_clipping) returns, normalized_advantages = compute_return_and_advantage( self._discount_factor, self._lambda, rewards, next_time_steps, value_preds) policy_loss = self._update_policy(time_steps, policy_steps_, normalized_advantages, weights) value_loss = self._update_values(time_steps, returns, weights) return tf_agent.LossInfo( loss=value_loss + policy_loss, extra=TRPOLossInfo(value_estimation_loss=value_loss, policy_gradient_loss=policy_loss), )
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) actions = policy_steps_.action if self._debug_summaries: actions_list = tf.nest.flatten(actions) show_action_index = len(actions_list) != 1 for i, single_action in enumerate(actions_list): action_name = ('actions_{}'.format(i) if show_action_index else 'actions') tf.compat.v2.summary.histogram(name=action_name, data=single_action, step=self.train_step_counter) action_distribution_parameters = policy_steps_.info # Reconstruct per-timestep policy distribution from stored distribution # parameters. old_actions_distribution = ( distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters)) # Compute log probability of actions taken during data collection, using the # collect policy distribution. act_log_probs = common.log_probability(old_actions_distribution, actions, self._action_spec) # Compute the value predictions for states using the current value function. # To be used for return & advantage computation. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state( batch_size=batch_size) value_preds, unused_policy_state = self._collect_policy.apply_value_network( experience.observation, experience.step_type, policy_state=policy_state) value_preds = tf.stop_gradient(value_preds) valid_mask = ppo_utils.make_timestep_mask(next_time_steps) if weights is None: weights = valid_mask else: weights *= valid_mask returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] loss_info = None # TODO(b/123627451): Remove. # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Only save debug summaries for first and last epochs. debug_summaries = (self._debug_summaries and (i_epoch == 0 or i_epoch == self._num_epochs - 1)) # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, self.train_step_counter, debug_summaries) variables_to_train = (self._actor_net.trainable_weights + self._value_net.trainable_weights) grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) # If summarize_gradients, create functions for summarizing both # gradients and variables. if self._summarize_grads_and_vars and debug_summaries: eager_utils.add_gradients_summaries( grads_and_vars, self.train_step_counter) eager_utils.add_variables_summaries( grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients( grads_and_vars, global_step=self.train_step_counter) policy_gradient_losses.append( loss_info.extra.policy_gradient_loss) value_estimation_losses.append( loss_info.extra.value_estimation_loss) l2_regularization_losses.append( loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size) # Compute the mean kl from previous action distribution. kl_divergence = self._kl_divergence( time_steps, action_distribution_parameters, self._collect_policy.distribution(time_steps, policy_state).action) self.update_adaptive_kl_beta(kl_divergence) if self._observation_normalizer: self._observation_normalizer.update(time_steps.observation, outer_dims=[0, 1]) else: # TODO(b/127661780): Verify performance of reward_normalizer when obs are # not normalized if self._reward_normalizer: self._reward_normalizer.update(next_time_steps.reward, outer_dims=[0, 1]) loss_info = tf.nest.map_structure(tf.identity, loss_info) # Make summaries for total loss across all epochs. # The *_losses lists will have been populated by # calls to self.get_epoch_loss. with tf.name_scope('Losses/'): total_policy_gradient_loss = tf.add_n(policy_gradient_losses) total_value_estimation_loss = tf.add_n(value_estimation_losses) total_l2_regularization_loss = tf.add_n(l2_regularization_losses) total_entropy_regularization_loss = tf.add_n( entropy_regularization_losses) total_kl_penalty_loss = tf.add_n(kl_penalty_losses) tf.compat.v2.summary.scalar(name='policy_gradient_loss', data=total_policy_gradient_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='value_estimation_loss', data=total_value_estimation_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='l2_regularization_loss', data=total_l2_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='entropy_regularization_loss', data=total_entropy_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='kl_penalty_loss', data=total_kl_penalty_loss, step=self.train_step_counter) total_abs_loss = (tf.abs(total_policy_gradient_loss) + tf.abs(total_value_estimation_loss) + tf.abs(total_entropy_regularization_loss) + tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss)) tf.compat.v2.summary.scalar(name='total_abs_loss', data=total_abs_loss, step=self.train_step_counter) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): all_vars = (self._actor_net.trainable_weights + self._value_net.trainable_weights) for var in all_vars: tf.compat.v2.summary.histogram( name=var.name.replace(':', '_'), data=var, step=self.train_step_counter) return loss_info
def _train(self, experience, weights, train_step_counter): # Change trajectory to transitions. trajectory0 = nest.map_structure(lambda t: t[:, :-1], experience) trajectory1 = nest.map_structure(lambda t: t[:, 1:], experience) # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(trajectory0, trajectory1) actions = policy_steps_.action if self._debug_summaries: tf.contrib.summary.histogram('actions', actions) action_distribution_parameters = policy_steps_.info # Reconstruct per-timestep policy distribution from stored distribution # parameters. old_actions_distribution = ( distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters)) # Compute log probability of actions taken during data collection, using the # collect policy distribution. act_log_probs = common_utils.log_probability(old_actions_distribution, actions, self._action_spec) # Compute the value predictions for states using the current value function. # To be used for return & advantage computation. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state( batch_size=batch_size) value_preds, unused_policy_state = self._collect_policy.apply_value_network( experience.observation, experience.step_type, policy_state=policy_state) value_preds = tf.stop_gradient(value_preds) valid_mask = ppo_utils.make_timestep_mask(next_time_steps) if weights is None: weights = valid_mask else: weights *= valid_mask returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] # For each epoch, create its own train op that depends on the previous one. loss_info = tf.no_op() for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): with tf.control_dependencies(nest.flatten(loss_info)): # Only save debug summaries for first and last epochs. debug_summaries = (self._debug_summaries and (i_epoch == 0 or i_epoch == self._num_epochs - 1)) # Build one epoch train op. loss_info = self.build_train_op( time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, train_step_counter, self._summarize_grads_and_vars, self._gradient_clipping, debug_summaries) policy_gradient_losses.append( loss_info.extra.policy_gradient_loss) value_estimation_losses.append( loss_info.extra.value_estimation_loss) l2_regularization_losses.append( loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. with tf.control_dependencies(nest.flatten(loss_info)): # Compute the mean kl from old. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size) kl_divergence = self._kl_divergence( time_steps, action_distribution_parameters, self._collect_policy.distribution(time_steps, policy_state).action) update_adaptive_kl_beta_op = self.update_adaptive_kl_beta( kl_divergence) with tf.control_dependencies([update_adaptive_kl_beta_op]): if self._observation_normalizer: update_obs_norm = (self._observation_normalizer.update( time_steps.observation, outer_dims=[0, 1])) else: update_obs_norm = tf.no_op() if self._reward_normalizer: update_reward_norm = self._reward_normalizer.update( next_time_steps.reward, outer_dims=[0, 1]) else: update_reward_norm = tf.no_op() with tf.control_dependencies([update_obs_norm, update_reward_norm]): loss_info = nest.map_structure(tf.identity, loss_info) # Make summaries for total loss across all epochs. # The *_losses lists will have been populated by # calls to self.build_train_op. with tf.name_scope('Losses/'): total_policy_gradient_loss = tf.add_n(policy_gradient_losses) total_value_estimation_loss = tf.add_n(value_estimation_losses) total_l2_regularization_loss = tf.add_n(l2_regularization_losses) total_entropy_regularization_loss = tf.add_n( entropy_regularization_losses) total_kl_penalty_loss = tf.add_n(kl_penalty_losses) tf.contrib.summary.scalar('policy_gradient_loss', total_policy_gradient_loss) tf.contrib.summary.scalar('value_estimation_loss', total_value_estimation_loss) tf.contrib.summary.scalar('l2_regularization_loss', total_l2_regularization_loss) if self._entropy_regularization: tf.contrib.summary.scalar('entropy_regularization_loss', total_entropy_regularization_loss) tf.contrib.summary.scalar('kl_penalty_loss', total_kl_penalty_loss) total_abs_loss = (tf.abs(total_policy_gradient_loss) + tf.abs(total_value_estimation_loss) + tf.abs(total_entropy_regularization_loss) + tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss)) tf.contrib.summary.scalar('total_abs_loss', total_abs_loss) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): all_vars = (self._actor_net.trainable_weights + self._value_net.trainable_weights) for var in all_vars: tf.contrib.summary.histogram(var.name.replace(':', '_'), var) return loss_info