def testToTransition(self): first = ts.StepType.FIRST mid = ts.StepType.MID last = ts.StepType.LAST # Define a batch size 1, 3-step trajectory. traj = trajectory.Trajectory( step_type=np.array([[first, mid, last]]), next_step_type=np.array([[mid, last, first]]), observation=np.array([[10.0, 20.0, 30.0]]), action=np.array([[11.0, 22.0, 33.0]]), # reward at step 0 is an invalid dummy reward. reward=np.array([[0.0, 1.0, 2.0]]), discount=np.array([[1.0, 1.0, 0.0]]), policy_info=np.array([[1.0, 2.0, 3.0]])) time_steps, policy_steps, next_time_steps = trajectory.to_transition( traj) self.assertAllEqual(time_steps.step_type, np.array([[first, mid]])) self.assertAllEqual(time_steps.observation, np.array([[10.0, 20.0]])) self.assertAllEqual(next_time_steps.step_type, np.array([[mid, last]])) self.assertAllEqual(next_time_steps.observation, np.array([[20.0, 30.0]])) self.assertAllEqual(next_time_steps.reward, np.array([[0.0, 1.0]])) self.assertAllEqual(next_time_steps.discount, np.array([[1.0, 1.0]])) self.assertAllEqual(policy_steps.action, np.array([[11.0, 22.0]])) self.assertAllEqual(policy_steps.info, np.array([[1.0, 2.0]]))
def testToTransitionHandlesTrajectoryFromDriverCorrectly(self): env = tf_py_environment.TFPyEnvironment(test_utils.PyEnvironmentMock()) policy = test_utils.TFPolicyMock(env.time_step_spec(), env.action_spec()) replay_buffer = test_utils.make_replay_buffer(policy) driver = dynamic_episode_driver.DynamicEpisodeDriver( env, policy, num_episodes=3, observers=[replay_buffer.add_batch]) run_driver = driver.run() rb_gather_all = replay_buffer.gather_all() self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(run_driver) trajectories = self.evaluate(rb_gather_all) time_steps, policy_step, next_time_steps = trajectory.to_transition( trajectories) self.assertAllEqual(time_steps.observation, trajectories.observation[:, :-1]) self.assertAllEqual(time_steps.step_type, trajectories.step_type[:, :-1]) self.assertAllEqual(next_time_steps.observation, trajectories.observation[:, 1:]) self.assertAllEqual(next_time_steps.step_type, trajectories.step_type[:, 1:]) self.assertAllEqual(next_time_steps.reward, trajectories.reward[:, :-1]) self.assertAllEqual(next_time_steps.discount, trajectories.discount[:, :-1]) self.assertAllEqual(policy_step.action, trajectories.action[:, :-1]) self.assertAllEqual(policy_step.info, trajectories.policy_info[:, :-1])
def _experience_to_transitions(self, experience): transitions = trajectory.to_transition(experience) time_steps, policy_steps, next_time_steps = transitions actions = policy_steps.action # TODO(eholly): Figure out how to properly deal with time dimension. time_steps, actions, next_time_steps = nest.map_structure( lambda t: tf.squeeze(t, axis=1), (time_steps, actions, next_time_steps)) return time_steps, actions, next_time_steps
def _experience_to_transitions(self, experience): transitions = trajectory.to_transition(experience) # Remove time dim if we are not using a recurrent network. if not self._actor_network.state_spec: transitions = nest.map_structure(lambda x: tf.squeeze(x, [1]), transitions) time_steps, policy_steps, next_time_steps = transitions actions = policy_steps.action return time_steps, actions, next_time_steps
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) actions = policy_steps_.action if self._debug_summaries: tf.compat.v2.summary.histogram( name='actions', data=actions, step=self.train_step_counter) action_distribution_parameters = policy_steps_.info # Reconstruct per-timestep policy distribution from stored distribution # parameters. old_actions_distribution = ( distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters)) # Compute log probability of actions taken during data collection, using the # collect policy distribution. act_log_probs = common.log_probability(old_actions_distribution, actions, self._action_spec) # Compute the value predictions for states using the current value function. # To be used for return & advantage computation. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size=batch_size) value_preds, unused_policy_state = self._collect_policy.apply_value_network( experience.observation, experience.step_type, policy_state=policy_state) value_preds = tf.stop_gradient(value_preds) valid_mask = ppo_utils.make_timestep_mask(next_time_steps) if weights is None: weights = valid_mask else: weights *= valid_mask returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] loss_info = None # TODO(b/123627451): Remove. # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Only save debug summaries for first and last epochs. debug_summaries = ( self._debug_summaries and (i_epoch == 0 or i_epoch == self._num_epochs - 1)) # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, self.train_step_counter, debug_summaries) variables_to_train = ( self._actor_net.trainable_weights + self._value_net.trainable_weights) grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) # If summarize_gradients, create functions for summarizing both # gradients and variables. if self._summarize_grads_and_vars and debug_summaries: eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients( grads_and_vars, global_step=self.train_step_counter) policy_gradient_losses.append(loss_info.extra.policy_gradient_loss) value_estimation_losses.append(loss_info.extra.value_estimation_loss) l2_regularization_losses.append(loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size) # Compute the mean kl from previous action distribution. kl_divergence = self._kl_divergence( time_steps, action_distribution_parameters, self._collect_policy.distribution(time_steps, policy_state).action) self.update_adaptive_kl_beta(kl_divergence) if self._observation_normalizer: self._observation_normalizer.update( time_steps.observation, outer_dims=[0, 1]) else: # TODO(b/127661780): Verify performance of reward_normalizer when obs are # not normalized if self._reward_normalizer: self._reward_normalizer.update(next_time_steps.reward, outer_dims=[0, 1]) loss_info = tf.nest.map_structure(tf.identity, loss_info) # Make summaries for total loss across all epochs. # The *_losses lists will have been populated by # calls to self.get_epoch_loss. with tf.name_scope('Losses/'): total_policy_gradient_loss = tf.add_n(policy_gradient_losses) total_value_estimation_loss = tf.add_n(value_estimation_losses) total_l2_regularization_loss = tf.add_n(l2_regularization_losses) total_entropy_regularization_loss = tf.add_n( entropy_regularization_losses) total_kl_penalty_loss = tf.add_n(kl_penalty_losses) tf.compat.v2.summary.scalar( name='policy_gradient_loss', data=total_policy_gradient_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='value_estimation_loss', data=total_value_estimation_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='l2_regularization_loss', data=total_l2_regularization_loss, step=self.train_step_counter) if self._entropy_regularization: tf.compat.v2.summary.scalar( name='entropy_regularization_loss', data=total_entropy_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='kl_penalty_loss', data=total_kl_penalty_loss, step=self.train_step_counter) total_abs_loss = ( tf.abs(total_policy_gradient_loss) + tf.abs(total_value_estimation_loss) + tf.abs(total_entropy_regularization_loss) + tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss)) tf.compat.v2.summary.scalar( name='total_abs_loss', data=total_abs_loss, step=self.train_step_counter) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): all_vars = ( self._actor_net.trainable_weights + self._value_net.trainable_weights) for var in all_vars: tf.compat.v2.summary.histogram( name=var.name.replace(':', '_'), data=var, step=self.train_step_counter) return loss_info
def _train(self, experience, weights, train_step_counter): # Change trajectory to transitions. trajectory0 = nest.map_structure(lambda t: t[:, :-1], experience) trajectory1 = nest.map_structure(lambda t: t[:, 1:], experience) # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(trajectory0, trajectory1) actions = policy_steps_.action if self._debug_summaries: tf.contrib.summary.histogram('actions', actions) action_distribution_parameters = policy_steps_.info # Reconstruct per-timestep policy distribution from stored distribution # parameters. old_actions_distribution = ( distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters)) # Compute log probability of actions taken during data collection, using the # collect policy distribution. act_log_probs = common_utils.log_probability(old_actions_distribution, actions, self._action_spec) # Compute the value predictions for states using the current value function. # To be used for return & advantage computation. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state( batch_size=batch_size) value_preds, unused_policy_state = self._collect_policy.apply_value_network( experience.observation, experience.step_type, policy_state=policy_state) value_preds = tf.stop_gradient(value_preds) valid_mask = ppo_utils.make_timestep_mask(next_time_steps) if weights is None: weights = valid_mask else: weights *= valid_mask returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] # For each epoch, create its own train op that depends on the previous one. loss_info = tf.no_op() for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): with tf.control_dependencies(nest.flatten(loss_info)): # Only save debug summaries for first and last epochs. debug_summaries = (self._debug_summaries and (i_epoch == 0 or i_epoch == self._num_epochs - 1)) # Build one epoch train op. loss_info = self.build_train_op( time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, train_step_counter, self._summarize_grads_and_vars, self._gradient_clipping, debug_summaries) policy_gradient_losses.append( loss_info.extra.policy_gradient_loss) value_estimation_losses.append( loss_info.extra.value_estimation_loss) l2_regularization_losses.append( loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. with tf.control_dependencies(nest.flatten(loss_info)): # Compute the mean kl from old. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size) kl_divergence = self._kl_divergence( time_steps, action_distribution_parameters, self._collect_policy.distribution(time_steps, policy_state).action) update_adaptive_kl_beta_op = self.update_adaptive_kl_beta( kl_divergence) with tf.control_dependencies([update_adaptive_kl_beta_op]): if self._observation_normalizer: update_obs_norm = (self._observation_normalizer.update( time_steps.observation, outer_dims=[0, 1])) else: update_obs_norm = tf.no_op() if self._reward_normalizer: update_reward_norm = self._reward_normalizer.update( next_time_steps.reward, outer_dims=[0, 1]) else: update_reward_norm = tf.no_op() with tf.control_dependencies([update_obs_norm, update_reward_norm]): loss_info = nest.map_structure(tf.identity, loss_info) # Make summaries for total loss across all epochs. # The *_losses lists will have been populated by # calls to self.build_train_op. with tf.name_scope('Losses/'): total_policy_gradient_loss = tf.add_n(policy_gradient_losses) total_value_estimation_loss = tf.add_n(value_estimation_losses) total_l2_regularization_loss = tf.add_n(l2_regularization_losses) total_entropy_regularization_loss = tf.add_n( entropy_regularization_losses) total_kl_penalty_loss = tf.add_n(kl_penalty_losses) tf.contrib.summary.scalar('policy_gradient_loss', total_policy_gradient_loss) tf.contrib.summary.scalar('value_estimation_loss', total_value_estimation_loss) tf.contrib.summary.scalar('l2_regularization_loss', total_l2_regularization_loss) if self._entropy_regularization: tf.contrib.summary.scalar('entropy_regularization_loss', total_entropy_regularization_loss) tf.contrib.summary.scalar('kl_penalty_loss', total_kl_penalty_loss) total_abs_loss = (tf.abs(total_policy_gradient_loss) + tf.abs(total_value_estimation_loss) + tf.abs(total_entropy_regularization_loss) + tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss)) tf.contrib.summary.scalar('total_abs_loss', total_abs_loss) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): all_vars = (self._actor_net.trainable_weights + self._value_net.trainable_weights) for var in all_vars: tf.contrib.summary.histogram(var.name.replace(':', '_'), var) return loss_info