def _train(self, experience, weights): rewards, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.reward, self._time_step_spec.reward) actions, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.action, self._action_spec) observations, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.observation, self._time_step_spec.observation) if self._observation_and_action_constraint_splitter is not None: observations, _ = self._observation_and_action_constraint_splitter( observations) with tf.GradientTape() as tape: loss_info = self.loss(observations, actions, rewards, weights=weights, training=True) tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan') self.compute_summaries(loss_info.loss) variables_to_train = self._reward_network.trainable_weights if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) training_lib.apply_gradients(self._optimizer, grads_and_vars, global_step=self.train_step_counter) return loss_info
def _train(self, experience, weights): experience = self._as_trajectory(experience) with tf.GradientTape() as tape: loss_info = self._loss(experience, weights=weights, training=True) variables_to_train = self._variables_to_train() if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) self.train_step_counter.assign_add(1) if not self._accepts_per_arm_features and self._num_samples_list: # Compute the number of samples for each action in the current batch. actions_flattened = tf.reshape(experience.action, [-1]) num_samples_per_action_current = [ tf.reduce_sum(tf.cast(tf.equal(actions_flattened, k), tf.int64)) for k in range(self._num_actions) ] # Update the number of samples for each action. for a, b in zip(self._num_samples_list, num_samples_per_action_current): tf.compat.v1.assign_add(a, b) return loss_info
def _train(self, experience, weights): (observations, actions, rewards) = bandit_utils.process_experience_for_neural_agents( experience, self._accepts_per_arm_features, self.training_data_spec) if self._observation_and_action_constraint_splitter is not None: observations, _ = self._observation_and_action_constraint_splitter( observations) with tf.GradientTape() as tape: loss_info = self.loss(observations, actions, rewards, weights=weights, training=True) variables_to_train = self._variables_to_train() if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) self.train_step_counter.assign_add(1) return loss_info
def _train(self, experience, weights): time_steps, actions, next_time_steps = self._experience_to_transitions( experience) with tf.GradientTape() as tape: loss_info = self.loss( time_steps, actions, next_time_steps, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights) tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan') variables_to_train = self._q_network.trainable_weights assert list( variables_to_train), "No variables in the agent's q_network." grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) self._update_target() return loss_info
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) actions = policy_steps_.action if self._debug_summaries: actions_list = tf.nest.flatten(actions) show_action_index = len(actions_list) != 1 for i, single_action in enumerate(actions_list): action_name = ('actions_{}'.format(i) if show_action_index else 'actions') tf.compat.v2.summary.histogram(name=action_name, data=single_action, step=self.train_step_counter) action_distribution_parameters = policy_steps_.info # Reconstruct per-timestep policy distribution from stored distribution # parameters. old_actions_distribution = ( distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters)) # Compute log probability of actions taken during data collection, using the # collect policy distribution. act_log_probs = common.log_probability(old_actions_distribution, actions, self._action_spec) # Compute the value predictions for states using the current value function. # To be used for return & advantage computation. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state( batch_size=batch_size) value_preds, unused_policy_state = self._collect_policy.apply_value_network( experience.observation, experience.step_type, policy_state=policy_state) value_preds = tf.stop_gradient(value_preds) valid_mask = ppo_utils.make_timestep_mask(next_time_steps) if weights is None: weights = valid_mask else: weights *= valid_mask returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] loss_info = None # TODO(b/123627451): Remove. # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Only save debug summaries for first and last epochs. debug_summaries = (self._debug_summaries and (i_epoch == 0 or i_epoch == self._num_epochs - 1)) # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, self.train_step_counter, debug_summaries) variables_to_train = (self._actor_net.trainable_weights + self._value_net.trainable_weights) grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) # If summarize_gradients, create functions for summarizing both # gradients and variables. if self._summarize_grads_and_vars and debug_summaries: eager_utils.add_gradients_summaries( grads_and_vars, self.train_step_counter) eager_utils.add_variables_summaries( grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients( grads_and_vars, global_step=self.train_step_counter) policy_gradient_losses.append( loss_info.extra.policy_gradient_loss) value_estimation_losses.append( loss_info.extra.value_estimation_loss) l2_regularization_losses.append( loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size) # Compute the mean kl from previous action distribution. kl_divergence = self._kl_divergence( time_steps, action_distribution_parameters, self._collect_policy.distribution(time_steps, policy_state).action) self.update_adaptive_kl_beta(kl_divergence) if self._observation_normalizer: self._observation_normalizer.update(time_steps.observation, outer_dims=[0, 1]) else: # TODO(b/127661780): Verify performance of reward_normalizer when obs are # not normalized if self._reward_normalizer: self._reward_normalizer.update(next_time_steps.reward, outer_dims=[0, 1]) loss_info = tf.nest.map_structure(tf.identity, loss_info) # Make summaries for total loss across all epochs. # The *_losses lists will have been populated by # calls to self.get_epoch_loss. with tf.name_scope('Losses/'): total_policy_gradient_loss = tf.add_n(policy_gradient_losses) total_value_estimation_loss = tf.add_n(value_estimation_losses) total_l2_regularization_loss = tf.add_n(l2_regularization_losses) total_entropy_regularization_loss = tf.add_n( entropy_regularization_losses) total_kl_penalty_loss = tf.add_n(kl_penalty_losses) tf.compat.v2.summary.scalar(name='policy_gradient_loss', data=total_policy_gradient_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='value_estimation_loss', data=total_value_estimation_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='l2_regularization_loss', data=total_l2_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='entropy_regularization_loss', data=total_entropy_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='kl_penalty_loss', data=total_kl_penalty_loss, step=self.train_step_counter) total_abs_loss = (tf.abs(total_policy_gradient_loss) + tf.abs(total_value_estimation_loss) + tf.abs(total_entropy_regularization_loss) + tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss)) tf.compat.v2.summary.scalar(name='total_abs_loss', data=total_abs_loss, step=self.train_step_counter) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): all_vars = (self._actor_net.trainable_weights + self._value_net.trainable_weights) for var in all_vars: tf.compat.v2.summary.histogram( name=var.name.replace(':', '_'), data=var, step=self.train_step_counter) return loss_info
def _create_summaries(grads_and_vars): eager_utils.add_gradients_summaries(grads_and_vars) eager_utils.add_variables_summaries(grads_and_vars) grads_and_vars = clip_gradients(grads_and_vars) return grads_and_vars
def _train(self, experience, weights): # Q-network training with tf.GradientTape() as tape: loss_info_q = self._loss( experience, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights, training=True) tf.debugging.check_numerics(loss_info_q.loss, 'Loss is inf or nan') variables_to_train = self._q_network.trainable_weights non_trainable_weights = self._q_network.non_trainable_weights assert list( variables_to_train), "No variables in the agent's q_network." grads = tape.gradient(loss_info_q.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = list(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: grads_and_vars_with_non_trainable = ( grads_and_vars + [(None, v) for v in non_trainable_weights]) eager_utils.add_variables_summaries( grads_and_vars_with_non_trainable, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) # H-network training with tf.GradientTape() as tape: loss_info_h = self._loss_h( experience, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights, training=True) tf.debugging.check_numerics(loss_info_h.loss, 'Loss is inf or nan') variables_to_train_h = self._h_network.trainable_weights non_trainable_weights_h = self._h_network.non_trainable_weights assert list( variables_to_train_h), "No variables in the agent's h_network." grads_h = tape.gradient(loss_info_h.loss, variables_to_train_h) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars_h = list(zip(grads_h, variables_to_train_h)) if self._gradient_clipping is not None: grads_and_vars_h = eager_utils.clip_gradient_norms( grads_and_vars_h, self._gradient_clipping) if self._summarize_grads_and_vars: grads_and_vars_with_non_trainable_h = ( grads_and_vars_h + [(None, v) for v in non_trainable_weights_h]) eager_utils.add_variables_summaries( grads_and_vars_with_non_trainable_h, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars_h, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars_h) self.train_step_counter.assign_add(1) self._update_target() return loss_info_q