def _train(self, experience, weights): (observations, actions, rewards) = bandit_utils.process_experience_for_neural_agents( experience, self._observation_and_action_constraint_splitter, self._accepts_per_arm_features, self.training_data_spec) with tf.GradientTape() as tape: loss_info = self.loss(observations, actions, rewards, weights=weights, training=True) variables_to_train = self._variables_to_train() if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) self.train_step_counter.assign_add(1) return loss_info
def _train(self, experience, weights): time_steps, actions, next_time_steps = self._experience_to_transitions( experience) with tf.GradientTape() as tape: loss_info = self.loss(time_steps, actions, next_time_steps, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights) tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan') variables_to_train = self._q_network.trainable_weights assert list(variables_to_train), "No variables in the agent's q_network." grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) self._update_target() return loss_info
def train_complete(self, tape: tf.GradientTape, training_info: TrainingInfo, weight=1.0): """Complete one iteration of training. `train_complete` should calculate gradients and update parameters using those gradients. Args: tape (tf.GradientTape): the tape which are used for calculating gradient. All the previous `train_interval` `train_step()` for are called under the context of this tape. training_info (TrainingInfo): information collected for training. training_info.info are the batched from each policy_step.info returned by train_step() weight (float): weight for this batch. Loss will be multiplied with this weight before calculating gradient Returns: a tuple of the following: loss_info (LossInfo): loss information grads_and_vars (list[tuple]): list of gradient and variable tuples """ valid_masks = tf.cast( tf.not_equal(training_info.step_type, StepType.LAST), tf.float32) # reward shaping if self._reward_shaping_fn is not None: # record unshaped extrinsic rewards given by the environment self.add_reward_summary("reward/raw", training_info.reward) training_info = training_info._replace( reward=self._reward_shaping_fn(training_info.reward)) # record shaped extrinsic rewards actually used for training self.add_reward_summary("reward/extrinsic", training_info.reward) with tape: loss_info = self.calc_loss(training_info) loss_info = tf.nest.map_structure( lambda l: tf.reduce_mean(l * valid_masks), loss_info) loss = weight * loss_info.loss var_sets = self._get_cached_var_sets() all_grads_and_vars = () for vars, optimizer in zip(var_sets, self._optimizers): grads = tape.gradient(loss, vars) grads_and_vars = tuple(zip(grads, vars)) all_grads_and_vars = all_grads_and_vars + grads_and_vars if self._gradient_clipping is not None: if self._clip_by_global_norm: grads, _ = tf.clip_by_global_norm(grads, self._gradient_clipping) grads_and_vars = tuple(zip(grads, vars)) else: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) optimizer.apply_gradients(grads_and_vars) return loss_info, all_grads_and_vars
def _train(self, experience, weights): with tf.GradientTape() as tape: loss_info = self._loss( experience, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights, training=True) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') variables_to_train = self._q_network.trainable_weights non_trainable_weights = self._q_network.non_trainable_weights assert list(variables_to_train), "No variables in the agent's q_network." grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = list(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: grads_and_vars_with_non_trainable = ( grads_and_vars + [(None, v) for v in non_trainable_weights]) eager_utils.add_variables_summaries(grads_and_vars_with_non_trainable, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) self.train_step_counter.assign_add(1) self._update_target() return loss_info
def _apply_loss(self, aggregated_losses, variables_to_train, tape, optimizer): total_loss = aggregated_losses.total_loss tf.debugging.check_numerics(total_loss, "Loss is inf or nan") assert list(variables_to_train), "No variables in the agent's network." grads = tape.gradient(total_loss, variables_to_train) grads_and_vars = list(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self.summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) optimizer.apply_gradients(grads_and_vars) if self.summaries_enabled: dict_losses = { "loss": aggregated_losses.weighted, "reg_loss": aggregated_losses.regularization, "total_loss": total_loss } common.summarize_scalar_dict(dict_losses, step=self.train_step_counter, name_scope="Losses/")
def _train_v1(self, experience, weights): with tf.GradientTape() as tape: loss_info = self._loss( experience, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights) tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan') variables_to_train = self._q_network.trainable_weights assert list(variables_to_train), "No variables in the agent's q_network." grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) train_op = self._optimizer.apply_gradients( grads_and_vars, global_step=self.train_step_counter) update_op = self._update_target() train_op = tf.group(train_op, update_op) return train_op, loss_info
def _train(self, experience: types.NestedTensor, weights: types.Tensor) -> tf_agent.LossInfo: experience = self._as_trajectory(experience) with tf.GradientTape() as tape: loss_info = self._loss(experience, weights=weights, training=True) variables_to_train = self._variables_to_train() if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) self.train_step_counter.assign_add(1) return loss_info
def _train(self, experience, weights=None): # TODO(b/126593927): Support batch dimensions >1. if experience.step_type.shape[0] != 1: raise NotImplementedError( 'ReinforceAgent does not yet support batch ' 'dimensions greater than 1.') experience = tf.nest.map_structure(lambda t: tf.squeeze(t, 0), experience) returns = common.compute_returns(experience.reward, experience.discount) if self._debug_summaries: tf.compat.v2.summary.histogram(name='rewards', data=experience.reward, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='discounts', data=experience.discount, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='returns', data=returns, step=self.train_step_counter) # TODO(b/126592060): replace with tensor normalizer. if self._normalize_returns: ret_mean, ret_var = tf.nn.moments(x=returns, axes=[0]) returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6) if self._debug_summaries: tf.compat.v2.summary.histogram(name='normalized_returns', data=returns, step=self.train_step_counter) time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) variables_to_train = self._actor_network.variables with tf.GradientTape() as tape: loss_info = self._loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if self._gradient_clipping: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) return tf.nest.map_structure(tf.identity, loss_info)
def train(self, x0, a0, y0, y1, r0, r1, vars_to_train): with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(vars_to_train) feat_x0, _ = self.forward_enc(x0, training=True) if self.action_condition: e_zx0_param, _ = self.forward_head([feat_x0, a0], training=True) else: e_zx0_param, _ = self.forward_head(feat_x0, training=True) e_zx0_loc, e_zx0_scale = e_zx0_param e_zx0 = tfd.MultivariateNormalDiag(loc=e_zx0_loc, scale_diag=e_zx0_scale) zx0 = e_zx0.sample() feat_y0, _ = self.backward_enc(y0, training=self.learn_backward_enc) if not self.learn_backward_enc: feat_y0 = tf.stop_gradient(feat_y0) if self.backward_encode_rewards: b_zy0_param, _ = self.backward_head([feat_y0, r0], training=True) else: b_zy0_param, _ = self.backward_head(feat_y0, training=True) b_zy0_loc, b_zy0_scale = b_zy0_param b_zy0 = tfd.MultivariateNormalDiag(loc=b_zy0_loc, scale_diag=b_zy0_scale) b_zy1 = None if self.ceb.smooth_mode is not None: feat_y1, _ = self.backward_enc( y1, training=self.learn_backward_enc) if not self.learn_backward_enc: feat_y1 = tf.stop_gradient(feat_y1) if self.backward_encode_rewards: b_zy1_param, _ = self.backward_head([feat_y1, r1], training=True) else: b_zy1_param, _ = self.backward_head(feat_y1, training=True) b_zy1_loc, b_zy1_scale = b_zy1_param b_zy1 = tfd.MultivariateNormalDiag(loc=b_zy1_loc, scale_diag=b_zy1_scale) if self.y_decoders is None: # pure contrastive CEB loss = self.ceb.loss(zx0, e_zx0, b_zy0, b_zy1) else: # CEB with generative objectives # y_targets0 = [y0, r0] y_targets0 = [tf.cast(y0, tf.float32) / 255.0, r0] y_preds0 = self.y_decoders(zx0, training=True)._[0] loss = self.ceb.loss(zx0, e_zx0, b_zy0, b_zy1, y_preds0, y_targets0) grads = tape.gradient(loss, vars_to_train) grads_and_vars = tuple(zip(grads, vars_to_train)) if self.grad_clip is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self.grad_clip) self.optimizer.apply_gradients(grads_and_vars) return loss, feat_x0, zx0
def compute_loss_using_reward_layer( self, observation: types.NestedTensor, action: types.Tensor, reward: types.Tensor, weights: Optional[types.Float] = None, training: bool = False) -> tf_agent.LossInfo: """Computes loss using the reward layer. Args: observation: A batch of observations. action: A batch of actions. reward: A batch of rewards. weights: Optional scalar or elementwise (per-batch-entry) importance weights. The output batch loss will be scaled by these weights, and the final scalar loss is the mean of these values. training: Whether the loss is being used for training. Returns: loss: A `LossInfo` containing the loss for the training step. """ # Update the neural network params. with tf.GradientTape() as tape: loss_info = self._loss_using_reward_layer(observation, action, reward, weights, training=training) tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan') tf.compat.v2.summary.scalar(name='using_reward_layer', data=1, step=self.train_step_counter) if self._summarize_grads_and_vars: self.compute_summaries(loss_info.loss) variables_to_train = (self._encoding_network.trainable_weights + self._reward_layer.trainable_weights) if not variables_to_train: raise ValueError('No variable to train in the agent.') grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: with tf.name_scope('Reward_network/'): eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) self.train_step_counter.assign_add(1) return loss_info
def _train(self, experience, weights=None): # Add a mask to ensure we reset the return calculation at episode # boundaries. This is needed in cases where episodes are truncated before # reaching a terminal state. non_last_mask = tf.cast( tf.math.not_equal(experience.next_step_type, ts.StepType.LAST), tf.float32) discounts = non_last_mask * experience.discount * self._gamma returns = value_ops.discounted_return(experience.reward, discounts, time_major=False) if self._debug_summaries: tf.compat.v2.summary.histogram(name='rewards', data=experience.reward, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='discounts', data=experience.discount, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='returns', data=returns, step=self.train_step_counter) time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) with tf.GradientTape() as tape: loss_info = self.total_loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') variables_to_train = self._actor_network.trainable_weights if self._baseline: variables_to_train += self._value_network.trainable_weights grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = list(zip(grads, variables_to_train)) if self._gradient_clipping: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) return tf.nest.map_structure(tf.identity, loss_info)
def testClipGrads(self): xs = tf.Variable(0.0) grads = tf.constant(4.0) gradients_to_variables = [(grads, xs)] clipped_gradients_to_variables = eager_utils.clip_gradient_norms( gradients_to_variables, 3.0) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAlmostEqual(4.0, self.evaluate(gradients_to_variables[0][0])) self.assertAlmostEqual(3.0, self.evaluate(clipped_gradients_to_variables[0][0]))
def _train(self, experience, weights=None): returns = value_ops.discounted_return(experience.reward, experience.discount, time_major=False) if self._debug_summaries: tf.compat.v2.summary.histogram(name='rewards', data=experience.reward, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='discounts', data=experience.discount, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='returns', data=returns, step=self.train_step_counter) # TODO(b/126592060): replace with tensor normalizer. if self._normalize_returns: returns = _standard_normalize(returns, axes=(0, 1)) if self._debug_summaries: tf.compat.v2.summary.histogram(name='normalized_returns', data=returns, step=self.train_step_counter) time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) variables_to_train = self._actor_network.variables with tf.GradientTape() as tape: loss_info = self._loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if self._gradient_clipping: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) return tf.nest.map_structure(tf.identity, loss_info)
def _train(self, experience, weights): rewards, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.reward, self._time_step_spec.reward) actions, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.action, self._action_spec) observations, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.observation, self.training_data_spec.observation) if self._observation_and_action_constraint_splitter is not None: observations, _ = self._observation_and_action_constraint_splitter( observations) if self._accepts_per_arm_features: # The arm observation we train on needs to be copied from the respective # policy info field to the per arm observation field. Pretending there was # only one action, we fill the action field with zeros. chosen_action, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.policy_info.chosen_arm_features, self.policy.info_spec.chosen_arm_features) observations[ bandit_spec_utils.PER_ARM_FEATURE_KEY] = tf.expand_dims( chosen_action, axis=1) actions = tf.zeros_like(actions) with tf.GradientTape() as tape: loss_info = self.loss(observations, actions, rewards, weights=weights, training=True) self.compute_summaries(loss_info.loss) variables_to_train = self._reward_network.trainable_weights if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) training_lib.apply_gradients(self._optimizer, grads_and_vars, global_step=self.train_step_counter) return loss_info
def _apply_gradients(self, gradients, variables, optimizer): grads_and_vars = list(zip(gradients, variables)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) optimizer.apply_gradients(grads_and_vars)
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) #observations = time_steps.observation actions = policy_steps_.action rewards = next_time_steps.reward print(rewards) discounts = next_time_steps.discount if self._reward_normalizer: rewards = self._reward_normalizer.normalize( rewards, center_mean=False, clip_value=self._reward_norm_clipping) value_preds = self.double_batch_pred(self._mod_net, experience.observation, is_training=True) #print("VPRED",value_preds.shape,value_preds_2.shape) returns = self.compute_return(next_time_steps, value_preds) value_estimation_losses = [] loss_info = None # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, returns, weights) #action_distribution_parameters variables_to_train = self._mod_net.trainable_weights grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) self._optimizer.apply_gradients( grads_and_vars) #, global_step=self.train_step_counter) value_estimation_losses.append( loss_info.extra.value_estimation_loss) loss_info = tf.nest.map_structure(tf.identity, loss_info) return loss_info
def testClipGradsIndexedSlices(self): xs = tf.Variable(0.0) grads = tf.IndexedSlices(values=tf.constant(4.0), indices=tf.constant([0]), dense_shape=None) gradients_to_variables = [(grads, xs)] clipped_gradients_to_variables = eager_utils.clip_gradient_norms( gradients_to_variables, 3.0) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAlmostEqual( 4.0, self.evaluate(gradients_to_variables[0][0].values)) self.assertAlmostEqual( 3.0, self.evaluate(clipped_gradients_to_variables[0][0].values))
def _apply_gradients(self, gradients, variables, optimizer): # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(gradients, variables)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) return optimizer.apply_gradients(grads_and_vars)
def _train(self, experience, weights=None): # TODO(b/132914246): Use .is_last() to mask the end of each episode. returns = value_ops.discounted_return(experience.reward, experience.discount * self._gamma, time_major=False) if self._debug_summaries: tf.compat.v2.summary.histogram(name='rewards', data=experience.reward, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='discounts', data=experience.discount, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='returns', data=returns, step=self.train_step_counter) time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) with tf.GradientTape() as tape: loss_info = self.total_loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') variables_to_train = self._actor_network.trainable_weights if self._baseline: variables_to_train += self._value_network.trainable_weights grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if self._gradient_clipping: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) return tf.nest.map_structure(tf.identity, loss_info)
def _train(self, experience: types.NestedTensor, weights: types.Tensor) -> tf_agent.LossInfo: (observations, actions, objective_values) = bandit_utils.process_experience_for_neural_agents( experience, self._accepts_per_arm_features, self.training_data_spec) if self._observation_and_action_constraint_splitter is not None: observations, _ = self._observation_and_action_constraint_splitter( observations) if objective_values.shape.rank != 2: raise ValueError( 'The objectives tensor should be rank-2 [batch_size, num_objectives],' ' but found to be rank-{}'.format(objective_values.shape.rank)) if objective_values.shape[1] != self._num_objectives: raise ValueError( 'The number of objectives in the objective_values tensor: {} ' 'is different from the number of objective networks: {}.'. format(objective_values.shape[1], self._num_objectives)) with tf.GradientTape() as tape: loss_info = self.loss(observations, actions, objective_values, weights=weights, training=True) variables_to_train = self._variables_to_train() if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) self.train_step_counter.assign_add(1) return loss_info
def _train(self, experience, weights): rewards, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.reward, self._time_step_spec.reward) actions, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.action, self._action_spec) observations, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.observation, self._time_step_spec.observation) if self._observation_and_action_constraint_splitter is not None: observations, _ = self._observation_and_action_constraint_splitter( observations) with tf.GradientTape() as tape: loss_info = self.loss(observations, actions, rewards, weights=weights, training=True) tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan') self.compute_summaries(loss_info.loss) variables_to_train = self._reward_network.trainable_weights if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) training_lib.apply_gradients(self._optimizer, grads_and_vars, global_step=self.train_step_counter) return loss_info
def _update_values(self, time_steps, returns, weights): """Update value function estimate by performing gradient descent on value loss""" variables_to_train = self._value_net.trainable_weights value_loss = 0.0 for _ in range(self._value_train_iters): with tf.GradientTape() as tape: value_loss = self.value_estimation_loss( time_steps, returns, weights) grads = tape.gradient(value_loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) self._optimizer.apply_gradients( grads_and_vars, global_step=self.train_step_counter) return value_loss
def _train(self, experience, weights): experience = self._as_trajectory(experience) with tf.GradientTape() as tape: loss_info = self._loss(experience, weights=weights, training=True) variables_to_train = self._variables_to_train() if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) self.train_step_counter.assign_add(1) if not self._accepts_per_arm_features and self._num_samples_list: # Compute the number of samples for each action in the current batch. actions_flattened = tf.reshape(experience.action, [-1]) num_samples_per_action_current = [ tf.reduce_sum(tf.cast(tf.equal(actions_flattened, k), tf.int64)) for k in range(self._num_actions) ] # Update the number of samples for each action. for a, b in zip(self._num_samples_list, num_samples_per_action_current): tf.compat.v1.assign_add(a, b) return loss_info
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) actions = policy_steps_.action if self._debug_summaries: actions_list = tf.nest.flatten(actions) show_action_index = len(actions_list) != 1 for i, single_action in enumerate(actions_list): action_name = ('actions_{}'.format(i) if show_action_index else 'actions') tf.compat.v2.summary.histogram(name=action_name, data=single_action, step=self.train_step_counter) action_distribution_parameters = policy_steps_.info # Reconstruct per-timestep policy distribution from stored distribution # parameters. old_actions_distribution = ( distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters)) # Compute log probability of actions taken during data collection, using the # collect policy distribution. act_log_probs = common.log_probability(old_actions_distribution, actions, self._action_spec) # Compute the value predictions for states using the current value function. # To be used for return & advantage computation. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state( batch_size=batch_size) value_preds, unused_policy_state = self._collect_policy.apply_value_network( experience.observation, experience.step_type, policy_state=policy_state) value_preds = tf.stop_gradient(value_preds) valid_mask = ppo_utils.make_timestep_mask(next_time_steps) if weights is None: weights = valid_mask else: weights *= valid_mask returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] loss_info = None # TODO(b/123627451): Remove. # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Only save debug summaries for first and last epochs. debug_summaries = (self._debug_summaries and (i_epoch == 0 or i_epoch == self._num_epochs - 1)) # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, self.train_step_counter, debug_summaries) variables_to_train = (self._actor_net.trainable_weights + self._value_net.trainable_weights) grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) # If summarize_gradients, create functions for summarizing both # gradients and variables. if self._summarize_grads_and_vars and debug_summaries: eager_utils.add_gradients_summaries( grads_and_vars, self.train_step_counter) eager_utils.add_variables_summaries( grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients( grads_and_vars, global_step=self.train_step_counter) policy_gradient_losses.append( loss_info.extra.policy_gradient_loss) value_estimation_losses.append( loss_info.extra.value_estimation_loss) l2_regularization_losses.append( loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size) # Compute the mean kl from previous action distribution. kl_divergence = self._kl_divergence( time_steps, action_distribution_parameters, self._collect_policy.distribution(time_steps, policy_state).action) self.update_adaptive_kl_beta(kl_divergence) if self._observation_normalizer: self._observation_normalizer.update(time_steps.observation, outer_dims=[0, 1]) else: # TODO(b/127661780): Verify performance of reward_normalizer when obs are # not normalized if self._reward_normalizer: self._reward_normalizer.update(next_time_steps.reward, outer_dims=[0, 1]) loss_info = tf.nest.map_structure(tf.identity, loss_info) # Make summaries for total loss across all epochs. # The *_losses lists will have been populated by # calls to self.get_epoch_loss. with tf.name_scope('Losses/'): total_policy_gradient_loss = tf.add_n(policy_gradient_losses) total_value_estimation_loss = tf.add_n(value_estimation_losses) total_l2_regularization_loss = tf.add_n(l2_regularization_losses) total_entropy_regularization_loss = tf.add_n( entropy_regularization_losses) total_kl_penalty_loss = tf.add_n(kl_penalty_losses) tf.compat.v2.summary.scalar(name='policy_gradient_loss', data=total_policy_gradient_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='value_estimation_loss', data=total_value_estimation_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='l2_regularization_loss', data=total_l2_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='entropy_regularization_loss', data=total_entropy_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='kl_penalty_loss', data=total_kl_penalty_loss, step=self.train_step_counter) total_abs_loss = (tf.abs(total_policy_gradient_loss) + tf.abs(total_value_estimation_loss) + tf.abs(total_entropy_regularization_loss) + tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss)) tf.compat.v2.summary.scalar(name='total_abs_loss', data=total_abs_loss, step=self.train_step_counter) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): all_vars = (self._actor_net.trainable_weights + self._value_net.trainable_weights) for var in all_vars: tf.compat.v2.summary.histogram( name=var.name.replace(':', '_'), data=var, step=self.train_step_counter) return loss_info
def apply_gradients(gradients, variables, optimizer, gradient_clipping): grads_and_vars = zip(gradients, variables) if gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars, gradient_clipping) optimizer.apply_gradients(grads_and_vars)
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) #observations = time_steps.observation actions = policy_steps_.action #rewards = next_time_steps.reward #discounts = next_time_steps.discount old_actions_distribution = policy_steps_.info act_log_probs = get_neglopacs(logits=old_actions_distribution, labels=actions) # Compute the value predictions for states using the current value function. value_preds = double_batch_pred2(self._value_net, experience.observation, self._observation_spec, is_training=True) value_preds = tf.squeeze(value_preds, -1) #NeedValue preds at all time_steps +1 final step obs #print("Weight",weights) #print("REW",rewards) #print("Dis",discounts) returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) #print("RET",returns) #print(normalized_advantages) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] loss_info = None # TODO(b/123627451): Remove. # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, actions, act_log_probs, returns, normalized_advantages, old_actions_distribution, weights) #action_distribution_parameters variables_to_train = (self._actor_net.trainable_variables + self._value_net.trainable_variables) grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) self._optimizer.apply_gradients( grads_and_vars) #, global_step=self.train_step_counter) policy_gradient_losses.append( loss_info.extra.policy_gradient_loss) value_estimation_losses.append( loss_info.extra.value_estimation_loss) l2_regularization_losses.append( loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. # Compute the mean kl from previous action distribution. temp_ = double_batch_pred2(self._actor_net, time_steps.observation, self._observation_spec, is_training=True) kl_divergence = self._kl_divergence(time_steps, old_actions_distribution, temp_) self.update_adaptive_kl_beta(kl_divergence) if self._observation_normalizer: self._observation_normalizer.update(time_steps.observation, outer_dims=[0, 1]) else: # TODO(b/127661780): Verify performance of reward_normalizer when obs are # not normalized if self._reward_normalizer: self._reward_normalizer.update(next_time_steps.reward, outer_dims=[0, 1]) loss_info = tf.nest.map_structure(tf.identity, loss_info) return loss_info
def train_complete(self, tape: tf.GradientTape, training_info, valid_masks=None, weight=1.0): """Complete one iteration of training. `train_complete` should calculate gradients and update parameters using those gradients. Args: tape (tf.GradientTape): the tape which are used for calculating gradient. All the previous `train_interval` `train_step()` are called under the context of this tape. training_info (nested Tensor): information collected for training. It is batched from each `info` returned bt `train_step()` valid_masks (tf.Tensor): masks indicating which samples are valid. shape=(T, B), dtype=tf.float32 weight (float): weight for this batch. Loss will be multiplied with this weight before calculating gradient Returns: loss_info (LossInfo): loss information grads_and_vars (list[tuple]): list of gradient and variable tuples """ with tape: loss_info = self.calc_loss(training_info) if valid_masks is not None: loss_info = tf.nest.map_structure( lambda l: tf.reduce_mean(l * valid_masks) if len(l.shape) == 2 else l, loss_info) else: loss_info = tf.nest.map_structure(lambda l: tf.reduce_mean(l), loss_info) if isinstance(loss_info.scalar_loss, tf.Tensor): assert len(loss_info.scalar_loss.shape) == 0 loss_info = loss_info._replace( loss=loss_info.loss + loss_info.scalar_loss) loss = weight * loss_info.loss opt_and_var_sets = self._get_cached_opt_and_var_sets() all_grads_and_vars = () for i, (optimizer, vars) in enumerate(opt_and_var_sets): if len(vars) == 0: continue assert optimizer is not None, "optimizer needs to be provides at __init__()" grads = tape.gradient(loss, vars) grads_and_vars = tuple(zip(grads, vars)) all_grads_and_vars = all_grads_and_vars + grads_and_vars if self._gradient_clipping is not None: if self._clip_by_global_norm: grads, global_norm = tf.clip_by_global_norm( grads, self._gradient_clipping) grads_and_vars = tuple(zip(grads, vars)) alf.utils.common.run_if( alf.utils.common.should_record_summaries(), lambda: tf. summary.scalar("global_grad_norm/%s" % i, global_norm)) else: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) optimizer.apply_gradients(grads_and_vars) self.after_train(training_info) return loss_info, all_grads_and_vars