def check_no_shared_variables(network_1, network_2): """Checks that there are no shared trainable variables in the two networks. Args: network_1: A network.Network. network_2: A network.Network. Raises: ValueError: if there are any common trainable variables. ValueError: if one of the networks has not yet been built (e.g. user must call `create_variables`). """ variables_1 = object_identity.ObjectIdentitySet(network_1.trainable_variables) variables_2 = object_identity.ObjectIdentitySet(network_2.trainable_variables) shared_variables = variables_1 & variables_2 if shared_variables: raise ValueError( 'After making a copy of network \'{}\' to create a target ' 'network \'{}\', the target network shares weights with ' 'the original network. This is not allowed. If ' 'you want explicitly share weights with the target network, or ' 'if your input network shares weights with others, please ' 'provide a target network which explicitly, selectively, shares ' 'layers/weights with the input network. If you are not intending to ' 'share weights make sure all the weights are created inside the Network' ' since a copy will be created by creating a new Network with the same ' 'args but a new name. Shared variables found: ' '\'{}\'.'.format( network_1.name, network_2.name, [x.name for x in shared_variables]))
def testDifference(self): class Element(object): pass a = Element() b = Element() c = Element() set1 = object_identity.ObjectIdentitySet([a, b]) set2 = object_identity.ObjectIdentitySet([b, c]) diff_set = set1.difference(set2) self.assertIn(a, diff_set) self.assertNotIn(b, diff_set) self.assertNotIn(c, diff_set)
def testDiscard(self): a = object() b = object() set1 = object_identity.ObjectIdentitySet([a, b]) set1.discard(a) self.assertIn(b, set1) self.assertNotIn(a, set1)
def _train(self, experience, weights=None): # TODO(b/120034503): Move the conversion to transitions to the base class. squeeze_time_dim = not self._actor_network.state_spec time_steps, policy_steps, next_time_steps = ( trajectory.experience_to_transitions(experience, squeeze_time_dim)) actions = policy_steps.action trainable_critic_variables = list( object_identity.ObjectIdentitySet( self._critic_network_1.trainable_variables + self._critic_network_2.trainable_variables)) with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_critic_variables, ( 'No trainable critic variables to ' 'optimize.') tape.watch(trainable_critic_variables) critic_loss = self.critic_loss(time_steps, actions, next_time_steps, weights=weights, training=True) tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.') critic_grads = tape.gradient(critic_loss, trainable_critic_variables) self._apply_gradients(critic_grads, trainable_critic_variables, self._critic_optimizer) trainable_actor_variables = self._actor_network.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_actor_variables, ( 'No trainable actor variables to ' 'optimize.') tape.watch(trainable_actor_variables) actor_loss = self.actor_loss(time_steps, weights=weights, training=True) tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.') # We only optimize the actor every actor_update_period training steps. def optimize_actor(): actor_grads = tape.gradient(actor_loss, trainable_actor_variables) return self._apply_gradients(actor_grads, trainable_actor_variables, self._actor_optimizer) remainder = tf.math.mod(self.train_step_counter, self._actor_update_period) tf.cond(pred=tf.equal(remainder, 0), true_fn=optimize_actor, false_fn=tf.no_op) self.train_step_counter.assign_add(1) self._update_target() # TODO(b/124382360): Compute per element TD loss and return in loss_info. total_loss = actor_loss + critic_loss return tf_agent.LossInfo(total_loss, Td3Info(actor_loss, critic_loss))
def deduped_network_variables(network, *args): """Returns a list of variables in net1 that are not in any other nets. Args: network: A Keras network. *args: other networks to check for duplicate variables. """ other_vars = object_identity.ObjectIdentitySet( [v for n in args for v in n.variables]) # pylint:disable=g-complex-comprehension return [v for v in network.variables if v not in other_vars]
def trainable_variables(self): """Override trainable_variables property to remove encoder_variables.""" if self._image_encoder: encoder_variables = object_identity.ObjectIdentitySet( self._image_encoder.trainable_variables) return [ v for v in super(Actor, self).trainable_variables if v not in encoder_variables ] else: return super(Actor, self).trainable_variables
def extract_shared_variables(variables_1, variables_2): """Separates shared variables from the given collections. Args: variables_1: An iterable of Variables variables_2: An iterable of Variables Returns: A Tuple of ObjectIdentitySets described by the set operations ``` (variables_1 - variables_2, variables_2 - variables_1, variables_1 & variables_2) ``` """ var_refs1 = object_identity.ObjectIdentitySet(variables_1) var_refs2 = object_identity.ObjectIdentitySet(variables_2) shared_vars = var_refs1.intersection(var_refs2) return (var_refs1.difference(shared_vars), var_refs2.difference(shared_vars), shared_vars)
def trainable_variables(self): tvars = super(DiagGuassianPolicy, self).trainable_variables if self.encoder is None: return tvars else: # Remove the encoder conv2d variables (Policy shouldn't update the conv2d # vars). Note that a call to stop_gradient on the fprop isn't enough to # ensure that this is the case, this is because conv2d vars are shared # with the critic and so they can get updated when bpropping through the # critic to minimze the actor loss. encoder_variables = object_identity.ObjectIdentitySet( self.encoder.conv_stack.trainable_variables) return [v for v in tvars if v not in encoder_variables]
def _filter_empty_layer_containers(layer_list): """Remove empty layer containers.""" existing = object_identity.ObjectIdentitySet() to_visit = layer_list[::-1] while to_visit: obj = to_visit.pop() if obj in existing: continue existing.add(obj) if _is_layer(obj): yield obj else: sub_layers = getattr(obj, "layers", None) or [] # Trackable data structures will not show up in ".layers" lists, but # the layers they contain will. to_visit.extend(sub_layers[::-1])
def _train(self, experience, weights): """Returns a train op to update the agent's networks. This method trains with the provided batched experience. Args: experience: A time-stacked trajectory object. weights: Optional scalar or elementwise (per-batch-entry) importance weights. Returns: A train_op. Raises: ValueError: If optimizers are None and no default value was provided to the constructor. """ squeeze_time_dim = not self._critic_network_1.state_spec time_steps, policy_steps, next_time_steps = ( trajectory.experience_to_transitions(experience, squeeze_time_dim)) actions = policy_steps.action trainable_critic_variables = list( object_identity.ObjectIdentitySet( self._critic_network_1.trainable_variables + self._critic_network_2.trainable_variables)) with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_critic_variables, ( 'No trainable critic variables to ' 'optimize.') tape.watch(trainable_critic_variables) critic_loss = self._critic_loss_weight * self.critic_loss( time_steps, actions, next_time_steps, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights, training=True) tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.') critic_grads = tape.gradient(critic_loss, trainable_critic_variables) self._apply_gradients(critic_grads, trainable_critic_variables, self._critic_optimizer) critic_no_entropy_loss = None if self._critic_network_no_entropy_1 is not None: trainable_critic_no_entropy_variables = list( object_identity.ObjectIdentitySet( self._critic_network_no_entropy_1.trainable_variables + self._critic_network_no_entropy_2.trainable_variables)) with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_critic_no_entropy_variables, ( 'No trainable critic_no_entropy variables to optimize.') tape.watch(trainable_critic_no_entropy_variables) critic_no_entropy_loss = self._critic_loss_weight * self.critic_no_entropy_loss( time_steps, actions, next_time_steps, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights, training=True) tf.debugging.check_numerics( critic_no_entropy_loss, 'Critic (without entropy) loss is inf or nan.') critic_no_entropy_grads = tape.gradient( critic_no_entropy_loss, trainable_critic_no_entropy_variables) self._apply_gradients(critic_no_entropy_grads, trainable_critic_no_entropy_variables, self._critic_no_entropy_optimizer) trainable_actor_variables = self._actor_network.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_actor_variables, ( 'No trainable actor variables to ' 'optimize.') tape.watch(trainable_actor_variables) actor_loss = self._actor_loss_weight * self.actor_loss( time_steps, weights=weights) tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.') actor_grads = tape.gradient(actor_loss, trainable_actor_variables) self._apply_gradients(actor_grads, trainable_actor_variables, self._actor_optimizer) alpha_variable = [self._log_alpha] with tf.GradientTape(watch_accessed_variables=False) as tape: assert alpha_variable, 'No alpha variable to optimize.' tape.watch(alpha_variable) alpha_loss = self._alpha_loss_weight * self.alpha_loss( time_steps, weights=weights) tf.debugging.check_numerics(alpha_loss, 'Alpha loss is inf or nan.') alpha_grads = tape.gradient(alpha_loss, alpha_variable) self._apply_gradients(alpha_grads, alpha_variable, self._alpha_optimizer) with tf.name_scope('Losses'): tf.compat.v2.summary.scalar(name='critic_loss_' + self.name, data=critic_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='actor_loss_' + self.name, data=actor_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='alpha_loss_' + self.name, data=alpha_loss, step=self.train_step_counter) if critic_no_entropy_loss is not None: tf.compat.v2.summary.scalar(name='critic_no_entropy_loss_' + self.name, data=critic_no_entropy_loss, step=self.train_step_counter) self.train_step_counter.assign_add(1) self._update_target() total_loss = critic_loss + actor_loss + alpha_loss if critic_no_entropy_loss is not None: total_loss += critic_no_entropy_loss extra = SacLossInfo(critic_loss=critic_loss, actor_loss=actor_loss, alpha_loss=alpha_loss, critic_no_entropy_loss=critic_no_entropy_loss) return tf_agent.LossInfo(loss=total_loss, extra=extra)
def _train(self, experience, weights): """Returns a train op to update the agent's networks. This method trains with the provided batched experience. Args: experience: A time-stacked trajectory object. weights: Optional scalar or elementwise (per-batch-entry) importance weights. Returns: A train_op. Raises: ValueError: If optimizers are None and no default value was provided to the constructor. """ experience, expert_experience = experience if self._n_step is None: transition = self._as_transition(experience) time_steps, policy_steps, next_time_steps = transition future_time_steps = next_time_steps else: experience_1 = experience._replace( observation=experience.observation[:, :2], action=experience.action[:, :2], discount=experience.discount[:, :2], reward=experience.reward[:, :2], step_type=experience.step_type[:, :2], next_step_type=experience.next_step_type[:, :2], ) obs_2 = tf.stack([experience.observation[:, 0], experience.observation[:, -1],], axis=1) action_2 = tf.stack([experience.action[:, 0], experience.action[:, -1],], axis=1) discount_2 = tf.stack([experience.discount[:, 0], experience.discount[:, -1],], axis=1) step_type_2 = tf.stack([experience.step_type[:, 0], experience.step_type[:, -1],], axis=1) next_step_type_2 = tf.stack([experience.next_step_type[:, 0], experience.next_step_type[:, -1],], axis=1) reward_2 = tf.stack([experience.reward[:, 0], experience.reward[:, -1],], axis=1) experience_2 = experience._replace( observation=obs_2, action=action_2, discount=discount_2, step_type=step_type_2, next_step_type=next_step_type_2, reward=reward_2) time_steps, policy_steps, next_time_steps = self._as_transition( experience_1) _, _, future_time_steps = self._as_transition(experience_2) actions = policy_steps.action trainable_critic_variables = list(object_identity.ObjectIdentitySet( self._critic_network_1.trainable_variables + self._critic_network_2.trainable_variables)) with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_critic_variables, ('No trainable critic variables to ' 'optimize.') tape.watch(trainable_critic_variables) critic_loss = self._critic_loss_weight*self.critic_loss( time_steps, expert_experience, actions, next_time_steps, future_time_steps, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights, training=True) tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.') critic_grads = tape.gradient(critic_loss, trainable_critic_variables) self._apply_gradients(critic_grads, trainable_critic_variables, self._critic_optimizer) trainable_actor_variables = self._actor_network.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_actor_variables, ('No trainable actor variables to ' 'optimize.') tape.watch(trainable_actor_variables) actor_loss = self._actor_loss_weight*self.actor_loss( time_steps, actions, weights=weights) tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.') actor_grads = tape.gradient(actor_loss, trainable_actor_variables) self._apply_gradients(actor_grads, trainable_actor_variables, self._actor_optimizer) # Train the behavior policy if self._use_behavior_policy: trainable_behavior_variables = self._behavior_actor_network.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_behavior_variables, ('No trainable behavior variables ' 'to optimize.') tape.watch(trainable_behavior_variables) behavior_loss = self._actor_loss_weight*self.behavior_loss( time_steps, actions, weights=weights) tf.debugging.check_numerics(behavior_loss, 'Behavior loss is inf or nan.') behavior_grads = tape.gradient(behavior_loss, trainable_behavior_variables) self._apply_gradients(behavior_grads, trainable_behavior_variables, self._actor_optimizer) else: behavior_loss = 0.0 with tf.name_scope('Losses'): tf.compat.v2.summary.scalar( name='critic_loss', data=critic_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='actor_loss', data=actor_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='behavior_loss', data=behavior_loss, step=self.train_step_counter) self.train_step_counter.assign_add(1) self._update_target() total_loss = critic_loss + actor_loss extra = RceLossInfo( critic_loss=critic_loss, actor_loss=actor_loss) return tf_agent.LossInfo(loss=total_loss, extra=extra)
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) actions = policy_steps_.action batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] if self._debug_summaries: actions_list = tf.nest.flatten(actions) show_action_index = len(actions_list) != 1 for i, single_action in enumerate(actions_list): action_name = ('actions_{}'.format(i) if show_action_index else 'actions') tf.compat.v2.summary.histogram( name=action_name, data=single_action, step=self.train_step_counter) action_distribution_parameters = policy_steps_.info['dist_params'] # Reconstruct per-timestep policy distribution from stored distribution # parameters. old_actions_distribution = ( distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters)) # Compute log probability of actions taken during data collection, using the # collect policy distribution. act_log_probs = common.log_probability(old_actions_distribution, actions, self._action_spec) valid_mask = ppo_utils.make_timestep_mask( next_time_steps, allow_partial_episodes=True) if weights is None: weights = valid_mask else: weights *= valid_mask if self._compute_value_and_advantage_in_train: value_state = self._collect_policy.get_initial_value_state(batch_size) value_preds, _ = self._collect_policy.apply_value_network( experience.observation, experience.step_type, value_state=value_state, training=False) else: value_preds = experience.policy_info['value_prediction'] value_preds = tf.stop_gradient(value_preds) returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] loss_info = None # TODO(b/123627451): Remove. variables_to_train = list( object_identity.ObjectIdentitySet(self._actor_net.trainable_weights + self._value_net.trainable_weights)) # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Only save debug summaries for first and last epochs. debug_summaries = ( self._debug_summaries and (i_epoch == 0 or i_epoch == self._num_epochs - 1)) # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, self.train_step_counter, debug_summaries, training=True) grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) # If summarize_gradients, create functions for summarizing both # gradients and variables. if self._summarize_grads_and_vars and debug_summaries: eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients( grads_and_vars, global_step=self.train_step_counter) policy_gradient_losses.append(loss_info.extra.policy_gradient_loss) value_estimation_losses.append(loss_info.extra.value_estimation_loss) l2_regularization_losses.append(loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. policy_state = self._collect_policy.get_initial_state(batch_size) # Compute the mean kl from previous action distribution. kl_divergence = self._kl_divergence( time_steps, action_distribution_parameters, self._collect_policy.distribution(time_steps, policy_state).action) self.update_adaptive_kl_beta(kl_divergence) if self._observation_normalizer: self._observation_normalizer.update( time_steps.observation, outer_dims=[0, 1]) else: # TODO(b/127661780): Verify performance of reward_normalizer when obs are # not normalized if self._reward_normalizer: self._reward_normalizer.update( next_time_steps.reward, outer_dims=[0, 1]) loss_info = tf.nest.map_structure(tf.identity, loss_info) # Make summaries for total loss averaged across all epochs. # The *_losses lists will have been populated by # calls to self.get_epoch_loss. Assumes all the losses have same length. with tf.name_scope('Losses/'): num_epochs = len(policy_gradient_losses) total_policy_gradient_loss = tf.add_n(policy_gradient_losses) / num_epochs total_value_estimation_loss = tf.add_n( value_estimation_losses) / num_epochs total_l2_regularization_loss = tf.add_n( l2_regularization_losses) / num_epochs total_entropy_regularization_loss = tf.add_n( entropy_regularization_losses) / num_epochs total_kl_penalty_loss = tf.add_n(kl_penalty_losses) / num_epochs tf.compat.v2.summary.scalar( name='policy_gradient_loss', data=total_policy_gradient_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='value_estimation_loss', data=total_value_estimation_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='l2_regularization_loss', data=total_l2_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='entropy_regularization_loss', data=total_entropy_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='kl_penalty_loss', data=total_kl_penalty_loss, step=self.train_step_counter) total_abs_loss = ( tf.abs(total_policy_gradient_loss) + tf.abs(total_value_estimation_loss) + tf.abs(total_entropy_regularization_loss) + tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss)) tf.compat.v2.summary.scalar( name='total_abs_loss', data=total_abs_loss, step=self.train_step_counter) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): all_vars = ( self._actor_net.trainable_weights + self._value_net.trainable_weights) for var in all_vars: tf.compat.v2.summary.histogram( name=var.name.replace(':', '_'), data=var, step=self.train_step_counter) return loss_info
def _train(self, experience, weights): """Returns a train op to update the agent's networks. This method trains with the provided batched experience. Args: experience: A time-stacked trajectory object. weights: Optional scalar or elementwise (per-batch-entry) importance weights. Returns: A train_op. Raises: ValueError: If optimizers are None and no default value was provided to the constructor. """ transition = self._as_transition(experience) time_steps, policy_steps, next_time_steps = transition actions = policy_steps.action trainable_critic_variables = list( object_identity.ObjectIdentitySet( self._critic_network_1.trainable_variables + self._critic_network_2.trainable_variables)) with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_critic_variables, ( 'No trainable critic variables to ' 'optimize.') tape.watch(trainable_critic_variables) critic_loss = self._critic_loss_with_optional_entropy_term( time_steps, actions, next_time_steps, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights, training=True) critic_loss *= self._critic_loss_weight cql_alpha = self._get_cql_alpha() cql_loss = self._cql_loss(time_steps, actions, training=True) if self._bc_debug_mode: cql_critic_loss = cql_loss * cql_alpha else: cql_critic_loss = critic_loss + (cql_loss * cql_alpha) tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.') tf.debugging.check_numerics(cql_loss, 'CQL loss is inf or nan.') critic_grads = tape.gradient(cql_critic_loss, trainable_critic_variables) self._apply_gradients(critic_grads, trainable_critic_variables, self._critic_optimizer) trainable_actor_variables = self._actor_network.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_actor_variables, ( 'No trainable actor variables to ' 'optimize.') tape.watch(trainable_actor_variables) actor_loss = self._actor_loss_weight * self.actor_loss( time_steps, actions=actions, weights=weights) tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.') actor_grads = tape.gradient(actor_loss, trainable_actor_variables) self._apply_gradients(actor_grads, trainable_actor_variables, self._actor_optimizer) alpha_variable = [self._log_alpha] with tf.GradientTape(watch_accessed_variables=False) as tape: assert alpha_variable, 'No alpha variable to optimize.' tape.watch(alpha_variable) alpha_loss = self._alpha_loss_weight * self.alpha_loss( time_steps, weights=weights) tf.debugging.check_numerics(alpha_loss, 'Alpha loss is inf or nan.') alpha_grads = tape.gradient(alpha_loss, alpha_variable) self._apply_gradients(alpha_grads, alpha_variable, self._alpha_optimizer) # Based on the equation (24), which automates CQL alpha with the "budget" # parameter tau. CQL(H) is now CQL-Lagrange(H): # ``` # min_Q max_{alpha >= 0} alpha * (log_sum_exp(Q(s, a')) - Q(s, a) - tau) # ``` # If the expected difference in Q-values is less than tau, alpha # will adjust to be closer to 0. If the difference is higher than tau, # alpha is likely to take on high values and more aggressively penalize # Q-values. cql_alpha_loss = tf.constant(0.) if self._use_lagrange_cql_alpha: cql_alpha_variable = [self._log_cql_alpha] with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(cql_alpha_variable) cql_alpha_loss = -self._get_cql_alpha() * (cql_loss - self._cql_tau) tf.debugging.check_numerics(cql_alpha_loss, 'CQL alpha loss is inf or nan.') cql_alpha_gradients = tape.gradient(cql_alpha_loss, cql_alpha_variable) self._apply_gradients(cql_alpha_gradients, cql_alpha_variable, self._cql_alpha_optimizer) with tf.name_scope('Losses'): tf.compat.v2.summary.scalar(name='critic_loss', data=critic_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='actor_loss', data=actor_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='alpha_loss', data=alpha_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='cql_loss', data=cql_loss, step=self.train_step_counter) if self._use_lagrange_cql_alpha: tf.compat.v2.summary.scalar(name='cql_alpha_loss', data=cql_alpha_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='cql_alpha', data=cql_alpha, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='sac_alpha', data=tf.exp(self._log_alpha), step=self.train_step_counter) self.train_step_counter.assign_add(1) self._update_target() total_loss = cql_critic_loss + actor_loss + alpha_loss extra = CqlSacLossInfo(critic_loss=critic_loss, actor_loss=actor_loss, alpha_loss=alpha_loss, cql_loss=cql_loss, cql_alpha=cql_alpha, cql_alpha_loss=cql_alpha_loss) return tf_agent.LossInfo(loss=total_loss, extra=extra)
def testClear(self): a = object() b = object() set1 = object_identity.ObjectIdentitySet([a, b]) set1.clear() self.assertLen(set1, 0)
def _train(self, experience, weights): """Modifies the default _train step in two ways. 1. Passes actions and next time steps to actor loss. 2. Clips the dual parameter. Args: experience: A time-stacked trajectory object. weights: Optional scalar or elementwise (per-batch-entry) importance weights. Returns: A train_op. """ transition = self._as_transition(experience) time_steps, policy_steps, next_time_steps = transition actions = policy_steps.action trainable_critic_variables = list(object_identity.ObjectIdentitySet( self._critic_network_1.trainable_variables + self._critic_network_2.trainable_variables)) tf.debugging.check_numerics( tf.reduce_mean(time_steps.reward), 'ts.reward is inf or nan.') tf.debugging.check_numerics( tf.reduce_mean(next_time_steps.reward), 'next_ts.reward is inf or nan.') tf.debugging.check_numerics( tf.reduce_mean(actions), 'Actions is inf or nan.') with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_critic_variables, ('No trainable critic variables to ' 'optimize.') tape.watch(trainable_critic_variables) critic_loss = self._critic_loss_weight*self.critic_loss( time_steps, actions, next_time_steps, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights, training=True) tf.debugging.check_numerics(critic_loss, 'Critic loss is inf or nan.') critic_grads = tape.gradient(critic_loss, trainable_critic_variables) self._apply_gradients(critic_grads, trainable_critic_variables, self._critic_optimizer) trainable_actor_variables = self._actor_network.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape: assert trainable_actor_variables, ('No trainable actor variables to ' 'optimize.') tape.watch(trainable_actor_variables) actor_loss = self._actor_loss_weight*self.actor_loss( time_steps, actions, next_time_steps, weights=weights) tf.debugging.check_numerics(actor_loss, 'Actor loss is inf or nan.') actor_grads = tape.gradient(actor_loss, trainable_actor_variables) self._apply_gradients(actor_grads, trainable_actor_variables, self._actor_optimizer) alpha_variable = [self._log_alpha] with tf.GradientTape(watch_accessed_variables=False) as tape: assert alpha_variable, 'No alpha variable to optimize.' tape.watch(alpha_variable) alpha_loss = self._alpha_loss_weight*self.alpha_loss( time_steps, weights=weights) tf.debugging.check_numerics(alpha_loss, 'Alpha loss is inf or nan.') alpha_grads = tape.gradient(alpha_loss, alpha_variable) self._apply_gradients(alpha_grads, alpha_variable, self._alpha_optimizer) with tf.name_scope('Losses'): tf.compat.v2.summary.scalar( name='critic_loss', data=critic_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='actor_loss', data=actor_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='alpha_loss', data=alpha_loss, step=self.train_step_counter) self.train_step_counter.assign_add(1) self._update_target() total_loss = critic_loss + actor_loss + alpha_loss extra = sac_agent.SacLossInfo( critic_loss=critic_loss, actor_loss=actor_loss, alpha_loss=alpha_loss) return LossInfo(loss=total_loss, extra=extra)