def _sample_and_transpose_actions_and_log_probs( self, time_steps: ts.TimeStep, num_action_samples: int, training: Optional[bool] = False ) -> Tuple[types.Tensor, types.Tensor]: """Samples actions and corresponding log probabilities from policy.""" # Get raw action distribution from policy, and initialize bijectors list. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._train_policy.get_initial_state(batch_size) if training: action_distribution = self._train_policy.distribution( time_steps, policy_state=policy_state).action else: action_distribution = self._policy.distribution( time_steps, policy_state=policy_state).action actions = tf.nest.map_structure( lambda d: d.sample(num_action_samples, seed=self._action_seed_stream()), action_distribution) log_pi = common.log_probability(action_distribution, actions, self.action_spec) # Swap the first two axes for a [batch, self._num_cql_samples, ...] shape. actions = self._transpose_tile_and_batch_dims(actions) log_pi = self._transpose_tile_and_batch_dims(log_pi) return actions, log_pi
def testNestedLogProbability(self): action_spec = [ tensor_spec.BoundedTensorSpec([2], tf.float32, -1, 1), [ tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1), tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1) ] ] distribution = [ tfp.distributions.Normal([0.0, 0.0], [1.0, 1.0]), [ tfp.distributions.Normal([0.5], [1.0]), tfp.distributions.Normal([-0.5], [1.0]) ] ] actions = [ tf.constant([0.0, 0.0]), [tf.constant([0.5]), tf.constant([-0.5])] ] log_probs = common.log_probability(distribution, actions, action_spec) self.evaluate(tf.compat.v1.global_variables_initializer()) log_probs_ = self.evaluate(log_probs) self.assertEqual(len(log_probs_.shape), 0) self.assertNear(log_probs_, 4 * -0.5 * np.log(2 * 3.14159), 0.001)
def _construct(self, batch_size, graph): """Construct the agent graph through placeholders.""" self._batch_size = batch_size self._batched = batch_size is not None outer_dims = [self._batch_size] if self._batched else [1] with graph.as_default(): self._time_step = tensor_spec.to_nest_placeholder( self._tf_policy.time_step_spec, outer_dims=outer_dims) self._tf_initial_state = self._tf_policy.get_initial_state( batch_size=self._batch_size or 1) self._policy_state = tf.nest.map_structure( lambda ps: tf.compat.v1.placeholder( # pylint: disable=g-long-lambda ps.dtype, ps.shape, name='policy_state'), self._tf_initial_state) self._action_step = self._tf_policy.action(self._time_step, self._policy_state, seed=self._seed) self._actions = tensor_spec.to_nest_placeholder( self._tf_policy.action_spec, outer_dims=outer_dims) self._action_distribution = self._tf_policy.distribution( self._time_step, policy_state=self._policy_state).action self._action_mean = self._action_distribution.mean() self._log_prob = common.log_probability( self._action_distribution, self._actions, self._tf_policy.action_spec)
def testBatchedNestedLogProbability(self): action_spec = [ tensor_spec.BoundedTensorSpec([2], tf.float32, -1, 1), [ tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1), tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1) ] ] distribution = [ tfp.distributions.Normal([[0.0, 0.0], [0.0, 0.0]], [[1.0, 1.0], [2.0, 2.0]]), [ tfp.distributions.Normal([[0.5], [0.5]], [[1.0], [2.0]]), tfp.distributions.Normal([[-0.5], [-0.5]], [[1.0], [2.0]]) ] ] actions = [ tf.constant([[0.0, 0.0], [0.0, 0.0]]), [tf.constant([[0.5], [0.5]]), tf.constant([[-0.5], [-0.5]])] ] log_probs = common.log_probability(distribution, actions, action_spec) self.evaluate(tf.compat.v1.global_variables_initializer()) log_probs_ = self.evaluate(log_probs) self.assertEqual(log_probs_.shape, (2, )) self.assertAllClose( log_probs_, [4 * -0.5 * np.log(2 * 3.14159), 4 * -0.5 * np.log(8 * 3.14159)], 0.001)
def _get_safe_idx(self, safe_ac_mask, fail_prob, sampled_ac, safe_ac_idx, actions, fail_prob_safe): if tf.math.count_nonzero(safe_ac_mask) == 0: # picks safest action safe_idx = tf.argmin(fail_prob) else: sampled_ac = tf.gather(sampled_ac, safe_ac_idx) # picks most unsafe "safe" action # safe_idx = tf.argmax(fail_prob_safe, axis=0) # picks the safest action # safe_idx = tf.argmin(fail_prob_safe) if self._training: # picks random safe_action, weighted by 1 - fail_prob_safe (so higher weight for safer actions) # safe_idx = tfp.distributions.Categorical([1 - fail_prob_safe]).sample() if self._sampling_method == 'rejection': # standard rejection sampling with prob proportional to original policy log_prob = common.log_probability(actions, sampled_ac, self.action_spec) safe_idx = tfp.distributions.Categorical(log_prob).sample() elif self._sampling_method == 'risky': # picks random risky safe action, weighted by fail_prob_safe (so higher weight for less safe actions) safe_idx = tfp.distributions.Categorical([fail_prob_safe ]).sample() elif self._sampling_method == 'safe': safe_idx = tfp.distributions.Categorical( [1 - fail_prob_safe]).sample() safe_idx = tf.reshape(safe_idx, [-1])[0] return safe_idx
def _actions_and_log_probs(self, time_steps): """Get actions and corresponding log probabilities from policy.""" # Get raw action distribution from policy, and initialize bijectors list. action_distribution = self.policy().distribution(time_steps).action if self._squash_actions: bijectors = [] # Bijector to rescale actions to ranges in action spec. action_means, action_magnitudes = self._action_spec_means_magnitudes( ) bijectors.append( tfp.bijectors.AffineScalar(shift=action_means, scale=action_magnitudes)) # Bijector to squash actions to range (-1.0, +1.0). bijectors.append(tanh_bijector_stable.Tanh()) # Chain applies bijectors in reverse order, so squash will happen before # rescaling to action spec. bijector_chain = tfp.bijectors.Chain(bijectors) action_distribution = tfp.distributions.TransformedDistribution( distribution=action_distribution, bijector=bijector_chain) # Sample actions and log_pis from transformed distribution. actions = tf.nest.map_structure(lambda d: d.sample(), action_distribution) log_pi = common_utils.log_probability(action_distribution, actions, self.action_spec()) return actions, log_pi
def policy_gradient_loss(self, actions_distribution, actions, is_boundary, returns, num_episodes, weights=None): """Computes the policy gradient loss. Args: actions_distribution: A possibly batched tuple of action distributions. actions: Tensor with a batch of actions. is_boundary: Tensor of booleans that indicate if the corresponding action was in a boundary trajectory and should be ignored. returns: Tensor with a return from each timestep, aligned on index. Works better when returns are normalized. num_episodes: Number of episodes contained in the training data. weights: Optional scalar or element-wise (per-batch-entry) importance weights. May include a mask for invalid timesteps. Returns: policy_gradient_loss: A tensor that will contain policy gradient loss for the on-policy experience. """ # TODO(b/126594799): Add class IndependentNested(tfd.Distribution) to handle # nests of independent distributions like this. action_log_prob = common.log_probability(actions_distribution, actions, self.action_spec) # Filter out transitions between end state of previous episode and start # state of next episode. valid_mask = tf.cast(~is_boundary, tf.float32) action_log_prob *= valid_mask action_log_prob_times_return = action_log_prob * returns if weights is not None: action_log_prob_times_return *= weights if self._debug_summaries: tf.compat.v2.summary.histogram( name='action_log_prob', data=action_log_prob, step=self.train_step_counter) tf.compat.v2.summary.histogram( name='action_log_prob_times_return', data=action_log_prob_times_return, step=self.train_step_counter) # Policy gradient loss is defined as the sum, over timesteps, of action # log-probability times the cumulative return from that timestep onward. # For more information, see (Williams, 1992). policy_gradient_loss = -tf.reduce_sum( input_tensor=action_log_prob_times_return) # We take the mean over episodes by dividing by num_episodes. policy_gradient_loss = policy_gradient_loss / num_episodes return policy_gradient_loss
def testLogProbabilityOneHot(self): action_spec = tensor_spec.BoundedTensorSpec([3], tf.int32, 0, 1) distribution = tfp.distributions.OneHotCategorical(probs=[0.6, 0.3, 0.1]) actions = tf.constant([1, 0, 0]) log_probs = common.log_probability(distribution, actions, action_spec) self.evaluate(tf.compat.v1.global_variables_initializer()) log_probs_ = self.evaluate(log_probs) self.assertEqual(len(log_probs_.shape), 0) self.assertNear(log_probs_, np.log(0.6), 0.00001)
def behavior_loss(self, time_steps, actions, weights=None): with tf.name_scope('behavior_loss'): nest_utils.assert_same_structure(time_steps, self.time_step_spec) batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._behavior_policy.get_initial_state(batch_size) action_distribution = self._behavior_policy.distribution( time_steps, policy_state=policy_state).action log_pi = common.log_probability(action_distribution, actions, self.action_spec) return -1.0 * tf.reduce_mean(log_pi)
def _ml_pmi(self, x, y, y_distribution): num_outer_dims = get_outer_rank(x, self._x_spec) hidden = self._model(x)[0] batch_squash = BatchSquash(num_outer_dims) hidden = batch_squash.flatten(hidden) delta_loc = self._delta_loc_layer(hidden) delta_scale = tf.nn.softplus(self._delta_scale_layer(hidden)) delta_loc = batch_squash.unflatten(delta_loc) delta_scale = batch_squash.unflatten(delta_scale) y_given_x_dist = tfp.distributions.Normal( loc=y_distribution.loc + delta_loc, scale=y_distribution.scale * delta_scale) # Because Normal.event_shape is [], the result of Normal.log_prob() is # the probabilities of individual dimensions. So we need to use # tfa_common.log_probability() instead. # TODO: implement a normal distribution with non-scalar event shape. pmi = tfa_common.log_probability(y_given_x_dist, y, self._y_spec) pmi -= tf.stop_gradient( tfa_common.log_probability(y_distribution, y, self._y_spec)) return pmi
def _line_search(self, time_steps, policy_steps_, advantages, natural_gradient, coeff, weights): """Find new policy parameters by line search in natural gradient direction""" batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] # old policy distribution action_distribution_parameters = policy_steps_.info actions = policy_steps_.action actions_distribution = distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters["dist_params"]) act_log_probs = common.log_probability(actions_distribution, actions, self._action_spec) # loss for the old policy loss_threshold = self.policy_gradient_loss( time_steps, actions, tf.stop_gradient(act_log_probs), tf.stop_gradient(advantages), actions_distribution, weights, ) policy_params = flatten_tensors(self._actor_net.trainable_variables) # try different steps_sizes, accept first one that improves loss and satisfies KL constraint for it in range(self._backtrack_iters): new_params = policy_params - self._backtrack_coeff**it * coeff * natural_gradient unflatten_tensor(new_params, self._opt_policy_parameters) opt_policy_state = self._opt_policy.get_initial_state(batch_size) dists = self._opt_policy.distribution(time_steps, opt_policy_state) new_policy_distribution = dists.action kl = tf.reduce_mean( self._kl_divergence(time_steps, action_distribution_parameters, new_policy_distribution)) loss = self.policy_gradient_loss( time_steps, actions, tf.stop_gradient(act_log_probs), tf.stop_gradient(advantages), new_policy_distribution, weights, ) if kl < self._max_kl and loss < loss_threshold: return new_params # no improvement found return policy_params
def _actions_and_log_probs(self, time_steps): """Get actions and corresponding log probabilities from policy.""" # Get raw action distribution from policy, and initialize bijectors list. action_distribution = self.policy.distribution(time_steps).action # Sample actions and log_pis from transformed distribution. actions = tf.nest.map_structure(lambda d: d.sample(), action_distribution) log_pi = common.log_probability(action_distribution, actions, self.action_spec) return actions, log_pi
def policy_gradient(self, time_steps, policy_steps_, advantages, weights): """ Compute policy gradient wrt actor_net parameters. :param time_steps: batch of TimeSteps with observations for each timestep :param policy_steps_: policy info for time step sampling policy :param advantages: Tensor of advantage estimate for each timestep, aligned on index. :param weights: mask for invalid timesteps :return: list of gradient tensors, policy loss computer on timesteps """ batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] actions = policy_steps_.action # get policy info before update action_distribution_parameters = policy_steps_.info # Reconstruct per-timestep policy distribution old_actions_distribution = distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters["dist_params"]) # Log probability of actions taken during data collection act_log_probs = common.log_probability(old_actions_distribution, actions, self._action_spec) with tf.GradientTape() as tape: # current policy distribution policy_state = self._collect_policy.get_initial_state(batch_size) distribution_step = self._collect_policy.distribution( time_steps, policy_state) current_policy_distribution = distribution_step.action policy_gradient_loss = self.policy_gradient_loss( time_steps, actions, tf.stop_gradient(act_log_probs), tf.stop_gradient(advantages), current_policy_distribution, weights, ) trainable = self._actor_net.trainable_weights grads = tape.gradient(policy_gradient_loss, trainable) for g in grads: tf.debugging.check_numerics(g, "Gradient divergence", name="grad_check") return policy_gradient_loss, grads
def _actions_and_log_probs(self, time_steps): """Get actions and corresponding log probabilities from policy.""" # Get raw action distribution from policy, and initialize bijectors list. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._train_policy.get_initial_state(batch_size) action_distribution = self._train_policy.distribution( time_steps, policy_state=policy_state).action # Sample actions and log_pis from transformed distribution. actions = tf.nest.map_structure(lambda d: d.sample(), action_distribution) log_pi = common.log_probability(action_distribution, actions, self.action_spec) return actions, log_pi
def _actions_and_log_probs(self, time_steps, safety_constrained=False): """Get actions and corresponding log probabilities from policy.""" # Get raw action distribution from policy, and initialize bijectors list. batch_size = nest_utils.get_outer_shape(time_steps, self.time_step_spec)[0] policy = self.collect_policy policy_state = policy.get_initial_state(batch_size) action_distribution = policy.distribution( time_steps, policy_state=policy_state).action # Sample actions and log_pis from transformed distribution. if safety_constrained: actions, policy_state = self.safe_policy._apply_actor_network( time_steps.observation, time_steps.step_type, policy_state) else: actions = tf.nest.map_structure(lambda d: d.sample(), action_distribution) log_pi = common.log_probability(action_distribution, actions, self.action_spec) return actions, log_pi
def policy_gradient_loss(self, time_steps, actions, returns, weights=None): """Computes the policy gradient loss. Args: time_steps: TimeStep object with a batch of observations. actions: Tensor with a batch of actions. returns: Tensor with a return from each timestep, aligned on index. Works better when returns are normalized. weights: Optional scalar or element-wise (per-batch-entry) importance weights. May include a mask for invalid timesteps. Returns: policy_gradient_loss: A tensor that will contain policy gradient loss for the on-policy experience. """ tf.nest.assert_same_structure(time_steps, self.time_step_spec()) actions_distribution = self.collect_policy().distribution( time_steps).action # TODO(kbanoop): Add class IndependentNested(tfd.Distribution) to handle # nests of independent distributions like this. action_log_prob = common.log_probability(actions_distribution, actions, self.action_spec()) action_log_prob_times_return = action_log_prob * returns if weights is not None: action_log_prob_times_return *= weights if self._debug_summaries: tf.contrib.summary.histogram('action_log_prob', action_log_prob) tf.contrib.summary.histogram('action_log_prob_times_return', action_log_prob_times_return) # Policy gradient loss is defined as the sum, over timesteps, of action # log-probability times the cumulative return from that timestep onward. # For more information, see (Williams, 1992) policy_gradient_loss = -tf.reduce_sum( input_tensor=action_log_prob_times_return) with tf.name_scope('Losses/'): tf.contrib.summary.scalar('policy_gradient_loss', policy_gradient_loss) return tf_agent.LossInfo(policy_gradient_loss, ())
def train_step(self, exp: Experience, state: SacState): action_distribution, share_actor_state = self._actor_network( exp.observation, step_type=exp.step_type, network_state=state.share.actor) action = tf.nest.map_structure(lambda d: d.sample(), action_distribution) log_pi = tfa_common.log_probability(action_distribution, action, self._action_spec) actor_state, actor_info = self._actor_train_step( exp, state.actor, action_distribution, action, log_pi) critic_state, critic_info = self._critic_train_step( exp, state.critic, action, log_pi) alpha_info = self._alpha_train_step(log_pi) state = SacState(share=SacShareState(actor=share_actor_state), actor=actor_state, critic=critic_state) info = SacInfo(actor=actor_info, critic=critic_info, alpha=alpha_info) return PolicyStep(action_distribution, state, info)
def _actions_and_log_probs(self, time_steps: ts.TimeStep, training: Optional[bool] = False ) -> Tuple[types.Tensor, types.Tensor]: """Get actions and corresponding log probabilities from policy.""" # Get raw action distribution from policy, and initialize bijectors list. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._train_policy.get_initial_state(batch_size) if training: action_distribution = self._train_policy.distribution( time_steps, policy_state=policy_state).action else: action_distribution = self._policy.distribution( time_steps, policy_state=policy_state).action # Sample actions and log_pis from transformed distribution. actions = tf.nest.map_structure( lambda d: d.sample((), seed=self._action_seed_stream()), action_distribution) log_pi = common.log_probability(action_distribution, actions, self.action_spec) return actions, log_pi
def policy_gradient_loss( self, time_steps, actions, sample_action_log_probs, advantages, current_policy_distribution, weights, ): """Create tensor for policy gradient loss. All tensors should have a single batch dimension. Args: time_steps: TimeSteps with observations for each timestep. actions: Tensor of actions for timesteps, aligned on index. sample_action_log_probs: Tensor of sample probability of each action. advantages: Tensor of advantage estimate for each timestep, aligned on index. Works better when advantage estimates are normalized. current_policy_distribution: The policy distribution, evaluated on all time_steps. weights: Optional scalar or element-wise (per-batch-entry) importance weights. Includes a mask for invalid timesteps. Returns: policy_gradient_loss: A tensor that will contain policy gradient loss for the on-policy experience. """ tf.nest.assert_same_structure(time_steps, self.time_step_spec) action_log_prob = common.log_probability(current_policy_distribution, actions, self._action_spec) action_log_prob = tf.cast(action_log_prob, tf.float32) if self._log_prob_clipping > 0.0: action_log_prob = tf.clip_by_value(action_log_prob, -self._log_prob_clipping, self._log_prob_clipping) tf.debugging.check_numerics(action_log_prob, "action_log_prob") tf.debugging.check_numerics(sample_action_log_probs, "sample_action_log_probs") # Prepare unclipped importance ratios. importance_ratio = tf.exp(action_log_prob - sample_action_log_probs) tf.debugging.check_numerics(importance_ratio, "importance_ratio", name="importance_ratio") per_timestep_objective = importance_ratio * advantages policy_gradient_loss = -per_timestep_objective policy_gradient_loss = tf.reduce_mean( input_tensor=policy_gradient_loss * weights) tf.debugging.check_numerics(policy_gradient_loss, "Policy Loss divergence", name="policy_check") return policy_gradient_loss
def _soft_relabel(self, experience): # experience.observation.shape = [B x T=2 x obs_dim+state_dim] states, orig_tasks = self._task_distribution.split( experience.observation[:, 0]) if self._task_distribution.tasks is None: tasks = orig_tasks else: tasks = tf.constant(self._task_distribution.tasks, dtype=tf.float32) next_states, _ = self._task_distribution.split( experience.observation[:, 1]) if self._candidate_task_type == "states": candidate_tasks = self._task_distribution.state_to_task(states) elif self._candidate_task_type == "next_states": candidate_tasks = self._task_distribution.state_to_task( next_states) else: assert self._candidate_task_type == "tasks" candidate_tasks = tasks actions = experience.action[:, 0] num_tasks = tasks.shape[0] batch_size = states.shape[0] task_dim = tasks.shape[1] obs_dim = states.shape[1] action_dim = actions.shape[1] action_spec = self._actor.output_tensor_spec states_tiled = tf.tile(states[:, None], [1, num_tasks, 1]) # B x B x D states_tiled = tf.reshape(states_tiled, [batch_size * num_tasks, obs_dim]) # B*B x D actions_tiled = tf.tile(actions[:, None], [1, num_tasks, 1]) # B x B x D actions_tiled = tf.reshape( actions_tiled, [batch_size * num_tasks, action_dim]) # B*B x D tasks_tiled = tf.tile(tasks[None], [batch_size, 1, 1]) # B x B x D tasks_tiled = tf.reshape(tasks_tiled, [batch_size * num_tasks, task_dim]) # B*B x D next_states_tiled = tf.tile(next_states[:, None], [1, num_tasks, 1]) next_states_tiled = tf.reshape( next_states_tiled, [batch_size * num_tasks, obs_dim]) # B*B x D next_relabelled_obs = self._task_distribution.combine( next_states_tiled, tasks_tiled) sampled_actions_tiled = self._actor(next_relabelled_obs, step_type=(), network_state=())[0].sample() critic_input = (next_relabelled_obs, sampled_actions_tiled) q_vals, _ = self._critic(critic_input, training=False) q_vals_vec = tf.reshape(q_vals, (batch_size, num_tasks)) rewards, dones = self._task_distribution.evaluate( states_tiled, actions_tiled, tasks_tiled) dones = tf.cast(dones, tf.float32) rewards_vec = tf.reshape(rewards, (batch_size, num_tasks)) dones_vec = tf.reshape(dones, (batch_size, num_tasks)) relabelled_obs = self._task_distribution.combine( states_tiled, tasks_tiled) action_distribution = self._actor(relabelled_obs, step_type=(), network_state=())[0] log_pi = common.log_probability(action_distribution, actions_tiled, action_spec) log_pi_vec = tf.reshape(log_pi, (batch_size, num_tasks)) logits_vec = (rewards_vec - log_pi_vec + self._gamma * (1.0 - dones_vec) * q_vals_vec) if self._relabel_type == "random": logits_vec = tf.ones_like( logits_vec) # Hack to make sampling random ## End new version if self._normalize_cols: logits_vec = logits_vec - tf.math.reduce_logsumexp(logits_vec, axis=0)[None] relabel_indices = tf.random.categorical(logits=logits_vec, num_samples=1) ### Metrics global_step = tf.compat.v1.train.get_or_create_global_step() orig_indices = tf.range(self._sample_batch_size, dtype=relabel_indices.dtype) with tf.name_scope("relabelling"): # How often are the originally commanded goals most optimal? opt_indices = tf.argmax(logits_vec, axis=1) orig_is_opt = opt_indices == orig_indices orig_opt_frac = tf.reduce_mean(tf.cast(orig_is_opt, tf.float32)) tf.compat.v2.summary.scalar(name="orig_task_optimal", data=orig_opt_frac, step=global_step) # How often is the relabelled goal optimal? # The relabel_indices are [B, 1], so we need to remove the extra dim. relabel_is_opt = tf.squeeze(relabel_indices) == orig_indices relabel_opt_frac = tf.reduce_mean( tf.cast(relabel_is_opt, tf.float32)) tf.compat.v2.summary.scalar(name="relabel_task_optimal", data=relabel_opt_frac, step=global_step) # What are the average Q values of the original tasks? if batch_size == num_tasks: indices = tf.transpose( tf.stack([orig_indices, orig_indices], axis=0)) orig_q_vals = tf.gather_nd(logits_vec, indices) tf.compat.v2.summary.scalar( name="orig_q_vals", data=tf.reduce_mean(orig_q_vals), step=global_step, ) # What are the average Q values of the relabelled tasks? indices = tf.transpose( tf.stack( [orig_indices, tf.squeeze(relabel_indices)], axis=0)) relabel_q_vals = tf.gather_nd(logits_vec, indices) tf.compat.v2.summary.scalar( name="relabel_q_vals", data=tf.reduce_mean(relabel_q_vals), step=global_step, ) max_q = tf.reduce_max(logits_vec, axis=1) tf.compat.v2.summary.scalar(name="max_q", data=tf.reduce_mean(max_q), step=global_step) ### End metrics # For both state-centric and goal-centric relabelling, the implementation of # mixing is the same: we randomly replace some of the indices with the # diagonal. relabelled_tasks = tf.gather(candidate_tasks, tf.squeeze(relabel_indices)) if self._relabel_prob == 0: relabelled_tasks = orig_tasks elif 0 < self._relabel_prob < 1: logits = tf.log([1.0 - self._relabel_prob, self._relabel_prob]) mask = tf.squeeze( tf.random.categorical(logits[None], num_samples=self._sample_batch_size)) mask = tf.cast(mask, tf.float32)[:, None] relabelled_tasks = mask * orig_tasks + (1 - mask) * relabelled_tasks states_and_tasks = self._task_distribution.combine( states, relabelled_tasks) next_states_and_tasks = self._task_distribution.combine( next_states, relabelled_tasks) new_observation = tf.concat( [states_and_tasks[:, None], next_states_and_tasks[:, None]], axis=1) assert new_observation.shape == experience.observation.shape experience = experience.replace(observation=new_observation) return experience
def policy_gradient_loss(self, time_steps, actions, sample_action_log_probs, advantages, current_policy_distribution, valid_mask, debug_summaries=False): """Create tensor for policy gradient loss. All tensors should have a single batch dimension. Args: time_steps: TimeSteps with observations for each timestep. actions: Tensor of actions for timesteps, aligned on index. sample_action_log_probs: Tensor of ample probability of each action. advantages: Tensor of advantage estimate for each timestep, aligned on index. Works better when advantage estimates are normalized. current_policy_distribution: The policy distribution, evaluated on all time_steps. valid_mask: Mask for invalid timesteps. Float value 1.0 for valid timesteps and 0.0 for invalid timesteps. (Timesteps which either are betweeen two episodes, or part of an unfinished episode at the end of one batch dimension.) debug_summaries: True if debug summaries should be created. Returns: policy_gradient_loss: A tensor that will contain policy gradient loss for the on-policy experience. """ nest.assert_same_structure(time_steps, self.time_step_spec()) action_log_prob = common_utils.log_probability( current_policy_distribution, actions, self._action_spec) action_log_prob = tf.to_float(action_log_prob) if self._log_prob_clipping > 0.0: action_log_prob = tf.clip_by_value(action_log_prob, -self._log_prob_clipping, self._log_prob_clipping) if self._check_numerics: action_log_prob = tf.check_numerics(action_log_prob, 'action_log_prob') # Prepare both clipped and unclipped importance ratios. importance_ratio = tf.exp(action_log_prob - sample_action_log_probs) importance_ratio_clipped = tf.clip_by_value( importance_ratio, 1 - self._importance_ratio_clipping, 1 + self._importance_ratio_clipping) if self._check_numerics: importance_ratio = tf.check_numerics(importance_ratio, 'importance_ratio') if self._importance_ratio_clipping > 0.0: importance_ratio_clipped = tf.check_numerics( importance_ratio_clipped, 'importance_ratio_clipped') # Pessimistically choose the minimum objective value for clipped and # unclipped importance ratios. per_timestep_objective = importance_ratio * advantages per_timestep_objective_clipped = importance_ratio_clipped * advantages per_timestep_objective_min = tf.minimum( per_timestep_objective, per_timestep_objective_clipped) if self._importance_ratio_clipping > 0.0: policy_gradient_loss = -per_timestep_objective_min else: policy_gradient_loss = -per_timestep_objective policy_gradient_loss = tf.reduce_mean(policy_gradient_loss * valid_mask) if debug_summaries: if self._importance_ratio_clipping > 0.0: clip_fraction = tf.reduce_mean( tf.to_float( tf.greater(tf.abs(importance_ratio - 1.0), self._importance_ratio_clipping))) tf.contrib.summary.scalar('clip_fraction', clip_fraction) tf.contrib.summary.histogram('action_log_prob', action_log_prob) tf.contrib.summary.histogram('action_log_prob_sample', sample_action_log_probs) tf.contrib.summary.histogram('importance_ratio', importance_ratio) tf.contrib.summary.scalar('importance_ratio_mean', tf.reduce_mean(importance_ratio)) tf.contrib.summary.histogram('importance_ratio_clipped', importance_ratio_clipped) tf.contrib.summary.histogram('per_timestep_objective', per_timestep_objective) tf.contrib.summary.histogram('per_timestep_objective_clipped', per_timestep_objective_clipped) tf.contrib.summary.histogram('per_timestep_objective_min', per_timestep_objective_min) entropy = common_utils.entropy(current_policy_distribution, self.action_spec()) tf.contrib.summary.histogram('policy_entropy', entropy) tf.contrib.summary.scalar('policy_entropy_mean', tf.reduce_mean(entropy)) # Categorical distribution (used for discrete actions) # doesn't have a mean. if not self.action_spec().is_discrete(): tf.contrib.summary.histogram( 'actions_distribution_mean', current_policy_distribution.mean()) tf.contrib.summary.histogram( 'actions_distribution_stddev', current_policy_distribution.stddev()) tf.contrib.summary.histogram('policy_gradient_loss', policy_gradient_loss) if self._check_numerics: policy_gradient_loss = tf.check_numerics(policy_gradient_loss, 'policy_gradient_loss') return policy_gradient_loss
def policy_gradient_loss(self, time_steps, actions, sample_action_log_probs, advantages, current_policy_distribution, weights, debug_summaries=False): """Create tensor for policy gradient loss. All tensors should have a single batch dimension. Args: time_steps: TimeSteps with observations for each timestep. actions: Tensor of actions for timesteps, aligned on index. sample_action_log_probs: Tensor of sample probability of each action. advantages: Tensor of advantage estimate for each timestep, aligned on index. Works better when advantage estimates are normalized. current_policy_distribution: The policy distribution, evaluated on all time_steps. weights: Optional scalar or element-wise (per-batch-entry) importance weights. Includes a mask for invalid timesteps. debug_summaries: True if debug summaries should be created. Returns: policy_gradient_loss: A tensor that will contain policy gradient loss for the on-policy experience. """ tf.nest.assert_same_structure(time_steps, self.time_step_spec) action_log_prob = common.log_probability(current_policy_distribution, actions, self._action_spec) action_log_prob = tf.cast(action_log_prob, tf.float32) if self._log_prob_clipping > 0.0: action_log_prob = tf.clip_by_value(action_log_prob, -self._log_prob_clipping, self._log_prob_clipping) if self._check_numerics: action_log_prob = tf.debugging.check_numerics( action_log_prob, 'action_log_prob') # Prepare both clipped and unclipped importance ratios. importance_ratio = tf.exp(action_log_prob - sample_action_log_probs) importance_ratio_clipped = tf.clip_by_value( importance_ratio, 1 - self._importance_ratio_clipping, 1 + self._importance_ratio_clipping) if self._check_numerics: importance_ratio = tf.debugging.check_numerics( importance_ratio, 'importance_ratio') if self._importance_ratio_clipping > 0.0: importance_ratio_clipped = tf.debugging.check_numerics( importance_ratio_clipped, 'importance_ratio_clipped') # Pessimistically choose the minimum objective value for clipped and # unclipped importance ratios. per_timestep_objective = importance_ratio * advantages per_timestep_objective_clipped = importance_ratio_clipped * advantages per_timestep_objective_min = tf.minimum( per_timestep_objective, per_timestep_objective_clipped) if self._importance_ratio_clipping > 0.0: policy_gradient_loss = -per_timestep_objective_min else: policy_gradient_loss = -per_timestep_objective policy_gradient_loss = tf.reduce_mean( input_tensor=policy_gradient_loss * weights) if debug_summaries: if self._importance_ratio_clipping > 0.0: clip_fraction = tf.reduce_mean(input_tensor=tf.cast( tf.greater(tf.abs(importance_ratio - 1.0), self._importance_ratio_clipping), tf.float32)) tf.compat.v2.summary.scalar(name='clip_fraction', data=clip_fraction, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='action_log_prob', data=action_log_prob, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='action_log_prob_sample', data=sample_action_log_probs, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='importance_ratio', data=importance_ratio, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='importance_ratio_mean', data=tf.reduce_mean(input_tensor=importance_ratio), step=self.train_step_counter) tf.compat.v2.summary.histogram(name='importance_ratio_clipped', data=importance_ratio_clipped, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='per_timestep_objective', data=per_timestep_objective, step=self.train_step_counter) tf.compat.v2.summary.histogram( name='per_timestep_objective_clipped', data=per_timestep_objective_clipped, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='per_timestep_objective_min', data=per_timestep_objective_min, step=self.train_step_counter) entropy = common.entropy(current_policy_distribution, self.action_spec) tf.compat.v2.summary.histogram(name='policy_entropy', data=entropy, step=self.train_step_counter) tf.compat.v2.summary.scalar( name='policy_entropy_mean', data=tf.reduce_mean(input_tensor=entropy), step=self.train_step_counter) for i, (single_action, single_distribution) in enumerate( zip(tf.nest.flatten(self.action_spec), tf.nest.flatten(current_policy_distribution))): # Categorical distribution (used for discrete actions) doesn't have a # mean. distribution_index = '_{}'.format(i) if i > 0 else '' if not tensor_spec.is_discrete(single_action): tf.compat.v2.summary.histogram( name='actions_distribution_mean' + distribution_index, data=single_distribution.mean(), step=self.train_step_counter) tf.compat.v2.summary.histogram( name='actions_distribution_stddev' + distribution_index, data=single_distribution.stddev(), step=self.train_step_counter) tf.compat.v2.summary.histogram(name='policy_gradient_loss', data=policy_gradient_loss, step=self.train_step_counter) if self._check_numerics: policy_gradient_loss = tf.debugging.check_numerics( policy_gradient_loss, 'policy_gradient_loss') return policy_gradient_loss
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) actions = policy_steps_.action if self._debug_summaries: actions_list = tf.nest.flatten(actions) show_action_index = len(actions_list) != 1 for i, single_action in enumerate(actions_list): action_name = ('actions_{}'.format(i) if show_action_index else 'actions') tf.compat.v2.summary.histogram(name=action_name, data=single_action, step=self.train_step_counter) action_distribution_parameters = policy_steps_.info # Reconstruct per-timestep policy distribution from stored distribution # parameters. old_actions_distribution = ( distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters)) # Compute log probability of actions taken during data collection, using the # collect policy distribution. act_log_probs = common.log_probability(old_actions_distribution, actions, self._action_spec) # Compute the value predictions for states using the current value function. # To be used for return & advantage computation. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state( batch_size=batch_size) value_preds, unused_policy_state = self._collect_policy.apply_value_network( experience.observation, experience.step_type, policy_state=policy_state) value_preds = tf.stop_gradient(value_preds) valid_mask = ppo_utils.make_timestep_mask(next_time_steps) if weights is None: weights = valid_mask else: weights *= valid_mask returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] loss_info = None # TODO(b/123627451): Remove. # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Only save debug summaries for first and last epochs. debug_summaries = (self._debug_summaries and (i_epoch == 0 or i_epoch == self._num_epochs - 1)) # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, self.train_step_counter, debug_summaries) variables_to_train = (self._actor_net.trainable_weights + self._value_net.trainable_weights) grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) # If summarize_gradients, create functions for summarizing both # gradients and variables. if self._summarize_grads_and_vars and debug_summaries: eager_utils.add_gradients_summaries( grads_and_vars, self.train_step_counter) eager_utils.add_variables_summaries( grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients( grads_and_vars, global_step=self.train_step_counter) policy_gradient_losses.append( loss_info.extra.policy_gradient_loss) value_estimation_losses.append( loss_info.extra.value_estimation_loss) l2_regularization_losses.append( loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size) # Compute the mean kl from previous action distribution. kl_divergence = self._kl_divergence( time_steps, action_distribution_parameters, self._collect_policy.distribution(time_steps, policy_state).action) self.update_adaptive_kl_beta(kl_divergence) if self._observation_normalizer: self._observation_normalizer.update(time_steps.observation, outer_dims=[0, 1]) else: # TODO(b/127661780): Verify performance of reward_normalizer when obs are # not normalized if self._reward_normalizer: self._reward_normalizer.update(next_time_steps.reward, outer_dims=[0, 1]) loss_info = tf.nest.map_structure(tf.identity, loss_info) # Make summaries for total loss across all epochs. # The *_losses lists will have been populated by # calls to self.get_epoch_loss. with tf.name_scope('Losses/'): total_policy_gradient_loss = tf.add_n(policy_gradient_losses) total_value_estimation_loss = tf.add_n(value_estimation_losses) total_l2_regularization_loss = tf.add_n(l2_regularization_losses) total_entropy_regularization_loss = tf.add_n( entropy_regularization_losses) total_kl_penalty_loss = tf.add_n(kl_penalty_losses) tf.compat.v2.summary.scalar(name='policy_gradient_loss', data=total_policy_gradient_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='value_estimation_loss', data=total_value_estimation_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='l2_regularization_loss', data=total_l2_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='entropy_regularization_loss', data=total_entropy_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='kl_penalty_loss', data=total_kl_penalty_loss, step=self.train_step_counter) total_abs_loss = (tf.abs(total_policy_gradient_loss) + tf.abs(total_value_estimation_loss) + tf.abs(total_entropy_regularization_loss) + tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss)) tf.compat.v2.summary.scalar(name='total_abs_loss', data=total_abs_loss, step=self.train_step_counter) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): all_vars = (self._actor_net.trainable_weights + self._value_net.trainable_weights) for var in all_vars: tf.compat.v2.summary.histogram( name=var.name.replace(':', '_'), data=var, step=self.train_step_counter) return loss_info
def action_importance_ratio(action_distribution, collect_action_distribution, action, action_spec, clipping_mode, scope, importance_ratio_clipping, log_prob_clipping, check_numerics, debug_summaries): """ ratio for importance sampling, used in PPO loss and vtrace loss. Caller has to save tf.name_scope() and pass scope to this function. Args: action_distribution (nested tf.distribution): Distribution over actions under target policy. collect_action_distribution (nested tf.distribution): distribution over actions from behavior policy, used to sample actions for the rollout. action (nested tf.distribution): possibly batched action tuple taken during rollout. action_spec (nested BoundedTensorSpec): representing the actions. clipping_mode (str): mode for clipping the importance ratio. 'double_sided': clips the range of importance ratio into [1-importance_ratio_clipping, 1+importance_ratio_clipping], which is used by PPOLoss. 'capping': clips the range of importance ratio into min(1+importance_ratio_clipping, importance_ratio), which is used by VTraceLoss, where c_bar or rho_bar = 1+importance_ratio_clipping. scope (name scope manager): returned by tf.name_scope(), set outside. importance_ratio_clipping (float): Epsilon in clipped, surrogate PPO objective. See the cited paper for more detail. log_prob_clipping (float): If >0, clipping log probs to the range (-log_prob_clipping, log_prob_clipping) to prevent inf / NaN values. check_numerics (bool): If true, adds tf.debugging.check_numerics to help find NaN / Inf values. For debugging only. debug_summaries (bool): If true, output summary metrics to tf. Returns: importance_ratio (Tensor), importance_ratio_clipped (Tensor). """ current_policy_distribution = action_distribution sample_action_log_probs = tfa_common.log_probability( collect_action_distribution, action, action_spec) sample_action_log_probs = tf.stop_gradient(sample_action_log_probs) action_log_prob = tfa_common.log_probability(current_policy_distribution, action, action_spec) if log_prob_clipping > 0.0: action_log_prob = tf.clip_by_value(action_log_prob, -log_prob_clipping, log_prob_clipping) if check_numerics: action_log_prob = tf.debugging.check_numerics(action_log_prob, 'action_log_prob') # Prepare both clipped and unclipped importance ratios. importance_ratio = tf.exp(action_log_prob - sample_action_log_probs) if check_numerics: importance_ratio = tf.debugging.check_numerics(importance_ratio, 'importance_ratio') if clipping_mode == 'double_sided': importance_ratio_clipped = tf.clip_by_value( importance_ratio, 1 - importance_ratio_clipping, 1 + importance_ratio_clipping) elif clipping_mode == 'capping': importance_ratio_clipped = tf.minimum(importance_ratio, 1 + importance_ratio_clipping) else: raise Exception('Unsupported clipping mode: ' + clipping_mode) def _summary(): with scope: if importance_ratio_clipping > 0.0: clip_fraction = tf.reduce_mean(input_tensor=tf.cast( tf.greater(tf.abs(importance_ratio - 1.0), importance_ratio_clipping), tf.float32)) tf.summary.scalar('clip_fraction', clip_fraction) tf.summary.histogram('action_log_prob', action_log_prob) tf.summary.histogram('action_log_prob_sample', sample_action_log_probs) tf.summary.histogram('importance_ratio', importance_ratio) tf.summary.scalar('importance_ratio_mean', tf.reduce_mean(input_tensor=importance_ratio)) tf.summary.histogram('importance_ratio_clipped', importance_ratio_clipped) if debug_summaries: common.run_if(common.should_record_summaries(), _summary) return importance_ratio, importance_ratio_clipped
def _train(self, experience, weights, train_step_counter): # Change trajectory to transitions. trajectory0 = nest.map_structure(lambda t: t[:, :-1], experience) trajectory1 = nest.map_structure(lambda t: t[:, 1:], experience) # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(trajectory0, trajectory1) actions = policy_steps_.action if self._debug_summaries: tf.contrib.summary.histogram('actions', actions) action_distribution_parameters = policy_steps_.info # Reconstruct per-timestep policy distribution from stored distribution # parameters. old_actions_distribution = ( distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters)) # Compute log probability of actions taken during data collection, using the # collect policy distribution. act_log_probs = common_utils.log_probability(old_actions_distribution, actions, self._action_spec) # Compute the value predictions for states using the current value function. # To be used for return & advantage computation. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state( batch_size=batch_size) value_preds, unused_policy_state = self._collect_policy.apply_value_network( experience.observation, experience.step_type, policy_state=policy_state) value_preds = tf.stop_gradient(value_preds) valid_mask = ppo_utils.make_timestep_mask(next_time_steps) if weights is None: weights = valid_mask else: weights *= valid_mask returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] # For each epoch, create its own train op that depends on the previous one. loss_info = tf.no_op() for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): with tf.control_dependencies(nest.flatten(loss_info)): # Only save debug summaries for first and last epochs. debug_summaries = (self._debug_summaries and (i_epoch == 0 or i_epoch == self._num_epochs - 1)) # Build one epoch train op. loss_info = self.build_train_op( time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, train_step_counter, self._summarize_grads_and_vars, self._gradient_clipping, debug_summaries) policy_gradient_losses.append( loss_info.extra.policy_gradient_loss) value_estimation_losses.append( loss_info.extra.value_estimation_loss) l2_regularization_losses.append( loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. with tf.control_dependencies(nest.flatten(loss_info)): # Compute the mean kl from old. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size) kl_divergence = self._kl_divergence( time_steps, action_distribution_parameters, self._collect_policy.distribution(time_steps, policy_state).action) update_adaptive_kl_beta_op = self.update_adaptive_kl_beta( kl_divergence) with tf.control_dependencies([update_adaptive_kl_beta_op]): if self._observation_normalizer: update_obs_norm = (self._observation_normalizer.update( time_steps.observation, outer_dims=[0, 1])) else: update_obs_norm = tf.no_op() if self._reward_normalizer: update_reward_norm = self._reward_normalizer.update( next_time_steps.reward, outer_dims=[0, 1]) else: update_reward_norm = tf.no_op() with tf.control_dependencies([update_obs_norm, update_reward_norm]): loss_info = nest.map_structure(tf.identity, loss_info) # Make summaries for total loss across all epochs. # The *_losses lists will have been populated by # calls to self.build_train_op. with tf.name_scope('Losses/'): total_policy_gradient_loss = tf.add_n(policy_gradient_losses) total_value_estimation_loss = tf.add_n(value_estimation_losses) total_l2_regularization_loss = tf.add_n(l2_regularization_losses) total_entropy_regularization_loss = tf.add_n( entropy_regularization_losses) total_kl_penalty_loss = tf.add_n(kl_penalty_losses) tf.contrib.summary.scalar('policy_gradient_loss', total_policy_gradient_loss) tf.contrib.summary.scalar('value_estimation_loss', total_value_estimation_loss) tf.contrib.summary.scalar('l2_regularization_loss', total_l2_regularization_loss) if self._entropy_regularization: tf.contrib.summary.scalar('entropy_regularization_loss', total_entropy_regularization_loss) tf.contrib.summary.scalar('kl_penalty_loss', total_kl_penalty_loss) total_abs_loss = (tf.abs(total_policy_gradient_loss) + tf.abs(total_value_estimation_loss) + tf.abs(total_entropy_regularization_loss) + tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss)) tf.contrib.summary.scalar('total_abs_loss', total_abs_loss) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): all_vars = (self._actor_net.trainable_weights + self._value_net.trainable_weights) for var in all_vars: tf.contrib.summary.histogram(var.name.replace(':', '_'), var) return loss_info
def _pg_loss(self, training_info, advantages): action_log_prob = tfa_common.log_probability( training_info.action_distribution, training_info.action, self._action_spec) return -advantages * action_log_prob