Exemplo n.º 1
0
    def _sample_and_transpose_actions_and_log_probs(
            self,
            time_steps: ts.TimeStep,
            num_action_samples: int,
            training: Optional[bool] = False
    ) -> Tuple[types.Tensor, types.Tensor]:
        """Samples actions and corresponding log probabilities from policy."""
        # Get raw action distribution from policy, and initialize bijectors list.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._train_policy.get_initial_state(batch_size)
        if training:
            action_distribution = self._train_policy.distribution(
                time_steps, policy_state=policy_state).action
        else:
            action_distribution = self._policy.distribution(
                time_steps, policy_state=policy_state).action

        actions = tf.nest.map_structure(
            lambda d: d.sample(num_action_samples,
                               seed=self._action_seed_stream()),
            action_distribution)
        log_pi = common.log_probability(action_distribution, actions,
                                        self.action_spec)

        # Swap the first two axes for a [batch, self._num_cql_samples, ...] shape.
        actions = self._transpose_tile_and_batch_dims(actions)
        log_pi = self._transpose_tile_and_batch_dims(log_pi)
        return actions, log_pi
Exemplo n.º 2
0
    def testNestedLogProbability(self):
        action_spec = [
            tensor_spec.BoundedTensorSpec([2], tf.float32, -1, 1),
            [
                tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1),
                tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)
            ]
        ]
        distribution = [
            tfp.distributions.Normal([0.0, 0.0], [1.0, 1.0]),
            [
                tfp.distributions.Normal([0.5], [1.0]),
                tfp.distributions.Normal([-0.5], [1.0])
            ]
        ]
        actions = [
            tf.constant([0.0, 0.0]), [tf.constant([0.5]),
                                      tf.constant([-0.5])]
        ]
        log_probs = common.log_probability(distribution, actions, action_spec)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        log_probs_ = self.evaluate(log_probs)
        self.assertEqual(len(log_probs_.shape), 0)
        self.assertNear(log_probs_, 4 * -0.5 * np.log(2 * 3.14159), 0.001)
Exemplo n.º 3
0
    def _construct(self, batch_size, graph):
        """Construct the agent graph through placeholders."""

        self._batch_size = batch_size
        self._batched = batch_size is not None

        outer_dims = [self._batch_size] if self._batched else [1]
        with graph.as_default():
            self._time_step = tensor_spec.to_nest_placeholder(
                self._tf_policy.time_step_spec, outer_dims=outer_dims)
            self._tf_initial_state = self._tf_policy.get_initial_state(
                batch_size=self._batch_size or 1)

            self._policy_state = tf.nest.map_structure(
                lambda ps: tf.compat.v1.placeholder(  # pylint: disable=g-long-lambda
                    ps.dtype,
                    ps.shape,
                    name='policy_state'),
                self._tf_initial_state)
            self._action_step = self._tf_policy.action(self._time_step,
                                                       self._policy_state,
                                                       seed=self._seed)

            self._actions = tensor_spec.to_nest_placeholder(
                self._tf_policy.action_spec, outer_dims=outer_dims)
            self._action_distribution = self._tf_policy.distribution(
                self._time_step, policy_state=self._policy_state).action
            self._action_mean = self._action_distribution.mean()
            self._log_prob = common.log_probability(
                self._action_distribution, self._actions,
                self._tf_policy.action_spec)
Exemplo n.º 4
0
    def testBatchedNestedLogProbability(self):
        action_spec = [
            tensor_spec.BoundedTensorSpec([2], tf.float32, -1, 1),
            [
                tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1),
                tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)
            ]
        ]
        distribution = [
            tfp.distributions.Normal([[0.0, 0.0], [0.0, 0.0]],
                                     [[1.0, 1.0], [2.0, 2.0]]),
            [
                tfp.distributions.Normal([[0.5], [0.5]], [[1.0], [2.0]]),
                tfp.distributions.Normal([[-0.5], [-0.5]], [[1.0], [2.0]])
            ]
        ]
        actions = [
            tf.constant([[0.0, 0.0], [0.0, 0.0]]),
            [tf.constant([[0.5], [0.5]]),
             tf.constant([[-0.5], [-0.5]])]
        ]
        log_probs = common.log_probability(distribution, actions, action_spec)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        log_probs_ = self.evaluate(log_probs)
        self.assertEqual(log_probs_.shape, (2, ))
        self.assertAllClose(
            log_probs_,
            [4 * -0.5 * np.log(2 * 3.14159), 4 * -0.5 * np.log(8 * 3.14159)],
            0.001)
Exemplo n.º 5
0
    def _get_safe_idx(self, safe_ac_mask, fail_prob, sampled_ac, safe_ac_idx,
                      actions, fail_prob_safe):
        if tf.math.count_nonzero(safe_ac_mask) == 0:
            # picks safest action
            safe_idx = tf.argmin(fail_prob)
        else:
            sampled_ac = tf.gather(sampled_ac, safe_ac_idx)
            # picks most unsafe "safe" action
            # safe_idx = tf.argmax(fail_prob_safe, axis=0)

            # picks the safest action
            # safe_idx = tf.argmin(fail_prob_safe)

            if self._training:
                # picks random safe_action, weighted by 1 - fail_prob_safe (so higher weight for safer actions)
                # safe_idx = tfp.distributions.Categorical([1 - fail_prob_safe]).sample()
                if self._sampling_method == 'rejection':
                    # standard rejection sampling with prob proportional to original policy
                    log_prob = common.log_probability(actions, sampled_ac,
                                                      self.action_spec)
                    safe_idx = tfp.distributions.Categorical(log_prob).sample()
                elif self._sampling_method == 'risky':
                    # picks random risky safe action, weighted by fail_prob_safe (so higher weight for less safe actions)
                    safe_idx = tfp.distributions.Categorical([fail_prob_safe
                                                              ]).sample()
                elif self._sampling_method == 'safe':
                    safe_idx = tfp.distributions.Categorical(
                        [1 - fail_prob_safe]).sample()
            safe_idx = tf.reshape(safe_idx, [-1])[0]
        return safe_idx
Exemplo n.º 6
0
    def _actions_and_log_probs(self, time_steps):
        """Get actions and corresponding log probabilities from policy."""
        # Get raw action distribution from policy, and initialize bijectors list.
        action_distribution = self.policy().distribution(time_steps).action

        if self._squash_actions:
            bijectors = []

            # Bijector to rescale actions to ranges in action spec.
            action_means, action_magnitudes = self._action_spec_means_magnitudes(
            )
            bijectors.append(
                tfp.bijectors.AffineScalar(shift=action_means,
                                           scale=action_magnitudes))

            # Bijector to squash actions to range (-1.0, +1.0).
            bijectors.append(tanh_bijector_stable.Tanh())

            # Chain applies bijectors in reverse order, so squash will happen before
            # rescaling to action spec.
            bijector_chain = tfp.bijectors.Chain(bijectors)
            action_distribution = tfp.distributions.TransformedDistribution(
                distribution=action_distribution, bijector=bijector_chain)

        # Sample actions and log_pis from transformed distribution.
        actions = tf.nest.map_structure(lambda d: d.sample(),
                                        action_distribution)
        log_pi = common_utils.log_probability(action_distribution, actions,
                                              self.action_spec())

        return actions, log_pi
Exemplo n.º 7
0
  def policy_gradient_loss(self,
                           actions_distribution,
                           actions,
                           is_boundary,
                           returns,
                           num_episodes,
                           weights=None):
    """Computes the policy gradient loss.

    Args:
      actions_distribution: A possibly batched tuple of action distributions.
      actions: Tensor with a batch of actions.
      is_boundary: Tensor of booleans that indicate if the corresponding action
        was in a boundary trajectory and should be ignored.
      returns: Tensor with a return from each timestep, aligned on index. Works
        better when returns are normalized.
      num_episodes: Number of episodes contained in the training data.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.  May include a mask for invalid timesteps.

    Returns:
      policy_gradient_loss: A tensor that will contain policy gradient loss for
        the on-policy experience.
    """
    # TODO(b/126594799): Add class IndependentNested(tfd.Distribution) to handle
    # nests of independent distributions like this.
    action_log_prob = common.log_probability(actions_distribution, actions,
                                             self.action_spec)

    # Filter out transitions between end state of previous episode and start
    # state of next episode.
    valid_mask = tf.cast(~is_boundary, tf.float32)
    action_log_prob *= valid_mask

    action_log_prob_times_return = action_log_prob * returns

    if weights is not None:
      action_log_prob_times_return *= weights

    if self._debug_summaries:
      tf.compat.v2.summary.histogram(
          name='action_log_prob',
          data=action_log_prob,
          step=self.train_step_counter)
      tf.compat.v2.summary.histogram(
          name='action_log_prob_times_return',
          data=action_log_prob_times_return,
          step=self.train_step_counter)

    # Policy gradient loss is defined as the sum, over timesteps, of action
    #   log-probability times the cumulative return from that timestep onward.
    #   For more information, see (Williams, 1992).
    policy_gradient_loss = -tf.reduce_sum(
        input_tensor=action_log_prob_times_return)

    # We take the mean over episodes by dividing by num_episodes.
    policy_gradient_loss = policy_gradient_loss / num_episodes

    return policy_gradient_loss
Exemplo n.º 8
0
  def testLogProbabilityOneHot(self):
    action_spec = tensor_spec.BoundedTensorSpec([3], tf.int32, 0, 1)
    distribution = tfp.distributions.OneHotCategorical(probs=[0.6, 0.3, 0.1])
    actions = tf.constant([1, 0, 0])
    log_probs = common.log_probability(distribution, actions, action_spec)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    log_probs_ = self.evaluate(log_probs)
    self.assertEqual(len(log_probs_.shape), 0)
    self.assertNear(log_probs_, np.log(0.6), 0.00001)
Exemplo n.º 9
0
 def behavior_loss(self, time_steps, actions, weights=None):
     with tf.name_scope('behavior_loss'):
         nest_utils.assert_same_structure(time_steps, self.time_step_spec)
         batch_size = nest_utils.get_outer_shape(time_steps,
                                                 self._time_step_spec)[0]
         policy_state = self._behavior_policy.get_initial_state(batch_size)
         action_distribution = self._behavior_policy.distribution(
             time_steps, policy_state=policy_state).action
         log_pi = common.log_probability(action_distribution, actions,
                                         self.action_spec)
         return -1.0 * tf.reduce_mean(log_pi)
Exemplo n.º 10
0
    def _ml_pmi(self, x, y, y_distribution):
        num_outer_dims = get_outer_rank(x, self._x_spec)
        hidden = self._model(x)[0]
        batch_squash = BatchSquash(num_outer_dims)
        hidden = batch_squash.flatten(hidden)
        delta_loc = self._delta_loc_layer(hidden)
        delta_scale = tf.nn.softplus(self._delta_scale_layer(hidden))
        delta_loc = batch_squash.unflatten(delta_loc)
        delta_scale = batch_squash.unflatten(delta_scale)
        y_given_x_dist = tfp.distributions.Normal(
            loc=y_distribution.loc + delta_loc,
            scale=y_distribution.scale * delta_scale)

        # Because Normal.event_shape is [], the result of Normal.log_prob() is
        # the probabilities of individual dimensions. So we need to use
        # tfa_common.log_probability() instead.
        # TODO: implement a normal distribution with non-scalar event shape.
        pmi = tfa_common.log_probability(y_given_x_dist, y, self._y_spec)
        pmi -= tf.stop_gradient(
            tfa_common.log_probability(y_distribution, y, self._y_spec))
        return pmi
Exemplo n.º 11
0
    def _line_search(self, time_steps, policy_steps_, advantages,
                     natural_gradient, coeff, weights):
        """Find new policy parameters by line search in natural gradient direction"""

        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]

        # old policy distribution
        action_distribution_parameters = policy_steps_.info
        actions = policy_steps_.action
        actions_distribution = distribution_spec.nested_distributions_from_specs(
            self._action_distribution_spec,
            action_distribution_parameters["dist_params"])
        act_log_probs = common.log_probability(actions_distribution, actions,
                                               self._action_spec)

        # loss for the old policy
        loss_threshold = self.policy_gradient_loss(
            time_steps,
            actions,
            tf.stop_gradient(act_log_probs),
            tf.stop_gradient(advantages),
            actions_distribution,
            weights,
        )

        policy_params = flatten_tensors(self._actor_net.trainable_variables)

        # try different steps_sizes, accept first one that improves loss and satisfies KL constraint
        for it in range(self._backtrack_iters):
            new_params = policy_params - self._backtrack_coeff**it * coeff * natural_gradient

            unflatten_tensor(new_params, self._opt_policy_parameters)
            opt_policy_state = self._opt_policy.get_initial_state(batch_size)
            dists = self._opt_policy.distribution(time_steps, opt_policy_state)
            new_policy_distribution = dists.action

            kl = tf.reduce_mean(
                self._kl_divergence(time_steps, action_distribution_parameters,
                                    new_policy_distribution))
            loss = self.policy_gradient_loss(
                time_steps,
                actions,
                tf.stop_gradient(act_log_probs),
                tf.stop_gradient(advantages),
                new_policy_distribution,
                weights,
            )
            if kl < self._max_kl and loss < loss_threshold:
                return new_params

        # no improvement found
        return policy_params
Exemplo n.º 12
0
    def _actions_and_log_probs(self, time_steps):
        """Get actions and corresponding log probabilities from policy."""
        # Get raw action distribution from policy, and initialize bijectors list.
        action_distribution = self.policy.distribution(time_steps).action

        # Sample actions and log_pis from transformed distribution.
        actions = tf.nest.map_structure(lambda d: d.sample(),
                                        action_distribution)
        log_pi = common.log_probability(action_distribution, actions,
                                        self.action_spec)

        return actions, log_pi
Exemplo n.º 13
0
    def policy_gradient(self, time_steps, policy_steps_, advantages, weights):
        """
        Compute policy gradient wrt actor_net parameters.

        :param time_steps: batch of TimeSteps with observations for each timestep
        :param policy_steps_: policy info for time step sampling policy
        :param advantages: Tensor of advantage estimate for each timestep, aligned on index.
        :param weights: mask for invalid timesteps
        :return: list of gradient tensors, policy loss computer on timesteps
        """
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        actions = policy_steps_.action

        # get policy info before update
        action_distribution_parameters = policy_steps_.info

        # Reconstruct per-timestep policy distribution
        old_actions_distribution = distribution_spec.nested_distributions_from_specs(
            self._action_distribution_spec,
            action_distribution_parameters["dist_params"])

        # Log probability of actions taken during data collection
        act_log_probs = common.log_probability(old_actions_distribution,
                                               actions, self._action_spec)

        with tf.GradientTape() as tape:
            # current policy distribution
            policy_state = self._collect_policy.get_initial_state(batch_size)
            distribution_step = self._collect_policy.distribution(
                time_steps, policy_state)
            current_policy_distribution = distribution_step.action

            policy_gradient_loss = self.policy_gradient_loss(
                time_steps,
                actions,
                tf.stop_gradient(act_log_probs),
                tf.stop_gradient(advantages),
                current_policy_distribution,
                weights,
            )

        trainable = self._actor_net.trainable_weights

        grads = tape.gradient(policy_gradient_loss, trainable)

        for g in grads:
            tf.debugging.check_numerics(g,
                                        "Gradient divergence",
                                        name="grad_check")

        return policy_gradient_loss, grads
Exemplo n.º 14
0
  def _actions_and_log_probs(self, time_steps):
    """Get actions and corresponding log probabilities from policy."""
    # Get raw action distribution from policy, and initialize bijectors list.
    batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0]
    policy_state = self._train_policy.get_initial_state(batch_size)
    action_distribution = self._train_policy.distribution(
        time_steps, policy_state=policy_state).action

    # Sample actions and log_pis from transformed distribution.
    actions = tf.nest.map_structure(lambda d: d.sample(), action_distribution)
    log_pi = common.log_probability(action_distribution, actions,
                                    self.action_spec)

    return actions, log_pi
Exemplo n.º 15
0
  def _actions_and_log_probs(self, time_steps, safety_constrained=False):
    """Get actions and corresponding log probabilities from policy."""
    # Get raw action distribution from policy, and initialize bijectors list.
    batch_size = nest_utils.get_outer_shape(time_steps, self.time_step_spec)[0]
    policy = self.collect_policy
    policy_state = policy.get_initial_state(batch_size)
    action_distribution = policy.distribution(
      time_steps, policy_state=policy_state).action
    # Sample actions and log_pis from transformed distribution.
    if safety_constrained:
      actions, policy_state = self.safe_policy._apply_actor_network(
          time_steps.observation, time_steps.step_type, policy_state)
    else:
      actions = tf.nest.map_structure(lambda d: d.sample(), action_distribution)
    log_pi = common.log_probability(action_distribution, actions,
                                    self.action_spec)

    return actions, log_pi
Exemplo n.º 16
0
    def policy_gradient_loss(self, time_steps, actions, returns, weights=None):
        """Computes the policy gradient loss.

    Args:
      time_steps: TimeStep object with a batch of observations.
      actions: Tensor with a batch of actions.
      returns: Tensor with a return from each timestep, aligned on index. Works
        better when returns are normalized.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.  May include a mask for invalid timesteps.

    Returns:
      policy_gradient_loss: A tensor that will contain policy gradient loss for
        the on-policy experience.
    """
        tf.nest.assert_same_structure(time_steps, self.time_step_spec())
        actions_distribution = self.collect_policy().distribution(
            time_steps).action

        # TODO(kbanoop): Add class IndependentNested(tfd.Distribution) to handle
        # nests of independent distributions like this.
        action_log_prob = common.log_probability(actions_distribution, actions,
                                                 self.action_spec())
        action_log_prob_times_return = action_log_prob * returns

        if weights is not None:
            action_log_prob_times_return *= weights

        if self._debug_summaries:
            tf.contrib.summary.histogram('action_log_prob', action_log_prob)
            tf.contrib.summary.histogram('action_log_prob_times_return',
                                         action_log_prob_times_return)

        # Policy gradient loss is defined as the sum, over timesteps, of action
        #   log-probability times the cumulative return from that timestep onward.
        #   For more information, see (Williams, 1992)
        policy_gradient_loss = -tf.reduce_sum(
            input_tensor=action_log_prob_times_return)

        with tf.name_scope('Losses/'):
            tf.contrib.summary.scalar('policy_gradient_loss',
                                      policy_gradient_loss)

        return tf_agent.LossInfo(policy_gradient_loss, ())
Exemplo n.º 17
0
    def train_step(self, exp: Experience, state: SacState):
        action_distribution, share_actor_state = self._actor_network(
            exp.observation,
            step_type=exp.step_type,
            network_state=state.share.actor)
        action = tf.nest.map_structure(lambda d: d.sample(),
                                       action_distribution)
        log_pi = tfa_common.log_probability(action_distribution, action,
                                            self._action_spec)

        actor_state, actor_info = self._actor_train_step(
            exp, state.actor, action_distribution, action, log_pi)
        critic_state, critic_info = self._critic_train_step(
            exp, state.critic, action, log_pi)
        alpha_info = self._alpha_train_step(log_pi)
        state = SacState(share=SacShareState(actor=share_actor_state),
                         actor=actor_state,
                         critic=critic_state)
        info = SacInfo(actor=actor_info, critic=critic_info, alpha=alpha_info)
        return PolicyStep(action_distribution, state, info)
Exemplo n.º 18
0
    def _actions_and_log_probs(self,
                               time_steps: ts.TimeStep,
                               training: Optional[bool] = False
                               ) -> Tuple[types.Tensor, types.Tensor]:
        """Get actions and corresponding log probabilities from policy."""
        # Get raw action distribution from policy, and initialize bijectors list.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._train_policy.get_initial_state(batch_size)
        if training:
            action_distribution = self._train_policy.distribution(
                time_steps, policy_state=policy_state).action
        else:
            action_distribution = self._policy.distribution(
                time_steps, policy_state=policy_state).action

        # Sample actions and log_pis from transformed distribution.
        actions = tf.nest.map_structure(
            lambda d: d.sample((), seed=self._action_seed_stream()),
            action_distribution)
        log_pi = common.log_probability(action_distribution, actions,
                                        self.action_spec)

        return actions, log_pi
Exemplo n.º 19
0
    def policy_gradient_loss(
        self,
        time_steps,
        actions,
        sample_action_log_probs,
        advantages,
        current_policy_distribution,
        weights,
    ):
        """Create tensor for policy gradient loss.

        All tensors should have a single batch dimension.

        Args:
          time_steps: TimeSteps with observations for each timestep.
          actions: Tensor of actions for timesteps, aligned on index.
          sample_action_log_probs: Tensor of sample probability of each action.
          advantages: Tensor of advantage estimate for each timestep, aligned on
            index. Works better when advantage estimates are normalized.
          current_policy_distribution: The policy distribution, evaluated on all
            time_steps.
          weights: Optional scalar or element-wise (per-batch-entry) importance
            weights.  Includes a mask for invalid timesteps.

        Returns:
          policy_gradient_loss: A tensor that will contain policy gradient loss for
            the on-policy experience.
        """
        tf.nest.assert_same_structure(time_steps, self.time_step_spec)
        action_log_prob = common.log_probability(current_policy_distribution,
                                                 actions, self._action_spec)

        action_log_prob = tf.cast(action_log_prob, tf.float32)
        if self._log_prob_clipping > 0.0:
            action_log_prob = tf.clip_by_value(action_log_prob,
                                               -self._log_prob_clipping,
                                               self._log_prob_clipping)

        tf.debugging.check_numerics(action_log_prob, "action_log_prob")

        tf.debugging.check_numerics(sample_action_log_probs,
                                    "sample_action_log_probs")

        # Prepare unclipped importance ratios.
        importance_ratio = tf.exp(action_log_prob - sample_action_log_probs)

        tf.debugging.check_numerics(importance_ratio,
                                    "importance_ratio",
                                    name="importance_ratio")

        per_timestep_objective = importance_ratio * advantages
        policy_gradient_loss = -per_timestep_objective

        policy_gradient_loss = tf.reduce_mean(
            input_tensor=policy_gradient_loss * weights)

        tf.debugging.check_numerics(policy_gradient_loss,
                                    "Policy Loss divergence",
                                    name="policy_check")

        return policy_gradient_loss
    def _soft_relabel(self, experience):
        # experience.observation.shape = [B x T=2 x obs_dim+state_dim]
        states, orig_tasks = self._task_distribution.split(
            experience.observation[:, 0])
        if self._task_distribution.tasks is None:
            tasks = orig_tasks
        else:
            tasks = tf.constant(self._task_distribution.tasks,
                                dtype=tf.float32)
        next_states, _ = self._task_distribution.split(
            experience.observation[:, 1])
        if self._candidate_task_type == "states":
            candidate_tasks = self._task_distribution.state_to_task(states)
        elif self._candidate_task_type == "next_states":
            candidate_tasks = self._task_distribution.state_to_task(
                next_states)
        else:
            assert self._candidate_task_type == "tasks"
            candidate_tasks = tasks

        actions = experience.action[:, 0]
        num_tasks = tasks.shape[0]
        batch_size = states.shape[0]
        task_dim = tasks.shape[1]
        obs_dim = states.shape[1]
        action_dim = actions.shape[1]
        action_spec = self._actor.output_tensor_spec

        states_tiled = tf.tile(states[:, None], [1, num_tasks, 1])  # B x B x D
        states_tiled = tf.reshape(states_tiled,
                                  [batch_size * num_tasks, obs_dim])  # B*B x D
        actions_tiled = tf.tile(actions[:, None],
                                [1, num_tasks, 1])  # B x B x D
        actions_tiled = tf.reshape(
            actions_tiled, [batch_size * num_tasks, action_dim])  # B*B x D
        tasks_tiled = tf.tile(tasks[None], [batch_size, 1, 1])  # B x B x D
        tasks_tiled = tf.reshape(tasks_tiled,
                                 [batch_size * num_tasks, task_dim])  # B*B x D

        next_states_tiled = tf.tile(next_states[:, None], [1, num_tasks, 1])
        next_states_tiled = tf.reshape(
            next_states_tiled, [batch_size * num_tasks, obs_dim])  # B*B x D
        next_relabelled_obs = self._task_distribution.combine(
            next_states_tiled, tasks_tiled)

        sampled_actions_tiled = self._actor(next_relabelled_obs,
                                            step_type=(),
                                            network_state=())[0].sample()
        critic_input = (next_relabelled_obs, sampled_actions_tiled)
        q_vals, _ = self._critic(critic_input, training=False)
        q_vals_vec = tf.reshape(q_vals, (batch_size, num_tasks))

        rewards, dones = self._task_distribution.evaluate(
            states_tiled, actions_tiled, tasks_tiled)
        dones = tf.cast(dones, tf.float32)
        rewards_vec = tf.reshape(rewards, (batch_size, num_tasks))
        dones_vec = tf.reshape(dones, (batch_size, num_tasks))

        relabelled_obs = self._task_distribution.combine(
            states_tiled, tasks_tiled)
        action_distribution = self._actor(relabelled_obs,
                                          step_type=(),
                                          network_state=())[0]
        log_pi = common.log_probability(action_distribution, actions_tiled,
                                        action_spec)
        log_pi_vec = tf.reshape(log_pi, (batch_size, num_tasks))

        logits_vec = (rewards_vec - log_pi_vec + self._gamma *
                      (1.0 - dones_vec) * q_vals_vec)
        if self._relabel_type == "random":
            logits_vec = tf.ones_like(
                logits_vec)  # Hack to make sampling random

        ## End new version
        if self._normalize_cols:
            logits_vec = logits_vec - tf.math.reduce_logsumexp(logits_vec,
                                                               axis=0)[None]
        relabel_indices = tf.random.categorical(logits=logits_vec,
                                                num_samples=1)

        ### Metrics
        global_step = tf.compat.v1.train.get_or_create_global_step()
        orig_indices = tf.range(self._sample_batch_size,
                                dtype=relabel_indices.dtype)
        with tf.name_scope("relabelling"):
            # How often are the originally commanded goals most optimal?
            opt_indices = tf.argmax(logits_vec, axis=1)
            orig_is_opt = opt_indices == orig_indices
            orig_opt_frac = tf.reduce_mean(tf.cast(orig_is_opt, tf.float32))
            tf.compat.v2.summary.scalar(name="orig_task_optimal",
                                        data=orig_opt_frac,
                                        step=global_step)

            # How often is the relabelled goal optimal?
            # The relabel_indices are [B, 1], so we need to remove the extra dim.
            relabel_is_opt = tf.squeeze(relabel_indices) == orig_indices
            relabel_opt_frac = tf.reduce_mean(
                tf.cast(relabel_is_opt, tf.float32))
            tf.compat.v2.summary.scalar(name="relabel_task_optimal",
                                        data=relabel_opt_frac,
                                        step=global_step)

            # What are the average Q values of the original tasks?
            if batch_size == num_tasks:
                indices = tf.transpose(
                    tf.stack([orig_indices, orig_indices], axis=0))
                orig_q_vals = tf.gather_nd(logits_vec, indices)
                tf.compat.v2.summary.scalar(
                    name="orig_q_vals",
                    data=tf.reduce_mean(orig_q_vals),
                    step=global_step,
                )

            # What are the average Q values of the relabelled tasks?
            indices = tf.transpose(
                tf.stack(
                    [orig_indices, tf.squeeze(relabel_indices)], axis=0))
            relabel_q_vals = tf.gather_nd(logits_vec, indices)
            tf.compat.v2.summary.scalar(
                name="relabel_q_vals",
                data=tf.reduce_mean(relabel_q_vals),
                step=global_step,
            )

            max_q = tf.reduce_max(logits_vec, axis=1)
            tf.compat.v2.summary.scalar(name="max_q",
                                        data=tf.reduce_mean(max_q),
                                        step=global_step)

        ### End metrics

        # For both state-centric and goal-centric relabelling, the implementation of
        # mixing is the same: we randomly replace some of the indices with the
        # diagonal.
        relabelled_tasks = tf.gather(candidate_tasks,
                                     tf.squeeze(relabel_indices))

        if self._relabel_prob == 0:
            relabelled_tasks = orig_tasks
        elif 0 < self._relabel_prob < 1:
            logits = tf.log([1.0 - self._relabel_prob, self._relabel_prob])
            mask = tf.squeeze(
                tf.random.categorical(logits[None],
                                      num_samples=self._sample_batch_size))
            mask = tf.cast(mask, tf.float32)[:, None]
            relabelled_tasks = mask * orig_tasks + (1 -
                                                    mask) * relabelled_tasks

        states_and_tasks = self._task_distribution.combine(
            states, relabelled_tasks)
        next_states_and_tasks = self._task_distribution.combine(
            next_states, relabelled_tasks)
        new_observation = tf.concat(
            [states_and_tasks[:, None], next_states_and_tasks[:, None]],
            axis=1)
        assert new_observation.shape == experience.observation.shape
        experience = experience.replace(observation=new_observation)
        return experience
Exemplo n.º 21
0
    def policy_gradient_loss(self,
                             time_steps,
                             actions,
                             sample_action_log_probs,
                             advantages,
                             current_policy_distribution,
                             valid_mask,
                             debug_summaries=False):
        """Create tensor for policy gradient loss.

    All tensors should have a single batch dimension.

    Args:
      time_steps: TimeSteps with observations for each timestep.
      actions: Tensor of actions for timesteps, aligned on index.
      sample_action_log_probs: Tensor of ample probability of each action.
      advantages: Tensor of advantage estimate for each timestep, aligned on
        index. Works better when advantage estimates are normalized.
      current_policy_distribution: The policy distribution, evaluated on all
        time_steps.
      valid_mask: Mask for invalid timesteps. Float value 1.0 for valid
        timesteps and 0.0 for invalid timesteps. (Timesteps which either are
        betweeen two episodes, or part of an unfinished episode at the end of
        one batch dimension.)
      debug_summaries: True if debug summaries should be created.

    Returns:
      policy_gradient_loss: A tensor that will contain policy gradient loss for
        the on-policy experience.
    """
        nest.assert_same_structure(time_steps, self.time_step_spec())
        action_log_prob = common_utils.log_probability(
            current_policy_distribution, actions, self._action_spec)
        action_log_prob = tf.to_float(action_log_prob)
        if self._log_prob_clipping > 0.0:
            action_log_prob = tf.clip_by_value(action_log_prob,
                                               -self._log_prob_clipping,
                                               self._log_prob_clipping)
        if self._check_numerics:
            action_log_prob = tf.check_numerics(action_log_prob,
                                                'action_log_prob')

        # Prepare both clipped and unclipped importance ratios.
        importance_ratio = tf.exp(action_log_prob - sample_action_log_probs)
        importance_ratio_clipped = tf.clip_by_value(
            importance_ratio, 1 - self._importance_ratio_clipping,
            1 + self._importance_ratio_clipping)

        if self._check_numerics:
            importance_ratio = tf.check_numerics(importance_ratio,
                                                 'importance_ratio')
            if self._importance_ratio_clipping > 0.0:
                importance_ratio_clipped = tf.check_numerics(
                    importance_ratio_clipped, 'importance_ratio_clipped')

        # Pessimistically choose the minimum objective value for clipped and
        #   unclipped importance ratios.
        per_timestep_objective = importance_ratio * advantages
        per_timestep_objective_clipped = importance_ratio_clipped * advantages
        per_timestep_objective_min = tf.minimum(
            per_timestep_objective, per_timestep_objective_clipped)

        if self._importance_ratio_clipping > 0.0:
            policy_gradient_loss = -per_timestep_objective_min
        else:
            policy_gradient_loss = -per_timestep_objective
        policy_gradient_loss = tf.reduce_mean(policy_gradient_loss *
                                              valid_mask)

        if debug_summaries:
            if self._importance_ratio_clipping > 0.0:
                clip_fraction = tf.reduce_mean(
                    tf.to_float(
                        tf.greater(tf.abs(importance_ratio - 1.0),
                                   self._importance_ratio_clipping)))
                tf.contrib.summary.scalar('clip_fraction', clip_fraction)
            tf.contrib.summary.histogram('action_log_prob', action_log_prob)
            tf.contrib.summary.histogram('action_log_prob_sample',
                                         sample_action_log_probs)
            tf.contrib.summary.histogram('importance_ratio', importance_ratio)
            tf.contrib.summary.scalar('importance_ratio_mean',
                                      tf.reduce_mean(importance_ratio))
            tf.contrib.summary.histogram('importance_ratio_clipped',
                                         importance_ratio_clipped)
            tf.contrib.summary.histogram('per_timestep_objective',
                                         per_timestep_objective)
            tf.contrib.summary.histogram('per_timestep_objective_clipped',
                                         per_timestep_objective_clipped)
            tf.contrib.summary.histogram('per_timestep_objective_min',
                                         per_timestep_objective_min)
            entropy = common_utils.entropy(current_policy_distribution,
                                           self.action_spec())
            tf.contrib.summary.histogram('policy_entropy', entropy)
            tf.contrib.summary.scalar('policy_entropy_mean',
                                      tf.reduce_mean(entropy))
            # Categorical distribution (used for discrete actions)
            # doesn't have a mean.
            if not self.action_spec().is_discrete():
                tf.contrib.summary.histogram(
                    'actions_distribution_mean',
                    current_policy_distribution.mean())
                tf.contrib.summary.histogram(
                    'actions_distribution_stddev',
                    current_policy_distribution.stddev())
            tf.contrib.summary.histogram('policy_gradient_loss',
                                         policy_gradient_loss)

        if self._check_numerics:
            policy_gradient_loss = tf.check_numerics(policy_gradient_loss,
                                                     'policy_gradient_loss')

        return policy_gradient_loss
Exemplo n.º 22
0
    def policy_gradient_loss(self,
                             time_steps,
                             actions,
                             sample_action_log_probs,
                             advantages,
                             current_policy_distribution,
                             weights,
                             debug_summaries=False):
        """Create tensor for policy gradient loss.

    All tensors should have a single batch dimension.

    Args:
      time_steps: TimeSteps with observations for each timestep.
      actions: Tensor of actions for timesteps, aligned on index.
      sample_action_log_probs: Tensor of sample probability of each action.
      advantages: Tensor of advantage estimate for each timestep, aligned on
        index. Works better when advantage estimates are normalized.
      current_policy_distribution: The policy distribution, evaluated on all
        time_steps.
      weights: Optional scalar or element-wise (per-batch-entry) importance
        weights.  Includes a mask for invalid timesteps.
      debug_summaries: True if debug summaries should be created.

    Returns:
      policy_gradient_loss: A tensor that will contain policy gradient loss for
        the on-policy experience.
    """
        tf.nest.assert_same_structure(time_steps, self.time_step_spec)
        action_log_prob = common.log_probability(current_policy_distribution,
                                                 actions, self._action_spec)
        action_log_prob = tf.cast(action_log_prob, tf.float32)
        if self._log_prob_clipping > 0.0:
            action_log_prob = tf.clip_by_value(action_log_prob,
                                               -self._log_prob_clipping,
                                               self._log_prob_clipping)
        if self._check_numerics:
            action_log_prob = tf.debugging.check_numerics(
                action_log_prob, 'action_log_prob')

        # Prepare both clipped and unclipped importance ratios.
        importance_ratio = tf.exp(action_log_prob - sample_action_log_probs)
        importance_ratio_clipped = tf.clip_by_value(
            importance_ratio, 1 - self._importance_ratio_clipping,
            1 + self._importance_ratio_clipping)

        if self._check_numerics:
            importance_ratio = tf.debugging.check_numerics(
                importance_ratio, 'importance_ratio')
            if self._importance_ratio_clipping > 0.0:
                importance_ratio_clipped = tf.debugging.check_numerics(
                    importance_ratio_clipped, 'importance_ratio_clipped')

        # Pessimistically choose the minimum objective value for clipped and
        #   unclipped importance ratios.
        per_timestep_objective = importance_ratio * advantages
        per_timestep_objective_clipped = importance_ratio_clipped * advantages
        per_timestep_objective_min = tf.minimum(
            per_timestep_objective, per_timestep_objective_clipped)

        if self._importance_ratio_clipping > 0.0:
            policy_gradient_loss = -per_timestep_objective_min
        else:
            policy_gradient_loss = -per_timestep_objective

        policy_gradient_loss = tf.reduce_mean(
            input_tensor=policy_gradient_loss * weights)

        if debug_summaries:
            if self._importance_ratio_clipping > 0.0:
                clip_fraction = tf.reduce_mean(input_tensor=tf.cast(
                    tf.greater(tf.abs(importance_ratio - 1.0),
                               self._importance_ratio_clipping), tf.float32))
                tf.compat.v2.summary.scalar(name='clip_fraction',
                                            data=clip_fraction,
                                            step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='action_log_prob',
                                           data=action_log_prob,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='action_log_prob_sample',
                                           data=sample_action_log_probs,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='importance_ratio',
                                           data=importance_ratio,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.scalar(
                name='importance_ratio_mean',
                data=tf.reduce_mean(input_tensor=importance_ratio),
                step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='importance_ratio_clipped',
                                           data=importance_ratio_clipped,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='per_timestep_objective',
                                           data=per_timestep_objective,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(
                name='per_timestep_objective_clipped',
                data=per_timestep_objective_clipped,
                step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='per_timestep_objective_min',
                                           data=per_timestep_objective_min,
                                           step=self.train_step_counter)
            entropy = common.entropy(current_policy_distribution,
                                     self.action_spec)
            tf.compat.v2.summary.histogram(name='policy_entropy',
                                           data=entropy,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.scalar(
                name='policy_entropy_mean',
                data=tf.reduce_mean(input_tensor=entropy),
                step=self.train_step_counter)
            for i, (single_action, single_distribution) in enumerate(
                    zip(tf.nest.flatten(self.action_spec),
                        tf.nest.flatten(current_policy_distribution))):
                # Categorical distribution (used for discrete actions) doesn't have a
                # mean.
                distribution_index = '_{}'.format(i) if i > 0 else ''
                if not tensor_spec.is_discrete(single_action):
                    tf.compat.v2.summary.histogram(
                        name='actions_distribution_mean' + distribution_index,
                        data=single_distribution.mean(),
                        step=self.train_step_counter)
                    tf.compat.v2.summary.histogram(
                        name='actions_distribution_stddev' +
                        distribution_index,
                        data=single_distribution.stddev(),
                        step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='policy_gradient_loss',
                                           data=policy_gradient_loss,
                                           step=self.train_step_counter)

        if self._check_numerics:
            policy_gradient_loss = tf.debugging.check_numerics(
                policy_gradient_loss, 'policy_gradient_loss')

        return policy_gradient_loss
Exemplo n.º 23
0
    def _train(self, experience, weights):
        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)
        actions = policy_steps_.action

        if self._debug_summaries:
            actions_list = tf.nest.flatten(actions)
            show_action_index = len(actions_list) != 1
            for i, single_action in enumerate(actions_list):
                action_name = ('actions_{}'.format(i)
                               if show_action_index else 'actions')
                tf.compat.v2.summary.histogram(name=action_name,
                                               data=single_action,
                                               step=self.train_step_counter)

        action_distribution_parameters = policy_steps_.info

        # Reconstruct per-timestep policy distribution from stored distribution
        #   parameters.
        old_actions_distribution = (
            distribution_spec.nested_distributions_from_specs(
                self._action_distribution_spec,
                action_distribution_parameters))

        # Compute log probability of actions taken during data collection, using the
        #   collect policy distribution.
        act_log_probs = common.log_probability(old_actions_distribution,
                                               actions, self._action_spec)

        # Compute the value predictions for states using the current value function.
        # To be used for return & advantage computation.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(
            batch_size=batch_size)

        value_preds, unused_policy_state = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            policy_state=policy_state)
        value_preds = tf.stop_gradient(value_preds)

        valid_mask = ppo_utils.make_timestep_mask(next_time_steps)

        if weights is None:
            weights = valid_mask
        else:
            weights *= valid_mask

        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        loss_info = None  # TODO(b/123627451): Remove.
        # For each epoch, create its own train op that depends on the previous one.
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):
                # Only save debug summaries for first and last epochs.
                debug_summaries = (self._debug_summaries
                                   and (i_epoch == 0
                                        or i_epoch == self._num_epochs - 1))

                # Build one epoch train op.
                with tf.GradientTape() as tape:
                    loss_info = self.get_epoch_loss(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, action_distribution_parameters,
                        weights, self.train_step_counter, debug_summaries)

                variables_to_train = (self._actor_net.trainable_weights +
                                      self._value_net.trainable_weights)
                grads = tape.gradient(loss_info.loss, variables_to_train)
                # Tuple is used for py3, where zip is a generator producing values once.
                grads_and_vars = tuple(zip(grads, variables_to_train))
                if self._gradient_clipping > 0:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

                # If summarize_gradients, create functions for summarizing both
                # gradients and variables.
                if self._summarize_grads_and_vars and debug_summaries:
                    eager_utils.add_gradients_summaries(
                        grads_and_vars, self.train_step_counter)
                    eager_utils.add_variables_summaries(
                        grads_and_vars, self.train_step_counter)

                self._optimizer.apply_gradients(
                    grads_and_vars, global_step=self.train_step_counter)

                policy_gradient_losses.append(
                    loss_info.extra.policy_gradient_loss)
                value_estimation_losses.append(
                    loss_info.extra.value_estimation_loss)
                l2_regularization_losses.append(
                    loss_info.extra.l2_regularization_loss)
                entropy_regularization_losses.append(
                    loss_info.extra.entropy_regularization_loss)
                kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(batch_size)
        # Compute the mean kl from previous action distribution.
        kl_divergence = self._kl_divergence(
            time_steps, action_distribution_parameters,
            self._collect_policy.distribution(time_steps, policy_state).action)
        self.update_adaptive_kl_beta(kl_divergence)

        if self._observation_normalizer:
            self._observation_normalizer.update(time_steps.observation,
                                                outer_dims=[0, 1])
        else:
            # TODO(b/127661780): Verify performance of reward_normalizer when obs are
            #                    not normalized
            if self._reward_normalizer:
                self._reward_normalizer.update(next_time_steps.reward,
                                               outer_dims=[0, 1])

        loss_info = tf.nest.map_structure(tf.identity, loss_info)

        # Make summaries for total loss across all epochs.
        # The *_losses lists will have been populated by
        #   calls to self.get_epoch_loss.
        with tf.name_scope('Losses/'):
            total_policy_gradient_loss = tf.add_n(policy_gradient_losses)
            total_value_estimation_loss = tf.add_n(value_estimation_losses)
            total_l2_regularization_loss = tf.add_n(l2_regularization_losses)
            total_entropy_regularization_loss = tf.add_n(
                entropy_regularization_losses)
            total_kl_penalty_loss = tf.add_n(kl_penalty_losses)
            tf.compat.v2.summary.scalar(name='policy_gradient_loss',
                                        data=total_policy_gradient_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='value_estimation_loss',
                                        data=total_value_estimation_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='l2_regularization_loss',
                                        data=total_l2_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='entropy_regularization_loss',
                                        data=total_entropy_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='kl_penalty_loss',
                                        data=total_kl_penalty_loss,
                                        step=self.train_step_counter)

            total_abs_loss = (tf.abs(total_policy_gradient_loss) +
                              tf.abs(total_value_estimation_loss) +
                              tf.abs(total_entropy_regularization_loss) +
                              tf.abs(total_l2_regularization_loss) +
                              tf.abs(total_kl_penalty_loss))

            tf.compat.v2.summary.scalar(name='total_abs_loss',
                                        data=total_abs_loss,
                                        step=self.train_step_counter)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                all_vars = (self._actor_net.trainable_weights +
                            self._value_net.trainable_weights)
                for var in all_vars:
                    tf.compat.v2.summary.histogram(
                        name=var.name.replace(':', '_'),
                        data=var,
                        step=self.train_step_counter)

        return loss_info
Exemplo n.º 24
0
def action_importance_ratio(action_distribution, collect_action_distribution,
                            action, action_spec, clipping_mode, scope,
                            importance_ratio_clipping, log_prob_clipping,
                            check_numerics, debug_summaries):
    """ ratio for importance sampling, used in PPO loss and vtrace loss.

        Caller has to save tf.name_scope() and pass scope to this function.

        Args:
            action_distribution (nested tf.distribution): Distribution over
                actions under target policy.
            collect_action_distribution (nested tf.distribution): distribution
                over actions from behavior policy, used to sample actions for
                the rollout.
            action (nested tf.distribution): possibly batched action tuple
                taken during rollout.
            action_spec (nested BoundedTensorSpec): representing the actions.
            clipping_mode (str): mode for clipping the importance ratio.
                'double_sided': clips the range of importance ratio into
                    [1-importance_ratio_clipping, 1+importance_ratio_clipping],
                    which is used by PPOLoss.
                'capping': clips the range of importance ratio into
                    min(1+importance_ratio_clipping, importance_ratio),
                    which is used by VTraceLoss, where c_bar or rho_bar =
                    1+importance_ratio_clipping.
            scope (name scope manager): returned by tf.name_scope(), set
                outside.
            importance_ratio_clipping (float):  Epsilon in clipped, surrogate
                PPO objective. See the cited paper for more detail.
            log_prob_clipping (float): If >0, clipping log probs to the range
                (-log_prob_clipping, log_prob_clipping) to prevent inf / NaN
                values.
            check_numerics (bool):  If true, adds tf.debugging.check_numerics to
                help find NaN / Inf values. For debugging only.
            debug_summaries (bool): If true, output summary metrics to tf.

        Returns:
            importance_ratio (Tensor), importance_ratio_clipped (Tensor).
    """
    current_policy_distribution = action_distribution

    sample_action_log_probs = tfa_common.log_probability(
        collect_action_distribution, action, action_spec)
    sample_action_log_probs = tf.stop_gradient(sample_action_log_probs)

    action_log_prob = tfa_common.log_probability(current_policy_distribution,
                                                 action, action_spec)
    if log_prob_clipping > 0.0:
        action_log_prob = tf.clip_by_value(action_log_prob, -log_prob_clipping,
                                           log_prob_clipping)
    if check_numerics:
        action_log_prob = tf.debugging.check_numerics(action_log_prob,
                                                      'action_log_prob')

    # Prepare both clipped and unclipped importance ratios.
    importance_ratio = tf.exp(action_log_prob - sample_action_log_probs)
    if check_numerics:
        importance_ratio = tf.debugging.check_numerics(importance_ratio,
                                                       'importance_ratio')

    if clipping_mode == 'double_sided':
        importance_ratio_clipped = tf.clip_by_value(
            importance_ratio, 1 - importance_ratio_clipping,
            1 + importance_ratio_clipping)
    elif clipping_mode == 'capping':
        importance_ratio_clipped = tf.minimum(importance_ratio,
                                              1 + importance_ratio_clipping)
    else:
        raise Exception('Unsupported clipping mode: ' + clipping_mode)

    def _summary():
        with scope:
            if importance_ratio_clipping > 0.0:
                clip_fraction = tf.reduce_mean(input_tensor=tf.cast(
                    tf.greater(tf.abs(importance_ratio - 1.0),
                               importance_ratio_clipping), tf.float32))
                tf.summary.scalar('clip_fraction', clip_fraction)

            tf.summary.histogram('action_log_prob', action_log_prob)
            tf.summary.histogram('action_log_prob_sample',
                                 sample_action_log_probs)
            tf.summary.histogram('importance_ratio', importance_ratio)
            tf.summary.scalar('importance_ratio_mean',
                              tf.reduce_mean(input_tensor=importance_ratio))
            tf.summary.histogram('importance_ratio_clipped',
                                 importance_ratio_clipped)

    if debug_summaries:
        common.run_if(common.should_record_summaries(), _summary)

    return importance_ratio, importance_ratio_clipped
Exemplo n.º 25
0
    def _train(self, experience, weights, train_step_counter):
        # Change trajectory to transitions.
        trajectory0 = nest.map_structure(lambda t: t[:, :-1], experience)
        trajectory1 = nest.map_structure(lambda t: t[:, 1:], experience)

        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(trajectory0, trajectory1)
        actions = policy_steps_.action
        if self._debug_summaries:
            tf.contrib.summary.histogram('actions', actions)

        action_distribution_parameters = policy_steps_.info

        # Reconstruct per-timestep policy distribution from stored distribution
        #   parameters.
        old_actions_distribution = (
            distribution_spec.nested_distributions_from_specs(
                self._action_distribution_spec,
                action_distribution_parameters))

        # Compute log probability of actions taken during data collection, using the
        #   collect policy distribution.
        act_log_probs = common_utils.log_probability(old_actions_distribution,
                                                     actions,
                                                     self._action_spec)

        # Compute the value predictions for states using the current value function.
        # To be used for return & advantage computation.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(
            batch_size=batch_size)

        value_preds, unused_policy_state = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            policy_state=policy_state)
        value_preds = tf.stop_gradient(value_preds)

        valid_mask = ppo_utils.make_timestep_mask(next_time_steps)

        if weights is None:
            weights = valid_mask
        else:
            weights *= valid_mask

        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        # For each epoch, create its own train op that depends on the previous one.
        loss_info = tf.no_op()
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):
                with tf.control_dependencies(nest.flatten(loss_info)):
                    # Only save debug summaries for first and last epochs.
                    debug_summaries = (self._debug_summaries and
                                       (i_epoch == 0
                                        or i_epoch == self._num_epochs - 1))

                    # Build one epoch train op.
                    loss_info = self.build_train_op(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, action_distribution_parameters,
                        weights, train_step_counter,
                        self._summarize_grads_and_vars,
                        self._gradient_clipping, debug_summaries)

                    policy_gradient_losses.append(
                        loss_info.extra.policy_gradient_loss)
                    value_estimation_losses.append(
                        loss_info.extra.value_estimation_loss)
                    l2_regularization_losses.append(
                        loss_info.extra.l2_regularization_loss)
                    entropy_regularization_losses.append(
                        loss_info.extra.entropy_regularization_loss)
                    kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        with tf.control_dependencies(nest.flatten(loss_info)):
            # Compute the mean kl from old.
            batch_size = nest_utils.get_outer_shape(time_steps,
                                                    self._time_step_spec)[0]
            policy_state = self._collect_policy.get_initial_state(batch_size)
            kl_divergence = self._kl_divergence(
                time_steps, action_distribution_parameters,
                self._collect_policy.distribution(time_steps,
                                                  policy_state).action)
            update_adaptive_kl_beta_op = self.update_adaptive_kl_beta(
                kl_divergence)

        with tf.control_dependencies([update_adaptive_kl_beta_op]):
            if self._observation_normalizer:
                update_obs_norm = (self._observation_normalizer.update(
                    time_steps.observation, outer_dims=[0, 1]))
            else:
                update_obs_norm = tf.no_op()
            if self._reward_normalizer:
                update_reward_norm = self._reward_normalizer.update(
                    next_time_steps.reward, outer_dims=[0, 1])
            else:
                update_reward_norm = tf.no_op()

        with tf.control_dependencies([update_obs_norm, update_reward_norm]):
            loss_info = nest.map_structure(tf.identity, loss_info)

        # Make summaries for total loss across all epochs.
        # The *_losses lists will have been populated by
        #   calls to self.build_train_op.
        with tf.name_scope('Losses/'):
            total_policy_gradient_loss = tf.add_n(policy_gradient_losses)
            total_value_estimation_loss = tf.add_n(value_estimation_losses)
            total_l2_regularization_loss = tf.add_n(l2_regularization_losses)
            total_entropy_regularization_loss = tf.add_n(
                entropy_regularization_losses)
            total_kl_penalty_loss = tf.add_n(kl_penalty_losses)
            tf.contrib.summary.scalar('policy_gradient_loss',
                                      total_policy_gradient_loss)
            tf.contrib.summary.scalar('value_estimation_loss',
                                      total_value_estimation_loss)
            tf.contrib.summary.scalar('l2_regularization_loss',
                                      total_l2_regularization_loss)
            if self._entropy_regularization:
                tf.contrib.summary.scalar('entropy_regularization_loss',
                                          total_entropy_regularization_loss)
            tf.contrib.summary.scalar('kl_penalty_loss', total_kl_penalty_loss)

            total_abs_loss = (tf.abs(total_policy_gradient_loss) +
                              tf.abs(total_value_estimation_loss) +
                              tf.abs(total_entropy_regularization_loss) +
                              tf.abs(total_l2_regularization_loss) +
                              tf.abs(total_kl_penalty_loss))

            tf.contrib.summary.scalar('total_abs_loss', total_abs_loss)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                all_vars = (self._actor_net.trainable_weights +
                            self._value_net.trainable_weights)
                for var in all_vars:
                    tf.contrib.summary.histogram(var.name.replace(':', '_'),
                                                 var)

        return loss_info
Exemplo n.º 26
0
 def _pg_loss(self, training_info, advantages):
     action_log_prob = tfa_common.log_probability(
         training_info.action_distribution, training_info.action,
         self._action_spec)
     return -advantages * action_log_prob