Exemplo n.º 1
0
    def testScanSumEquivalenceWithSeqLen(self):
        with self.test_session() as sess:
            sequence_lengths = [0, 2]
            bootstrap = tf.constant([0.5, 1.5], dtype=tf.float32)

            sequence = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
            decays = [[.1, .2, .3, .4, .5], [.6, .7, .8, .9, .10]]

            eq_sequence = [[0, 0, 0, 0, 0], [6, 7, 0, 0, 0]]
            eq_decays = [[0, 0, 0, 0, 0], [.6, .7, 0, 0, 0]]

            eq_reverse_sequence = [[0, 0, 0, 0, 0], [7, 6, 0, 0, 0]]
            eq_reverse_decays = [[0, 0, 0, 0, 0], [.7, .6, 0, 0, 0]]

            # We use transpose because it is easier to define the input data in
            # BxT (batch x time) form, while scan_discounted_sum assumes TxB form.
            sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32))
            decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32))
            eq_sequence_in = tf.transpose(
                tf.constant(eq_sequence, dtype=tf.float32))
            eq_decays_in = tf.transpose(
                tf.constant(eq_decays, dtype=tf.float32))
            eq_reverse_sequence_in = tf.transpose(
                tf.constant(eq_reverse_sequence, dtype=tf.float32))
            eq_reverse_decays_in = tf.transpose(
                tf.constant(eq_reverse_decays, dtype=tf.float32))

            eq_result = sequence_ops.scan_discounted_sum(
                sequence_in,
                decays_in,
                bootstrap,
                reverse=False,
                sequence_lengths=sequence_lengths)
            exp_eq_result = sequence_ops.scan_discounted_sum(
                eq_sequence_in, eq_decays_in, bootstrap)

            eq_reverse_result = sequence_ops.scan_discounted_sum(
                sequence_in,
                decays_in,
                bootstrap,
                reverse=True,
                sequence_lengths=sequence_lengths)
            exp_eq_reverse_result = sequence_ops.scan_discounted_sum(
                eq_reverse_sequence_in, eq_reverse_decays_in, bootstrap)
            exp_eq_reverse_result = tf.reverse_sequence(exp_eq_reverse_result,
                                                        sequence_lengths,
                                                        seq_axis=0,
                                                        batch_axis=1)

            self.assertAllClose(sess.run(eq_result), sess.run(exp_eq_result))
            self.assertAllClose(sess.run(eq_reverse_result),
                                sess.run(exp_eq_reverse_result))
Exemplo n.º 2
0
 def testScanSumWithDecaysReverse3D(self):
   """scan_discounted_sum vs. higher-dimensional arguments."""
   with self.test_session() as sess:
     sequence = [[[3, 33], [1, 11], [5, 55]],
                 [[-1.7, -17], [1.2, 12], [2.3, 23]]]
     decays = [[[0.5, 5], [0.9, 9], [1.0, 10]],
               [[0.9, 9], [0.5, 5], [0.3, 3]]]
     # We use transpose because it is easier to define the input data in
     # BxT (batch x time) form, while scan_discounted_sum assumes TxB form.
     sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32),
                                perm=[1, 0, 2])
     decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32),
                              perm=[1, 0, 2])
     bootstrap = tf.constant([[0, 0], [1.5, 15]], dtype=tf.float32)
     result = sequence_ops.scan_discounted_sum(sequence_in, decays_in,
                                               bootstrap,
                                               reverse=True)
     expected_result = tf.constant(
         [[[(5 * 0.9 + 1) * 0.5 + 3,
            (55 * 9 + 11) * 5 + 33],
           [5 * 0.9 + 1,
            55 * 9 + 11],
           [5,
            55]],
          [[((2.3 + 0.3 * 1.5) * 0.5 + 1.2) * 0.9 - 1.7,
            ((23 + 3 * 15) * 5 + 12) * 9 - 17],
           [(2.3 + 0.3 * 1.5) * 0.5 + 1.2,
            (23 + 3 * 15) * 5 + 12],
           [2.3 + 0.3 * 1.5,
            23 + 3 * 15]]],
         dtype=tf.float32)
     self.assertAllClose(sess.run(result),
                         sess.run(tf.transpose(expected_result,
                                               perm=[1, 0, 2])))
Exemplo n.º 3
0
 def testScanSumWithDecays(self):
   with self.test_session() as sess:
     sequence = [[3, 1, 5, 2, 1], [-1.7, 1.2, 2.3, 0, 1]]
     decays = [[0.5, 0.9, 1.0, 0.1, 0.5], [0.9, 0.5, 0.0, 2, 0.8]]
     # We use transpose because it is easier to define the input data in
     # BxT (batch x time) form, while scan_discounted_sum assumes TxB form.
     sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32))
     decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32))
     bootstrap = tf.constant([0, 1.5], dtype=tf.float32)
     result = sequence_ops.scan_discounted_sum(sequence_in, decays_in,
                                               bootstrap,
                                               reverse=False)
     expected_result = tf.constant(
         [[3,
           3 * 0.9 + 1,
           (3 * 0.9 + 1) * 1.0 + 5,
           ((3 * 0.9 + 1) * 1.0 + 5) * 0.1 + 2,
           (((3 * 0.9 + 1) * 1.0 + 5) * 0.1 + 2) * 0.5 + 1],
          [-1.7 + 1.5 * 0.9,
           (-1.7 + 1.5 * 0.9) * 0.5 + 1.2,
           ((-1.7 + 1.5 * 0.9) * 0.5 + 1.2) * 0.0 + 2.3,
           (((-1.7 + 1.5 * 0.9) * 0.5 + 1.2) * 0.0 + 2.3) * 2 + 0,
           ((((-1.7 + 1.5 * 0.9) * 0.5 + 1.2) * 0.0 + 2.3) * 2 + 0) * 0.8 + 1,
          ]], dtype=tf.float32)
     self.assertAllClose(sess.run(result),
                         sess.run(tf.transpose(expected_result)))
Exemplo n.º 4
0
 def testScanSumWithDecaysReverse3DWithSeqLen(self):
     """scan_discounted_sum vs. higher-dimensional arguments."""
     with self.test_session() as sess:
         sequence = [[[3, 33], [1, 11], [5, 55]],
                     [[-1.7, -17], [1.2, 12], [2.3, 23]]]
         decays = [[[0.5, 5], [0.9, 9], [1.0, 10]],
                   [[0.9, 9], [0.5, 5], [0.3, 3]]]
         sequence_lengths = [2, 0]
         # We use transpose because it is easier to define the input data in
         # BxT (batch x time) form, while scan_discounted_sum assumes TxB form.
         sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32),
                                    perm=[1, 0, 2])
         decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32),
                                  perm=[1, 0, 2])
         bootstrap = tf.constant([[0, 0], [1.5, 15]], dtype=tf.float32)
         result = sequence_ops.scan_discounted_sum(
             sequence_in,
             decays_in,
             bootstrap,
             reverse=True,
             sequence_lengths=sequence_lengths)
         expected_result = np.asarray([[[1 * 0.5 + 3, 11 * 5 + 33], [1, 11],
                                        [0, 0]], [[0, 0], [0, 0], [0, 0]]],
                                      dtype=np.float32)
         self.assertAllClose(sess.run(result),
                             np.transpose(expected_result, axes=[1, 0, 2]))
Exemplo n.º 5
0
  def testScanSumEquivalenceWithSeqLen(self):
    with self.test_session() as sess:
      sequence_lengths = [0, 2]
      bootstrap = tf.constant([0.5, 1.5], dtype=tf.float32)

      sequence = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
      decays = [[.1, .2, .3, .4, .5], [.6, .7, .8, .9, .10]]

      eq_sequence = [[0, 0, 0, 0, 0], [6, 7, 0, 0, 0]]
      eq_decays = [[0, 0, 0, 0, 0], [.6, .7, 0, 0, 0]]

      eq_reverse_sequence = [[0, 0, 0, 0, 0], [7, 6, 0, 0, 0]]
      eq_reverse_decays = [[0, 0, 0, 0, 0], [.7, .6, 0, 0, 0]]

      # We use transpose because it is easier to define the input data in
      # BxT (batch x time) form, while scan_discounted_sum assumes TxB form.
      sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32))
      decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32))
      eq_sequence_in = tf.transpose(tf.constant(eq_sequence, dtype=tf.float32))
      eq_decays_in = tf.transpose(tf.constant(eq_decays, dtype=tf.float32))
      eq_reverse_sequence_in = tf.transpose(
          tf.constant(eq_reverse_sequence, dtype=tf.float32))
      eq_reverse_decays_in = tf.transpose(
          tf.constant(eq_reverse_decays, dtype=tf.float32))

      eq_result = sequence_ops.scan_discounted_sum(
          sequence_in, decays_in, bootstrap, reverse=False,
          sequence_lengths=sequence_lengths)
      exp_eq_result = sequence_ops.scan_discounted_sum(
          eq_sequence_in, eq_decays_in, bootstrap)

      eq_reverse_result = sequence_ops.scan_discounted_sum(
          sequence_in, decays_in, bootstrap, reverse=True,
          sequence_lengths=sequence_lengths)
      exp_eq_reverse_result = sequence_ops.scan_discounted_sum(
          eq_reverse_sequence_in, eq_reverse_decays_in, bootstrap)
      exp_eq_reverse_result = tf.reverse_sequence(
          exp_eq_reverse_result, sequence_lengths, seq_axis=0, batch_axis=1)

      self.assertAllClose(sess.run(eq_result),
                          sess.run(exp_eq_result))
      self.assertAllClose(sess.run(eq_reverse_result),
                          sess.run(exp_eq_reverse_result))
Exemplo n.º 6
0
  def testScanSumShapeInference(self):
    """scan_discounted_sum should support static shape inference."""
    # No session needed since we're not evaluating any ops.
    sequence_in = tf.placeholder(tf.float32, shape=[1647, 2001])
    decays_in = tf.placeholder(tf.float32, shape=[1647, 2001])
    bootstrap = tf.placeholder(tf.float32, shape=[2001])
    result = sequence_ops.scan_discounted_sum(sequence_in, decays_in,
                                              bootstrap,
                                              reverse=False)
    self.assertAllEqual(result.get_shape(), [1647, 2001])

    # Let's do it again with higher-dimensional inputs.
    sequence_in = tf.placeholder(tf.float32, shape=[4, 8, 15, 16, 23, 42])
    decays_in = tf.placeholder(tf.float32, shape=[4, 8, 15, 16, 23, 42])
    bootstrap = tf.placeholder(tf.float32, shape=[8, 15, 16, 23, 42])
    result = sequence_ops.scan_discounted_sum(sequence_in, decays_in,
                                              bootstrap,
                                              reverse=False)
    self.assertAllEqual(result.get_shape(), [4, 8, 15, 16, 23, 42])
Exemplo n.º 7
0
 def testScanSumWithDecaysReverseWithSeqLen(self):
   with self.test_session() as sess:
     sequence = [[3, 1, 5], [-1.7, 1.2, 2.3]]
     decays = [[0.5, 0.9, 1.0], [0.9, 0.5, 0.3]]
     sequence_lengths = [2, 0]
     # We use transpose because it is easier to define the input data in
     # BxT (batch x time) form, while scan_discounted_sum assumes TxB form.
     sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32))
     decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32))
     bootstrap = tf.constant([2.5, 1.5], dtype=tf.float32)
     result = sequence_ops.scan_discounted_sum(
         sequence_in, decays_in, bootstrap, reverse=True,
         sequence_lengths=sequence_lengths)
     expected_result = tf.constant(
         [[(0.9 * 2.5 + 1) * 0.5 + 3, (0.9 * 2.5 + 1), 0], [0, 0, 0]],
         dtype=tf.float32)
     self.assertAllClose(sess.run(result),
                         sess.run(tf.transpose(expected_result)))
Exemplo n.º 8
0
 def testScanSumWithDecaysReverse3DWithSeqLen(self):
   """scan_discounted_sum vs. higher-dimensional arguments."""
   with self.test_session() as sess:
     sequence = [[[3, 33], [1, 11], [5, 55]],
                 [[-1.7, -17], [1.2, 12], [2.3, 23]]]
     decays = [[[0.5, 5], [0.9, 9], [1.0, 10]],
               [[0.9, 9], [0.5, 5], [0.3, 3]]]
     sequence_lengths = [2, 0]
     # We use transpose because it is easier to define the input data in
     # BxT (batch x time) form, while scan_discounted_sum assumes TxB form.
     sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32),
                                perm=[1, 0, 2])
     decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32),
                              perm=[1, 0, 2])
     bootstrap = tf.constant([[0, 0], [1.5, 15]], dtype=tf.float32)
     result = sequence_ops.scan_discounted_sum(
         sequence_in, decays_in, bootstrap, reverse=True,
         sequence_lengths=sequence_lengths)
     expected_result = np.asarray(
         [[[1 * 0.5 + 3, 11 * 5 + 33], [1, 11], [0, 0]],
          [[0, 0], [0, 0], [0, 0]]], dtype=np.float32)
     self.assertAllClose(sess.run(result),
                         np.transpose(expected_result, axes=[1, 0, 2]))
Exemplo n.º 9
0
def _general_off_policy_corrected_multistep_target(r_t,
                                                   pcont_t,
                                                   target_policy_t,
                                                   c_t,
                                                   q_t,
                                                   a_t,
                                                   back_prop=False,
                                                   name=None):
    """Evaluates targets for various off-policy value correction based algorithms.

  `target_policy_t` is the policy that this function aims to evaluate. New
  action-value estimates (target values `T`) must be expressible in this
  recurrent form:
  ```none
  T(x_{t-1}, a_{t-1}) = r_t + γ[ 𝔼_π Q(x_t, .) - c_t Q(x_t, a_t) +
                                                 c_t T(x_t, a_t) ]
  ```
  `T(x_t, a_t)` is an estimate of expected discounted future returns based
  on the current Q value estimates `Q(x_t, a_t)` and rewards `r_t`. The
  evaluated target values can be used as supervised targets for learning the Q
  function itself or as returns for various policy gradient algorithms.
  `Q==T` if convergence is reached. As the formula is recurrent, it will
  evaluate multistep returns for non-zero importance weights `c_t`.

  In the usual moving and target network setup `q_t` should be calculated by
  the target network while the `target_policy_t` may be evaluated by either of
  the networks. If `target_policy_t` is evaluated by the current moving network
  the algorithm implemented will have a similar flavour as double DQN.

  Depending on the choice of c_t, the algorithm can implement:
  ```none
  Importance Sampling             c_t = π(x_t, a_t) / μ(x_t, a_t),
  Harutyunyan's et al. Q(lambda)  c_t = λ,
  Precup's et al. Tree-Backup     c_t = π(x_t, a_t),
  Munos' et al. Retrace           c_t = λ min(1, π(x_t, a_t) / μ(x_t, a_t)).
  ```
  Please refer to page 3 for more details:
  https://arxiv.org/pdf/1606.02647v1.pdf

  Args:
    r_t: 2-D tensor holding rewards received during the transition
      that corresponds to each major index.
      Shape is `[T, B]`.
    pcont_t: 2-D tensor holding pcontinue values received during the
      transition that corresponds to each major index.
      Shape is `[T, B]`.
    target_policy_t:  3-D tensor holding per-action policy probabilities for
      the states encountered just AFTER the transitions that correspond to
      each major index, according to the target policy (i.e. the policy we
      wish to learn). These usually derive from the learning net.
      Shape is `[T, B, num_actions]`.
    c_t: 2-D tensor holding importance weights; see discussion above.
      Shape is `[T, B]`.
    q_t: 3-D tensor holding per-action Q-values for the states
      encountered just AFTER taking the transitions that correspond to each
      major index. Shape is `[T, B, num_actions]`.
    a_t: 2-D tensor holding the indices of actions executed during the
      transition AFTER the transition that corresponds to each major index.
      Shape is `[T, B]`.
    back_prop: whether to backpropagate gradients through time.
    name: name of the op.

  Returns:
    Tensor of shape `[T, B, num_actions]` containing Q values.
  """
    # Formula (4) in https://arxiv.org/pdf/1606.02647v1.pdf can be expressed
    # in a recursive form where T is a new target value:
    # T(x_{t-1}, a_{t-1}) = r_t + γ[ 𝔼_π Q(x_t, .) - c_t Q(x_t, a_t) +
    #                                                c_t T(x_t, a_t) ]
    # This recurrent form allows us to express Retrace by using
    # `scan_discounted_sum`.
    # Define:
    #   T_tm1   = T(x_{t-1}, a_{t-1})
    #   T_t     = T(x_t, a_t)
    #   exp_q_t = 𝔼_π Q(x_t,.)
    #   qa_t    = Q(x_t, a_t)
    # Hence:
    #   T_tm1   = r_t + γ * (exp_q_t - c_t * qa_t) + γ * c_t * T_t
    # Define:
    #   current = r_t + γ * (exp_q_t - c_t * qa_t)
    # Thus:
    #   T_tm1 = scan_discounted_sum(current, γ * c_t, reverse=True)
    args = [r_t, pcont_t, target_policy_t, c_t, q_t, a_t]
    with tf.name_scope(name,
                       'general_returns_based_off_policy_target',
                       values=args):
        exp_q_t = tf.reduce_sum(target_policy_t * q_t, axis=2)
        qa_t = indexing_ops.batched_index(q_t, a_t)
        current = r_t + pcont_t * (exp_q_t - c_t * qa_t)
        initial_value = qa_t[-1]
        return sequence_ops.scan_discounted_sum(current,
                                                pcont_t * c_t,
                                                initial_value,
                                                reverse=True,
                                                back_prop=back_prop)
Exemplo n.º 10
0
def _general_off_policy_corrected_multistep_target(r_t,
                                                   pcont_t,
                                                   target_policy_t,
                                                   c_t,
                                                   q_t,
                                                   a_t,
                                                   back_prop=False,
                                                   name=None):
  """Evaluates targets for various off-policy value correction based algorithms.

  `target_policy_t` is the policy that this function aims to evaluate. New
  action-value estimates (target values `T`) must be expressible in this
  recurrent form:
  ```none
  T(x_{t-1}, a_{t-1}) = r_t + γ[ 𝔼_π Q(x_t, .) - c_t Q(x_t, a_t) +
                                                 c_t T(x_t, a_t) ]
  ```
  `T(x_t, a_t)` is an estimate of expected discounted future returns based
  on the current Q value estimates `Q(x_t, a_t)` and rewards `r_t`. The
  evaluated target values can be used as supervised targets for learning the Q
  function itself or as returns for various policy gradient algorithms.
  `Q==T` if convergence is reached. As the formula is recurrent, it will
  evaluate multistep returns for non-zero importance weights `c_t`.

  In the usual moving and target network setup `q_t` should be calculated by
  the target network while the `target_policy_t` may be evaluated by either of
  the networks. If `target_policy_t` is evaluated by the current moving network
  the algorithm implemented will have a similar flavour as double DQN.

  Depending on the choice of c_t, the algorithm can implement:
  ```none
  Importance Sampling             c_t = π(x_t, a_t) / μ(x_t, a_t),
  Harutyunyan's et al. Q(lambda)  c_t = λ,
  Precup's et al. Tree-Backup     c_t = π(x_t, a_t),
  Munos' et al. Retrace           c_t = λ min(1, π(x_t, a_t) / μ(x_t, a_t)).
  ```
  Please refer to page 3 for more details:
  https://arxiv.org/pdf/1606.02647v1.pdf

  Args:
    r_t: 2-D tensor holding rewards received during the transition
      that corresponds to each major index.
      Shape is `[T, B]`.
    pcont_t: 2-D tensor holding pcontinue values received during the
      transition that corresponds to each major index.
      Shape is `[T, B]`.
    target_policy_t:  3-D tensor holding per-action policy probabilities for
      the states encountered just AFTER the transitions that correspond to
      each major index, according to the target policy (i.e. the policy we
      wish to learn). These usually derive from the learning net.
      Shape is `[T, B, num_actions]`.
    c_t: 2-D tensor holding importance weights; see discussion above.
      Shape is `[T, B]`.
    q_t: 3-D tensor holding per-action Q-values for the states
      encountered just AFTER taking the transitions that correspond to each
      major index. Shape is `[T, B, num_actions]`.
    a_t: 2-D tensor holding the indices of actions executed during the
      transition AFTER the transition that corresponds to each major index.
      Shape is `[T, B]`.
    back_prop: whether to backpropagate gradients through time.
    name: name of the op.

  Returns:
    Tensor of shape `[T, B, num_actions]` containing Q values.
  """
  # Formula (4) in https://arxiv.org/pdf/1606.02647v1.pdf can be expressed
  # in a recursive form where T is a new target value:
  # T(x_{t-1}, a_{t-1}) = r_t + γ[ 𝔼_π Q(x_t, .) - c_t Q(x_t, a_t) +
  #                                                c_t T(x_t, a_t) ]
  # This recurrent form allows us to express Retrace by using
  # `scan_discounted_sum`.
  # Define:
  #   T_tm1   = T(x_{t-1}, a_{t-1})
  #   T_t     = T(x_t, a_t)
  #   exp_q_t = 𝔼_π Q(x_{t+1},.)
  #   qa_t    = Q(x_t, a_t)
  # Hence:
  #   T_tm1   = (r_t + γ * exp_q_t - c_t * qa_t) + γ * c_t * T_t
  # Define:
  #   current = r_t + γ * (exp_q_t - c_t * qa_t)
  # Thus:
  #   T_tm1 = scan_discounted_sum(current, γ * c_t, reverse=True)
  args = [r_t, pcont_t, target_policy_t, c_t, q_t, a_t]
  with tf.name_scope(
      name, 'general_returns_based_off_policy_target', values=args):
    exp_q_t = tf.reduce_sum(target_policy_t * q_t, axis=2)
    qa_t = indexing_ops.batched_index(q_t, a_t)
    current = r_t + pcont_t * (exp_q_t - c_t * qa_t)
    initial_value = qa_t[-1]
    return sequence_ops.scan_discounted_sum(
        current,
        pcont_t * c_t,
        initial_value,
        reverse=True,
        back_prop=back_prop)
Exemplo n.º 11
0
def generalized_lambda_returns(rewards,
                               pcontinues,
                               values,
                               bootstrap_value,
                               lambda_=1,
                               name="generalized_lambda_returns"):
    """Computes lambda-returns along a batch of (chunks of) trajectories.

  For lambda=1 these will be multistep returns looking ahead from each
  state to the end of the chunk, where bootstrap_value is used. If you pass an
  entire trajectory and zeros for bootstrap_value, this is just the Monte-Carlo
  return / TD(1) target.

  For lambda=0 these are one-step TD(0) targets.

  For inbetween values of lambda these are lambda-returns / TD(lambda) targets,
  except that traces are always cut off at the end of the chunk, since we can't
  see returns beyond then. If you pass an entire trajectory with zeros for
  bootstrap_value though, then they're plain TD(lambda) targets.

  lambda can also be a tensor of values in [0, 1], determining the mix of
  bootstrapping vs further accumulation of multistep returns at each timestep.
  This can be used to implement Retrace and other algorithms. See
  `sequence_ops.multistep_forward_view` for more info on this. Another way to
  think about the end-of-chunk cutoff is that lambda is always effectively zero
  on the timestep after the end of the chunk, since at the end of the chunk we
  rely entirely on bootstrapping and can't accumulate returns looking further
  into the future.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` transitions into another state with value `V'`, receiving
  reward `r` and pcontinue `p`. Then `V`, `r` and `p` are all at the same index
  `i` in the corresponding tensors. `V'` is at index `i+1`, or in the
  `bootstrap_value` tensor if `i == T`.

  Subtracting `values` from these lambda-returns will yield estimates of the
  advantage function which can be used for both the policy gradient loss and
  the baseline value function loss in A3C / GAE.

  Args:
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    values: 2-D Tensor containing estimates of the state values for timesteps
      0 to `T-1`. Shape `[T, B]`.
    bootstrap_value: 1-D Tensor containing an estimate of the value of the
      final state at time `T`, used for bootstrapping the target n-step
      returns. Shape `[B]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`.
    name: Customises the name_scope for this op.

  Returns:
    2-D Tensor with shape `[T, B]`
  """
    values.get_shape().assert_has_rank(2)
    rewards.get_shape().assert_has_rank(2)
    pcontinues.get_shape().assert_has_rank(2)
    bootstrap_value.get_shape().assert_has_rank(1)
    scoped_values = [rewards, pcontinues, values, bootstrap_value, lambda_]
    with tf.name_scope(name, values=scoped_values):
        if lambda_ == 1:
            # This is actually equivalent to the branch below, just an optimisation
            # to avoid unnecessary work in this case:
            return sequence_ops.scan_discounted_sum(
                rewards,
                pcontinues,
                initial_value=bootstrap_value,
                reverse=True,
                back_prop=False,
                name="multistep_returns")
        else:
            v_tp1 = tf.concat(
                axis=0,
                values=[values[1:, :],
                        tf.expand_dims(bootstrap_value, 0)])
            # `back_prop=False` prevents gradients flowing into values and
            # bootstrap_value, which is what you want when using the bootstrapped
            # lambda-returns in an update as targets for values.
            return sequence_ops.multistep_forward_view(
                rewards,
                pcontinues,
                v_tp1,
                lambda_,
                back_prop=False,
                name="generalized_lambda_returns")
Exemplo n.º 12
0
def generalized_lambda_returns(rewards,
                               pcontinues,
                               values,
                               bootstrap_value,
                               lambda_=1,
                               name="generalized_lambda_returns"):
  """Computes lambda-returns along a batch of (chunks of) trajectories.

  For lambda=1 these will be multistep returns looking ahead from each
  state to the end of the chunk, where bootstrap_value is used. If you pass an
  entire trajectory and zeros for bootstrap_value, this is just the Monte-Carlo
  return / TD(1) target.

  For lambda=0 these are one-step TD(0) targets.

  For inbetween values of lambda these are lambda-returns / TD(lambda) targets,
  except that traces are always cut off at the end of the chunk, since we can't
  see returns beyond then. If you pass an entire trajectory with zeros for
  bootstrap_value though, then they're plain TD(lambda) targets.

  lambda can also be a tensor of values in [0, 1], determining the mix of
  bootstrapping vs further accumulation of multistep returns at each timestep.
  This can be used to implement Retrace and other algorithms. See
  `sequence_ops.multistep_forward_view` for more info on this. Another way to
  think about the end-of-chunk cutoff is that lambda is always effectively zero
  on the timestep after the end of the chunk, since at the end of the chunk we
  rely entirely on bootstrapping and can't accumulate returns looking further
  into the future.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` transitions into another state with value `V'`, receiving
  reward `r` and pcontinue `p`. Then `V`, `r` and `p` are all at the same index
  `i` in the corresponding tensors. `V'` is at index `i+1`, or in the
  `bootstrap_value` tensor if `i == T`.

  Subtracting `values` from these lambda-returns will yield estimates of the
  advantage function which can be used for both the policy gradient loss and
  the baseline value function loss in A3C / GAE.

  Args:
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    values: 2-D Tensor containing estimates of the state values for timesteps
      0 to `T-1`. Shape `[T, B]`.
    bootstrap_value: 1-D Tensor containing an estimate of the value of the
      final state at time `T`, used for bootstrapping the target n-step
      returns. Shape `[B]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`.
    name: Customises the name_scope for this op.

  Returns:
    2-D Tensor with shape `[T, B]`
  """
  values.get_shape().assert_has_rank(2)
  rewards.get_shape().assert_has_rank(2)
  pcontinues.get_shape().assert_has_rank(2)
  bootstrap_value.get_shape().assert_has_rank(1)
  scoped_values = [rewards, pcontinues, values, bootstrap_value, lambda_]
  with tf.name_scope(name, values=scoped_values):
    if lambda_ == 1:
      # This is actually equivalent to the branch below, just an optimisation
      # to avoid unnecessary work in this case:
      return sequence_ops.scan_discounted_sum(
          rewards,
          pcontinues,
          initial_value=bootstrap_value,
          reverse=True,
          back_prop=False,
          name="multistep_returns")
    else:
      v_tp1 = tf.concat(
          axis=0, values=[values[1:, :],
                          tf.expand_dims(bootstrap_value, 0)])
      # `back_prop=False` prevents gradients flowing into values and
      # bootstrap_value, which is what you want when using the bootstrapped
      # lambda-returns in an update as targets for values.
      return sequence_ops.multistep_forward_view(
          rewards,
          pcontinues,
          v_tp1,
          lambda_,
          back_prop=False,
          name="generalized_lambda_returns")