def testMultistepForwardView(self):
   with self.test_session() as sess:
     # Define input data.
     rewards = [[1, 0, -1, 0, 1], [0.5, 0.8, -0.7, 0.0, 2.1]]
     pcontinues = [[0.5, 0.9, 1.0, 0.5, 0.8], [0.9, 0.5, 0.3, 0.8, 0.7]]
     state_values = [[3, 1, 5, -5, 3], [-1.7, 1.2, 2.3, 2.2, 2.7]]
     lambda_ = 0.75
     # Evaluate expected complex backups at all time-steps for both batches.
     expected_result = []
     for b in xrange(0, 2):
       expected_result.append(
           get_complex_n_step_backup_at_all_times(rewards[b], pcontinues[b],
                                                  state_values[b], lambda_))
     # Only partially-specify the input shapes - verifies that the
     # dynamically sized Tensors are handled correctly.
     state_values_pl = tf.placeholder(tf.float32, shape=[None, None])
     rewards_pl = tf.placeholder(tf.float32, shape=[None, None])
     pcontinues_pl = tf.placeholder(tf.float32, shape=[None, None])
     # We use transpose because it is easier to define the input data in
     # BxT (batch x time) form, while scan_discounted_sum assumes TxB form.
     state_values_in = tf.transpose(state_values_pl)
     rewards_in = tf.transpose(rewards_pl)
     pcontinues_in = tf.transpose(pcontinues_pl)
     expected = tf.transpose(tf.constant(expected_result, dtype=tf.float32))
     # Evaluate complex backups.
     result = sequence_ops.multistep_forward_view(rewards_in, pcontinues_in,
                                                  state_values_in, lambda_)
     feed_dict = {state_values_pl: state_values,
                  rewards_pl: rewards,
                  pcontinues_pl: pcontinues}
     self.assertAllClose(sess.run(result, feed_dict=feed_dict),
                         sess.run(expected))
示例#2
0
def qlambda(
    q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_, name="GeneralizedQLambda"):
  """Implements Peng's and Watkins' Q(lambda) loss as a TensorFlow op.

  This function is general enough to implement both Peng's and Watkins'
  Q-lambda algorithms.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node78.html).

  Args:
    q_tm1: `Tensor` holding a sequence of Q-values starting at the first
      timestep; shape `[T, B, num_actions]`
    a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]`
    r_t: Tensor holding a sequence of rewards, shape `[T, B]`
    pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]`
    q_t: `Tensor` holding a sequence of Q-values for second timestep;
      shape `[T, B, num_actions]`. In a target network setting,
      this quantity is often supplied by the target network.
    lambda_: a scalar or `Tensor` of shape `[T, B]`
      specifying the ratio of mixing between bootstrapped and MC returns;
      if lambda_ is the same for all time steps then the function implements
      Peng's Q-learning algorithm; if lambda_ = 0 at every sub-optimal action
      and a constant otherwise, then the function implements Watkins'
      Q-learning algorithm. Generally lambda_ can be a Tensor of any values
      in the range [0, 1] supplied by the user.
    name: a name of the op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[T, B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`.
        * `td_error`: batch of temporal difference errors, shape `[T, B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert([[q_tm1, q_t]], [3], name)
  if isinstance(lambda_, tf.Tensor) and lambda_.get_shape().ndims > 0:
    base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t, lambda_]], [2], name)
  else:
    base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t]], [2], name)

  # QLambda op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      state_values = tf.reduce_max(q_t, axis=2)
      target = sequence_ops.multistep_forward_view(
          r_t, pcont_t, state_values, lambda_, back_prop=False)
      target = tf.stop_gradient(target)
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
示例#3
0
def sarsa_lambda(q_tm1,
                 a_tm1,
                 r_t,
                 pcont_t,
                 q_t,
                 a_t,
                 lambda_,
                 name="SarsaLambda"):
  """Implements SARSA(lambda) loss as a TensorFlow op.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node77.html).

  Args:
    q_tm1: `Tensor` holding a sequence of Q-values starting at the first
      timestep; shape `[T, B, num_actions]`
    a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]`
    r_t: Tensor holding a sequence of rewards, shape `[T, B]`
    pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]`
    q_t: `Tensor` holding a sequence of Q-values for second timestep;
      shape `[T, B, num_actions]`.
    a_t: `Tensor` holding a sequence of action indices for second timestep;
      shape `[T, B]`
    lambda_: a scalar specifying the ratio of mixing between bootstrapped and
      MC returns.
    name: a name of the op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[T, B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`.
        * `td_error`: batch of temporal difference errors, shape `[T, B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert(
      [[q_tm1, q_t], [a_tm1, r_t, pcont_t, a_t]], [3, 2], name)

  # SARSALambda op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, a_t]):

    # Select head to update and build target.
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
    qa_t = indexing_ops.batched_index(q_t, a_t)
    target = sequence_ops.multistep_forward_view(
        r_t, pcont_t, qa_t, lambda_, back_prop=False)
    target = tf.stop_gradient(target)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
示例#4
0
def generalized_lambda_returns(rewards,
                               pcontinues,
                               values,
                               bootstrap_value,
                               lambda_=1,
                               name="generalized_lambda_returns"):
    """Computes lambda-returns along a batch of (chunks of) trajectories.

  For lambda=1 these will be multistep returns looking ahead from each
  state to the end of the chunk, where bootstrap_value is used. If you pass an
  entire trajectory and zeros for bootstrap_value, this is just the Monte-Carlo
  return / TD(1) target.

  For lambda=0 these are one-step TD(0) targets.

  For inbetween values of lambda these are lambda-returns / TD(lambda) targets,
  except that traces are always cut off at the end of the chunk, since we can't
  see returns beyond then. If you pass an entire trajectory with zeros for
  bootstrap_value though, then they're plain TD(lambda) targets.

  lambda can also be a tensor of values in [0, 1], determining the mix of
  bootstrapping vs further accumulation of multistep returns at each timestep.
  This can be used to implement Retrace and other algorithms. See
  `sequence_ops.multistep_forward_view` for more info on this. Another way to
  think about the end-of-chunk cutoff is that lambda is always effectively zero
  on the timestep after the end of the chunk, since at the end of the chunk we
  rely entirely on bootstrapping and can't accumulate returns looking further
  into the future.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` transitions into another state with value `V'`, receiving
  reward `r` and pcontinue `p`. Then `V`, `r` and `p` are all at the same index
  `i` in the corresponding tensors. `V'` is at index `i+1`, or in the
  `bootstrap_value` tensor if `i == T`.

  Subtracting `values` from these lambda-returns will yield estimates of the
  advantage function which can be used for both the policy gradient loss and
  the baseline value function loss in A3C / GAE.

  Args:
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    values: 2-D Tensor containing estimates of the state values for timesteps
      0 to `T-1`. Shape `[T, B]`.
    bootstrap_value: 1-D Tensor containing an estimate of the value of the
      final state at time `T`, used for bootstrapping the target n-step
      returns. Shape `[B]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`.
    name: Customises the name_scope for this op.

  Returns:
    2-D Tensor with shape `[T, B]`
  """
    values.get_shape().assert_has_rank(2)
    rewards.get_shape().assert_has_rank(2)
    pcontinues.get_shape().assert_has_rank(2)
    bootstrap_value.get_shape().assert_has_rank(1)
    scoped_values = [rewards, pcontinues, values, bootstrap_value, lambda_]
    with tf.name_scope(name, values=scoped_values):
        if lambda_ == 1:
            # This is actually equivalent to the branch below, just an optimisation
            # to avoid unnecessary work in this case:
            return sequence_ops.scan_discounted_sum(
                rewards,
                pcontinues,
                initial_value=bootstrap_value,
                reverse=True,
                back_prop=False,
                name="multistep_returns")
        else:
            v_tp1 = tf.concat(
                axis=0,
                values=[values[1:, :],
                        tf.expand_dims(bootstrap_value, 0)])
            # `back_prop=False` prevents gradients flowing into values and
            # bootstrap_value, which is what you want when using the bootstrapped
            # lambda-returns in an update as targets for values.
            return sequence_ops.multistep_forward_view(
                rewards,
                pcontinues,
                v_tp1,
                lambda_,
                back_prop=False,
                name="generalized_lambda_returns")
示例#5
0
def qlambda(
    q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_, name="GeneralizedQLambda"):
  """Implements Peng's and Watkins' Q(lambda) loss as a TensorFlow op.

  This function is general enough to implement both Peng's and Watkins'
  Q-lambda algorithms.

  See "Reinforcement Learning: An Introduction" by Sutton and Barto.
  (http://incompleteideas.net/book/ebook/node78.html).

  Args:
    q_tm1: `Tensor` holding a sequence of Q-values starting at the first
      timestep; shape `[T, B, num_actions]`
    a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]`
    r_t: Tensor holding a sequence of rewards, shape `[T, B]`
    pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]`
    q_t: `Tensor` holding a sequence of Q-values for second timestep;
      shape `[T, B, num_actions]`. In a target network setting,
      this quantity is often supplied by the target network.
    lambda_: a scalar or `Tensor` of shape `[T, B]`
      specifying the ratio of mixing between bootstrapped and MC returns;
      if lambda_ is the same for all time steps then the function implements
      Peng's Q-learning algorithm; if lambda_ = 0 at every sub-optimal action
      and a constant otherwise, then the function implements Watkins'
      Q-learning algorithm. Generally lambda_ can be a Tensor of any values
      in the range [0, 1] supplied by the user.
    name: a name of the op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[T, B]`.
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`.
        * `td_error`: batch of temporal difference errors, shape `[T, B]`.
  """
  # Rank and compatibility checks.
  base_ops.wrap_rank_shape_assert([[q_tm1, q_t]], [3], name)
  if isinstance(
      lambda_, tf.Tensor
  ) and lambda_.get_shape().ndims is not None and lambda_.get_shape().ndims > 0:
    base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t, lambda_]], [2], name)
  else:
    base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t]], [2], name)

  # QLambda op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]):

    # Build target and select head to update.
    with tf.name_scope("target"):
      state_values = tf.reduce_max(q_t, axis=2)
      target = sequence_ops.multistep_forward_view(
          r_t, pcont_t, state_values, lambda_, back_prop=False)
      target = tf.stop_gradient(target)
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
示例#6
0
def generalized_lambda_returns(rewards,
                               pcontinues,
                               values,
                               bootstrap_value,
                               lambda_=1,
                               name="generalized_lambda_returns"):
  """Computes lambda-returns along a batch of (chunks of) trajectories.

  For lambda=1 these will be multistep returns looking ahead from each
  state to the end of the chunk, where bootstrap_value is used. If you pass an
  entire trajectory and zeros for bootstrap_value, this is just the Monte-Carlo
  return / TD(1) target.

  For lambda=0 these are one-step TD(0) targets.

  For inbetween values of lambda these are lambda-returns / TD(lambda) targets,
  except that traces are always cut off at the end of the chunk, since we can't
  see returns beyond then. If you pass an entire trajectory with zeros for
  bootstrap_value though, then they're plain TD(lambda) targets.

  lambda can also be a tensor of values in [0, 1], determining the mix of
  bootstrapping vs further accumulation of multistep returns at each timestep.
  This can be used to implement Retrace and other algorithms. See
  `sequence_ops.multistep_forward_view` for more info on this. Another way to
  think about the end-of-chunk cutoff is that lambda is always effectively zero
  on the timestep after the end of the chunk, since at the end of the chunk we
  rely entirely on bootstrapping and can't accumulate returns looking further
  into the future.

  The sequences in the tensors should be aligned such that an agent in a state
  with value `V` transitions into another state with value `V'`, receiving
  reward `r` and pcontinue `p`. Then `V`, `r` and `p` are all at the same index
  `i` in the corresponding tensors. `V'` is at index `i+1`, or in the
  `bootstrap_value` tensor if `i == T`.

  Subtracting `values` from these lambda-returns will yield estimates of the
  advantage function which can be used for both the policy gradient loss and
  the baseline value function loss in A3C / GAE.

  Args:
    rewards: 2-D Tensor with shape `[T, B]`.
    pcontinues: 2-D Tensor with shape `[T, B]`.
    values: 2-D Tensor containing estimates of the state values for timesteps
      0 to `T-1`. Shape `[T, B]`.
    bootstrap_value: 1-D Tensor containing an estimate of the value of the
      final state at time `T`, used for bootstrapping the target n-step
      returns. Shape `[B]`.
    lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`.
    name: Customises the name_scope for this op.

  Returns:
    2-D Tensor with shape `[T, B]`
  """
  values.get_shape().assert_has_rank(2)
  rewards.get_shape().assert_has_rank(2)
  pcontinues.get_shape().assert_has_rank(2)
  bootstrap_value.get_shape().assert_has_rank(1)
  scoped_values = [rewards, pcontinues, values, bootstrap_value, lambda_]
  with tf.name_scope(name, values=scoped_values):
    if lambda_ == 1:
      # This is actually equivalent to the branch below, just an optimisation
      # to avoid unnecessary work in this case:
      return sequence_ops.scan_discounted_sum(
          rewards,
          pcontinues,
          initial_value=bootstrap_value,
          reverse=True,
          back_prop=False,
          name="multistep_returns")
    else:
      v_tp1 = tf.concat(
          axis=0, values=[values[1:, :],
                          tf.expand_dims(bootstrap_value, 0)])
      # `back_prop=False` prevents gradients flowing into values and
      # bootstrap_value, which is what you want when using the bootstrapped
      # lambda-returns in an update as targets for values.
      return sequence_ops.multistep_forward_view(
          rewards,
          pcontinues,
          v_tp1,
          lambda_,
          back_prop=False,
          name="generalized_lambda_returns")