def testMultistepForwardView(self): with self.test_session() as sess: # Define input data. rewards = [[1, 0, -1, 0, 1], [0.5, 0.8, -0.7, 0.0, 2.1]] pcontinues = [[0.5, 0.9, 1.0, 0.5, 0.8], [0.9, 0.5, 0.3, 0.8, 0.7]] state_values = [[3, 1, 5, -5, 3], [-1.7, 1.2, 2.3, 2.2, 2.7]] lambda_ = 0.75 # Evaluate expected complex backups at all time-steps for both batches. expected_result = [] for b in xrange(0, 2): expected_result.append( get_complex_n_step_backup_at_all_times(rewards[b], pcontinues[b], state_values[b], lambda_)) # Only partially-specify the input shapes - verifies that the # dynamically sized Tensors are handled correctly. state_values_pl = tf.placeholder(tf.float32, shape=[None, None]) rewards_pl = tf.placeholder(tf.float32, shape=[None, None]) pcontinues_pl = tf.placeholder(tf.float32, shape=[None, None]) # We use transpose because it is easier to define the input data in # BxT (batch x time) form, while scan_discounted_sum assumes TxB form. state_values_in = tf.transpose(state_values_pl) rewards_in = tf.transpose(rewards_pl) pcontinues_in = tf.transpose(pcontinues_pl) expected = tf.transpose(tf.constant(expected_result, dtype=tf.float32)) # Evaluate complex backups. result = sequence_ops.multistep_forward_view(rewards_in, pcontinues_in, state_values_in, lambda_) feed_dict = {state_values_pl: state_values, rewards_pl: rewards, pcontinues_pl: pcontinues} self.assertAllClose(sess.run(result, feed_dict=feed_dict), sess.run(expected))
def qlambda( q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_, name="GeneralizedQLambda"): """Implements Peng's and Watkins' Q(lambda) loss as a TensorFlow op. This function is general enough to implement both Peng's and Watkins' Q-lambda algorithms. See "Reinforcement Learning: An Introduction" by Sutton and Barto. (http://incompleteideas.net/book/ebook/node78.html). Args: q_tm1: `Tensor` holding a sequence of Q-values starting at the first timestep; shape `[T, B, num_actions]` a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]` r_t: Tensor holding a sequence of rewards, shape `[T, B]` pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]` q_t: `Tensor` holding a sequence of Q-values for second timestep; shape `[T, B, num_actions]`. In a target network setting, this quantity is often supplied by the target network. lambda_: a scalar or `Tensor` of shape `[T, B]` specifying the ratio of mixing between bootstrapped and MC returns; if lambda_ is the same for all time steps then the function implements Peng's Q-learning algorithm; if lambda_ = 0 at every sub-optimal action and a constant otherwise, then the function implements Watkins' Q-learning algorithm. Generally lambda_ can be a Tensor of any values in the range [0, 1] supplied by the user. name: a name of the op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[T, B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`. * `td_error`: batch of temporal difference errors, shape `[T, B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert([[q_tm1, q_t]], [3], name) if isinstance(lambda_, tf.Tensor) and lambda_.get_shape().ndims > 0: base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t, lambda_]], [2], name) else: base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t]], [2], name) # QLambda op. with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]): # Build target and select head to update. with tf.name_scope("target"): state_values = tf.reduce_max(q_t, axis=2) target = sequence_ops.multistep_forward_view( r_t, pcont_t, state_values, lambda_, back_prop=False) target = tf.stop_gradient(target) qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - qa_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, QExtra(target, td_error))
def sarsa_lambda(q_tm1, a_tm1, r_t, pcont_t, q_t, a_t, lambda_, name="SarsaLambda"): """Implements SARSA(lambda) loss as a TensorFlow op. See "Reinforcement Learning: An Introduction" by Sutton and Barto. (http://incompleteideas.net/book/ebook/node77.html). Args: q_tm1: `Tensor` holding a sequence of Q-values starting at the first timestep; shape `[T, B, num_actions]` a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]` r_t: Tensor holding a sequence of rewards, shape `[T, B]` pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]` q_t: `Tensor` holding a sequence of Q-values for second timestep; shape `[T, B, num_actions]`. a_t: `Tensor` holding a sequence of action indices for second timestep; shape `[T, B]` lambda_: a scalar specifying the ratio of mixing between bootstrapped and MC returns. name: a name of the op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[T, B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`. * `td_error`: batch of temporal difference errors, shape `[T, B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert( [[q_tm1, q_t], [a_tm1, r_t, pcont_t, a_t]], [3, 2], name) # SARSALambda op. with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, a_t]): # Select head to update and build target. qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) qa_t = indexing_ops.batched_index(q_t, a_t) target = sequence_ops.multistep_forward_view( r_t, pcont_t, qa_t, lambda_, back_prop=False) target = tf.stop_gradient(target) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - qa_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, QExtra(target, td_error))
def generalized_lambda_returns(rewards, pcontinues, values, bootstrap_value, lambda_=1, name="generalized_lambda_returns"): """Computes lambda-returns along a batch of (chunks of) trajectories. For lambda=1 these will be multistep returns looking ahead from each state to the end of the chunk, where bootstrap_value is used. If you pass an entire trajectory and zeros for bootstrap_value, this is just the Monte-Carlo return / TD(1) target. For lambda=0 these are one-step TD(0) targets. For inbetween values of lambda these are lambda-returns / TD(lambda) targets, except that traces are always cut off at the end of the chunk, since we can't see returns beyond then. If you pass an entire trajectory with zeros for bootstrap_value though, then they're plain TD(lambda) targets. lambda can also be a tensor of values in [0, 1], determining the mix of bootstrapping vs further accumulation of multistep returns at each timestep. This can be used to implement Retrace and other algorithms. See `sequence_ops.multistep_forward_view` for more info on this. Another way to think about the end-of-chunk cutoff is that lambda is always effectively zero on the timestep after the end of the chunk, since at the end of the chunk we rely entirely on bootstrapping and can't accumulate returns looking further into the future. The sequences in the tensors should be aligned such that an agent in a state with value `V` transitions into another state with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `r` and `p` are all at the same index `i` in the corresponding tensors. `V'` is at index `i+1`, or in the `bootstrap_value` tensor if `i == T`. Subtracting `values` from these lambda-returns will yield estimates of the advantage function which can be used for both the policy gradient loss and the baseline value function loss in A3C / GAE. Args: rewards: 2-D Tensor with shape `[T, B]`. pcontinues: 2-D Tensor with shape `[T, B]`. values: 2-D Tensor containing estimates of the state values for timesteps 0 to `T-1`. Shape `[T, B]`. bootstrap_value: 1-D Tensor containing an estimate of the value of the final state at time `T`, used for bootstrapping the target n-step returns. Shape `[B]`. lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`. name: Customises the name_scope for this op. Returns: 2-D Tensor with shape `[T, B]` """ values.get_shape().assert_has_rank(2) rewards.get_shape().assert_has_rank(2) pcontinues.get_shape().assert_has_rank(2) bootstrap_value.get_shape().assert_has_rank(1) scoped_values = [rewards, pcontinues, values, bootstrap_value, lambda_] with tf.name_scope(name, values=scoped_values): if lambda_ == 1: # This is actually equivalent to the branch below, just an optimisation # to avoid unnecessary work in this case: return sequence_ops.scan_discounted_sum( rewards, pcontinues, initial_value=bootstrap_value, reverse=True, back_prop=False, name="multistep_returns") else: v_tp1 = tf.concat( axis=0, values=[values[1:, :], tf.expand_dims(bootstrap_value, 0)]) # `back_prop=False` prevents gradients flowing into values and # bootstrap_value, which is what you want when using the bootstrapped # lambda-returns in an update as targets for values. return sequence_ops.multistep_forward_view( rewards, pcontinues, v_tp1, lambda_, back_prop=False, name="generalized_lambda_returns")
def qlambda( q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_, name="GeneralizedQLambda"): """Implements Peng's and Watkins' Q(lambda) loss as a TensorFlow op. This function is general enough to implement both Peng's and Watkins' Q-lambda algorithms. See "Reinforcement Learning: An Introduction" by Sutton and Barto. (http://incompleteideas.net/book/ebook/node78.html). Args: q_tm1: `Tensor` holding a sequence of Q-values starting at the first timestep; shape `[T, B, num_actions]` a_tm1: `Tensor` holding a sequence of action indices, shape `[T, B]` r_t: Tensor holding a sequence of rewards, shape `[T, B]` pcont_t: `Tensor` holding a sequence of pcontinue values, shape `[T, B]` q_t: `Tensor` holding a sequence of Q-values for second timestep; shape `[T, B, num_actions]`. In a target network setting, this quantity is often supplied by the target network. lambda_: a scalar or `Tensor` of shape `[T, B]` specifying the ratio of mixing between bootstrapped and MC returns; if lambda_ is the same for all time steps then the function implements Peng's Q-learning algorithm; if lambda_ = 0 at every sub-optimal action and a constant otherwise, then the function implements Watkins' Q-learning algorithm. Generally lambda_ can be a Tensor of any values in the range [0, 1] supplied by the user. name: a name of the op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[T, B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `q_tm1[a_tm1]`, shape `[T, B]`. * `td_error`: batch of temporal difference errors, shape `[T, B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert([[q_tm1, q_t]], [3], name) if isinstance( lambda_, tf.Tensor ) and lambda_.get_shape().ndims is not None and lambda_.get_shape().ndims > 0: base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t, lambda_]], [2], name) else: base_ops.wrap_rank_shape_assert([[a_tm1, r_t, pcont_t]], [2], name) # QLambda op. with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t]): # Build target and select head to update. with tf.name_scope("target"): state_values = tf.reduce_max(q_t, axis=2) target = sequence_ops.multistep_forward_view( r_t, pcont_t, state_values, lambda_, back_prop=False) target = tf.stop_gradient(target) qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - qa_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, QExtra(target, td_error))