def testScanSumEquivalenceWithSeqLen(self): with self.test_session() as sess: sequence_lengths = [0, 2] bootstrap = tf.constant([0.5, 1.5], dtype=tf.float32) sequence = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]] decays = [[.1, .2, .3, .4, .5], [.6, .7, .8, .9, .10]] eq_sequence = [[0, 0, 0, 0, 0], [6, 7, 0, 0, 0]] eq_decays = [[0, 0, 0, 0, 0], [.6, .7, 0, 0, 0]] eq_reverse_sequence = [[0, 0, 0, 0, 0], [7, 6, 0, 0, 0]] eq_reverse_decays = [[0, 0, 0, 0, 0], [.7, .6, 0, 0, 0]] # We use transpose because it is easier to define the input data in # BxT (batch x time) form, while scan_discounted_sum assumes TxB form. sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32)) decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32)) eq_sequence_in = tf.transpose( tf.constant(eq_sequence, dtype=tf.float32)) eq_decays_in = tf.transpose( tf.constant(eq_decays, dtype=tf.float32)) eq_reverse_sequence_in = tf.transpose( tf.constant(eq_reverse_sequence, dtype=tf.float32)) eq_reverse_decays_in = tf.transpose( tf.constant(eq_reverse_decays, dtype=tf.float32)) eq_result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=False, sequence_lengths=sequence_lengths) exp_eq_result = sequence_ops.scan_discounted_sum( eq_sequence_in, eq_decays_in, bootstrap) eq_reverse_result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=True, sequence_lengths=sequence_lengths) exp_eq_reverse_result = sequence_ops.scan_discounted_sum( eq_reverse_sequence_in, eq_reverse_decays_in, bootstrap) exp_eq_reverse_result = tf.reverse_sequence(exp_eq_reverse_result, sequence_lengths, seq_axis=0, batch_axis=1) self.assertAllClose(sess.run(eq_result), sess.run(exp_eq_result)) self.assertAllClose(sess.run(eq_reverse_result), sess.run(exp_eq_reverse_result))
def testScanSumWithDecaysReverse3D(self): """scan_discounted_sum vs. higher-dimensional arguments.""" with self.test_session() as sess: sequence = [[[3, 33], [1, 11], [5, 55]], [[-1.7, -17], [1.2, 12], [2.3, 23]]] decays = [[[0.5, 5], [0.9, 9], [1.0, 10]], [[0.9, 9], [0.5, 5], [0.3, 3]]] # We use transpose because it is easier to define the input data in # BxT (batch x time) form, while scan_discounted_sum assumes TxB form. sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32), perm=[1, 0, 2]) decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32), perm=[1, 0, 2]) bootstrap = tf.constant([[0, 0], [1.5, 15]], dtype=tf.float32) result = sequence_ops.scan_discounted_sum(sequence_in, decays_in, bootstrap, reverse=True) expected_result = tf.constant( [[[(5 * 0.9 + 1) * 0.5 + 3, (55 * 9 + 11) * 5 + 33], [5 * 0.9 + 1, 55 * 9 + 11], [5, 55]], [[((2.3 + 0.3 * 1.5) * 0.5 + 1.2) * 0.9 - 1.7, ((23 + 3 * 15) * 5 + 12) * 9 - 17], [(2.3 + 0.3 * 1.5) * 0.5 + 1.2, (23 + 3 * 15) * 5 + 12], [2.3 + 0.3 * 1.5, 23 + 3 * 15]]], dtype=tf.float32) self.assertAllClose(sess.run(result), sess.run(tf.transpose(expected_result, perm=[1, 0, 2])))
def testScanSumWithDecays(self): with self.test_session() as sess: sequence = [[3, 1, 5, 2, 1], [-1.7, 1.2, 2.3, 0, 1]] decays = [[0.5, 0.9, 1.0, 0.1, 0.5], [0.9, 0.5, 0.0, 2, 0.8]] # We use transpose because it is easier to define the input data in # BxT (batch x time) form, while scan_discounted_sum assumes TxB form. sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32)) decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32)) bootstrap = tf.constant([0, 1.5], dtype=tf.float32) result = sequence_ops.scan_discounted_sum(sequence_in, decays_in, bootstrap, reverse=False) expected_result = tf.constant( [[3, 3 * 0.9 + 1, (3 * 0.9 + 1) * 1.0 + 5, ((3 * 0.9 + 1) * 1.0 + 5) * 0.1 + 2, (((3 * 0.9 + 1) * 1.0 + 5) * 0.1 + 2) * 0.5 + 1], [-1.7 + 1.5 * 0.9, (-1.7 + 1.5 * 0.9) * 0.5 + 1.2, ((-1.7 + 1.5 * 0.9) * 0.5 + 1.2) * 0.0 + 2.3, (((-1.7 + 1.5 * 0.9) * 0.5 + 1.2) * 0.0 + 2.3) * 2 + 0, ((((-1.7 + 1.5 * 0.9) * 0.5 + 1.2) * 0.0 + 2.3) * 2 + 0) * 0.8 + 1, ]], dtype=tf.float32) self.assertAllClose(sess.run(result), sess.run(tf.transpose(expected_result)))
def testScanSumWithDecaysReverse3DWithSeqLen(self): """scan_discounted_sum vs. higher-dimensional arguments.""" with self.test_session() as sess: sequence = [[[3, 33], [1, 11], [5, 55]], [[-1.7, -17], [1.2, 12], [2.3, 23]]] decays = [[[0.5, 5], [0.9, 9], [1.0, 10]], [[0.9, 9], [0.5, 5], [0.3, 3]]] sequence_lengths = [2, 0] # We use transpose because it is easier to define the input data in # BxT (batch x time) form, while scan_discounted_sum assumes TxB form. sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32), perm=[1, 0, 2]) decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32), perm=[1, 0, 2]) bootstrap = tf.constant([[0, 0], [1.5, 15]], dtype=tf.float32) result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=True, sequence_lengths=sequence_lengths) expected_result = np.asarray([[[1 * 0.5 + 3, 11 * 5 + 33], [1, 11], [0, 0]], [[0, 0], [0, 0], [0, 0]]], dtype=np.float32) self.assertAllClose(sess.run(result), np.transpose(expected_result, axes=[1, 0, 2]))
def testScanSumEquivalenceWithSeqLen(self): with self.test_session() as sess: sequence_lengths = [0, 2] bootstrap = tf.constant([0.5, 1.5], dtype=tf.float32) sequence = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]] decays = [[.1, .2, .3, .4, .5], [.6, .7, .8, .9, .10]] eq_sequence = [[0, 0, 0, 0, 0], [6, 7, 0, 0, 0]] eq_decays = [[0, 0, 0, 0, 0], [.6, .7, 0, 0, 0]] eq_reverse_sequence = [[0, 0, 0, 0, 0], [7, 6, 0, 0, 0]] eq_reverse_decays = [[0, 0, 0, 0, 0], [.7, .6, 0, 0, 0]] # We use transpose because it is easier to define the input data in # BxT (batch x time) form, while scan_discounted_sum assumes TxB form. sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32)) decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32)) eq_sequence_in = tf.transpose(tf.constant(eq_sequence, dtype=tf.float32)) eq_decays_in = tf.transpose(tf.constant(eq_decays, dtype=tf.float32)) eq_reverse_sequence_in = tf.transpose( tf.constant(eq_reverse_sequence, dtype=tf.float32)) eq_reverse_decays_in = tf.transpose( tf.constant(eq_reverse_decays, dtype=tf.float32)) eq_result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=False, sequence_lengths=sequence_lengths) exp_eq_result = sequence_ops.scan_discounted_sum( eq_sequence_in, eq_decays_in, bootstrap) eq_reverse_result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=True, sequence_lengths=sequence_lengths) exp_eq_reverse_result = sequence_ops.scan_discounted_sum( eq_reverse_sequence_in, eq_reverse_decays_in, bootstrap) exp_eq_reverse_result = tf.reverse_sequence( exp_eq_reverse_result, sequence_lengths, seq_axis=0, batch_axis=1) self.assertAllClose(sess.run(eq_result), sess.run(exp_eq_result)) self.assertAllClose(sess.run(eq_reverse_result), sess.run(exp_eq_reverse_result))
def testScanSumShapeInference(self): """scan_discounted_sum should support static shape inference.""" # No session needed since we're not evaluating any ops. sequence_in = tf.placeholder(tf.float32, shape=[1647, 2001]) decays_in = tf.placeholder(tf.float32, shape=[1647, 2001]) bootstrap = tf.placeholder(tf.float32, shape=[2001]) result = sequence_ops.scan_discounted_sum(sequence_in, decays_in, bootstrap, reverse=False) self.assertAllEqual(result.get_shape(), [1647, 2001]) # Let's do it again with higher-dimensional inputs. sequence_in = tf.placeholder(tf.float32, shape=[4, 8, 15, 16, 23, 42]) decays_in = tf.placeholder(tf.float32, shape=[4, 8, 15, 16, 23, 42]) bootstrap = tf.placeholder(tf.float32, shape=[8, 15, 16, 23, 42]) result = sequence_ops.scan_discounted_sum(sequence_in, decays_in, bootstrap, reverse=False) self.assertAllEqual(result.get_shape(), [4, 8, 15, 16, 23, 42])
def testScanSumWithDecaysReverseWithSeqLen(self): with self.test_session() as sess: sequence = [[3, 1, 5], [-1.7, 1.2, 2.3]] decays = [[0.5, 0.9, 1.0], [0.9, 0.5, 0.3]] sequence_lengths = [2, 0] # We use transpose because it is easier to define the input data in # BxT (batch x time) form, while scan_discounted_sum assumes TxB form. sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32)) decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32)) bootstrap = tf.constant([2.5, 1.5], dtype=tf.float32) result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=True, sequence_lengths=sequence_lengths) expected_result = tf.constant( [[(0.9 * 2.5 + 1) * 0.5 + 3, (0.9 * 2.5 + 1), 0], [0, 0, 0]], dtype=tf.float32) self.assertAllClose(sess.run(result), sess.run(tf.transpose(expected_result)))
def testScanSumWithDecaysReverse3DWithSeqLen(self): """scan_discounted_sum vs. higher-dimensional arguments.""" with self.test_session() as sess: sequence = [[[3, 33], [1, 11], [5, 55]], [[-1.7, -17], [1.2, 12], [2.3, 23]]] decays = [[[0.5, 5], [0.9, 9], [1.0, 10]], [[0.9, 9], [0.5, 5], [0.3, 3]]] sequence_lengths = [2, 0] # We use transpose because it is easier to define the input data in # BxT (batch x time) form, while scan_discounted_sum assumes TxB form. sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32), perm=[1, 0, 2]) decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32), perm=[1, 0, 2]) bootstrap = tf.constant([[0, 0], [1.5, 15]], dtype=tf.float32) result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=True, sequence_lengths=sequence_lengths) expected_result = np.asarray( [[[1 * 0.5 + 3, 11 * 5 + 33], [1, 11], [0, 0]], [[0, 0], [0, 0], [0, 0]]], dtype=np.float32) self.assertAllClose(sess.run(result), np.transpose(expected_result, axes=[1, 0, 2]))
def _general_off_policy_corrected_multistep_target(r_t, pcont_t, target_policy_t, c_t, q_t, a_t, back_prop=False, name=None): """Evaluates targets for various off-policy value correction based algorithms. `target_policy_t` is the policy that this function aims to evaluate. New action-value estimates (target values `T`) must be expressible in this recurrent form: ```none T(x_{t-1}, a_{t-1}) = r_t + γ[ 𝔼_π Q(x_t, .) - c_t Q(x_t, a_t) + c_t T(x_t, a_t) ] ``` `T(x_t, a_t)` is an estimate of expected discounted future returns based on the current Q value estimates `Q(x_t, a_t)` and rewards `r_t`. The evaluated target values can be used as supervised targets for learning the Q function itself or as returns for various policy gradient algorithms. `Q==T` if convergence is reached. As the formula is recurrent, it will evaluate multistep returns for non-zero importance weights `c_t`. In the usual moving and target network setup `q_t` should be calculated by the target network while the `target_policy_t` may be evaluated by either of the networks. If `target_policy_t` is evaluated by the current moving network the algorithm implemented will have a similar flavour as double DQN. Depending on the choice of c_t, the algorithm can implement: ```none Importance Sampling c_t = π(x_t, a_t) / μ(x_t, a_t), Harutyunyan's et al. Q(lambda) c_t = λ, Precup's et al. Tree-Backup c_t = π(x_t, a_t), Munos' et al. Retrace c_t = λ min(1, π(x_t, a_t) / μ(x_t, a_t)). ``` Please refer to page 3 for more details: https://arxiv.org/pdf/1606.02647v1.pdf Args: r_t: 2-D tensor holding rewards received during the transition that corresponds to each major index. Shape is `[T, B]`. pcont_t: 2-D tensor holding pcontinue values received during the transition that corresponds to each major index. Shape is `[T, B]`. target_policy_t: 3-D tensor holding per-action policy probabilities for the states encountered just AFTER the transitions that correspond to each major index, according to the target policy (i.e. the policy we wish to learn). These usually derive from the learning net. Shape is `[T, B, num_actions]`. c_t: 2-D tensor holding importance weights; see discussion above. Shape is `[T, B]`. q_t: 3-D tensor holding per-action Q-values for the states encountered just AFTER taking the transitions that correspond to each major index. Shape is `[T, B, num_actions]`. a_t: 2-D tensor holding the indices of actions executed during the transition AFTER the transition that corresponds to each major index. Shape is `[T, B]`. back_prop: whether to backpropagate gradients through time. name: name of the op. Returns: Tensor of shape `[T, B, num_actions]` containing Q values. """ # Formula (4) in https://arxiv.org/pdf/1606.02647v1.pdf can be expressed # in a recursive form where T is a new target value: # T(x_{t-1}, a_{t-1}) = r_t + γ[ 𝔼_π Q(x_t, .) - c_t Q(x_t, a_t) + # c_t T(x_t, a_t) ] # This recurrent form allows us to express Retrace by using # `scan_discounted_sum`. # Define: # T_tm1 = T(x_{t-1}, a_{t-1}) # T_t = T(x_t, a_t) # exp_q_t = 𝔼_π Q(x_t,.) # qa_t = Q(x_t, a_t) # Hence: # T_tm1 = r_t + γ * (exp_q_t - c_t * qa_t) + γ * c_t * T_t # Define: # current = r_t + γ * (exp_q_t - c_t * qa_t) # Thus: # T_tm1 = scan_discounted_sum(current, γ * c_t, reverse=True) args = [r_t, pcont_t, target_policy_t, c_t, q_t, a_t] with tf.name_scope(name, 'general_returns_based_off_policy_target', values=args): exp_q_t = tf.reduce_sum(target_policy_t * q_t, axis=2) qa_t = indexing_ops.batched_index(q_t, a_t) current = r_t + pcont_t * (exp_q_t - c_t * qa_t) initial_value = qa_t[-1] return sequence_ops.scan_discounted_sum(current, pcont_t * c_t, initial_value, reverse=True, back_prop=back_prop)
def _general_off_policy_corrected_multistep_target(r_t, pcont_t, target_policy_t, c_t, q_t, a_t, back_prop=False, name=None): """Evaluates targets for various off-policy value correction based algorithms. `target_policy_t` is the policy that this function aims to evaluate. New action-value estimates (target values `T`) must be expressible in this recurrent form: ```none T(x_{t-1}, a_{t-1}) = r_t + γ[ 𝔼_π Q(x_t, .) - c_t Q(x_t, a_t) + c_t T(x_t, a_t) ] ``` `T(x_t, a_t)` is an estimate of expected discounted future returns based on the current Q value estimates `Q(x_t, a_t)` and rewards `r_t`. The evaluated target values can be used as supervised targets for learning the Q function itself or as returns for various policy gradient algorithms. `Q==T` if convergence is reached. As the formula is recurrent, it will evaluate multistep returns for non-zero importance weights `c_t`. In the usual moving and target network setup `q_t` should be calculated by the target network while the `target_policy_t` may be evaluated by either of the networks. If `target_policy_t` is evaluated by the current moving network the algorithm implemented will have a similar flavour as double DQN. Depending on the choice of c_t, the algorithm can implement: ```none Importance Sampling c_t = π(x_t, a_t) / μ(x_t, a_t), Harutyunyan's et al. Q(lambda) c_t = λ, Precup's et al. Tree-Backup c_t = π(x_t, a_t), Munos' et al. Retrace c_t = λ min(1, π(x_t, a_t) / μ(x_t, a_t)). ``` Please refer to page 3 for more details: https://arxiv.org/pdf/1606.02647v1.pdf Args: r_t: 2-D tensor holding rewards received during the transition that corresponds to each major index. Shape is `[T, B]`. pcont_t: 2-D tensor holding pcontinue values received during the transition that corresponds to each major index. Shape is `[T, B]`. target_policy_t: 3-D tensor holding per-action policy probabilities for the states encountered just AFTER the transitions that correspond to each major index, according to the target policy (i.e. the policy we wish to learn). These usually derive from the learning net. Shape is `[T, B, num_actions]`. c_t: 2-D tensor holding importance weights; see discussion above. Shape is `[T, B]`. q_t: 3-D tensor holding per-action Q-values for the states encountered just AFTER taking the transitions that correspond to each major index. Shape is `[T, B, num_actions]`. a_t: 2-D tensor holding the indices of actions executed during the transition AFTER the transition that corresponds to each major index. Shape is `[T, B]`. back_prop: whether to backpropagate gradients through time. name: name of the op. Returns: Tensor of shape `[T, B, num_actions]` containing Q values. """ # Formula (4) in https://arxiv.org/pdf/1606.02647v1.pdf can be expressed # in a recursive form where T is a new target value: # T(x_{t-1}, a_{t-1}) = r_t + γ[ 𝔼_π Q(x_t, .) - c_t Q(x_t, a_t) + # c_t T(x_t, a_t) ] # This recurrent form allows us to express Retrace by using # `scan_discounted_sum`. # Define: # T_tm1 = T(x_{t-1}, a_{t-1}) # T_t = T(x_t, a_t) # exp_q_t = 𝔼_π Q(x_{t+1},.) # qa_t = Q(x_t, a_t) # Hence: # T_tm1 = (r_t + γ * exp_q_t - c_t * qa_t) + γ * c_t * T_t # Define: # current = r_t + γ * (exp_q_t - c_t * qa_t) # Thus: # T_tm1 = scan_discounted_sum(current, γ * c_t, reverse=True) args = [r_t, pcont_t, target_policy_t, c_t, q_t, a_t] with tf.name_scope( name, 'general_returns_based_off_policy_target', values=args): exp_q_t = tf.reduce_sum(target_policy_t * q_t, axis=2) qa_t = indexing_ops.batched_index(q_t, a_t) current = r_t + pcont_t * (exp_q_t - c_t * qa_t) initial_value = qa_t[-1] return sequence_ops.scan_discounted_sum( current, pcont_t * c_t, initial_value, reverse=True, back_prop=back_prop)
def generalized_lambda_returns(rewards, pcontinues, values, bootstrap_value, lambda_=1, name="generalized_lambda_returns"): """Computes lambda-returns along a batch of (chunks of) trajectories. For lambda=1 these will be multistep returns looking ahead from each state to the end of the chunk, where bootstrap_value is used. If you pass an entire trajectory and zeros for bootstrap_value, this is just the Monte-Carlo return / TD(1) target. For lambda=0 these are one-step TD(0) targets. For inbetween values of lambda these are lambda-returns / TD(lambda) targets, except that traces are always cut off at the end of the chunk, since we can't see returns beyond then. If you pass an entire trajectory with zeros for bootstrap_value though, then they're plain TD(lambda) targets. lambda can also be a tensor of values in [0, 1], determining the mix of bootstrapping vs further accumulation of multistep returns at each timestep. This can be used to implement Retrace and other algorithms. See `sequence_ops.multistep_forward_view` for more info on this. Another way to think about the end-of-chunk cutoff is that lambda is always effectively zero on the timestep after the end of the chunk, since at the end of the chunk we rely entirely on bootstrapping and can't accumulate returns looking further into the future. The sequences in the tensors should be aligned such that an agent in a state with value `V` transitions into another state with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `r` and `p` are all at the same index `i` in the corresponding tensors. `V'` is at index `i+1`, or in the `bootstrap_value` tensor if `i == T`. Subtracting `values` from these lambda-returns will yield estimates of the advantage function which can be used for both the policy gradient loss and the baseline value function loss in A3C / GAE. Args: rewards: 2-D Tensor with shape `[T, B]`. pcontinues: 2-D Tensor with shape `[T, B]`. values: 2-D Tensor containing estimates of the state values for timesteps 0 to `T-1`. Shape `[T, B]`. bootstrap_value: 1-D Tensor containing an estimate of the value of the final state at time `T`, used for bootstrapping the target n-step returns. Shape `[B]`. lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`. name: Customises the name_scope for this op. Returns: 2-D Tensor with shape `[T, B]` """ values.get_shape().assert_has_rank(2) rewards.get_shape().assert_has_rank(2) pcontinues.get_shape().assert_has_rank(2) bootstrap_value.get_shape().assert_has_rank(1) scoped_values = [rewards, pcontinues, values, bootstrap_value, lambda_] with tf.name_scope(name, values=scoped_values): if lambda_ == 1: # This is actually equivalent to the branch below, just an optimisation # to avoid unnecessary work in this case: return sequence_ops.scan_discounted_sum( rewards, pcontinues, initial_value=bootstrap_value, reverse=True, back_prop=False, name="multistep_returns") else: v_tp1 = tf.concat( axis=0, values=[values[1:, :], tf.expand_dims(bootstrap_value, 0)]) # `back_prop=False` prevents gradients flowing into values and # bootstrap_value, which is what you want when using the bootstrapped # lambda-returns in an update as targets for values. return sequence_ops.multistep_forward_view( rewards, pcontinues, v_tp1, lambda_, back_prop=False, name="generalized_lambda_returns")
def generalized_lambda_returns(rewards, pcontinues, values, bootstrap_value, lambda_=1, name="generalized_lambda_returns"): """Computes lambda-returns along a batch of (chunks of) trajectories. For lambda=1 these will be multistep returns looking ahead from each state to the end of the chunk, where bootstrap_value is used. If you pass an entire trajectory and zeros for bootstrap_value, this is just the Monte-Carlo return / TD(1) target. For lambda=0 these are one-step TD(0) targets. For inbetween values of lambda these are lambda-returns / TD(lambda) targets, except that traces are always cut off at the end of the chunk, since we can't see returns beyond then. If you pass an entire trajectory with zeros for bootstrap_value though, then they're plain TD(lambda) targets. lambda can also be a tensor of values in [0, 1], determining the mix of bootstrapping vs further accumulation of multistep returns at each timestep. This can be used to implement Retrace and other algorithms. See `sequence_ops.multistep_forward_view` for more info on this. Another way to think about the end-of-chunk cutoff is that lambda is always effectively zero on the timestep after the end of the chunk, since at the end of the chunk we rely entirely on bootstrapping and can't accumulate returns looking further into the future. The sequences in the tensors should be aligned such that an agent in a state with value `V` transitions into another state with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `r` and `p` are all at the same index `i` in the corresponding tensors. `V'` is at index `i+1`, or in the `bootstrap_value` tensor if `i == T`. Subtracting `values` from these lambda-returns will yield estimates of the advantage function which can be used for both the policy gradient loss and the baseline value function loss in A3C / GAE. Args: rewards: 2-D Tensor with shape `[T, B]`. pcontinues: 2-D Tensor with shape `[T, B]`. values: 2-D Tensor containing estimates of the state values for timesteps 0 to `T-1`. Shape `[T, B]`. bootstrap_value: 1-D Tensor containing an estimate of the value of the final state at time `T`, used for bootstrapping the target n-step returns. Shape `[B]`. lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`. name: Customises the name_scope for this op. Returns: 2-D Tensor with shape `[T, B]` """ values.get_shape().assert_has_rank(2) rewards.get_shape().assert_has_rank(2) pcontinues.get_shape().assert_has_rank(2) bootstrap_value.get_shape().assert_has_rank(1) scoped_values = [rewards, pcontinues, values, bootstrap_value, lambda_] with tf.name_scope(name, values=scoped_values): if lambda_ == 1: # This is actually equivalent to the branch below, just an optimisation # to avoid unnecessary work in this case: return sequence_ops.scan_discounted_sum( rewards, pcontinues, initial_value=bootstrap_value, reverse=True, back_prop=False, name="multistep_returns") else: v_tp1 = tf.concat( axis=0, values=[values[1:, :], tf.expand_dims(bootstrap_value, 0)]) # `back_prop=False` prevents gradients flowing into values and # bootstrap_value, which is what you want when using the bootstrapped # lambda-returns in an update as targets for values. return sequence_ops.multistep_forward_view( rewards, pcontinues, v_tp1, lambda_, back_prop=False, name="generalized_lambda_returns")