def averaged_particle_returns(reward: Tensor, discount: Tensor, number_of_particles: int) -> Tensor: """ Compute the returns from a set of trajectories, averaging over a number of particles per element of the batch. :param reward: A batch of trajectories of step rewards. The batch size is the number of action trajectories multiplied by the number of monte-carlo rollouts of each action trajectory. :param discount: A batch of trajectories of step discounts. The batch size is the number of action trajectories multiplied by the number of monte-carlo rollouts of each action trajectory. :param number_of_particles: Number of monte-carlo rollouts of each action trajectory. :return: Monte-carlo estimate of the returns from each action trajectory. """ # Looks weird but is correct! At first sight, digging into it, it evokes the impression # that the last reward signal is missed. However, tf's cumprod method has an `exclusive` # flag which is set to True such that the last reward signal is included. mask = get_contiguous_sub_episodes(discount) particles_returns = tf.reduce_sum(reward * mask, axis=1) # shape = (batch_size,) batch_particles_returns = reshape_create_particle_axis( particles_returns, number_of_particles) return tf.reduce_mean(batch_particles_returns, axis=1)
def testNumSteps(self): discounts = [ [0.9, 0.9, 0.9, 0.9], # No episode termination. [0.0, 0.9, 0.9, 0.9], # Episode terminates on first step. [0.9, 0.9, 0.0, 0.9] ] # Episode terminates on third step. tensor = tf.constant(discounts, dtype=tf.float32) result = common.get_contiguous_sub_episodes(tensor) expected_result = [[1.0, 1.0, 1.0, 1.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0]] self.assertAllClose(expected_result, self.evaluate(result))