def testInconsistentSupportsAndWeightsParameters(self): supports = tf.constant([[0, 2, 4, 6, 8], [3, 4, 5, 6, 7]], dtype=tf.float32) weights = tf.constant( [[0.1, 0.2, 0.3, 0.2], [0.1, 0.2, 0.3, 0.2]], dtype=tf.float32) target_support = tf.constant([4, 5, 6, 7, 8], dtype=tf.float32) with self.assertRaisesRegexp(ValueError, 'are incompatible'): rainbow_agent.project_distribution(supports, weights, target_support)
def testZeroDimensionalTargetSupport(self): supports = tf.constant([[0, 2, 4, 6, 8], [3, 4, 5, 6, 7]], dtype=tf.float32) weights = tf.constant( [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.3, 0.2, 0.2]], dtype=tf.float32) target_support = tf.constant(3, dtype=tf.float32) with self.assertRaisesRegexp(ValueError, 'Index out of range'): rainbow_agent.project_distribution(supports, weights, target_support)
def _build_target_distribution(self): """Builds the C51 target distribution as per Bellemare et al. (2017). First, we compute the support of the Bellman target, r + gamma Z'. Where Z' is the support of the next state distribution: * Evenly spaced in [-vmax, vmax] if the current state is nonterminal; * 0 otherwise (duplicated num_atoms times). Second, we compute the next-state probabilities, corresponding to the action with highest expected value. Finally we project the Bellman target (support + probabilities) onto the original support. Returns: target_distribution: tf.tensor, the target distribution from the replay. """ batch_size = self._replay.batch_size # size of rewards: batch_size x 1 rewards = self._replay.rewards[:, None] # size of tiled_support: batch_size x num_atoms tiled_support = tf.tile(self._support, [batch_size]) tiled_support = tf.reshape(tiled_support, [batch_size, self._num_atoms]) # size of target_support: batch_size x num_atoms is_terminal_multiplier = 1. - tf.cast(self._replay.terminals, tf.float32) # Incorporate terminal state to discount factor. # size of gamma_with_terminal: batch_size x 1 gamma_with_terminal = self.cumulative_gamma * is_terminal_multiplier gamma_with_terminal = gamma_with_terminal[:, None] target_support = rewards + gamma_with_terminal * tiled_support # size of next_qt_argmax: 1 x batch_size next_qt_argmax = tf.argmax( self._replay_next_target_net_outputs.q_values, axis=1)[:, None] batch_indices = tf.range(tf.to_int64(batch_size))[:, None] # size of next_qt_argmax: batch_size x 2 batch_indexed_next_qt_argmax = tf.concat( [batch_indices, next_qt_argmax], axis=1) # size of next_probabilities: batch_size x num_atoms next_probabilities = tf.gather_nd( self._replay_next_target_net_outputs.probabilities, batch_indexed_next_qt_argmax) return rainbow_agent.project_distribution(target_support, next_probabilities, self._support)
def testProjectFromNonMonotonicSupport(self): supports = tf.constant([[4, 3, 2, 1, 0]], dtype=tf.float32) weights = tf.constant([[0.1, 0.2, 0.1, 0.3, 0.3]], dtype=tf.float32) target_support = tf.constant([3, 4, 5, 6, 7], dtype=tf.float32) projection = rainbow_agent.project_distribution( supports, weights, target_support) expected_projection = [[0.9, 0.1, 0.0, 0.0, 0.0]] with self.test_session() as sess: tf.global_variables_initializer().run() projection_ = sess.run(projection) self.assertAllClose(expected_projection, projection_)
def testProjectSingleIdenticalDistribution(self): supports = tf.constant([[0, 1, 2, 3, 4]], dtype=tf.float32) expected_weights = [0.1, 0.2, 0.1, 0.3, 0.3] weights = tf.constant([expected_weights], dtype=tf.float32) target_support = tf.constant([0, 1, 2, 3, 4], dtype=tf.float32) projection = rainbow_agent.project_distribution( supports, weights, target_support) with self.test_session() as sess: tf.global_variables_initializer().run() projection_ = sess.run(projection) self.assertAllClose([expected_weights], projection_)
def testProjectFromNonMonotonicSupport(self): supports = tf.constant([[4, 3, 2, 1, 0]], dtype=tf.float32) weights = tf.constant([[0.1, 0.2, 0.1, 0.3, 0.3]], dtype=tf.float32) target_support = tf.constant([3, 4, 5, 6, 7], dtype=tf.float32) projection = rainbow_agent.project_distribution(supports, weights, target_support) expected_projection = [[0.9, 0.1, 0.0, 0.0, 0.0]] with self.test_session() as sess: tf.global_variables_initializer().run() projection_ = sess.run(projection) self.assertAllClose(expected_projection, projection_)
def testProjectSingleIdenticalDistribution(self): supports = tf.constant([[0, 1, 2, 3, 4]], dtype=tf.float32) expected_weights = [0.1, 0.2, 0.1, 0.3, 0.3] weights = tf.constant([expected_weights], dtype=tf.float32) target_support = tf.constant([0, 1, 2, 3, 4], dtype=tf.float32) projection = rainbow_agent.project_distribution(supports, weights, target_support) with self.test_session() as sess: tf.global_variables_initializer().run() projection_ = sess.run(projection) self.assertAllClose([expected_weights], projection_)
def testProjectNewSupportHasInconsistentDeltask(self): supports = tf.constant([[0, 2, 4, 6, 8], [3, 4, 5, 6, 7]], dtype=tf.float32) weights = tf.constant( [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.3, 0.2, 0.2]], dtype=tf.float32) target_support = tf.constant([3, 4, 6, 7, 8], dtype=tf.float32) projection = rainbow_agent.project_distribution( supports, weights, target_support, validate_args=True) with self.test_session() as sess: tf.global_variables_initializer().run() with self.assertRaisesRegexp(tf.errors.InvalidArgumentError, 'assertion failed'): sess.run(projection)
def testExampleFromCodeComments(self): supports = tf.constant([[0, 2, 4, 6, 8], [1, 3, 4, 5, 6]], dtype=tf.float32) weights = tf.constant( [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.2, 0.5, 0.1, 0.1]], dtype=tf.float32) target_support = tf.constant([4, 5, 6, 7, 8], dtype=tf.float32) projection = rainbow_agent.project_distribution(supports, weights, target_support) expected_projections = [[0.8, 0.0, 0.1, 0.0, 0.1], [0.8, 0.1, 0.1, 0.0, 0.0]] with self.test_session() as sess: tf.global_variables_initializer().run() projection_ = sess.run(projection) self.assertAllClose(expected_projections, projection_)
def testProjectBatchOfDifferentDistributionsWithLargerDelta(self): supports = tf.constant([[0, 2, 4, 6, 8], [8, 9, 10, 12, 14]], dtype=tf.float32) weights = tf.constant( [[0.1, 0.2, 0.2, 0.2, 0.3], [0.1, 0.2, 0.4, 0.1, 0.2]], dtype=tf.float32) target_support = tf.constant([0, 4, 8, 12, 16], dtype=tf.float32) projection = rainbow_agent.project_distribution( supports, weights, target_support) expected_projections = [[0.2, 0.4, 0.4, 0.0, 0.0], [0.0, 0.0, 0.45, 0.45, 0.1]] with self.test_session() as sess: tf.global_variables_initializer().run() projection_ = sess.run(projection) self.assertAllClose(expected_projections, projection_)
def testProjectBatchOfDifferentDistributionsWithLargerDelta(self): supports = tf.constant( [[0, 2, 4, 6, 8], [8, 9, 10, 12, 14]], dtype=tf.float32) weights = tf.constant( [[0.1, 0.2, 0.2, 0.2, 0.3], [0.1, 0.2, 0.4, 0.1, 0.2]], dtype=tf.float32) target_support = tf.constant([0, 4, 8, 12, 16], dtype=tf.float32) projection = rainbow_agent.project_distribution(supports, weights, target_support) expected_projections = [[0.2, 0.4, 0.4, 0.0, 0.0], [0.0, 0.0, 0.45, 0.45, 0.1]] with self.test_session() as sess: tf.global_variables_initializer().run() projection_ = sess.run(projection) self.assertAllClose(expected_projections, projection_)
def testProjectBatchOfDifferentDistributions(self): supports = tf.constant( [[0, 2, 4, 6, 8], [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]], dtype=tf.float32) weights = tf.constant( [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.1, 0.3, 0.3], [0.1, 0.2, 0.3, 0.2, 0.2]], dtype=tf.float32) target_support = tf.constant([3, 4, 5, 6, 7], dtype=tf.float32) projection = rainbow_agent.project_distribution(supports, weights, target_support) expected_projections = [[0.3, 0.3, 0.0, 0.2, 0.2], [0.7, 0.3, 0.0, 0.0, 0.0], [0.1, 0.2, 0.3, 0.2, 0.2]] with self.test_session() as sess: tf.global_variables_initializer().run() projection_ = sess.run(projection) self.assertAllClose(expected_projections, projection_)
def testMultiDimensionalTargetSupportWithPlaceholders(self): supports = [[0, 2, 4, 6, 8], [3, 4, 5, 6, 7]] supports_ph = tf.placeholder(tf.float32, None) weights = [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.3, 0.2, 0.2]] weights_ph = tf.placeholder(tf.float32, None) target_support = [[3]] target_support_ph = tf.placeholder(tf.float32, None) projection = rainbow_agent.project_distribution( supports_ph, weights_ph, target_support_ph, validate_args=True) with self.test_session() as sess: tf.global_variables_initializer().run() with (self.assertRaises(tf.errors.InvalidArgumentError)): sess.run( projection, feed_dict={ supports_ph: supports, weights_ph: weights, target_support_ph: target_support })
def testUsingPlaceholders(self): supports = [[0, 2, 4, 6, 8], [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]] supports_ph = tf.placeholder(tf.float32, None) weights = [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.1, 0.3, 0.3], [0.1, 0.2, 0.3, 0.2, 0.2]] weights_ph = tf.placeholder(tf.float32, None) target_support = [3, 4, 5, 6, 7] target_support_ph = tf.placeholder(tf.float32, None) projection = rainbow_agent.project_distribution( supports_ph, weights_ph, target_support_ph) expected_projections = [[0.3, 0.3, 0.0, 0.2, 0.2], [0.7, 0.3, 0.0, 0.0, 0.0], [0.1, 0.2, 0.3, 0.2, 0.2]] with self.test_session() as sess: tf.global_variables_initializer().run() projection_ = sess.run(projection, feed_dict={ supports_ph: supports, weights_ph: weights, target_support_ph: target_support }) self.assertAllClose(expected_projections, projection_)
def testUsingPlaceholders(self): supports = [[0, 2, 4, 6, 8], [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]] supports_ph = tf.placeholder(tf.float32, None) weights = [[0.1, 0.2, 0.3, 0.2, 0.2], [0.1, 0.2, 0.1, 0.3, 0.3], [0.1, 0.2, 0.3, 0.2, 0.2]] weights_ph = tf.placeholder(tf.float32, None) target_support = [3, 4, 5, 6, 7] target_support_ph = tf.placeholder(tf.float32, None) projection = rainbow_agent.project_distribution(supports_ph, weights_ph, target_support_ph) expected_projections = [[0.3, 0.3, 0.0, 0.2, 0.2], [0.7, 0.3, 0.0, 0.0, 0.0], [0.1, 0.2, 0.3, 0.2, 0.2]] with self.test_session() as sess: tf.global_variables_initializer().run() projection_ = sess.run( projection, feed_dict={ supports_ph: supports, weights_ph: weights, target_support_ph: target_support }) self.assertAllClose(expected_projections, projection_)