def testDynamicBatchSize(self):
   policy_logits = tf.placeholder(tf.float32, shape=[None, 3])
   action_values = tf.placeholder(tf.float32, shape=[None])
   actions = tf.placeholder(tf.int32, shape=[None])
   loss = pg_ops.discrete_policy_gradient(policy_logits, actions,
                                          action_values)
   self.assertEqual(loss.get_shape().as_list(), [None])
   gradients = tf.gradients(tf.reduce_sum(loss), [policy_logits])
   self.assertAllEqual(gradients[0].get_shape().as_list(), [None, 3])
 def testDynamicBatchSize(self):
   policy_logits = tf.placeholder(tf.float32, shape=[None, 3])
   action_values = tf.placeholder(tf.float32, shape=[None])
   actions = tf.placeholder(tf.int32, shape=[None])
   loss = pg_ops.discrete_policy_gradient(policy_logits, actions,
                                          action_values)
   self.assertEqual(loss.get_shape().as_list(), [None])
   gradients = tf.gradients(tf.reduce_sum(loss), [policy_logits])
   self.assertAllEqual(gradients[0].get_shape().as_list(), [None, 3])
  def testLoss(self):
    with self.test_session() as sess:
      policy_logits = tf.constant([[0, 1], [0, 1], [1, 1], [0, 100]],
                                  dtype=tf.float32)
      action_values = tf.constant([0, 1, 2, 1], dtype=tf.float32)
      actions = tf.constant([0, 0, 1, 1], dtype=tf.int32)
      loss = pg_ops.discrete_policy_gradient(policy_logits, actions,
                                             action_values)
      self.assertEqual(loss.get_shape(), tf.TensorShape(4))

      # Calculate the targets with:
      #     loss = action_value*(-logits[action] + log(sum_a(exp(logits[a]))))
      #  The final case (with large logits), runs out of precision and gets
      #  truncated to 0, but isn't `nan`.
      self.assertAllClose(sess.run(loss), [0, 1.313262, 1.386294, 0])
  def testLoss(self):
    with self.test_session() as sess:
      policy_logits = tf.constant([[0, 1], [0, 1], [1, 1], [0, 100]],
                                  dtype=tf.float32)
      action_values = tf.constant([0, 1, 2, 1], dtype=tf.float32)
      actions = tf.constant([0, 0, 1, 1], dtype=tf.int32)
      loss = pg_ops.discrete_policy_gradient(policy_logits, actions,
                                             action_values)
      self.assertEqual(loss.get_shape(), tf.TensorShape(4))

      # Calculate the targets with:
      #     loss = action_value*(-logits[action] + log(sum_a(exp(logits[a]))))
      #  The final case (with large logits), runs out of precision and gets
      #  truncated to 0, but isn't `nan`.
      self.assertAllClose(sess.run(loss), [0, 1.313262, 1.386294, 0])
  def testGradients(self):
    with self.test_session() as sess:
      policy_logits = tf.constant([[0, 1], [0, 1], [1, 1], [0, 100]],
                                  dtype=tf.float32)
      action_values = tf.constant([0, 1, 2, 1], dtype=tf.float32)
      actions = tf.constant([0, 0, 1, 1], dtype=tf.int32)
      loss = pg_ops.discrete_policy_gradient(policy_logits, actions,
                                             action_values)
      total_loss = tf.reduce_sum(loss)
      gradients = tf.gradients([total_loss], [policy_logits])
      grad_policy_logits = sess.run(gradients[0])
      #  The final case (with large logits), runs out of precision and gets
      #  truncated to 0, but isn't `nan`.
      self.assertAllClose(grad_policy_logits,
                          [[0, 0], [-0.731, 0.731], [1, -1], [0, 0]], atol=1e-4)

      self.assertAllEqual(tf.gradients([total_loss], [actions, action_values]),
                          [None, None])
  def testGradients(self):
    with self.test_session() as sess:
      policy_logits = tf.constant([[0, 1], [0, 1], [1, 1], [0, 100]],
                                  dtype=tf.float32)
      action_values = tf.constant([0, 1, 2, 1], dtype=tf.float32)
      actions = tf.constant([0, 0, 1, 1], dtype=tf.int32)
      loss = pg_ops.discrete_policy_gradient(policy_logits, actions,
                                             action_values)
      total_loss = tf.reduce_sum(loss)
      gradients = tf.gradients([total_loss], [policy_logits])
      grad_policy_logits = sess.run(gradients[0])
      #  The final case (with large logits), runs out of precision and gets
      #  truncated to 0, but isn't `nan`.
      self.assertAllClose(grad_policy_logits,
                          [[0, 0], [-0.731, 0.731], [1, -1], [0, 0]], atol=1e-4)

      self.assertAllEqual(tf.gradients([total_loss], [actions, action_values]),
                          [None, None])