def testScalarizeObjectivesWrongInputRankRaisesError(self):
     objectives_tensor = tf.constant([1], dtype=tf.float32)
     with self.assertRaisesRegexp(
             ValueError,
             'The objectives_tensor should be rank-3, but is rank-1'):
         greedy_multi_objective_policy.scalarize_objectives(
             objectives_tensor, self._scalarizer)
 def testPredictedRewards(self):
     policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
         self._time_step_spec,
         self._action_spec,
         self._scalarizer,
         self._create_objective_networks(),
         emit_policy_info=(
             policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,
             policy_utilities.InfoFields.
             MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN))
     observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
     time_step = ts.restart(observations, batch_size=2)
     action_step = policy.action(time_step)
     self.assertEqual(action_step.action.shape.as_list(), [2])
     self.assertEqual(action_step.action.dtype, tf.int32)
     # Initialize all variables
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllEqual(self.evaluate(action_step.action), [2, 0])
     # The expected values are obtained by passing the observation through the
     # Keras dense layer of the DummyNet (defined above).
     predicted_rewards_expected_array = np.array([[[8, 11, 14], [12, 8, 13],
                                                   [11, 14, 8]],
                                                  [[5, 8, 11], [10, 5, 9],
                                                   [8, 11, 5]]])
     p_info = self.evaluate(action_step.info)
     predicted_rewards = getattr(
         p_info, policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN)
     self.assertAllClose(predicted_rewards,
                         predicted_rewards_expected_array)
     self.assertAllClose(
         getattr(
             p_info, policy_utilities.InfoFields.
             MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN),
         greedy_multi_objective_policy.scalarize_objectives(
             predicted_rewards, policy.scalarizer))
 def testScalarizeObjectivesWrongNumberOfObjectiveRaisesError(self):
     objectives_tensor = tf.constant([[[1, 2, 3]], [[4, 5, 6]]],
                                     dtype=tf.float32)
     with self.assertRaisesRegexp(
             ValueError,
             'The number of input objectives should be 3, but is 1'):
         self.evaluate(
             greedy_multi_objective_policy.scalarize_objectives(
                 objectives_tensor, self._scalarizer))
 def testScalarizeObjectives(self):
     objectives_tensor = tf.constant(
         [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
         dtype=tf.float32)
     scalarized_reward = greedy_multi_objective_policy.scalarize_objectives(
         objectives_tensor, self._scalarizer)
     self.assertAllClose(self.evaluate(scalarized_reward),
                         [[3, 4], [9, 10]],
                         rtol=1e-4,
                         atol=1e-3)