def testUpdate(self):
        policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
            self._time_step_spec, self._action_spec, self._scalarizer,
            self._create_objective_networks())
        new_policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
            self._time_step_spec, self._action_spec, self._scalarizer,
            self._create_objective_networks())

        observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
        time_step = ts.restart(observations, batch_size=2)

        action_step = policy.action(time_step)
        new_action_step = new_policy.action(time_step)

        self.assertEqual(len(policy.variables()), 6)
        self.assertEqual(len(new_policy.variables()), 6)
        self.assertEqual(action_step.action.shape,
                         new_action_step.action.shape)
        self.assertEqual(action_step.action.dtype,
                         new_action_step.action.dtype)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertIsNone(self.evaluate(new_policy.update(policy)))

        self.assertAllEqual(self.evaluate(action_step.action), [2, 0])
        self.assertAllEqual(self.evaluate(new_action_step.action), [2, 0])
    def testBuild(self):
        policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
            self._time_step_spec, self._action_spec, self._scalarizer,
            self._create_objective_networks())

        self.assertEqual(policy.time_step_spec, self._time_step_spec)
        self.assertEqual(policy.action_spec, self._action_spec)
 def _create_arm_policy_and_observations(
     self
 ) -> Tuple[greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy,
            Dict[Text, tf.Tensor]]:
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
     time_step_spec = ts.time_step_spec(obs_spec)
     action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
     objective_networks = [
         global_and_arm_feature_network.
         create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4),
                                                  (4, 2)) for _ in range(3)
     ]
     policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
         time_step_spec,
         action_spec,
         self._scalarizer,
         objective_networks,
         accepts_per_arm_features=True,
         emit_policy_info=('predicted_rewards_mean', ))
     action_feature = tf.cast(tf.reshape(tf.random.shuffle(tf.range(24)),
                                         shape=[2, 4, 3]),
                              dtype=tf.float32)
     observations = {
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [2, 1]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         action_feature
     }
     return policy, observations
 def testPredictedRewards(self):
     policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
         self._time_step_spec,
         self._action_spec,
         self._scalarizer,
         self._create_objective_networks(),
         emit_policy_info=(
             policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,
             policy_utilities.InfoFields.
             MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN))
     observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
     time_step = ts.restart(observations, batch_size=2)
     action_step = policy.action(time_step)
     self.assertEqual(action_step.action.shape.as_list(), [2])
     self.assertEqual(action_step.action.dtype, tf.int32)
     # Initialize all variables
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllEqual(self.evaluate(action_step.action), [2, 0])
     # The expected values are obtained by passing the observation through the
     # Keras dense layer of the DummyNet (defined above).
     predicted_rewards_expected_array = np.array([[[8, 11, 14], [12, 8, 13],
                                                   [11, 14, 8]],
                                                  [[5, 8, 11], [10, 5, 9],
                                                   [8, 11, 5]]])
     p_info = self.evaluate(action_step.info)
     predicted_rewards = getattr(
         p_info, policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN)
     self.assertAllClose(predicted_rewards,
                         predicted_rewards_expected_array)
     self.assertAllClose(
         getattr(
             p_info, policy_utilities.InfoFields.
             MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN),
         greedy_multi_objective_policy.scalarize_objectives(
             predicted_rewards, policy.scalarizer))
 def testTooFewNetworksRaiseError(self):
     with self.assertRaisesRegexp(
             ValueError,
             'Number of objectives should be at least two, but found to be 1'
     ):
         greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
             self._time_step_spec, self._action_spec, self._scalarizer,
             [self._create_objective_networks()[0]])
 def testMultipleActionsRaiseError(self):
     action_spec = [tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)] * 2
     with self.assertRaisesRegexp(
             NotImplementedError,
             'action_spec can only contain a single BoundedTensorSpec'):
         greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
             self._time_step_spec, action_spec, self._scalarizer,
             self._create_objective_networks())
 def testWrongActionsRaiseError(self):
     action_spec = tensor_spec.BoundedTensorSpec((5, 6, 7), tf.float32, 0,
                                                 2)
     with self.assertRaisesRegexp(
             NotImplementedError,
             'action_spec must be a BoundedTensorSpec of type int32.*'):
         greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
             self._time_step_spec, action_spec, self._scalarizer,
             self._create_objective_networks())
 def testUnmatchingPolicyState(self):
     policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
         self._time_step_spec, self._action_spec, self._scalarizer,
         self._create_objective_networks())
     observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
     time_step = ts.restart(observations, batch_size=2)
     with self.assertRaisesRegexp(
             ValueError,
             'policy_state and policy_state_spec structures do not match:'):
         policy.action(time_step, policy_state=[()])
 def testActionHeteroscedastic(self):
     policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
         self._time_step_spec, self._action_spec, self._scalarizer,
         self._create_heteroscedastic_networks())
     observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
     time_step = ts.restart(observations, batch_size=2)
     action_step = policy.action(time_step)
     self.assertEqual(action_step.action.shape.as_list(), [2])
     self.assertEqual(action_step.action.dtype, tf.int32)
     # Initialize all variables
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllEqual(self.evaluate(action_step.action), [2, 0])
 def testWrongOutputLayerRaiseError(self):
     action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 10, 20)
     policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
         self._time_step_spec, action_spec, self._scalarizer,
         self._create_objective_networks())
     observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
     time_step = ts.restart(observations, batch_size=2)
     with self.assertRaisesRegexp(
             ValueError,
             r'The number of actions \(11\) does not match objective network 0'
             r' output size \(3\)\.'):
         policy.action(time_step)
    def testActionScalarSpecWithShift(self):
        action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 10, 12)
        policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
            self._time_step_spec, action_spec, self._scalarizer,
            self._create_objective_networks())

        observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllEqual(self.evaluate(action_step.action), [12, 10])
 def testNoneTimeStepSpecForPerArmFeaturesRaisesError(self):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
     action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
     objective_networks = [
         global_and_arm_feature_network.
         create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4),
                                                  (4, 2)) for _ in range(3)
     ]
     with self.assertRaisesRegexp(
             ValueError,
             'time_step_spec should not be None for per-arm-features policies'
     ):
         greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
             None,
             action_spec,
             self._scalarizer,
             objective_networks,
             accepts_per_arm_features=True,
             emit_policy_info=('predicted_rewards_mean', ))
 def testSetScalarizationParameters(self):
     policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
         self._time_step_spec, self._action_spec, self._scalarizer,
         self._create_objective_networks())
     observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
     time_step = ts.restart(observations, batch_size=2)
     policy.scalarizer.set_parameters(
         direction=tf.constant([[0, 1, 0], [0, 0, 1]], dtype=tf.float32),
         transform_params={
             multi_objective_scalarizer.HyperVolumeScalarizer.SLOPE_KEY:
             tf.constant([[0.2, 0.2, 0.2], [0.1, 0.1, 0.1]],
                         dtype=tf.float32),
             multi_objective_scalarizer.HyperVolumeScalarizer.OFFSET_KEY:
             tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.float32)
         })
     action_step = policy.action(time_step)
     self.assertEqual(action_step.action.shape.as_list(), [2])
     self.assertEqual(action_step.action.dtype, tf.int32)
     # Initialize all variables
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllEqual(self.evaluate(action_step.action), [2, 1])
    def testMaskedAction(self):
        observation_spec = (tensor_spec.TensorSpec([2], tf.float32),
                            tensor_spec.TensorSpec([3], tf.int32))
        time_step_spec = ts.time_step_spec(observation_spec)

        def split_fn(obs):
            return obs[0], obs[1]

        policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
            time_step_spec,
            self._action_spec,
            self._scalarizer,
            self._create_objective_networks(),
            observation_and_action_constraint_splitter=split_fn)

        observations = (tf.constant([[1, 2], [2, 1]], dtype=tf.float32),
                        tf.constant([[0, 0, 1], [0, 1, 0]], dtype=tf.int32))
        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllEqual(self.evaluate(action_step.action), [2, 1])
    def __init__(
            self,
            time_step_spec: Optional[ts.TimeStep],
            action_spec: Optional[types.NestedBoundedTensorSpec],
            scalarizer: multi_objective_scalarizer.Scalarizer,
            objective_networks: Sequence[Network],
            optimizer: tf.keras.optimizers.Optimizer,
            observation_and_action_constraint_splitter: types.Splitter = None,
            accepts_per_arm_features: bool = False,
            # Params for training.
            error_loss_fn: Callable[
                ..., tf.Tensor] = tf.compat.v1.losses.mean_squared_error,
            gradient_clipping: Optional[float] = None,
            # Params for debugging.
            debug_summaries: bool = False,
            summarize_grads_and_vars: bool = False,
            enable_summaries: bool = True,
            emit_policy_info: Tuple[Text] = (),
            train_step_counter: Optional[tf.Variable] = None,
            name: Optional[Text] = None):
        """Creates a Greedy Multi-objective Neural Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      scalarizer: A
       `tf_agents.bandits.multi_objective.multi_objective_scalarizer.Scalarizer`
        object that implements scalarization of multiple objectives into a
        single scalar reward.
      objective_networks: A Sequence of `tf_agents.network.Network` objects to
        be used by the agent. Each network will be called with
        call(observation, step_type) and is expected to provide a prediction for
        a specific objective for all actions.
      optimizer: A 'tf.keras.optimizers.Optimizer' object, the optimizer to use
        for training.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask of shape `[batch_size, num_actions]`.
        This function should also work with a `TensorSpec` as input, and should
        output `TensorSpec` objects for the observation and mask.
      accepts_per_arm_features: (bool) Whether the agent accepts per-arm
        features.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError:
        - If the action spec contains more than one action or or it is not a
          bounded scalar int32 spec with minimum 0.
        - If `objective_networks` has fewer than two networks.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
        self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
            action_spec)
        self._accepts_per_arm_features = accepts_per_arm_features

        self._num_objectives = len(objective_networks)
        if self._num_objectives < 2:
            raise ValueError(
                'Number of objectives should be at least two, but found to be {}'
                .format(self._num_objectives))
        self._objective_networks = objective_networks
        self._optimizer = optimizer
        self._error_loss_fn = error_loss_fn
        self._gradient_clipping = gradient_clipping
        self._heteroscedastic = [
            isinstance(network,
                       heteroscedastic_q_network.HeteroscedasticQNetwork)
            for network in objective_networks
        ]

        policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
            time_step_spec,
            action_spec,
            scalarizer,
            self._objective_networks,
            observation_and_action_constraint_splitter,
            accepts_per_arm_features=accepts_per_arm_features,
            emit_policy_info=emit_policy_info)
        training_data_spec = None
        if accepts_per_arm_features:
            training_data_spec = bandit_spec_utils.drop_arm_observation(
                policy.trajectory_spec)

        super(GreedyMultiObjectiveNeuralAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy=policy,
                             train_sequence_length=None,
                             training_data_spec=training_data_spec,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             train_step_counter=train_step_counter)
    def testPerArmRewardsSparseObs(self):
        obs_spec = {
            'global': {
                'sport': tensor_spec.TensorSpec((), tf.string)
            },
            'per_arm': {
                'name': tensor_spec.TensorSpec((3, ), tf.string),
                'fruit': tensor_spec.TensorSpec((3, ), tf.string)
            }
        }
        columns_a = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'name', ['bob', 'george', 'wanda']))
        columns_b = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'fruit', ['banana', 'kiwi', 'pear']))
        columns_c = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                'sport', ['bridge', 'chess', 'snooker']))

        objective_networks = []
        for _ in range(3):
            objective_networks.append(
                global_and_arm_feature_network.
                create_feed_forward_common_tower_network(
                    observation_spec=obs_spec,
                    global_layers=(4, 3, 2),
                    arm_layers=(6, 5, 4),
                    common_layers=(7, 6, 5),
                    global_preprocessing_combiner=(
                        tf.compat.v2.keras.layers.DenseFeatures([columns_c])),
                    arm_preprocessing_combiner=tf.compat.v2.keras.layers.
                    DenseFeatures([columns_a, columns_b])))
        time_step_spec = ts.time_step_spec(obs_spec)
        action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)
        policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy(
            time_step_spec,
            action_spec,
            self._scalarizer,
            objective_networks,
            accepts_per_arm_features=True,
            emit_policy_info=('predicted_rewards_mean', ))
        observations = {
            'global': {
                'sport': tf.constant(['snooker', 'chess'])
            },
            'per_arm': {
                'name':
                tf.constant([['george', 'george', 'george'],
                             ['bob', 'bob', 'bob']]),
                'fruit':
                tf.constant([['banana', 'banana', 'banana'],
                             ['kiwi', 'kiwi', 'kiwi']])
            }
        }

        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate([
            tf.compat.v1.global_variables_initializer(),
            tf.compat.v1.tables_initializer()
        ])
        action, p_info, first_arm_name_feature = self.evaluate([
            action_step.action, action_step.info,
            observations[bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0]
        ])
        self.assertAllEqual(action.shape, [2])
        self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3, 3])
        self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2])
        self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2])
        first_action = action[0]
        self.assertAllEqual(p_info.chosen_arm_features['name'][0],
                            first_arm_name_feature[first_action])