def testUpdate(self): policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, self._action_spec, self._scalarizer, self._create_objective_networks()) new_policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, self._action_spec, self._scalarizer, self._create_objective_networks()) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step) new_action_step = new_policy.action(time_step) self.assertEqual(len(policy.variables()), 6) self.assertEqual(len(new_policy.variables()), 6) self.assertEqual(action_step.action.shape, new_action_step.action.shape) self.assertEqual(action_step.action.dtype, new_action_step.action.dtype) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertIsNone(self.evaluate(new_policy.update(policy))) self.assertAllEqual(self.evaluate(action_step.action), [2, 0]) self.assertAllEqual(self.evaluate(new_action_step.action), [2, 0])
def testBuild(self): policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, self._action_spec, self._scalarizer, self._create_objective_networks()) self.assertEqual(policy.time_step_spec, self._time_step_spec) self.assertEqual(policy.action_spec, self._action_spec)
def _create_arm_policy_and_observations( self ) -> Tuple[greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy, Dict[Text, tf.Tensor]]: obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) objective_networks = [ global_and_arm_feature_network. create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4), (4, 2)) for _ in range(3) ] policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( time_step_spec, action_spec, self._scalarizer, objective_networks, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) action_feature = tf.cast(tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]), dtype=tf.float32) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [2, 1]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: action_feature } return policy, observations
def testPredictedRewards(self): policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, self._action_spec, self._scalarizer, self._create_objective_networks(), emit_policy_info=( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, policy_utilities.InfoFields. MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN)) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [2, 0]) # The expected values are obtained by passing the observation through the # Keras dense layer of the DummyNet (defined above). predicted_rewards_expected_array = np.array([[[8, 11, 14], [12, 8, 13], [11, 14, 8]], [[5, 8, 11], [10, 5, 9], [8, 11, 5]]]) p_info = self.evaluate(action_step.info) predicted_rewards = getattr( p_info, policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN) self.assertAllClose(predicted_rewards, predicted_rewards_expected_array) self.assertAllClose( getattr( p_info, policy_utilities.InfoFields. MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN), greedy_multi_objective_policy.scalarize_objectives( predicted_rewards, policy.scalarizer))
def testTooFewNetworksRaiseError(self): with self.assertRaisesRegexp( ValueError, 'Number of objectives should be at least two, but found to be 1' ): greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, self._action_spec, self._scalarizer, [self._create_objective_networks()[0]])
def testMultipleActionsRaiseError(self): action_spec = [tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)] * 2 with self.assertRaisesRegexp( NotImplementedError, 'action_spec can only contain a single BoundedTensorSpec'): greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, action_spec, self._scalarizer, self._create_objective_networks())
def testWrongActionsRaiseError(self): action_spec = tensor_spec.BoundedTensorSpec((5, 6, 7), tf.float32, 0, 2) with self.assertRaisesRegexp( NotImplementedError, 'action_spec must be a BoundedTensorSpec of type int32.*'): greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, action_spec, self._scalarizer, self._create_objective_networks())
def testUnmatchingPolicyState(self): policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, self._action_spec, self._scalarizer, self._create_objective_networks()) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) with self.assertRaisesRegexp( ValueError, 'policy_state and policy_state_spec structures do not match:'): policy.action(time_step, policy_state=[()])
def testActionHeteroscedastic(self): policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, self._action_spec, self._scalarizer, self._create_heteroscedastic_networks()) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [2, 0])
def testWrongOutputLayerRaiseError(self): action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 10, 20) policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, action_spec, self._scalarizer, self._create_objective_networks()) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) with self.assertRaisesRegexp( ValueError, r'The number of actions \(11\) does not match objective network 0' r' output size \(3\)\.'): policy.action(time_step)
def testActionScalarSpecWithShift(self): action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 10, 12) policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, action_spec, self._scalarizer, self._create_objective_networks()) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [12, 10])
def testNoneTimeStepSpecForPerArmFeaturesRaisesError(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) objective_networks = [ global_and_arm_feature_network. create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4), (4, 2)) for _ in range(3) ] with self.assertRaisesRegexp( ValueError, 'time_step_spec should not be None for per-arm-features policies' ): greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( None, action_spec, self._scalarizer, objective_networks, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', ))
def testSetScalarizationParameters(self): policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, self._action_spec, self._scalarizer, self._create_objective_networks()) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) policy.scalarizer.set_parameters( direction=tf.constant([[0, 1, 0], [0, 0, 1]], dtype=tf.float32), transform_params={ multi_objective_scalarizer.HyperVolumeScalarizer.SLOPE_KEY: tf.constant([[0.2, 0.2, 0.2], [0.1, 0.1, 0.1]], dtype=tf.float32), multi_objective_scalarizer.HyperVolumeScalarizer.OFFSET_KEY: tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.float32) }) action_step = policy.action(time_step) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [2, 1])
def testMaskedAction(self): observation_spec = (tensor_spec.TensorSpec([2], tf.float32), tensor_spec.TensorSpec([3], tf.int32)) time_step_spec = ts.time_step_spec(observation_spec) def split_fn(obs): return obs[0], obs[1] policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( time_step_spec, self._action_spec, self._scalarizer, self._create_objective_networks(), observation_and_action_constraint_splitter=split_fn) observations = (tf.constant([[1, 2], [2, 1]], dtype=tf.float32), tf.constant([[0, 0, 1], [0, 1, 0]], dtype=tf.int32)) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [2, 1])
def __init__( self, time_step_spec: Optional[ts.TimeStep], action_spec: Optional[types.NestedBoundedTensorSpec], scalarizer: multi_objective_scalarizer.Scalarizer, objective_networks: Sequence[Network], optimizer: tf.keras.optimizers.Optimizer, observation_and_action_constraint_splitter: types.Splitter = None, accepts_per_arm_features: bool = False, # Params for training. error_loss_fn: Callable[ ..., tf.Tensor] = tf.compat.v1.losses.mean_squared_error, gradient_clipping: Optional[float] = None, # Params for debugging. debug_summaries: bool = False, summarize_grads_and_vars: bool = False, enable_summaries: bool = True, emit_policy_info: Tuple[Text] = (), train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates a Greedy Multi-objective Neural Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. scalarizer: A `tf_agents.bandits.multi_objective.multi_objective_scalarizer.Scalarizer` object that implements scalarization of multiple objectives into a single scalar reward. objective_networks: A Sequence of `tf_agents.network.Network` objects to be used by the agent. Each network will be called with call(observation, step_type) and is expected to provide a prediction for a specific objective for all actions. optimizer: A 'tf.keras.optimizers.Optimizer' object, the optimizer to use for training. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the agent accepts per-arm features. error_loss_fn: A function for computing the error loss, taking parameters labels, predictions, and weights (any function from tf.losses would work). The default is `tf.losses.mean_squared_error`. gradient_clipping: A float representing the norm length to clip gradients (or None for no clipping.) debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. enable_summaries: A Python bool, default True. When False, all summaries (debug or otherwise) should not be written. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. train_step_counter: An optional `tf.Variable` to increment every time the train op is run. Defaults to the `global_step`. name: Python str name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: - If the action spec contains more than one action or or it is not a bounded scalar int32 spec with minimum 0. - If `objective_networks` has fewer than two networks. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._accepts_per_arm_features = accepts_per_arm_features self._num_objectives = len(objective_networks) if self._num_objectives < 2: raise ValueError( 'Number of objectives should be at least two, but found to be {}' .format(self._num_objectives)) self._objective_networks = objective_networks self._optimizer = optimizer self._error_loss_fn = error_loss_fn self._gradient_clipping = gradient_clipping self._heteroscedastic = [ isinstance(network, heteroscedastic_q_network.HeteroscedasticQNetwork) for network in objective_networks ] policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( time_step_spec, action_spec, scalarizer, self._objective_networks, observation_and_action_constraint_splitter, accepts_per_arm_features=accepts_per_arm_features, emit_policy_info=emit_policy_info) training_data_spec = None if accepts_per_arm_features: training_data_spec = bandit_spec_utils.drop_arm_observation( policy.trajectory_spec) super(GreedyMultiObjectiveNeuralAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy=policy, train_sequence_length=None, training_data_spec=training_data_spec, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, enable_summaries=enable_summaries, train_step_counter=train_step_counter)
def testPerArmRewardsSparseObs(self): obs_spec = { 'global': { 'sport': tensor_spec.TensorSpec((), tf.string) }, 'per_arm': { 'name': tensor_spec.TensorSpec((3, ), tf.string), 'fruit': tensor_spec.TensorSpec((3, ), tf.string) } } columns_a = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'name', ['bob', 'george', 'wanda'])) columns_b = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'fruit', ['banana', 'kiwi', 'pear'])) columns_c = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'sport', ['bridge', 'chess', 'snooker'])) objective_networks = [] for _ in range(3): objective_networks.append( global_and_arm_feature_network. create_feed_forward_common_tower_network( observation_spec=obs_spec, global_layers=(4, 3, 2), arm_layers=(6, 5, 4), common_layers=(7, 6, 5), global_preprocessing_combiner=( tf.compat.v2.keras.layers.DenseFeatures([columns_c])), arm_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures([columns_a, columns_b]))) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2) policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( time_step_spec, action_spec, self._scalarizer, objective_networks, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) observations = { 'global': { 'sport': tf.constant(['snooker', 'chess']) }, 'per_arm': { 'name': tf.constant([['george', 'george', 'george'], ['bob', 'bob', 'bob']]), 'fruit': tf.constant([['banana', 'banana', 'banana'], ['kiwi', 'kiwi', 'kiwi']]) } } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate([ tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer() ]) action, p_info, first_arm_name_feature = self.evaluate([ action_step.action, action_step.info, observations[bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0] ]) self.assertAllEqual(action.shape, [2]) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3, 3]) self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2]) self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2]) first_action = action[0] self.assertAllEqual(p_info.chosen_arm_features['name'][0], first_arm_name_feature[first_action])