def testTrainPerArmAgentWithConstraint(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) reward_spec = { 'reward': tensor_spec.TensorSpec( shape=(), dtype=tf.float32, name='reward'), 'constraint': tensor_spec.TensorSpec( shape=(), dtype=tf.float32, name='constraint') } time_step_spec = ts.time_step_spec(obs_spec, reward_spec) reward_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1) constraint_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) neural_constraint = constraints.NeuralConstraint( time_step_spec, self._action_spec, constraint_network=constraint_net) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, accepts_per_arm_features=True, optimizer=optimizer, constraints=[neural_constraint]) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast( tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32) } actions = np.array([0, 3], dtype=np.int32) rewards = { 'reward': np.array([0.5, 3.0], dtype=np.float32), 'constraint': np.array([6.0, 4.0], dtype=np.float32) } initial_step, final_step = _get_initial_and_final_steps_nested_rewards( observations, rewards) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def testTrainPerArmAgent(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 3) time_step_spec = ts.time_step_spec(obs_spec) reward_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1) agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer, epsilon=0.1, accepts_per_arm_features=True) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast( tf.reshape(tf.range(18), shape=[2, 3, 3]), dtype=tf.float32) } time_steps = ts.restart(observations, batch_size=2) policy = agent.policy action_step = policy.action(time_steps) self.evaluate(tf.compat.v1.initialize_all_variables()) actions = self.evaluate(action_step.action) self.assertAllEqual(actions.shape, (2,))
def testComputeLossWithArmFeatures(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( global_dim=2, per_arm_dim=3, max_num_actions=3) time_step_spec = ts.time_step_spec(obs_spec) constraint_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, global_layers=(4,), arm_layers=(4,), common_layers=(4,))) neural_constraint = constraints.NeuralConstraint( time_step_spec, self._action_spec, constraint_network=constraint_net) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast( tf.reshape(tf.range(18), shape=[2, 3, 3]), dtype=tf.float32) } actions = tf.constant([0, 1], dtype=tf.int32) rewards = tf.constant([0.5, 3.0], dtype=tf.float32) init_op = neural_constraint.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op)) loss = neural_constraint.compute_loss( observations, actions, rewards) self.assertGreater(self.evaluate(loss), 0.0)
def testTrainPerArmAgentWithMask(self): num_actions = 4 obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, num_actions, add_action_mask=True) time_step_spec = ts.time_step_spec(obs_spec) reward_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec[0], (4, 3), (3, 4), (4, 2))) optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, observation_and_action_constraint_splitter=lambda x: [x[0], x[1]], accepts_per_arm_features=True, optimizer=optimizer) observations = ({ bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast( tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32) }, tf.ones([2, num_actions], dtype=tf.int32)) actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps_with_action_mask( observations, rewards) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def testCreateFeedForwardCommonTowerNetworkWithFeatureColumns( self, batch_size=2, feature_dim=4, num_actions=3): obs_spec = { 'global': { 'dense': tensor_spec.TensorSpec(shape=(feature_dim, ), dtype=tf.float32), 'composer': tensor_spec.TensorSpec((), tf.string) }, 'per_arm': { 'name': tensor_spec.TensorSpec((num_actions, ), tf.string), 'fruit': tensor_spec.TensorSpec((num_actions, ), tf.string) } } columns_dense = tf.feature_column.numeric_column('dense', shape=(feature_dim, )) columns_composer = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'composer', ['wolfgang', 'amadeus', 'mozart'])) columns_name = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'name', ['bob', 'george', 'wanda'])) columns_fruit = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'fruit', ['banana', 'kiwi', 'pear'])) net = gafn.create_feed_forward_common_tower_network( observation_spec=obs_spec, global_layers=(4, 3, 2), arm_layers=(6, 5, 4), common_layers=(7, 6, 5), global_preprocessing_combiner=tf.compat.v2.keras.layers. DenseFeatures([columns_dense, columns_composer]), arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures( [columns_name, columns_fruit])) input_nest = { 'global': { 'dense': tf.constant(np.random.rand(batch_size, feature_dim)), 'composer': tf.constant(['wolfgang', 'mozart']) }, 'per_arm': { 'name': tf.constant([[['george'], ['george'], ['george']], [['bob'], ['bob'], ['bob']]]), 'fruit': tf.constant([[['banana'], ['banana'], ['banana']], [['kiwi'], ['kiwi'], ['kiwi']]]) } } output, _ = net(input_nest) self.evaluate([ tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer() ]) output = self.evaluate(output) self.assertAllEqual(output.shape, (batch_size, num_actions))
def testPerArmObservation(self, batch_size, actions_from_reward_layer): global_obs_dim = 7 arm_obs_dim = 3 obs_spec = bandit_spec_utils.create_per_arm_observation_spec( global_obs_dim, arm_obs_dim, self._num_actions) time_step_spec = ts.time_step_spec(obs_spec) dummy_net = arm_network.create_feed_forward_common_tower_network( obs_spec, global_layers=(3, 4, 5), arm_layers=(3, 2), common_layers=(4, 3), output_dim=self._encoding_dim) reward_layer = get_per_arm_reward_layer( encoding_dim=self._encoding_dim) policy = neural_linucb_policy.NeuralLinUCBPolicy( dummy_net, self._encoding_dim, reward_layer, actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=self._a[0:1], data_vector=self._b[0:1], num_samples=self._num_samples_per_arm[0:1], epsilon_greedy=0.0, time_step_spec=time_step_spec, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) current_time_step = self._per_arm_time_step_batch( batch_size=batch_size, global_obs_dim=global_obs_dim, arm_obs_dim=arm_obs_dim) action_step = policy.action(current_time_step) self.assertEqual(action_step.action.dtype, tf.int32) self.evaluate(tf.compat.v1.global_variables_initializer()) action_fn = common.function_in_tf1()(policy.action) action_step = action_fn(current_time_step) input_observation = current_time_step.observation encoded_observation, _ = dummy_net(input_observation) if actions_from_reward_layer: predicted_rewards_from_reward_layer = reward_layer( encoded_observation) predicted_rewards_expected = self.evaluate( predicted_rewards_from_reward_layer).reshape( (-1, self._num_actions)) else: observation_numpy = self.evaluate(encoded_observation) predicted_rewards_expected = ( self._get_predicted_rewards_from_per_arm_linucb( observation_numpy, batch_size)) p_info = self.evaluate(action_step.info) self.assertEqual(p_info.predicted_rewards_mean.dtype, np.float32) self.assertAllClose(p_info.predicted_rewards_mean, predicted_rewards_expected)
def testTrainPerArmAgentVariableActions(self): num_actions = 5 obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, num_actions, add_num_actions_feature=True) time_step_spec = time_step.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) encoding_dim = 10 encoder = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2), encoding_dim)) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, encoding_network=encoder, encoding_network_num_train_steps=10, encoding_dim=encoding_dim, accepts_per_arm_features=True, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast( tf.reshape(tf.range(30), shape=[2, 5, 3]), dtype=tf.float32), bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.constant([3, 4], dtype=tf.int32) } actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step = time_step.TimeStep( tf.constant( time_step.StepType.FIRST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), observations) final_step = time_step.TimeStep( tf.constant( time_step.StepType.LAST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(rewards, dtype=tf.float32, name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), observations) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) loss_info, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) loss_value = self.evaluate(loss_info) self.assertGreater(loss_value, 0.0)
def testCreateFeedForwardCommonTowerNetwork(self, batch_size, feature_dim, num_actions): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 7, feature_dim, num_actions) net = gafn.create_feed_forward_common_tower_network( obs_spec, (4, 3, 2), (6, 5, 4), (7, 6, 5)) input_nest = tensor_spec.sample_spec_nest(obs_spec, outer_dims=(batch_size, )) output, _ = self.evaluate(net(input_nest)) self.assertAllEqual(output.shape, (batch_size, num_actions))
def testCreateFeedForwardCommonTowerNetworkWithEmptyLayers( self, batch_size, feature_dim, num_actions): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 7, feature_dim, num_actions) net = gafn.create_feed_forward_common_tower_network(obs_spec, global_layers=(), arm_layers=(), common_layers=()) input_nest = tensor_spec.sample_spec_nest(obs_spec, outer_dims=(batch_size, )) output, _ = net(input_nest) self.evaluate(tf.compat.v1.global_variables_initializer()) output = self.evaluate(output) self.assertAllEqual(output.shape, (batch_size, num_actions))
def testPerArmRewardsVariableNumActions(self): tf.compat.v1.set_random_seed(3000) obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, 4, add_num_actions_feature=True) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) reward_network = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) policy = greedy_reward_policy.GreedyRewardPredictionPolicy( time_step_spec, action_spec, reward_network=reward_network, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean',)) action_feature = tf.cast( tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]), dtype=tf.float32) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: action_feature, bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.constant([2, 3], dtype=tf.int32) } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) action, p_info, first_arm_features = self.evaluate([ action_step.action, action_step.info, observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0] ]) self.assertAllEqual(action.shape, [2]) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4]) self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3]) first_action = action[0] self.assertAllEqual(p_info.chosen_arm_features[0], first_arm_features[first_action])
def testComputeMaskFromMultipleSourcesMask(self): observation_spec = bandit_spec_utils.create_per_arm_observation_spec( 4, 5, 6) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5) constraint_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( observation_spec, (3, 4), (4, 3), (2, 3))) neural_constraint = constraints.NeuralConstraint( time_step_spec, action_spec, constraint_network=constraint_net) original_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]] observations = ({ 'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32), 'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]), }, original_mask) mask = constraints.construct_mask_from_multiple_sources( observations, lambda x: (x[0], x[1]), [neural_constraint], 6) self.assertAllGreaterEqual(original_mask - mask, 0)
def testTrainPerArmAgent(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, 4, add_num_actions_feature=True) time_step_spec = ts.time_step_spec( observation_spec=obs_spec, reward_spec=tensor_spec.TensorSpec([3], tf.float32)) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) objective_networks = [ global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2)) for _ in range(3) ] optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.01) agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( time_step_spec, action_spec, self._scalarizer, objective_networks=objective_networks, accepts_per_arm_features=True, optimizer=optimizer) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast( tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32), bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.ones([2], dtype=tf.int32) } actions = np.array([0, 3], dtype=np.int32) objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, objectives) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def testComputeMaskFromMultipleSourcesNumActionsFeature(self): observation_spec = bandit_spec_utils.create_per_arm_observation_spec( 4, 5, 6, add_num_actions_feature=True) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5) constraint_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( observation_spec, (3, 4), (4, 3), (2, 3))) neural_constraint = constraints.NeuralConstraint( time_step_spec, action_spec, constraint_network=constraint_net) observations = { 'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32), 'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]), 'num_actions': tf.constant([4, 3], dtype=tf.int32) } mask = constraints.construct_mask_from_multiple_sources( observations, None, [neural_constraint], 6) implied_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]] self.assertAllGreaterEqual(implied_mask - mask, 0)
def testPerArmRewardsSparseObs(self): tf.compat.v1.set_random_seed(3000) obs_spec = { 'global': {'sport': tensor_spec.TensorSpec((), tf.string)}, 'per_arm': { 'name': tensor_spec.TensorSpec((3,), tf.string), 'fruit': tensor_spec.TensorSpec((3,), tf.string) } } columns_a = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'name', ['bob', 'george', 'wanda'])) columns_b = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'fruit', ['banana', 'kiwi', 'pear'])) columns_c = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'sport', ['bridge', 'chess', 'snooker'])) reward_network = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( observation_spec=obs_spec, global_layers=(4, 3, 2), arm_layers=(6, 5, 4), common_layers=(7, 6, 5), global_preprocessing_combiner=( tf.compat.v2.keras.layers.DenseFeatures([columns_c])), arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures( [columns_a, columns_b]))) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2) policy = greedy_reward_policy.GreedyRewardPredictionPolicy( time_step_spec, action_spec, reward_network=reward_network, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean',)) observations = { 'global': { 'sport': tf.constant(['snooker', 'chess']) }, 'per_arm': { 'name': tf.constant([['george', 'george', 'george'], ['bob', 'bob', 'bob']]), 'fruit': tf.constant([['banana', 'banana', 'banana'], ['kiwi', 'kiwi', 'kiwi']]) } } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate([ tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer() ]) action, p_info, first_arm_name_feature = self.evaluate([ action_step.action, action_step.info, observations[bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0] ]) self.assertAllEqual(action.shape, [2]) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3]) self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2]) self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2]) first_action = action[0] self.assertAllEqual(p_info.chosen_arm_features['name'][0], first_arm_name_feature[first_action])
def testPerArmRewards(self): tf.compat.v1.set_random_seed(3000) obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) reward_network = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) policy = greedy_reward_policy.GreedyRewardPredictionPolicy( time_step_spec, action_spec, reward_network=reward_network, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean',)) action_feature = tf.cast( tf.reshape(tf.random.shuffle(tf.range(24)), shape=[2, 4, 3]), dtype=tf.float32) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: action_feature } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) action, p_info, first_arm_features = self.evaluate([ action_step.action, action_step.info, observations[bandit_spec_utils.PER_ARM_FEATURE_KEY][0] ]) self.assertAllEqual(action.shape, [2]) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 4]) self.assertAllEqual(p_info.chosen_arm_features.shape, [2, 3]) first_action = action[0] self.assertAllEqual(p_info.chosen_arm_features[0], first_arm_features[first_action]) # Check that zeroing out some of the actions does not affect the predicted # rewards for unchanged actions. This is to make sure that action feature # padding does not influence the behavior. if not tf.executing_eagerly(): # The below comparison will only work in tf2 because of the random per-arm # observations get re-drawn in tf1. return padded_action_feature = tf.concat( [action_feature[:, 0:1, :], tf.zeros(shape=[2, 3, 3], dtype=tf.float32)], axis=1) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: padded_action_feature } time_step = ts.restart(observations, batch_size=2) padded_action_step = policy.action(time_step, seed=1) padded_p_info = self.evaluate(padded_action_step.info) self.assertAllEqual(p_info.predicted_rewards_mean[:, 0], padded_p_info.predicted_rewards_mean[:, 0])
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. feature_dict = np.array([str(i) for i in range(DICTIONARY_SIZE)]) def _global_context_sampling_fn(): """Generates one sample of global features. It generates a dictionary of size `NUM_GLOBAL_FEATURES`, with the following syntax: {..., 'global_feature_4': ['43'], ... } That is, the values are one-element numpy arrays of strings. Returns: A dictionary with string keys and numpy string array values. """ generated_features = feature_dict[np.random.randint(0, DICTIONARY_SIZE, [NUM_GLOBAL_FEATURES])] global_features = { 'global_feature_{}'.format(i): generated_features[[i]] for i in range(NUM_GLOBAL_FEATURES) } return global_features def _arm_context_sampling_fn(): """Generates one sample of arm features. It generates a dictionary of size `NUM_ARM_FEATURES`, with the following syntax: {..., 'arm_feature_7': ['29'], ... } That is, the values are one-element numpy arrays of strings. Note that the output sample is for one arm and one non-batched time step. Returns: A dictionary with string keys and numpy string array values. """ generated_features = feature_dict[np.random.randint( 0, DICTIONARY_SIZE, [NUM_ARM_FEATURES])] arm_features = { 'arm_feature_{}'.format(i): generated_features[[i]] for i in range(NUM_ARM_FEATURES) } return arm_features def _reward_fn(global_features, arm_features): """Outputs a [0, 1] float given a sample. The output reward is generated by hashing the concatenation of feature keys and values, then adding all up, taking modulo by 1000, and normalizing. Args: global_features: A dictionary with string keys and 1d string numpy array values. arm_features: A dictionary with string keys and 1d string numpy array values. Returns: A float value between 0 and 1. """ hashed_global = 0 for x, y in global_features.items(): hashed_global += hash(x + y[0]) hashed_arm = 0 for x, y in arm_features.items(): hashed_arm += hash(x + y[0]) return (hashed_global + hashed_arm) % 1000 / 1000 env = sspe.StationaryStochasticStructuredPyEnvironment( _global_context_sampling_fn, _arm_context_sampling_fn, NUM_ACTIONS, _reward_fn, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) def make_string_feature(name): return tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( name, feature_dict)) global_columns = [ make_string_feature('global_feature_{}'.format(i)) for i in range(NUM_GLOBAL_FEATURES) ] arm_columns = [ make_string_feature('arm_feature_{}'.format(i)) for i in range(NUM_ARM_FEATURES) ] obs_spec = environment.observation_spec() if FLAGS.agent == 'epsGredy': network = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2), global_preprocessing_combiner=tf.compat.v2.keras.layers .DenseFeatures(global_columns), arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures( arm_columns))) agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, accepts_per_arm_features=True, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN) elif FLAGS.agent == 'NeuralLinUCB': network = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (40, 30), (30, 40), (40, 20), ENCODING_DIM, global_preprocessing_combiner=tf.compat.v2.keras.layers .DenseFeatures(global_columns), arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures( arm_columns))) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), encoding_network=network, encoding_network_num_train_steps=EPS_PHASE_STEPS, encoding_dim=ENCODING_DIM, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), alpha=1.0, gamma=1.0, epsilon_greedy=EPSILON, accepts_per_arm_features=True, debug_summaries=True, summarize_grads_and_vars=True, emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN) if FLAGS.drop_arm_obs: drop_arm_feature_fn = bandit_spec_utils.drop_arm_observation else: drop_arm_feature_fn = None trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, training_data_spec_transformation_fn=drop_arm_feature_fn)
def testSparseObs(self, batch_size, actions_from_reward_layer): obs_spec = { 'global': { 'sport': tensor_spec.TensorSpec((), tf.string) }, 'per_arm': { 'name': tensor_spec.TensorSpec((3, ), tf.string), 'fruit': tensor_spec.TensorSpec((3, ), tf.string) } } columns_a = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'name', ['bob', 'george', 'wanda'])) columns_b = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'fruit', ['banana', 'kiwi', 'pear'])) columns_c = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'sport', ['bridge', 'chess', 'snooker'])) dummy_net = arm_network.create_feed_forward_common_tower_network( obs_spec, global_layers=(3, 4, 5), arm_layers=(3, 2), common_layers=(4, 3), output_dim=self._encoding_dim, global_preprocessing_combiner=( tf.compat.v2.keras.layers.DenseFeatures([columns_c])), arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures( [columns_a, columns_b])) time_step_spec = ts.time_step_spec(obs_spec) reward_layer = get_per_arm_reward_layer( encoding_dim=self._encoding_dim) policy = neural_linucb_policy.NeuralLinUCBPolicy( dummy_net, self._encoding_dim, reward_layer, actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=self._a[0:1], data_vector=self._b[0:1], num_samples=self._num_samples_per_arm[0:1], epsilon_greedy=0.0, time_step_spec=time_step_spec, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean', )) observations = { 'global': { 'sport': tf.constant(['snooker', 'chess']) }, 'per_arm': { 'name': tf.constant([['george', 'george', 'george'], ['bob', 'bob', 'bob']]), 'fruit': tf.constant([['banana', 'banana', 'banana'], ['kiwi', 'kiwi', 'kiwi']]) } } time_step = ts.restart(observations, batch_size=2) action_fn = common.function_in_tf1()(policy.action) action_step = action_fn(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate([ tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer() ]) action = self.evaluate(action_step.action) self.assertAllEqual(action.shape, [2]) p_info = self.evaluate(action_step.info) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3]) self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2]) self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2]) first_action = action[0] first_arm_name_feature = observations[ bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0] self.assertAllEqual(p_info.chosen_arm_features['name'][0], first_arm_name_feature[first_action])