Exemplo n.º 1
0
    def testNeuralLinUCBUpdateNumTrainSteps10(self,
                                              batch_size=1,
                                              context_dim=10):
        """Check NeuralLinUCBAgent updates when behaving like eps-greedy."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        encoder = DummyNet(observation_spec)
        encoding_dim = 10
        variable_collection = neural_linucb_agent.NeuralLinUCBVariableCollection(
            num_actions, encoding_dim)
        agent = neural_linucb_agent.NeuralLinUCBAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            encoding_network=encoder,
            encoding_network_num_train_steps=10,
            encoding_dim=encoding_dim,
            variable_collection=variable_collection,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001))

        loss_info, _ = agent.train(experience)
        self.evaluate(agent.initialize())
        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_value = self.evaluate(loss_info)
        self.assertGreater(loss_value, 0.0)
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    class LinearNormalReward(object):
        def __init__(self, theta):
            self.theta = theta

        def __call__(self, x):
            mu = np.dot(x, self.theta)
            return np.random.normal(mu, 1)

    def _global_context_sampling_fn():
        return np.random.randint(-10, 10, [4]).astype(np.float32)

    def _arm_context_sampling_fn():
        return np.random.randint(-2, 3, [5]).astype(np.float32)

    reward_fn = LinearNormalReward(HIDDEN_PARAM)

    observation_and_action_constraint_splitter = None
    num_actions_fn = None
    variable_action_method = bandit_spec_utils.VariableActionMethod.FIXED
    if FLAGS.add_num_actions_feature:
        num_actions_fn = lambda: NUM_ACTIONS
        variable_action_method = (
            bandit_spec_utils.VariableActionMethod.NUM_ACTIONS_FEATURE)

    env = sspe.StationaryStochasticPerArmPyEnvironment(
        _global_context_sampling_fn,
        _arm_context_sampling_fn,
        NUM_ACTIONS,
        reward_fn,
        num_actions_fn,
        batch_size=BATCH_SIZE,
        variable_action_method=variable_action_method)
    environment = tf_py_environment.TFPyEnvironment(env)

    if FLAGS.agent == 'LinUCB':
        agent = lin_ucb_agent.LinearUCBAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            alpha=AGENT_ALPHA,
            accepts_per_arm_features=True,
            dtype=tf.float32)
    elif FLAGS.agent == 'LinTS':
        agent = lin_ts_agent.LinearThompsonSamplingAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            alpha=AGENT_ALPHA,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter),
            accepts_per_arm_features=True,
            dtype=tf.float32)
    elif FLAGS.agent == 'epsGreedy':
        obs_spec = environment.observation_spec()
        if FLAGS.network == 'commontower':
            network = (global_and_arm_feature_network.
                       create_feed_forward_common_tower_network(
                           obs_spec, (40, 30), (30, 40), (40, 20)))
        elif FLAGS.network == 'dotproduct':
            network = (global_and_arm_feature_network.
                       create_feed_forward_dot_product_network(
                           obs_spec, (4, 3, 6), (3, 4, 6)))
        agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            epsilon=EPSILON,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter),
            accepts_per_arm_features=True,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )
    elif FLAGS.agent == 'NeuralLinUCB':
        obs_spec = environment.observation_spec()
        network = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (40, 30), (30, 40), (40, 20), ENCODING_DIM))
        agent = neural_linucb_agent.NeuralLinUCBAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            encoding_network=network,
            encoding_network_num_train_steps=EPS_PHASE_STEPS,
            encoding_dim=ENCODING_DIM,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            alpha=1.0,
            gamma=1.0,
            epsilon_greedy=EPSILON,
            accepts_per_arm_features=True,
            debug_summaries=True,
            summarize_grads_and_vars=True,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )

    def _all_rewards(observation, hidden_param):
        """Outputs rewards for all actions, given an observation."""
        hidden_param = tf.cast(hidden_param, dtype=tf.float32)
        global_obs = observation[bandit_spec_utils.GLOBAL_FEATURE_KEY]
        per_arm_obs = observation[bandit_spec_utils.PER_ARM_FEATURE_KEY]
        num_actions = tf.shape(per_arm_obs)[1]
        tiled_global = tf.tile(tf.expand_dims(global_obs, axis=1),
                               [1, num_actions, 1])
        concatenated = tf.concat([tiled_global, per_arm_obs], axis=-1)
        rewards = tf.linalg.matvec(concatenated, hidden_param)
        return rewards

    def optimal_reward(observation, hidden_param):
        return tf.reduce_max(_all_rewards(observation, hidden_param), axis=1)

    def optimal_action(observation, hidden_param):
        return tf.argmax(_all_rewards(observation, hidden_param),
                         axis=1,
                         output_type=tf.int32)

    optimal_reward_fn = functools.partial(optimal_reward,
                                          hidden_param=HIDDEN_PARAM)
    optimal_action_fn = functools.partial(optimal_action,
                                          hidden_param=HIDDEN_PARAM)
    regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
    suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
        optimal_action_fn)

    if FLAGS.drop_arm_obs:
        drop_arm_feature_fn = functools.partial(
            bandit_spec_utils.drop_arm_observation)
    else:
        drop_arm_feature_fn = None
    trainer.train(root_dir=FLAGS.root_dir,
                  agent=agent,
                  environment=environment,
                  training_loops=TRAINING_LOOPS,
                  steps_per_loop=STEPS_PER_LOOP,
                  additional_metrics=[regret_metric, suboptimal_arms_metric],
                  training_data_spec_transformation_fn=drop_arm_feature_fn)
Exemplo n.º 3
0
 def testTrainPerArmAgent(self):
     num_actions = 5
     mask_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                               shape=(num_actions, ),
                                               minimum=0,
                                               maximum=1)
     obs_spec = (bandit_spec_utils.create_per_arm_observation_spec(
         2, 3, num_actions), mask_spec)
     time_step_spec = time_step.time_step_spec(obs_spec)
     action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                 shape=(),
                                                 minimum=0,
                                                 maximum=num_actions - 1)
     encoding_dim = 10
     encoder = (global_and_arm_feature_network.
                create_feed_forward_common_tower_network(
                    obs_spec[0], (4, 3), (3, 4), (4, 2), encoding_dim))
     agent = neural_linucb_agent.NeuralLinUCBAgent(
         time_step_spec=time_step_spec,
         action_spec=action_spec,
         encoding_network=encoder,
         encoding_network_num_train_steps=10,
         encoding_dim=encoding_dim,
         observation_and_action_constraint_splitter=lambda x: (x[0], x[1]),
         accepts_per_arm_features=True,
         optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001))
     observations = ({
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(30), shape=[2, 5, 3]),
                 dtype=tf.float32)
     }, tf.ones(shape=(2, num_actions), dtype=tf.int32))
     actions = np.array([0, 3], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step = time_step.TimeStep(
         tf.constant(time_step.StepType.FIRST,
                     dtype=tf.int32,
                     shape=[2],
                     name='step_type'),
         tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'),
         tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
         observations)
     final_step = time_step.TimeStep(
         tf.constant(time_step.StepType.LAST,
                     dtype=tf.int32,
                     shape=[2],
                     name='step_type'),
         tf.constant(rewards, dtype=tf.float32, name='reward'),
         tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'),
         observations)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     loss_info, _ = agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     loss_value = self.evaluate(loss_info)
     self.assertGreater(loss_value, 0.0)
Exemplo n.º 4
0
    def testNeuralLinUCBUpdateDistributed(self, batch_size=1, context_dim=10):
        """Same as above but with distributed LinUCB updates."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        encoder = DummyNet(observation_spec)
        encoding_dim = 10
        agent = neural_linucb_agent.NeuralLinUCBAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            encoding_network=encoder,
            encoding_network_num_train_steps=0,
            encoding_dim=encoding_dim,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2))

        self.evaluate(agent.initialize())
        self.evaluate(tf.compat.v1.global_variables_initializer())
        # Call the distributed LinUCB training instead of agent.train().
        train_fn = common.function_in_tf1()(
            agent.compute_loss_using_linucb_distributed)
        reward = tf.cast(experience.reward, agent._dtype)
        loss_info = train_fn(experience.observation,
                             action,
                             reward,
                             weights=None)
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(tf.cast(experience.observation, tf.float64),
                            [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(tf.cast(experience.reward, tf.float64),
                            [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):

            encoded_observations_for_arm, _ = encoder(observations_for_arm)
            encoded_observations_for_arm = tf.cast(
                encoded_observations_for_arm, dtype=tf.float64)

            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float64)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(encoded_observations_for_arm,
                                  encoded_observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, encoded_observations_for_arm)
                return a_new, b_new

            def false_fn():
                return (tf.zeros([encoding_dim, encoding_dim],
                                 dtype=tf.float64),
                        tf.zeros([encoding_dim], dtype=tf.float64))

            a_new, b_new = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    feature_dict = np.array([str(i) for i in range(DICTIONARY_SIZE)])

    def _global_context_sampling_fn():
        """Generates one sample of global features.

    It generates a dictionary of size `NUM_GLOBAL_FEATURES`, with the following
    syntax:

    {...,
     'global_feature_4': ['43'],
     ...
    }

    That is, the values are one-element numpy arrays of strings.

    Returns:
      A dictionary with string keys and numpy string array values.
    """
        generated_features = feature_dict[np.random.randint(
            0, DICTIONARY_SIZE, [NUM_GLOBAL_FEATURES])]
        global_features = {
            'global_feature_{}'.format(i): generated_features[[i]]
            for i in range(NUM_GLOBAL_FEATURES)
        }
        return global_features

    def _arm_context_sampling_fn():
        """Generates one sample of arm features.

    It generates a dictionary of size `NUM_ARM_FEATURES`, with the following
    syntax:

    {...,
     'arm_feature_7': ['29'],
     ...
    }

    That is, the values are one-element numpy arrays of strings. Note that the
    output sample is for one arm and one non-batched time step.

    Returns:
      A dictionary with string keys and numpy string array values.
    """
        generated_features = feature_dict[np.random.randint(
            0, DICTIONARY_SIZE, [NUM_ARM_FEATURES])]
        arm_features = {
            'arm_feature_{}'.format(i): generated_features[[i]]
            for i in range(NUM_ARM_FEATURES)
        }
        return arm_features

    def _reward_fn(global_features, arm_features):
        """Outputs a [0, 1] float given a sample.

    The output reward is generated by hashing the concatenation of feature keys
    and values, then adding all up, taking modulo by 1000, and normalizing.

    Args:
      global_features: A dictionary with string keys and 1d string numpy array
        values.
      arm_features: A dictionary with string keys and 1d string numpy array
        values.

    Returns:
      A float value between 0 and 1.
    """
        hashed_global = 0
        for x, y in global_features.items():
            hashed_global += hash(x + y[0])
        hashed_arm = 0
        for x, y in arm_features.items():
            hashed_arm += hash(x + y[0])
        return (hashed_global + hashed_arm) % 1000 / 1000

    env = sspe.StationaryStochasticStructuredPyEnvironment(
        _global_context_sampling_fn,
        _arm_context_sampling_fn,
        NUM_ACTIONS,
        _reward_fn,
        batch_size=BATCH_SIZE)
    environment = tf_py_environment.TFPyEnvironment(env)

    def make_string_feature(name):
        return tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                name, feature_dict))

    global_columns = [
        make_string_feature('global_feature_{}'.format(i))
        for i in range(NUM_GLOBAL_FEATURES)
    ]
    arm_columns = [
        make_string_feature('arm_feature_{}'.format(i))
        for i in range(NUM_ARM_FEATURES)
    ]
    obs_spec = environment.observation_spec()
    if FLAGS.agent == 'epsGredy':
        network = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (4, 3), (3, 4), (4, 2),
                       global_preprocessing_combiner=tf.compat.v2.keras.layers.
                       DenseFeatures(global_columns),
                       arm_preprocessing_combiner=tf.compat.v2.keras.layers.
                       DenseFeatures(arm_columns)))
        agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            epsilon=EPSILON,
            accepts_per_arm_features=True,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )
    elif FLAGS.agent == 'NeuralLinUCB':
        network = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (40, 30), (30, 40), (40, 20),
                       ENCODING_DIM,
                       global_preprocessing_combiner=tf.compat.v2.keras.layers.
                       DenseFeatures(global_columns),
                       arm_preprocessing_combiner=tf.compat.v2.keras.layers.
                       DenseFeatures(arm_columns)))
        agent = neural_linucb_agent.NeuralLinUCBAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            encoding_network=network,
            encoding_network_num_train_steps=EPS_PHASE_STEPS,
            encoding_dim=ENCODING_DIM,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            alpha=1.0,
            gamma=1.0,
            epsilon_greedy=EPSILON,
            accepts_per_arm_features=True,
            debug_summaries=True,
            summarize_grads_and_vars=True,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )

    if FLAGS.drop_arm_obs:
        drop_arm_feature_fn = bandit_spec_utils.drop_arm_observation
    else:
        drop_arm_feature_fn = None
    trainer.train(root_dir=FLAGS.root_dir,
                  agent=agent,
                  environment=environment,
                  training_loops=TRAINING_LOOPS,
                  steps_per_loop=STEPS_PER_LOOP,
                  training_data_spec_transformation_fn=drop_arm_feature_fn)