Exemplo n.º 1
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    class LinearNormalReward(object):
        def __init__(self, theta):
            self.theta = theta

        def __call__(self, x):
            mu = np.dot(x, self.theta)
            return np.random.normal(mu, 1)

    def _global_context_sampling_fn():
        return np.random.randint(-10, 10, [4]).astype(np.float32)

    def _arm_context_sampling_fn():
        return np.random.randint(-2, 3, [5]).astype(np.float32)

    reward_fn = LinearNormalReward(HIDDEN_PARAM)

    env = sspe.StationaryStochasticPerArmPyEnvironment(
        _global_context_sampling_fn,
        _arm_context_sampling_fn,
        NUM_ACTIONS,
        reward_fn,
        batch_size=BATCH_SIZE)
    environment = tf_py_environment.TFPyEnvironment(env)

    obs_spec = environment.observation_spec()
    if FLAGS.network == 'commontower':
        network = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (4, 3), (3, 4), (4, 2)))
    elif FLAGS.network == 'dotproduct':
        network = (global_and_arm_feature_network.
                   create_feed_forward_dot_product_network(
                       obs_spec, (4, 3, 6), (3, 4, 6)))
    agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        reward_network=network,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        epsilon=EPSILON,
        accepts_per_arm_features=True,
        emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN)

    optimal_reward_fn = functools.partial(optimal_reward,
                                          hidden_param=HIDDEN_PARAM)
    optimal_action_fn = functools.partial(optimal_action,
                                          hidden_param=HIDDEN_PARAM)
    regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
    suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
        optimal_action_fn)

    trainer.train(root_dir=FLAGS.root_dir,
                  agent=agent,
                  environment=environment,
                  training_loops=TRAINING_LOOPS,
                  steps_per_loop=STEPS_PER_LOOP,
                  additional_metrics=[regret_metric, suboptimal_arms_metric])
    def test_with_variable_num_actions_masking(self):
        def _global_context_sampling_fn():
            return np.random.randint(-10, 10, [4])

        def _arm_context_sampling_fn():
            return np.random.randint(-2, 3, [5])

        def _num_actions_fn():
            return np.random.randint(0, 7)

        reward_fn = LinearNormalReward([0, 1, 2, 3, 4, 5, 6, 7, 8])

        env = sspe.StationaryStochasticPerArmPyEnvironment(
            _global_context_sampling_fn,
            _arm_context_sampling_fn,
            6,
            reward_fn,
            _num_actions_fn,
            batch_size=2,
            add_num_actions_feature=False)
        time_step_spec = env.time_step_spec()
        self.assertAllEqual(time_step_spec.observation[1].shape, [6])
        action_spec = array_spec.BoundedArraySpec(shape=(),
                                                  minimum=0,
                                                  maximum=5,
                                                  dtype=np.int32)

        random_policy = random_py_policy.RandomPyPolicy(
            time_step_spec=time_step_spec, action_spec=action_spec)

        for _ in range(5):
            time_step = env.reset()
            self.assertTrue(
                check_unbatched_time_step_spec(time_step=time_step,
                                               time_step_spec=time_step_spec,
                                               batch_size=env.batch_size))

            action = random_policy.action(time_step).action
            self.assertAllEqual(action.shape, [2])
            self.assertAllGreaterEqual(action, 0)
            self.assertAllLess(action, 6)
            time_step = env.step(action)
    def test_with_variable_num_actions(self, variable_action_method,
                                       batch_size):
        def _global_context_sampling_fn():
            return np.random.randint(-10, 10, [4])

        def _arm_context_sampling_fn():
            return np.random.randint(-2, 3, [5])

        def _num_actions_fn():
            return np.random.randint(5, 7)

        reward_fn = LinearNormalReward([0, 1, 2, 3, 4, 5, 6, 7, 8])

        env = sspe.StationaryStochasticPerArmPyEnvironment(
            _global_context_sampling_fn,
            _arm_context_sampling_fn,
            6,
            reward_fn,
            _num_actions_fn,
            batch_size=batch_size,
            variable_action_method=variable_action_method)

        time_step_spec = env.time_step_spec()
        self._check_arm_obs_spec(time_step_spec.observation,
                                 variable_action_method, 6, 5)

        for _ in range(5):
            time_step = env.reset()
            actual_batch_size = time_step.step_type.shape[0]
            self.assertTrue(
                check_unbatched_time_step_spec(time_step=time_step,
                                               time_step_spec=time_step_spec,
                                               batch_size=actual_batch_size))

            action = np.random.randint(0, 4, [batch_size])
            time_step = env.step(action)
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    class LinearNormalReward(object):
        def __init__(self, theta):
            self.theta = theta

        def __call__(self, x):
            mu = np.dot(x, self.theta)
            return np.random.normal(mu, 1)

    def _global_context_sampling_fn():
        return np.random.randint(-10, 10, [4]).astype(np.float32)

    def _arm_context_sampling_fn():
        return np.random.randint(-2, 3, [5]).astype(np.float32)

    reward_fn = LinearNormalReward(HIDDEN_PARAM)

    observation_and_action_constraint_splitter = None
    num_actions_fn = None
    variable_action_method = bandit_spec_utils.VariableActionMethod.FIXED
    if FLAGS.add_num_actions_feature:
        num_actions_fn = lambda: NUM_ACTIONS
        variable_action_method = (
            bandit_spec_utils.VariableActionMethod.NUM_ACTIONS_FEATURE)

    env = sspe.StationaryStochasticPerArmPyEnvironment(
        _global_context_sampling_fn,
        _arm_context_sampling_fn,
        NUM_ACTIONS,
        reward_fn,
        num_actions_fn,
        batch_size=BATCH_SIZE,
        variable_action_method=variable_action_method)
    environment = tf_py_environment.TFPyEnvironment(env)

    if FLAGS.agent == 'LinUCB':
        agent = lin_ucb_agent.LinearUCBAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            alpha=AGENT_ALPHA,
            accepts_per_arm_features=True,
            dtype=tf.float32)
    elif FLAGS.agent == 'LinTS':
        agent = lin_ts_agent.LinearThompsonSamplingAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            alpha=AGENT_ALPHA,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter),
            accepts_per_arm_features=True,
            dtype=tf.float32)
    elif FLAGS.agent == 'epsGreedy':
        obs_spec = environment.observation_spec()
        if FLAGS.network == 'commontower':
            network = (global_and_arm_feature_network.
                       create_feed_forward_common_tower_network(
                           obs_spec, (40, 30), (30, 40), (40, 20)))
        elif FLAGS.network == 'dotproduct':
            network = (global_and_arm_feature_network.
                       create_feed_forward_dot_product_network(
                           obs_spec, (4, 3, 6), (3, 4, 6)))
        agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            epsilon=EPSILON,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter),
            accepts_per_arm_features=True,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )
    elif FLAGS.agent == 'NeuralLinUCB':
        obs_spec = environment.observation_spec()
        network = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (40, 30), (30, 40), (40, 20), ENCODING_DIM))
        agent = neural_linucb_agent.NeuralLinUCBAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            encoding_network=network,
            encoding_network_num_train_steps=EPS_PHASE_STEPS,
            encoding_dim=ENCODING_DIM,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            alpha=1.0,
            gamma=1.0,
            epsilon_greedy=EPSILON,
            accepts_per_arm_features=True,
            debug_summaries=True,
            summarize_grads_and_vars=True,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )

    def _all_rewards(observation, hidden_param):
        """Outputs rewards for all actions, given an observation."""
        hidden_param = tf.cast(hidden_param, dtype=tf.float32)
        global_obs = observation[bandit_spec_utils.GLOBAL_FEATURE_KEY]
        per_arm_obs = observation[bandit_spec_utils.PER_ARM_FEATURE_KEY]
        num_actions = tf.shape(per_arm_obs)[1]
        tiled_global = tf.tile(tf.expand_dims(global_obs, axis=1),
                               [1, num_actions, 1])
        concatenated = tf.concat([tiled_global, per_arm_obs], axis=-1)
        rewards = tf.linalg.matvec(concatenated, hidden_param)
        return rewards

    def optimal_reward(observation, hidden_param):
        return tf.reduce_max(_all_rewards(observation, hidden_param), axis=1)

    def optimal_action(observation, hidden_param):
        return tf.argmax(_all_rewards(observation, hidden_param),
                         axis=1,
                         output_type=tf.int32)

    optimal_reward_fn = functools.partial(optimal_reward,
                                          hidden_param=HIDDEN_PARAM)
    optimal_action_fn = functools.partial(optimal_action,
                                          hidden_param=HIDDEN_PARAM)
    regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
    suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
        optimal_action_fn)

    if FLAGS.drop_arm_obs:
        drop_arm_feature_fn = functools.partial(
            bandit_spec_utils.drop_arm_observation)
    else:
        drop_arm_feature_fn = None
    trainer.train(root_dir=FLAGS.root_dir,
                  agent=agent,
                  environment=environment,
                  training_loops=TRAINING_LOOPS,
                  steps_per_loop=STEPS_PER_LOOP,
                  additional_metrics=[regret_metric, suboptimal_arms_metric],
                  training_data_spec_transformation_fn=drop_arm_feature_fn)