예제 #1
0
def get_agent_by_name(agent_name, time_step_spec, action_spec):
    if agent_name == 'LinUCB':
        return lin_ucb_agent.LinearUCBAgent(time_step_spec=time_step_spec,
                                            action_spec=action_spec,
                                            dtype=tf.float32)
    elif agent_name == 'LinTS':
        return lin_ts_agent.LinearThompsonSamplingAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            dtype=tf.float32)
    elif agent_name == 'epsGreedy':
        network = q_network.QNetwork(
            input_tensor_spec=time_step_spec.observation,
            action_spec=action_spec,
            fc_layer_params=(50, 50, 50))
        return neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05),
            epsilon=0.1)
    elif agent_name == 'mix':
        emit_policy_info = (
            policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, )
        network = q_network.QNetwork(
            input_tensor_spec=time_step_spec.observation,
            action_spec=action_spec,
            fc_layer_params=(50, 50, 50))
        agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05),
            emit_policy_info=emit_policy_info,
            epsilon=0.1)
        agent_linucb = lin_ucb_agent.LinearUCBAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            emit_policy_info=emit_policy_info,
            dtype=tf.float32)
        agent_random = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05),
            emit_policy_info=emit_policy_info,
            epsilon=1.)
        agent_halfrandom = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05),
            emit_policy_info=emit_policy_info,
            epsilon=0.5)
        return exp3_mixture_agent.Exp3MixtureAgent(
            (agent_epsgreedy, agent_linucb, agent_random, agent_halfrandom))
예제 #2
0
 def testUCBandThompsonSamplingShareVariables(self):
     if not tf.executing_eagerly():
         self.skipTest('Test only works in eager mode.')
     context_dim = 9
     num_actions = 4
     batch_size = 7
     variable_collection = linear_agent.LinearBanditVariableCollection(
         context_dim=context_dim, num_models=num_actions)
     observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
     time_step_spec = time_step.time_step_spec(observation_spec)
     action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                 shape=(),
                                                 minimum=0,
                                                 maximum=num_actions - 1)
     ucb_agent = lin_ucb_agent.LinearUCBAgent(
         time_step_spec=time_step_spec,
         action_spec=action_spec,
         variable_collection=variable_collection)
     ts_agent = linear_thompson_sampling_agent.LinearThompsonSamplingAgent(
         time_step_spec=time_step_spec,
         action_spec=action_spec,
         variable_collection=variable_collection)
     initial_step, final_step = _get_initial_and_final_steps(
         batch_size, context_dim)
     action = np.random.randint(num_actions,
                                size=batch_size,
                                dtype=np.int32)
     action_step = _get_action_step(action)
     experience = _get_experience(initial_step, action_step, final_step)
     self.evaluate(ucb_agent.train(experience))
     self.assertAllEqual(ucb_agent._variable_collection.cov_matrix_list[0],
                         ts_agent._variable_collection.cov_matrix_list[0])
     self.evaluate(ts_agent.train(experience))
     self.assertAllEqual(ucb_agent._variable_collection.data_vector_list[0],
                         ts_agent._variable_collection.data_vector_list[0])
예제 #3
0
 def testAgentUpdate(self, batch_size, context_dim, num_agents):
     num_actions = 5
     observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
     time_step_spec = time_step.time_step_spec(observation_spec)
     action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                 shape=(),
                                                 minimum=0,
                                                 maximum=num_actions - 1)
     agents = []
     for _ in range(num_agents):
         agents.append(
             lin_ucb_agent.LinearUCBAgent(
                 time_step_spec,
                 action_spec,
                 emit_policy_info=(
                     policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, )))
     mixture_agent = static_mixture_agent.StaticMixtureAgent(
         [1] * num_agents, agents)
     initial_step, final_step = _get_initial_and_final_steps(
         batch_size, context_dim)
     action = np.random.randint(num_actions,
                                size=batch_size,
                                dtype=np.int32)
     action_step = _get_action_step(action, num_agents, num_actions)
     experience = _get_experience(initial_step, action_step, final_step)
     for agent in agents:
         self.evaluate(agent.initialize())
     self.evaluate(mixture_agent.initialize())
     loss_info = mixture_agent.train(experience)
     self.evaluate(loss_info)
def main(unused_argv):
  tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

  data_path = FLAGS.data_path
  if not data_path:
    raise ValueError('Please specify the location of the data file.')
  env = movielens_py_environment.MovieLensPyEnvironment(
      data_path, RANK_K, BATCH_SIZE, num_movies=20)
  environment = tf_py_environment.TFPyEnvironment(env)

  optimal_reward_fn = functools.partial(
      environment_utilities.compute_optimal_reward_with_movielens_environment,
      environment=environment)

  optimal_action_fn = functools.partial(
      environment_utilities.compute_optimal_action_with_movielens_environment,
      environment=environment)

  if FLAGS.agent == 'LinUCB':
    agent = lin_ucb_agent.LinearUCBAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        tikhonov_weight=0.001,
        alpha=AGENT_ALPHA,
        dtype=tf.float32)
  elif FLAGS.agent == 'LinTS':
    agent = lin_ts_agent.LinearThompsonSamplingAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        dtype=tf.float32)
  elif FLAGS.agent == 'epsGreedy':
    network = q_network.QNetwork(
        input_tensor_spec=environment.time_step_spec().observation,
        action_spec=environment.action_spec(),
        fc_layer_params=LAYERS)
    agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        reward_network=network,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        epsilon=EPSILON)
  elif FLAGS.agent == 'DropoutTS':
    agent = dropout_ts_agent.DropoutThompsonSamplingAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        dropout_rate=DROPOUT_RATE,
        network_layers=LAYERS,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR))

  regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
  suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
      optimal_action_fn)

  trainer.train(
      root_dir=FLAGS.root_dir,
      agent=agent,
      environment=environment,
      training_loops=TRAINING_LOOPS,
      steps_per_loop=STEPS_PER_LOOP,
      additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #5
0
 def testMixtureUpdate(self, batch_size, context_dim, num_agents):
     num_actions = 5
     observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
     time_step_spec = time_step.time_step_spec(observation_spec)
     action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                 shape=(),
                                                 minimum=0,
                                                 maximum=num_actions - 1)
     agents = []
     for _ in range(num_agents):
         agents.append(
             lin_ucb_agent.LinearUCBAgent(
                 time_step_spec,
                 action_spec,
                 emit_policy_info=(
                     policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, )))
     mixed_agent = exp3_mixture_agent.Exp3MixtureAgent(agents)
     initial_step, final_step = _get_initial_and_final_steps(
         batch_size, context_dim)
     action = np.random.randint(num_actions,
                                size=batch_size,
                                dtype=np.int32)
     action_step = _get_action_step(action, num_agents, num_actions)
     experience = _get_experience(initial_step, action_step, final_step)
     self.evaluate(mixed_agent.initialize())
     self.evaluate(mixed_agent._variable_collection.reward_aggregates)
     self.evaluate(mixed_agent.train(experience))
     reward_aggregates = self.evaluate(
         mixed_agent._variable_collection.reward_aggregates)
     self.assertAllInSet(reward_aggregates[:num_agents - 1], [0.999])
     agent_prob = 1 / num_agents
     est_rewards = 0.5 / agent_prob
     per_step_update = est_rewards
     last_agent_update = 1 - batch_size * per_step_update
     self.assertAllClose(reward_aggregates[-1], last_agent_update * 0.999)
예제 #6
0
 def testAgentWithDifferentSubagentsUpdate(self):
   num_actions = 3
   context_dim = 2
   batch_size = 7
   observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
   time_step_spec = time_step.time_step_spec(observation_spec)
   action_spec = tensor_spec.BoundedTensorSpec(
       dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
   agent1 = lin_ucb_agent.LinearUCBAgent(
       time_step_spec,
       action_spec,
       emit_policy_info=(policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,))
   reward_net = q_network.QNetwork(
       input_tensor_spec=observation_spec,
       action_spec=action_spec,
       fc_layer_params=(4, 3, 2))
   agent2 = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
       time_step_spec,
       action_spec,
       reward_network=reward_net,
       emit_policy_info=(policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN,),
       optimizer=tf.compat.v1.train.GradientDescentOptimizer(
           learning_rate=0.1),
       epsilon=0.1)
   agents = [agent1, agent2]
   dist = tfd.Categorical(probs=tf.Variable([0., 1.]))
   mixed_agent = WeightRotatingMixtureAgent(dist, agents)
   initial_step, final_step = _get_initial_and_final_steps(
       batch_size, context_dim)
   action = np.random.randint(num_actions, size=batch_size, dtype=np.int32)
   action_step = _get_action_step(action, 2, num_actions)
   experience = _get_experience(initial_step, action_step, final_step)
   self.evaluate(mixed_agent.initialize())
   loss_info = mixed_agent.train(experience)
   self.evaluate(loss_info)
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994

        covertype_dataset = dataset_utilities.convert_covertype_dataset(
            FLAGS.covertype_csv)
        covertype_reward_distribution = tfd.Independent(
            tfd.Deterministic(tf.eye(7)), reinterpreted_batch_ndims=2)
        environment = ce.ClassificationBanditEnvironment(
            covertype_dataset, covertype_reward_distribution, BATCH_SIZE)

        optimal_reward_fn = functools.partial(
            env_util.compute_optimal_reward_with_classification_environment,
            environment=environment)

        optimal_action_fn = functools.partial(
            env_util.compute_optimal_action_with_classification_environment,
            environment=environment)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                emit_log_probability=False,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'epsGreedy':
            network = q_network.QNetwork(
                input_tensor_spec=environment.time_step_spec().observation,
                action_spec=environment.action_spec(),
                fc_layer_params=LAYERS)
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                epsilon=EPSILON)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #8
0
 def testInitializeAgent(
     self, batch_size, context_dim, dtype, use_eigendecomp=False):
   num_actions = 5
   observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
   time_step_spec = time_step.time_step_spec(observation_spec)
   action_spec = tensor_spec.BoundedTensorSpec(
       dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
   agent = lin_ucb_agent.LinearUCBAgent(
       time_step_spec=time_step_spec,
       action_spec=action_spec,
       dtype=dtype)
   self.evaluate(agent.initialize())
예제 #9
0
def main(unused_argv):
    tf.enable_resource_variables()

    with tf.device('/CPU:0'):  # due to b/128333994
        env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE,
                                                      MU_HIGH, STD_HIGH,
                                                      BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_reward,
            delta=DELTA,
            mu_inside=MU_BASE[0],
            mu_high=MU_HIGH)
        optimal_action_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_action,
            delta=DELTA)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'epsGreedy':
            network = q_network.QNetwork(
                input_tensor_spec=environment.time_step_spec().observation,
                action_spec=environment.action_spec(),
                fc_layer_params=LAYERS)
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                epsilon=EPSILON)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #10
0
 def testInitializeAgentEmptyObservationSpec(self):
   dtype = tf.float32
   num_actions = 5
   observation_spec = tensor_spec.TensorSpec((), tf.float32)
   time_step_spec = time_step.time_step_spec(observation_spec)
   action_spec = tensor_spec.BoundedTensorSpec(
       dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
   agent = lin_ucb_agent.LinearUCBAgent(
       time_step_spec=time_step_spec,
       action_spec=action_spec,
       dtype=dtype)
   self.evaluate(agent.initialize())
예제 #11
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994

        mushroom_reward_distribution = (
            dataset_utilities.mushroom_reward_distribution(
                r_noeat=0.0,
                r_eat_safe=5.0,
                r_eat_poison_bad=-35.0,
                r_eat_poison_good=5.0,
                prob_poison_bad=0.5))
        mushroom_dataset = (
            dataset_utilities.convert_mushroom_csv_to_tf_dataset(
                FLAGS.mushroom_csv))
        environment = ce.ClassificationBanditEnvironment(
            mushroom_dataset, mushroom_reward_distribution, BATCH_SIZE)

        optimal_reward_fn = functools.partial(
            env_util.compute_optimal_reward_with_classification_environment,
            environment=environment)

        optimal_action_fn = functools.partial(
            env_util.compute_optimal_action_with_classification_environment,
            environment=environment)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                gamma=0.95,
                emit_log_probability=False,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                gamma=0.95,
                dtype=tf.float32)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #12
0
 def testInitializeAgent(self, batch_size, context_dim, num_agents,
                         emit_policy_info):
   num_actions = 7
   observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
   time_step_spec = time_step.time_step_spec(observation_spec)
   action_spec = tensor_spec.BoundedTensorSpec(
       dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
   agents = [
       lin_ucb_agent.LinearUCBAgent(time_step_spec, action_spec)
       for _ in range(num_agents)
   ]
   mixed_agent = exp3_mixture_agent.Exp3MixtureAgent(agents)
   self.evaluate(mixed_agent.initialize())
예제 #13
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994
        observation_shape = [CONTEXT_DIM]
        overall_shape = [BATCH_SIZE] + observation_shape
        observation_distribution = tfd.Normal(loc=tf.zeros(overall_shape),
                                              scale=tf.ones(overall_shape))
        action_shape = [NUM_ACTIONS]
        observation_to_reward_shape = observation_shape + action_shape
        observation_to_reward_distribution = tfd.Normal(
            loc=tf.zeros(observation_to_reward_shape),
            scale=tf.ones(observation_to_reward_shape))
        drift_distribution = tfd.Normal(loc=DRIFT_MEAN, scale=DRIFT_VARIANCE)
        additive_reward_distribution = tfd.Normal(
            loc=tf.zeros(action_shape),
            scale=(REWARD_NOISE_VARIANCE * tf.ones(action_shape)))
        environment_dynamics = dle.DriftingLinearDynamics(
            observation_distribution, observation_to_reward_distribution,
            drift_distribution, additive_reward_distribution)
        environment = nse.NonStationaryStochasticEnvironment(
            environment_dynamics)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                gamma=0.95,
                emit_log_probability=False,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                gamma=0.95,
                dtype=tf.float32)

        regret_metric = tf_bandit_metrics.RegretMetric(
            environment.environment_dynamics.compute_optimal_reward)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            environment.environment_dynamics.compute_optimal_action)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #14
0
 def testInitializeAgent(self, batch_size, context_dim, num_agents):
   num_actions = 7
   observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
   time_step_spec = time_step.time_step_spec(observation_spec)
   action_spec = tensor_spec.BoundedTensorSpec(
       dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
   agents = [
       lin_ucb_agent.LinearUCBAgent(time_step_spec, action_spec)
       for _ in range(num_agents)
   ]
   dist = tfd.Categorical(
       probs=tf.Variable(tf.range(num_agents, dtype=tf.float32)))
   mixed_agent = WeightRotatingMixtureAgent(dist, agents)
   self.evaluate(mixed_agent.initialize())
예제 #15
0
def main(unused_argv):
    tf.compat.v1.enable_resource_variables()

    with tf.device('/CPU:0'):  # due to b/128333994
        action_reward_fns = (
            environment_utilities.sliding_linear_reward_fn_generator(
                CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

        env = sspe.StationaryStochasticPyEnvironment(functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
                                                     action_reward_fns,
                                                     batch_size=BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #16
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994
        env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE,
                                                      MU_HIGH, STD_HIGH,
                                                      BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_reward,
            delta=DELTA,
            mu_inside=MU_BASE[0],
            mu_high=MU_HIGH)
        optimal_action_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_action,
            delta=DELTA)
        network = q_network.QNetwork(
            input_tensor_spec=environment.time_step_spec().observation,
            action_spec=environment.action_spec(),
            fc_layer_params=(LAYERS))

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'epsGreedy':
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                epsilon=EPSILON)
        elif FLAGS.agent == 'random':
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                epsilon=1.)
        elif FLAGS.agent == 'Mix':
            emit_policy_info = (
                policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, )
            agent_epsgreedy = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                emit_policy_info=emit_policy_info,
                epsilon=EPSILON)
            agent_linucb = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                emit_policy_info=emit_policy_info,
                dtype=tf.float32)
            agent_random = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                emit_policy_info=emit_policy_info,
                epsilon=1.)
            agent_halfrandom = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                emit_policy_info=emit_policy_info,
                epsilon=0.5)
            agent = exp3_mixture_agent.Exp3MixtureAgent(
                (agent_epsgreedy, agent_linucb, agent_random,
                 agent_halfrandom))

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #17
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994
        action_reward_fns = (
            environment_utilities.structured_linear_reward_fn_generator(
                CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

        env = sspe.StationaryStochasticPyEnvironment(functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
                                                     action_reward_fns,
                                                     batch_size=BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'epsGreedy':
            laplacian_matrix = utils.build_laplacian_over_ordinal_integer_actions(
                environment.action_spec())

            network = q_network.QNetwork(
                input_tensor_spec=environment.time_step_spec().observation,
                action_spec=environment.action_spec(),
                fc_layer_params=REWARD_NETWORK_LAYER_PARAMS)
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=NN_LEARNING_RATE),
                epsilon=EPSILON,
                laplacian_matrix=laplacian_matrix,
                laplacian_smoothing_weight=0.01)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    class LinearNormalReward(object):
        def __init__(self, theta):
            self.theta = theta

        def __call__(self, x):
            mu = np.dot(x, self.theta)
            return np.random.normal(mu, 1)

    def _global_context_sampling_fn():
        return np.random.randint(-10, 10, [4]).astype(np.float32)

    def _arm_context_sampling_fn():
        return np.random.randint(-2, 3, [5]).astype(np.float32)

    reward_fn = LinearNormalReward(HIDDEN_PARAM)

    observation_and_action_constraint_splitter = None
    num_actions_fn = None
    variable_action_method = bandit_spec_utils.VariableActionMethod.FIXED
    if FLAGS.add_num_actions_feature:
        num_actions_fn = lambda: NUM_ACTIONS
        variable_action_method = (
            bandit_spec_utils.VariableActionMethod.NUM_ACTIONS_FEATURE)

    env = sspe.StationaryStochasticPerArmPyEnvironment(
        _global_context_sampling_fn,
        _arm_context_sampling_fn,
        NUM_ACTIONS,
        reward_fn,
        num_actions_fn,
        batch_size=BATCH_SIZE,
        variable_action_method=variable_action_method)
    environment = tf_py_environment.TFPyEnvironment(env)

    if FLAGS.agent == 'LinUCB':
        agent = lin_ucb_agent.LinearUCBAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            alpha=AGENT_ALPHA,
            accepts_per_arm_features=True,
            dtype=tf.float32)
    elif FLAGS.agent == 'LinTS':
        agent = lin_ts_agent.LinearThompsonSamplingAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            alpha=AGENT_ALPHA,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter),
            accepts_per_arm_features=True,
            dtype=tf.float32)
    elif FLAGS.agent == 'epsGreedy':
        obs_spec = environment.observation_spec()
        if FLAGS.network == 'commontower':
            network = (global_and_arm_feature_network.
                       create_feed_forward_common_tower_network(
                           obs_spec, (40, 30), (30, 40), (40, 20)))
        elif FLAGS.network == 'dotproduct':
            network = (global_and_arm_feature_network.
                       create_feed_forward_dot_product_network(
                           obs_spec, (4, 3, 6), (3, 4, 6)))
        agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            epsilon=EPSILON,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter),
            accepts_per_arm_features=True,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )
    elif FLAGS.agent == 'NeuralLinUCB':
        obs_spec = environment.observation_spec()
        network = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (40, 30), (30, 40), (40, 20), ENCODING_DIM))
        agent = neural_linucb_agent.NeuralLinUCBAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            encoding_network=network,
            encoding_network_num_train_steps=EPS_PHASE_STEPS,
            encoding_dim=ENCODING_DIM,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            alpha=1.0,
            gamma=1.0,
            epsilon_greedy=EPSILON,
            accepts_per_arm_features=True,
            debug_summaries=True,
            summarize_grads_and_vars=True,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )

    def _all_rewards(observation, hidden_param):
        """Outputs rewards for all actions, given an observation."""
        hidden_param = tf.cast(hidden_param, dtype=tf.float32)
        global_obs = observation[bandit_spec_utils.GLOBAL_FEATURE_KEY]
        per_arm_obs = observation[bandit_spec_utils.PER_ARM_FEATURE_KEY]
        num_actions = tf.shape(per_arm_obs)[1]
        tiled_global = tf.tile(tf.expand_dims(global_obs, axis=1),
                               [1, num_actions, 1])
        concatenated = tf.concat([tiled_global, per_arm_obs], axis=-1)
        rewards = tf.linalg.matvec(concatenated, hidden_param)
        return rewards

    def optimal_reward(observation, hidden_param):
        return tf.reduce_max(_all_rewards(observation, hidden_param), axis=1)

    def optimal_action(observation, hidden_param):
        return tf.argmax(_all_rewards(observation, hidden_param),
                         axis=1,
                         output_type=tf.int32)

    optimal_reward_fn = functools.partial(optimal_reward,
                                          hidden_param=HIDDEN_PARAM)
    optimal_action_fn = functools.partial(optimal_action,
                                          hidden_param=HIDDEN_PARAM)
    regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
    suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
        optimal_action_fn)

    if FLAGS.drop_arm_obs:
        drop_arm_feature_fn = functools.partial(
            bandit_spec_utils.drop_arm_observation)
    else:
        drop_arm_feature_fn = None
    trainer.train(root_dir=FLAGS.root_dir,
                  agent=agent,
                  environment=environment,
                  training_loops=TRAINING_LOOPS,
                  steps_per_loop=STEPS_PER_LOOP,
                  additional_metrics=[regret_metric, suboptimal_arms_metric],
                  training_data_spec_transformation_fn=drop_arm_feature_fn)
def main(unused_argv):
  tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

  with tf.device('/CPU:0'):  # due to b/128333994
    if FLAGS.normalize_reward_fns:
      action_reward_fns = (
          environment_utilities.normalized_sliding_linear_reward_fn_generator(
              CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))
    else:
      action_reward_fns = (
          environment_utilities.sliding_linear_reward_fn_generator(
              CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

    env = sspe.StationaryStochasticPyEnvironment(
        functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
        action_reward_fns,
        batch_size=BATCH_SIZE)
    mask_split_fn = None
    if FLAGS.num_disabled_actions > 0:
      mask_split_fn = lambda x: (x[0], x[1])
      env = wrappers.ExtraDisabledActionsWrapper(env,
                                                 FLAGS.num_disabled_actions)
    environment = tf_py_environment.TFPyEnvironment(env)

    optimal_reward_fn = functools.partial(
        environment_utilities.tf_compute_optimal_reward,
        per_action_reward_fns=action_reward_fns)

    optimal_action_fn = functools.partial(
        environment_utilities.tf_compute_optimal_action,
        per_action_reward_fns=action_reward_fns)

    network_input_spec = environment.time_step_spec().observation
    if FLAGS.num_disabled_actions > 0:

      def _apply_only_to_observation(fn):
        def result_fn(obs):
          return fn(obs[0])
        return result_fn

      optimal_action_fn = _apply_only_to_observation(optimal_action_fn)
      optimal_reward_fn = _apply_only_to_observation(optimal_reward_fn)
      network_input_spec = network_input_spec[0]

    network = q_network.QNetwork(
        input_tensor_spec=network_input_spec,
        action_spec=environment.action_spec(),
        fc_layer_params=LAYERS)

    if FLAGS.agent == 'LinUCB':
      agent = lin_ucb_agent.LinearUCBAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          alpha=AGENT_ALPHA,
          dtype=tf.float32,
          observation_and_action_constraint_splitter=mask_split_fn)
    elif FLAGS.agent == 'LinTS':
      agent = lin_ts_agent.LinearThompsonSamplingAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          alpha=AGENT_ALPHA,
          dtype=tf.float32,
          observation_and_action_constraint_splitter=mask_split_fn)
    elif FLAGS.agent == 'epsGreedy':
      agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          reward_network=network,
          optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
          epsilon=EPSILON,
          observation_and_action_constraint_splitter=mask_split_fn)
    elif FLAGS.agent == 'Mix':
      assert FLAGS.num_disabled_actions == 0, (
          'Extra actions with mixture agent not supported.')

      emit_policy_info = policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
      agent_linucb = lin_ucb_agent.LinearUCBAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          emit_policy_info=emit_policy_info,
          alpha=AGENT_ALPHA,
          dtype=tf.float32)
      agent_lints = lin_ts_agent.LinearThompsonSamplingAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          emit_policy_info=emit_policy_info,
          alpha=AGENT_ALPHA,
          dtype=tf.float32)
      agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
          time_step_spec=environment.time_step_spec(),
          action_spec=environment.action_spec(),
          reward_network=network,
          optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
          emit_policy_info=emit_policy_info,
          epsilon=EPSILON)
      agent = exp3_mixture_agent.Exp3MixtureAgent(
          (agent_linucb, agent_lints, agent_epsgreedy))

    regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
    suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
        optimal_action_fn)

    trainer.train(
        root_dir=FLAGS.root_dir,
        agent=agent,
        environment=environment,
        training_loops=TRAINING_LOOPS,
        steps_per_loop=STEPS_PER_LOOP,
        additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #20
0
  def testLinearUCBUpdateWithForgetting(
      self, batch_size, context_dim, dtype, use_eigendecomp=False):
    """Check LinearUCB agent updates for specified actions and rewards."""
    gamma = 0.9

    # Construct a `Trajectory` for the given action, observation, reward.
    num_actions = 5
    initial_step, final_step = _get_initial_and_final_steps(
        batch_size, context_dim)
    action = np.random.randint(num_actions, size=batch_size, dtype=np.int32)
    action_step = _get_action_step(action)
    experience = _get_experience(initial_step, action_step, final_step)

    # Construct an agent and perform the update.
    observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
    time_step_spec = time_step.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec(
        dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
    agent = lin_ucb_agent.LinearUCBAgent(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        gamma=gamma,
        dtype=dtype,
        use_eigendecomp=use_eigendecomp)
    self.evaluate(tf.compat.v1.global_variables_initializer())
    loss_info = agent.train(experience)
    self.evaluate(loss_info)
    final_a = self.evaluate(agent.cov_matrix)
    final_b = self.evaluate(agent.data_vector)
    final_eig_vals = self.evaluate(agent.eig_vals)

    # Compute the expected updated estimates.
    observations_list = tf.dynamic_partition(
        data=tf.reshape(experience.observation,
                        [batch_size, context_dim]),
        partitions=tf.convert_to_tensor(action),
        num_partitions=num_actions)
    rewards_list = tf.dynamic_partition(
        data=tf.reshape(experience.reward, [batch_size]),
        partitions=tf.convert_to_tensor(action),
        num_partitions=num_actions)
    expected_a_updated_list = []
    expected_b_updated_list = []
    expected_eigvals_updated_list = []
    for _, (observations_for_arm, rewards_for_arm) in enumerate(zip(
        observations_list, rewards_list)):
      num_samples_for_arm_current = tf.cast(
          tf.shape(rewards_for_arm)[0], tf.float32)
      num_samples_for_arm_total = num_samples_for_arm_current

      # pylint: disable=cell-var-from-loop
      def true_fn():
        a_new = gamma * tf.eye(context_dim) + tf.matmul(
            observations_for_arm, observations_for_arm, transpose_a=True)
        b_new = bandit_utils.sum_reward_weighted_observations(
            rewards_for_arm, observations_for_arm)
        eigmatrix_new = tf.constant([], dtype=dtype)
        eigvals_new = tf.constant([], dtype=dtype)
        if use_eigendecomp:
          eigvals_new, eigmatrix_new = tf.linalg.eigh(a_new)
        return a_new, b_new, eigvals_new, eigmatrix_new
      def false_fn():
        if use_eigendecomp:
          return (tf.eye(context_dim), tf.zeros([context_dim]),
                  tf.ones([context_dim]), tf.eye(context_dim))
        else:
          return (tf.eye(context_dim), tf.zeros([context_dim]),
                  tf.constant([], dtype=dtype), tf.constant([], dtype=dtype))
      a_new, b_new, eig_vals_new, _ = tf.cond(
          tf.squeeze(num_samples_for_arm_total) > 0,
          true_fn,
          false_fn)

      expected_a_updated_list.append(self.evaluate(a_new))
      expected_b_updated_list.append(self.evaluate(b_new))
      expected_eigvals_updated_list.append(self.evaluate(eig_vals_new))

    # Check that the actual updated estimates match the expectations.
    self.assertAllClose(expected_a_updated_list, final_a)
    self.assertAllClose(expected_b_updated_list, final_b)
    self.assertAllClose(
        expected_eigvals_updated_list, final_eig_vals, atol=1e-4, rtol=1e-4)
예제 #21
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994
        if FLAGS.normalize_reward_fns:
            action_reward_fns = (environment_utilities.
                                 normalized_sliding_linear_reward_fn_generator(
                                     CONTEXT_DIM, NUM_ACTIONS,
                                     REWARD_NOISE_VARIANCE))
        else:
            action_reward_fns = (
                environment_utilities.sliding_linear_reward_fn_generator(
                    CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

        env = sspe.StationaryStochasticPyEnvironment(functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
                                                     action_reward_fns,
                                                     batch_size=BATCH_SIZE)
        mask_split_fn = None
        if FLAGS.num_disabled_actions > 0:
            mask_split_fn = lambda x: (x[0], x[1])
            env = wrappers.ExtraDisabledActionsWrapper(
                env, FLAGS.num_disabled_actions)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)

        network_input_spec = environment.time_step_spec().observation
        if FLAGS.num_disabled_actions > 0:

            def _apply_only_to_observation(fn):
                def result_fn(obs):
                    return fn(obs[0])

                return result_fn

            optimal_action_fn = _apply_only_to_observation(optimal_action_fn)
            optimal_reward_fn = _apply_only_to_observation(optimal_reward_fn)
            network_input_spec = network_input_spec[0]

        network = q_network.QNetwork(input_tensor_spec=network_input_spec,
                                     action_spec=environment.action_spec(),
                                     fc_layer_params=LAYERS)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32,
                observation_and_action_constraint_splitter=mask_split_fn)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32,
                observation_and_action_constraint_splitter=mask_split_fn)
        elif FLAGS.agent == 'epsGreedy':
            agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                epsilon=EPSILON,
                observation_and_action_constraint_splitter=mask_split_fn)
        elif FLAGS.agent == 'Boltzmann':
            train_step_counter = tf.compat.v1.train.get_or_create_global_step()
            boundaries = [500]
            temp_values = [1000.0, TEMPERATURE]
            temp_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
                boundaries, temp_values)

            def _temperature_fn():
                # Any variable used in the function needs to be saved in the policy.
                # This is true by default for the `train_step_counter`.
                return temp_schedule(train_step_counter)

            agent = neural_boltzmann_agent.NeuralBoltzmannAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                temperature=_temperature_fn,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                observation_and_action_constraint_splitter=mask_split_fn,
                train_step_counter=train_step_counter)
            # This is needed, otherwise the PolicySaver complains.
            agent.policy.step = train_step_counter
        elif FLAGS.agent == 'BoltzmannGumbel':
            num_samples_list = [
                tf.compat.v2.Variable(0,
                                      dtype=tf.int32,
                                      name='num_samples_{}'.format(k))
                for k in range(NUM_ACTIONS)
            ]
            agent = neural_boltzmann_agent.NeuralBoltzmannAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                boltzmann_gumbel_exploration_constant=250.0,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                observation_and_action_constraint_splitter=mask_split_fn,
                num_samples_list=num_samples_list)
        elif FLAGS.agent == 'Mix':
            assert FLAGS.num_disabled_actions == 0, (
                'Extra actions with mixture agent not supported.')

            emit_policy_info = policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
            agent_linucb = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                emit_policy_info=emit_policy_info,
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
            agent_lints = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                emit_policy_info=emit_policy_info,
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
            agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                emit_policy_info=emit_policy_info,
                epsilon=EPSILON)
            agent = exp3_mixture_agent.Exp3MixtureAgent(
                (agent_linucb, agent_lints, agent_epsgreedy))

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #22
0
  def testLinearUCBUpdateWithMaskedActions(self,
                                           batch_size,
                                           context_dim,
                                           dtype,
                                           use_eigendecomp=False):
    """Check LinearUCB agent updates for specified actions and rewards."""

    # Construct a `Trajectory` for the given action, observation, reward.
    num_actions = 5
    initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
        batch_size, context_dim, num_actions=num_actions)
    action = np.random.randint(num_actions, size=batch_size, dtype=np.int32)
    action_step = _get_action_step(action)
    experience = _get_experience(initial_step, action_step, final_step)

    # Construct an agent and perform the update.
    observation_spec = (tensor_spec.TensorSpec([context_dim], tf.float32),
                        tensor_spec.TensorSpec([num_actions], tf.int32))
    time_step_spec = time_step.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec(
        dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)

    def observation_and_action_constraint_splitter(obs):
      return obs[0], obs[1]

    agent = lin_ucb_agent.LinearUCBAgent(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter),
        dtype=dtype)
    self.evaluate(agent.initialize())
    loss_info = agent.train(experience)
    self.evaluate(loss_info)
    final_a = self.evaluate(agent.cov_matrix)
    final_b = self.evaluate(agent.data_vector)

    # Compute the expected updated estimates.
    observations_list = tf.dynamic_partition(
        data=tf.reshape(
            observation_and_action_constraint_splitter(
                experience.observation)[0], [batch_size, -1]),
        partitions=tf.convert_to_tensor(action),
        num_partitions=num_actions)
    rewards_list = tf.dynamic_partition(
        data=tf.reshape(experience.reward, [batch_size]),
        partitions=tf.convert_to_tensor(action),
        num_partitions=num_actions)
    expected_a_updated_list = []
    expected_b_updated_list = []
    for _, (observations_for_arm,
            rewards_for_arm) in enumerate(zip(observations_list, rewards_list)):
      num_samples_for_arm_current = tf.cast(
          tf.shape(rewards_for_arm)[0], tf.float32)
      num_samples_for_arm_total = num_samples_for_arm_current

      # pylint: disable=cell-var-from-loop
      def true_fn():
        a_new = tf.eye(context_dim) + tf.matmul(
            observations_for_arm, observations_for_arm, transpose_a=True)
        b_new = bandit_utils.sum_reward_weighted_observations(
            rewards_for_arm, observations_for_arm)
        return a_new, b_new
      def false_fn():
        return tf.eye(context_dim), tf.zeros([context_dim])
      a_new, b_new = tf.cond(
          tf.squeeze(num_samples_for_arm_total) > 0,
          true_fn,
          false_fn)

      expected_a_updated_list.append(self.evaluate(a_new))
      expected_b_updated_list.append(self.evaluate(b_new))

    # Check that the actual updated estimates match the expectations.
    self.assertAllClose(expected_a_updated_list, final_a)
    self.assertAllClose(expected_b_updated_list, final_b)
예제 #23
0
def main(unused_argv):
  tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

  data_path = FLAGS.data_path
  if not data_path:
    raise ValueError('Please specify the location of the data file.')
  if FLAGS.per_arm:
    env = movielens_per_arm_py_environment.MovieLensPerArmPyEnvironment(
        data_path,
        RANK_K,
        BATCH_SIZE,
        num_actions=NUM_ACTIONS,
        csv_delimiter='\t')
  else:
    env = movielens_py_environment.MovieLensPyEnvironment(
        data_path,
        RANK_K,
        BATCH_SIZE,
        num_movies=NUM_ACTIONS,
        csv_delimiter='\t')
  environment = tf_py_environment.TFPyEnvironment(env)

  optimal_reward_fn = functools.partial(
      environment_utilities.compute_optimal_reward_with_movielens_environment,
      environment=environment)

  optimal_action_fn = functools.partial(
      environment_utilities.compute_optimal_action_with_movielens_environment,
      environment=environment)

  if FLAGS.agent == 'LinUCB':
    agent = lin_ucb_agent.LinearUCBAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        tikhonov_weight=0.001,
        alpha=AGENT_ALPHA,
        dtype=tf.float32,
        accepts_per_arm_features=FLAGS.per_arm)
  elif FLAGS.agent == 'LinTS':
    agent = lin_ts_agent.LinearThompsonSamplingAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        dtype=tf.float32,
        accepts_per_arm_features=FLAGS.per_arm)
  elif FLAGS.agent == 'epsGreedy':
    if FLAGS.per_arm:
      network = (
          global_and_arm_feature_network
          .create_feed_forward_dot_product_network(
              environment.time_step_spec().observation,
              global_layers=LAYERS,
              arm_layers=LAYERS))
    else:
      network = q_network.QNetwork(
          input_tensor_spec=environment.time_step_spec().observation,
          action_spec=environment.action_spec(),
          fc_layer_params=LAYERS)
    agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        reward_network=network,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
        epsilon=EPSILON,
        emit_policy_info='predicted_rewards_mean',
        info_fields_to_inherit_from_greedy=['predicted_rewards_mean'])
  elif FLAGS.agent == 'DropoutTS':
    train_step_counter = tf.compat.v1.train.get_or_create_global_step()

    def dropout_fn():
      return tf.math.maximum(
          tf.math.reciprocal_no_nan(1.01 +
                                    tf.cast(train_step_counter, tf.float32)),
          0.0003)

    agent = dropout_ts_agent.DropoutThompsonSamplingAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        dropout_rate=dropout_fn,
        network_layers=LAYERS,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR))

  regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
  suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
      optimal_action_fn)

  trainer.train(
      root_dir=FLAGS.root_dir,
      agent=agent,
      environment=environment,
      training_loops=TRAINING_LOOPS,
      steps_per_loop=STEPS_PER_LOOP,
      additional_metrics=[regret_metric, suboptimal_arms_metric])
예제 #24
0
    def testDistributedLinearUCBUpdate(self,
                                       batch_size,
                                       context_dim,
                                       dtype,
                                       use_eigendecomp=False):
        """Same as above, but uses the distributed train function of LinUCB."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)

        agent = lin_ucb_agent.LinearUCBAgent(time_step_spec=time_step_spec,
                                             action_spec=action_spec,
                                             dtype=dtype)
        self.evaluate(agent.initialize())
        train_fn = common.function_in_tf1()(agent._distributed_train_step)
        loss_info = train_fn(experience=experience)
        self.evaluate(loss_info)

        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(experience.observation, [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        expected_theta_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float32)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(observations_for_arm,
                                  observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm)
                return a_new, b_new

            def false_fn():
                return tf.zeros([context_dim,
                                 context_dim]), tf.zeros([context_dim])

            a_new, b_new = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)
            theta_new = tf.squeeze(tf.linalg.solve(
                a_new + tf.eye(context_dim), tf.expand_dims(b_new, axis=-1)),
                                   axis=-1)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))
            expected_theta_updated_list.append(self.evaluate(theta_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
예제 #25
0
def get_agent_by_name(agent_name, time_step_spec, action_spec):
    """Helper function that outputs an agent.

  Args:
    agent_name: The name (string) of the desired agent.
    time_step_spec: The time step spec of the environment on which the agent
      acts.
    action_spec: The action spec on which the agent acts.

  Returns:
    The desired agent.
  """
    accepts_per_arm_features = isinstance(
        time_step_spec.observation,
        dict) and 'per_arm' in time_step_spec.observation
    if agent_name == 'LinUCB':
        return lin_ucb_agent.LinearUCBAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            dtype=tf.float32,
            accepts_per_arm_features=accepts_per_arm_features)
    elif agent_name == 'LinTS':
        return lin_ts_agent.LinearThompsonSamplingAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            dtype=tf.float32,
            accepts_per_arm_features=accepts_per_arm_features)
    elif agent_name == 'epsGreedy':
        if accepts_per_arm_features:
            network = (global_and_arm_feature_network.
                       create_feed_forward_common_tower_network(
                           time_step_spec.observation, (20, 20), (20, 20),
                           (20, 20)))
        else:
            network = q_network.QNetwork(
                input_tensor_spec=time_step_spec.observation,
                action_spec=action_spec,
                fc_layer_params=(50, 50, 50))
        return neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05),
            epsilon=0.1,
            accepts_per_arm_features=accepts_per_arm_features)
    elif agent_name == 'mix':
        assert not accepts_per_arm_features, 'Per-arm mixture agent not supported.'
        emit_policy_info = (
            policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, )
        network = q_network.QNetwork(
            input_tensor_spec=time_step_spec.observation,
            action_spec=action_spec,
            fc_layer_params=(50, 50, 50))
        agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05),
            emit_policy_info=emit_policy_info,
            epsilon=0.1)
        agent_linucb = lin_ucb_agent.LinearUCBAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            emit_policy_info=emit_policy_info,
            dtype=tf.float32)
        agent_random = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05),
            emit_policy_info=emit_policy_info,
            epsilon=1.)
        agent_halfrandom = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.05),
            emit_policy_info=emit_policy_info,
            epsilon=0.5)
        return exp3_mixture_agent.Exp3MixtureAgent(
            (agent_epsgreedy, agent_linucb, agent_random, agent_halfrandom))
예제 #26
0
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    class LinearNormalReward(object):
        def __init__(self, theta):
            self.theta = theta

        def __call__(self, x):
            mu = np.dot(x, self.theta)
            return np.random.normal(mu, 1)

    def _global_context_sampling_fn():
        return np.random.randint(-10, 10, [4]).astype(np.float32)

    def _arm_context_sampling_fn():
        return np.random.randint(-2, 3, [5]).astype(np.float32)

    reward_fn = LinearNormalReward(HIDDEN_PARAM)

    env = sspe.StationaryStochasticPerArmPyEnvironment(
        _global_context_sampling_fn,
        _arm_context_sampling_fn,
        NUM_ACTIONS,
        reward_fn,
        batch_size=BATCH_SIZE)
    environment = tf_py_environment.TFPyEnvironment(env)

    obs_spec = environment.observation_spec()
    if FLAGS.drop_arm_obs:
        drop_arm_feature_fn = bandit_spec_utils.drop_arm_observation
    else:
        drop_arm_feature_fn = None
    if FLAGS.agent == 'LinUCB':
        agent = lin_ucb_agent.LinearUCBAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            alpha=AGENT_ALPHA,
            accepts_per_arm_features=True,
            drop_arm_features=FLAGS.drop_arm_obs,
            dtype=tf.float32)
    elif FLAGS.agent == 'LinTS':
        agent = lin_ts_agent.LinearThompsonSamplingAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            alpha=AGENT_ALPHA,
            accepts_per_arm_features=True,
            drop_arm_features=FLAGS.drop_arm_obs,
            dtype=tf.float32)
    elif FLAGS.agent == 'epsGreedy':
        if FLAGS.network == 'commontower':
            network = (global_and_arm_feature_network.
                       create_feed_forward_common_tower_network(
                           obs_spec, (4, 3), (3, 4), (4, 2)))
        elif FLAGS.network == 'dotproduct':
            network = (global_and_arm_feature_network.
                       create_feed_forward_dot_product_network(
                           obs_spec, (4, 3, 6), (3, 4, 6)))
        agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent(
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec(),
            reward_network=network,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
            epsilon=EPSILON,
            accepts_per_arm_features=True,
            training_data_spec_transformation_fn=drop_arm_feature_fn,
            emit_policy_info=policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN
        )

    optimal_reward_fn = functools.partial(optimal_reward,
                                          hidden_param=HIDDEN_PARAM)
    optimal_action_fn = functools.partial(optimal_action,
                                          hidden_param=HIDDEN_PARAM)
    regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
    suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
        optimal_action_fn)

    trainer.train(root_dir=FLAGS.root_dir,
                  agent=agent,
                  environment=environment,
                  training_loops=TRAINING_LOOPS,
                  steps_per_loop=STEPS_PER_LOOP,
                  additional_metrics=[regret_metric, suboptimal_arms_metric],
                  training_data_spec_transformation_fn=drop_arm_feature_fn)