def main(unused_argv): tf.enable_resource_variables() with tf.device('/CPU:0'): # due to b/128333994 env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE, MU_HIGH, STD_HIGH, BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_reward, delta=DELTA, mu_inside=MU_BASE[0], mu_high=MU_HIGH) optimal_action_fn = functools.partial( environment_utilities.tf_wheel_bandit_compute_optimal_action, delta=DELTA) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=LAYERS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def main(unused_argv): tf.enable_resource_variables() with tf.device('/CPU:0'): # due to b/128333994 observation_shape = [CONTEXT_DIM] overall_shape = [BATCH_SIZE] + observation_shape observation_distribution = tfd.Normal(loc=tf.zeros(overall_shape), scale=tf.ones(overall_shape)) action_shape = [NUM_ACTIONS] observation_to_reward_shape = observation_shape + action_shape observation_to_reward_distribution = tfd.Normal( loc=tf.zeros(observation_to_reward_shape), scale=tf.ones(observation_to_reward_shape)) drift_distribution = tfd.Normal(loc=DRIFT_MEAN, scale=DRIFT_VARIANCE) additive_reward_distribution = tfd.Normal( loc=tf.zeros(action_shape), scale=(REWARD_NOISE_VARIANCE * tf.ones(action_shape))) environment_dynamics = dle.DriftingLinearDynamics( observation_distribution, observation_to_reward_distribution, drift_distribution, additive_reward_distribution) environment = nse.NonStationaryStochasticEnvironment( environment_dynamics) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, gamma=0.95, emit_log_probability=False, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, gamma=0.95, dtype=tf.float32) regret_metric = tf_bandit_metrics.RegretMetric( environment.environment_dynamics.compute_optimal_reward) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( environment.environment_dynamics.compute_optimal_action) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def main(unused_argv): tf.compat.v1.enable_resource_variables() with tf.device('/CPU:0'): # due to b/128333994 action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def testTrainerTF1ExportsCheckpoints(self, num_actions, observation_shape, action_shape, batch_size, training_loops, steps_per_loop, learning_rate): """Tests TF1 trainer code, checks that expected checkpoints are exported.""" root_dir = tempfile.mkdtemp(dir=os.getenv('TEST_TMPDIR')) environment = get_bounded_reward_random_environment( observation_shape, action_shape, batch_size, num_actions) agent = exp3_agent.Exp3Agent( learning_rate=learning_rate, time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec()) trainer.train(root_dir, agent, environment, training_loops, steps_per_loop) latest_checkpoint = tf.train.latest_checkpoint( os.path.join(root_dir, 'train')) expected_checkpoint_regex = '.*ckpt-{}'.format( training_loops * batch_size * steps_per_loop) self.assertRegex(latest_checkpoint, expected_checkpoint_regex)