def test_with_uniform_context_and_normal_mu_reward(self): def _context_sampling_fn(): return np.random.randint(-10, 10, [1, 4]) reward_fns = [ LinearNormalReward(theta) for theta in ([0, 1, 2, 3], [3, 2, 1, 0], [-1, -2, -3, -4]) ] env = sspe.StationaryStochasticPyEnvironment(_context_sampling_fn, reward_fns) time_step_spec = env.time_step_spec() action_spec = env.action_spec() random_policy = random_py_policy.RandomPyPolicy( time_step_spec=time_step_spec, action_spec=action_spec) for _ in range(5): time_step = env.reset() self.assertTrue( check_unbatched_time_step_spec(time_step=time_step, time_step_spec=time_step_spec, batch_size=env.batch_size)) action = random_policy.action(time_step).action time_step = env.step(action)
def test_non_scalar_rewards(self): def _context_sampling_fn(): return np.array([[4, 3], [4, 3], [5, 6]]) # Build a case with 4 arms and 2-dimensional rewards and batch size 3. reward_fns = [ LinearDeterministicMultipleRewards(theta) # pylint: disable=g-complex-comprehension for theta in [ np.array([[0, 1], [1, 0]]), np.array([[1, 2], [2, 1]]), np.array([[2, 3], [3, 2]]), np.array([[3, 4], [4, 3]]) ] ] env = sspe.StationaryStochasticPyEnvironment(_context_sampling_fn, reward_fns, batch_size=3) time_step = env.reset() self.assertAllEqual(time_step.observation, [[4, 3], [4, 3], [5, 6]]) time_step = env.step([0, 1, 2]) self.assertAllEqual(time_step.reward, [[3., 4.], [10., 11.], [28., 27.]]) env.reset() time_step = env.step([2, 3, 0]) self.assertAllEqual(time_step.reward, [[17., 18.], [24., 25.], [6., 5.]]) # Check that the reward vectors in the reward spec are 2-dimensional. time_step_spec = env.time_step_spec() self.assertEqual(time_step_spec.reward.shape[0], 2)
def test_with_normal_context_and_normal_reward(self): def _context_sampling_fn(): return np.random.normal(0, 3, [1, 2]) def _reward_fn(x): return np.random.normal(2 * x[0], abs(x[1]) + 1) env = sspe.StationaryStochasticPyEnvironment(_context_sampling_fn, [_reward_fn]) time_step_spec = env.time_step_spec() action_spec = env.action_spec() random_policy = random_py_policy.RandomPyPolicy( time_step_spec=time_step_spec, action_spec=action_spec) for _ in range(5): time_step = env.reset() self.assertTrue( check_unbatched_time_step_spec( time_step=time_step, time_step_spec=time_step_spec, batch_size=env.batch_size)) action = random_policy.action(time_step).action time_step = env.step(action)
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) q_net = q_network.QNetwork(environment.observation_spec(), environment.action_spec(), fc_layer_params=(50, 50)) agent = dqn_agent.DqnAgent( environment.time_step_spec(), environment.action_spec(), q_network=q_net, epsilon_greedy=0.1, target_update_tau=0.05, target_update_period=5, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-2), td_errors_loss_fn=common.element_wise_squared_loss) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def main(unused_argv): tf.compat.v1.enable_resource_variables() with tf.device('/CPU:0'): # due to b/128333994 action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def test_deterministic_with_batch_2(self): def _context_sampling_fn(): return np.array([[4, 3], [4, 3]]) reward_fns = [ LinearDeterministicReward(theta) for theta in ([0, 1], [1, 2], [2, 3], [3, 4]) ] env = sspe.StationaryStochasticPyEnvironment(_context_sampling_fn, reward_fns, batch_size=2) time_step = env.reset() self.assertAllEqual(time_step.observation, [[4, 3], [4, 3]]) time_step = env.step([0, 1]) self.assertAllEqual(time_step.reward, [3, 10]) env.reset() time_step = env.step([2, 3]) self.assertAllEqual(time_step.reward, [17, 24])
def testBanditEnvironment(self): def _context_sampling_fn(): return np.array([[5, -5], [2, -2]]) reward_fns = [ environment_utilities.LinearNormalReward(theta, sigma=0.0) for theta in ([1, 0], [0, 1]) ] batch_size = 2 py_env = sspe.StationaryStochasticPyEnvironment(_context_sampling_fn, reward_fns, batch_size=batch_size) env = tf_py_environment.TFPyEnvironment(py_env) policy = random_tf_policy.RandomTFPolicy(env.time_step_spec(), env.action_spec()) steps_per_loop = 4 replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=policy.trajectory_spec, batch_size=batch_size, max_length=steps_per_loop) driver = dynamic_episode_driver.DynamicEpisodeDriver( env, policy, num_episodes=steps_per_loop * batch_size, observers=[replay_buffer.add_batch]) run_driver = driver.run() rb_gather_all = replay_buffer.gather_all() self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(run_driver) trajectories = self.evaluate(rb_gather_all) self.assertAllEqual(trajectories.step_type, [[0, 0, 0, 0], [0, 0, 0, 0]]) self.assertAllEqual(trajectories.next_step_type, [[2, 2, 2, 2], [2, 2, 2, 2]])
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 if FLAGS.normalize_reward_fns: action_reward_fns = ( environment_utilities.normalized_sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) else: action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment( functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) mask_split_fn = None if FLAGS.num_disabled_actions > 0: mask_split_fn = lambda x: (x[0], x[1]) env = wrappers.ExtraDisabledActionsWrapper(env, FLAGS.num_disabled_actions) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) network_input_spec = environment.time_step_spec().observation if FLAGS.num_disabled_actions > 0: def _apply_only_to_observation(fn): def result_fn(obs): return fn(obs[0]) return result_fn optimal_action_fn = _apply_only_to_observation(optimal_action_fn) optimal_reward_fn = _apply_only_to_observation(optimal_reward_fn) network_input_spec = network_input_spec[0] network = q_network.QNetwork( input_tensor_spec=network_input_spec, action_spec=environment.action_spec(), fc_layer_params=LAYERS) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'epsGreedy': agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'Mix': assert FLAGS.num_disabled_actions == 0, ( 'Extra actions with mixture agent not supported.') emit_policy_info = policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN agent_linucb = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), emit_policy_info=emit_policy_info, alpha=AGENT_ALPHA, dtype=tf.float32) agent_lints = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), emit_policy_info=emit_policy_info, alpha=AGENT_ALPHA, dtype=tf.float32) agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), emit_policy_info=emit_policy_info, epsilon=EPSILON) agent = exp3_mixture_agent.Exp3MixtureAgent( (agent_linucb, agent_lints, agent_epsgreedy)) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 if FLAGS.normalize_reward_fns: action_reward_fns = (environment_utilities. normalized_sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) else: action_reward_fns = ( environment_utilities.sliding_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) mask_split_fn = None if FLAGS.num_disabled_actions > 0: mask_split_fn = lambda x: (x[0], x[1]) env = wrappers.ExtraDisabledActionsWrapper( env, FLAGS.num_disabled_actions) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) network_input_spec = environment.time_step_spec().observation if FLAGS.num_disabled_actions > 0: def _apply_only_to_observation(fn): def result_fn(obs): return fn(obs[0]) return result_fn optimal_action_fn = _apply_only_to_observation(optimal_action_fn) optimal_reward_fn = _apply_only_to_observation(optimal_reward_fn) network_input_spec = network_input_spec[0] network = q_network.QNetwork(input_tensor_spec=network_input_spec, action_spec=environment.action_spec(), fc_layer_params=LAYERS) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'epsGreedy': agent = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), epsilon=EPSILON, observation_and_action_constraint_splitter=mask_split_fn) elif FLAGS.agent == 'Boltzmann': train_step_counter = tf.compat.v1.train.get_or_create_global_step() boundaries = [500] temp_values = [1000.0, TEMPERATURE] temp_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries, temp_values) def _temperature_fn(): # Any variable used in the function needs to be saved in the policy. # This is true by default for the `train_step_counter`. return temp_schedule(train_step_counter) agent = neural_boltzmann_agent.NeuralBoltzmannAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, temperature=_temperature_fn, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), observation_and_action_constraint_splitter=mask_split_fn, train_step_counter=train_step_counter) # This is needed, otherwise the PolicySaver complains. agent.policy.step = train_step_counter elif FLAGS.agent == 'BoltzmannGumbel': num_samples_list = [ tf.compat.v2.Variable(0, dtype=tf.int32, name='num_samples_{}'.format(k)) for k in range(NUM_ACTIONS) ] agent = neural_boltzmann_agent.NeuralBoltzmannAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, boltzmann_gumbel_exploration_constant=250.0, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), observation_and_action_constraint_splitter=mask_split_fn, num_samples_list=num_samples_list) elif FLAGS.agent == 'Mix': assert FLAGS.num_disabled_actions == 0, ( 'Extra actions with mixture agent not supported.') emit_policy_info = policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN agent_linucb = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), emit_policy_info=emit_policy_info, alpha=AGENT_ALPHA, dtype=tf.float32) agent_lints = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), emit_policy_info=emit_policy_info, alpha=AGENT_ALPHA, dtype=tf.float32) agent_epsgreedy = neural_epsilon_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), emit_policy_info=emit_policy_info, epsilon=EPSILON) agent = exp3_mixture_agent.Exp3MixtureAgent( (agent_linucb, agent_lints, agent_epsgreedy)) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 action_reward_fns = ( environment_utilities.structured_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': laplacian_matrix = utils.build_laplacian_over_ordinal_integer_actions( environment.action_spec()) network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=REWARD_NETWORK_LAYER_PARAMS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=NN_LEARNING_RATE), epsilon=EPSILON, laplacian_matrix=laplacian_matrix, laplacian_smoothing_weight=0.01) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])