def testLaplacian1D(self): action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=4) laplacian_matrix = tf.convert_to_tensor( utils.build_laplacian_over_ordinal_integer_actions(action_spec), dtype=tf.float32) res = tf.matmul(laplacian_matrix, tf.ones([5, 1], dtype=tf.float32)) # The vector of ones is in the null space of the Laplacian matrix. self.assertAllClose(0.0, self.evaluate(tf.norm(res))) # The row sum is zero. row_sum = tf.reduce_sum(laplacian_matrix, 1) self.assertAllClose(0.0, self.evaluate(tf.norm(row_sum))) # The column sum is zero. column_sum = tf.reduce_sum(laplacian_matrix, 0) self.assertAllClose(0.0, self.evaluate(tf.norm(column_sum))) # The diagonal elements are 2.0. self.assertAllClose(2.0, laplacian_matrix[1, 1]) laplacian_matrix_expected = np.array([[1.0, -1.0, 0.0, 0.0, 0.0], [-1.0, 2.0, -1.0, 0.0, 0.0], [0.0, -1.0, 2.0, -1.0, 0.0], [0.0, 0.0, -1.0, 2.0, -1.0], [0.0, 0.0, 0.0, -1.0, 1.0]]) self.assertAllClose(laplacian_matrix_expected, self.evaluate(laplacian_matrix))
def build_laplacian_over_ordinal_integer_actions_from_env(env): return utils.build_laplacian_over_ordinal_integer_actions( env.action_spec())
def main(unused_argv): tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled. with tf.device('/CPU:0'): # due to b/128333994 action_reward_fns = ( environment_utilities.structured_linear_reward_fn_generator( CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE)) env = sspe.StationaryStochasticPyEnvironment(functools.partial( environment_utilities.context_sampling_fn, batch_size=BATCH_SIZE, context_dim=CONTEXT_DIM), action_reward_fns, batch_size=BATCH_SIZE) environment = tf_py_environment.TFPyEnvironment(env) optimal_reward_fn = functools.partial( environment_utilities.tf_compute_optimal_reward, per_action_reward_fns=action_reward_fns) optimal_action_fn = functools.partial( environment_utilities.tf_compute_optimal_action, per_action_reward_fns=action_reward_fns) if FLAGS.agent == 'LinUCB': agent = lin_ucb_agent.LinearUCBAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) elif FLAGS.agent == 'epsGreedy': laplacian_matrix = utils.build_laplacian_over_ordinal_integer_actions( environment.action_spec()) network = q_network.QNetwork( input_tensor_spec=environment.time_step_spec().observation, action_spec=environment.action_spec(), fc_layer_params=REWARD_NETWORK_LAYER_PARAMS) agent = eps_greedy_agent.NeuralEpsilonGreedyAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), reward_network=network, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=NN_LEARNING_RATE), epsilon=EPSILON, laplacian_matrix=laplacian_matrix, laplacian_smoothing_weight=0.01) elif FLAGS.agent == 'LinTS': agent = lin_ts_agent.LinearThompsonSamplingAgent( time_step_spec=environment.time_step_spec(), action_spec=environment.action_spec(), alpha=AGENT_ALPHA, dtype=tf.float32) regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn) suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric( optimal_action_fn) trainer.train( root_dir=FLAGS.root_dir, agent=agent, environment=environment, training_loops=TRAINING_LOOPS, steps_per_loop=STEPS_PER_LOOP, additional_metrics=[regret_metric, suboptimal_arms_metric])