def testTrainAgentWithLaplacianSmoothingInvalidMatrix(self): if tf.executing_eagerly: return observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, rewards) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) with self.assertRaisesRegexp(errors.InvalidArgumentError, ''): reward_net = DummyNet(self._observation_spec, self._action_spec) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) # Set the Laplacian matrix to be the identity, which is not a valid # Laplacian. laplacian_matrix = tf.eye(3) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer, laplacian_matrix=laplacian_matrix, laplacian_smoothing_weight=1.0) self.evaluate(tf.compat.v1.initialize_all_variables()) loss_before, _ = agent.train(experience, None) self.evaluate(loss_before)
def testTrainPerArmAgent(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) time_step_spec = ts.time_step_spec(obs_spec) reward_net = ( global_and_arm_feature_network.create_feed_forward_per_arm_network( obs_spec, (4, 3), (3, 4), (4, 2))) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, accepts_per_arm_features=True, optimizer=optimizer) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32) } actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, rewards) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def testTrainAgentWithMask(self): reward_net = DummyNet(self._observation_spec, self._action_spec) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) time_step_spec = ts.time_step_spec( (tensor_spec.TensorSpec([2], tf.float32), tensor_spec.TensorSpec([3], tf.int32))) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer, observation_and_action_constraint_splitter=lambda x: (x[0], x[1])) observations = (np.array([[1, 2], [3, 4]], dtype=np.float32), np.array([[1, 0, 0], [1, 1, 0]], dtype=np.int32)) actions = np.array([0, 1], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps_with_action_mask( observations, rewards) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) loss_after, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss_before), 42.25) self.assertAllClose(self.evaluate(loss_after), 93.46)
def testTrainAgentWithConstraint(self): reward_net = DummyNet(self._observation_spec, self._action_spec) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) constraint_net = DummyNet(self._observation_spec, self._action_spec) neural_constraint = constraints.NeuralConstraint( self._time_step_spec, self._action_spec, constraint_network=constraint_net) reward_spec = tensor_spec.TensorSpec(shape=(2, ), dtype=tf.float32, name='reward') self._time_step_spec = ts.time_step_spec(self._obs_spec, reward_spec) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer, constraints=[neural_constraint]) observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) rewards = np.array([[0.5, 6.0], [3.0, 4.0]], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, rewards) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) # The loss is the sum of the reward loss and the constraint loss. self.assertAllClose(self.evaluate(loss_before), 42.25 + 30.125)
def testInitializeRestoreAgent(self): reward_net = DummyNet(self._observation_spec, self._action_spec) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=None) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) policy = agent.policy action_step = policy.action(time_steps) self.evaluate(tf.compat.v1.initialize_all_variables()) checkpoint = tf.train.Checkpoint(agent=agent) latest_checkpoint = tf.train.latest_checkpoint(self.get_temp_dir()) checkpoint_load_status = checkpoint.restore(latest_checkpoint) if tf.executing_eagerly(): self.evaluate(checkpoint_load_status.initialize_or_restore()) self.assertAllEqual(self.evaluate(action_step.action), [1, 2]) else: with self.cached_session() as sess: checkpoint_load_status.initialize_or_restore(sess) self.assertAllEqual(sess.run(action_step.action), [1, 2])
def testNumSamplesList(self): if not tf.executing_eagerly(): return reward_net = DummyNet(self._observation_spec, self._action_spec) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) num_samples_list = [] for k in range(3): num_samples_list.append( tf.compat.v2.Variable(tf.zeros([], dtype=tf.int64), name='num_samples_{}'.format(k))) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer, num_samples_list=num_samples_list) observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, rewards) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) self.evaluate(tf.compat.v1.initialize_all_variables()) _, _ = agent.train(experience, None) _, _ = agent.train(experience, None) # Action 0 and 1 have 2 samples. Action 2 has 0 samples. self.assertEqual(self.evaluate(num_samples_list[0].read_value()), 2) self.assertEqual(self.evaluate(num_samples_list[1].read_value()), 2) self.assertEqual(self.evaluate(num_samples_list[2].read_value()), 0)
def testCreateAgent(self): reward_net = DummyNet(self._observation_spec, self._action_spec) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=None) self.assertIsNotNone(agent.policy)
def testInitializeAgent(self): reward_net = DummyNet(self._observation_spec, self._action_spec) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=None) init_op = agent.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op))
def testPolicy(self): reward_net = DummyNet(self._observation_spec, self._action_spec) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=None) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) policy = agent.policy action_step = policy.action(time_steps) # Batch size 2. self.assertAllEqual([2], action_step.action.shape) self.evaluate(tf.compat.v1.initialize_all_variables()) actions = self.evaluate(action_step.action) self.assertAllEqual(actions, [1, 2])
def testTrainAgentWithMaskAndConstraint(self): reward_net = DummyNet(self._observation_spec, self._action_spec) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) reward_spec = { 'reward': tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name='reward'), 'constraint': tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name='constraint') } observation_and_mask_spec = (tensor_spec.TensorSpec([2], tf.float32), tensor_spec.TensorSpec([3], tf.int32)) time_step_spec = ts.time_step_spec(observation_and_mask_spec, reward_spec) constraint_net = DummyNet(self._observation_spec, self._action_spec) neural_constraint = constraints.NeuralConstraint( self._time_step_spec, self._action_spec, constraint_network=constraint_net) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer, observation_and_action_constraint_splitter=lambda x: (x[0], x[1]), constraints=[neural_constraint]) observations = (np.array([[1, 2], [3, 4]], dtype=np.float32), np.array([[1, 0, 0], [1, 1, 0]], dtype=np.int32)) actions = np.array([0, 1], dtype=np.int32) rewards = { 'reward': np.array([0.5, 3.0], dtype=np.float32), 'constraint': np.array([6.0, 4.0], dtype=np.float32) } initial_step, final_step = ( _get_initial_and_final_steps_action_mask_nested_rewards( observations, rewards)) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) # The loss is the sum of the reward loss and the constraint loss. self.assertAllClose(self.evaluate(loss_before), 42.25 + 30.125)
def testLoss(self): reward_net = DummyNet(self._observation_spec, self._action_spec) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) actions = tf.constant([0, 1], dtype=tf.int32) rewards = tf.constant([0.5, 3.0], dtype=tf.float32) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=None) init_op = agent.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op)) loss, _ = agent.loss(observations, actions, rewards) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss), 42.25)
def testTrainAgent(self): reward_net = DummyNet(self._observation_spec, self._action_spec) optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer) observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, rewards) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) loss_after, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss_before), 42.25) self.assertAllClose(self.evaluate(loss_after), 93.46)
def testTrainPerArmAgentWithMask(self): num_actions = 4 obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, num_actions) mask_obs_spec = (obs_spec, tensor_spec.BoundedTensorSpec(shape=[num_actions], minimum=0, maximum=1, dtype=tf.float32)) time_step_spec = ts.time_step_spec(mask_obs_spec) reward_net = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, observation_and_action_constraint_splitter=lambda x: [x[0], x[1]], accepts_per_arm_features=True, optimizer=optimizer) observations = ({ bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32) }, tf.ones([2, num_actions])) actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps_with_action_mask( observations, rewards) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def testLoss(self): reward_net = DummyNet(self._observation_spec, self._action_spec) observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps_nested_rewards( observations, rewards) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=None) init_op = agent.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op)) loss, _ = agent._loss(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss), 42.25)
def testTrainAgentWithLaplacianSmoothing(self): reward_net = DummyNet(self._observation_spec, self._action_spec) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) laplacian_matrix = tf.constant([[1.0, -1.0, 0.0], [-1.0, 2.0, -1.0], [0.0, -1.0, 1.0]]) agent = greedy_agent.GreedyRewardPredictionAgent( self._time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer, laplacian_matrix=laplacian_matrix, laplacian_smoothing_weight=1.0) observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, rewards) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) # The Laplacian smoothing term ends up adding 22.5 to the loss. self.assertAllClose(self.evaluate(loss_before), 42.25 + 22.5)