def testPolicyWithConstraints(self): constraint_net = DummyNet(self._obs_spec) neural_constraint = constraints.NeuralConstraint( self._time_step_spec, self._action_spec, constraint_network=constraint_net) tf.compat.v1.set_random_seed(1) policy = greedy_reward_policy.GreedyRewardPredictionPolicy( self._time_step_spec, self._action_spec, reward_network=DummyNet(self._obs_spec), constraints=[neural_constraint], emit_policy_info=('predicted_rewards_mean',)) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [1, 2]) # The expected values are obtained by passing the observation through the # Keras dense layer of the DummyNet (defined above). predicted_rewards_expected_array = np.array([[4.0, 5.5, 0.0], [8.0, 11.5, 12.0]]) p_info = self.evaluate(action_step.info) self.assertAllClose(p_info.predicted_rewards_mean, predicted_rewards_expected_array)
def testComputeLossWithArmFeatures(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( global_dim=2, per_arm_dim=3, max_num_actions=3) time_step_spec = ts.time_step_spec(obs_spec) constraint_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, global_layers=(4,), arm_layers=(4,), common_layers=(4,))) neural_constraint = constraints.NeuralConstraint( time_step_spec, self._action_spec, constraint_network=constraint_net) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast( tf.reshape(tf.range(18), shape=[2, 3, 3]), dtype=tf.float32) } actions = tf.constant([0, 1], dtype=tf.int32) rewards = tf.constant([0.5, 3.0], dtype=tf.float32) init_op = neural_constraint.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op)) loss = neural_constraint.compute_loss( observations, actions, rewards) self.assertGreater(self.evaluate(loss), 0.0)
def testInitializeConstraint(self): constraint_net = DummyNet(self._observation_spec, self._action_spec) neural_constraint = constraints.NeuralConstraint( self._time_step_spec, self._action_spec, constraint_network=constraint_net) init_op = neural_constraint.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op))
def testTrainPerArmAgentWithConstraint(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4) reward_spec = { 'reward': tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name='reward'), 'constraint': tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name='constraint') } time_step_spec = ts.time_step_spec(obs_spec, reward_spec) reward_net = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) constraint_net = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) neural_constraint = constraints.NeuralConstraint( time_step_spec, self._action_spec, constraint_network=constraint_net) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, accepts_per_arm_features=True, optimizer=optimizer, constraints=[neural_constraint]) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32) } actions = np.array([0, 3], dtype=np.int32) rewards = { 'reward': np.array([0.5, 3.0], dtype=np.float32), 'constraint': np.array([6.0, 4.0], dtype=np.float32) } initial_step, final_step = _get_initial_and_final_steps_nested_rewards( observations, rewards) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def testComputeActionFeasibility(self): constraint_net = DummyNet(self._observation_spec, self._action_spec) neural_constraint = constraints.NeuralConstraint( self._time_step_spec, self._action_spec, constraint_network=constraint_net) init_op = neural_constraint.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op)) observation = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) feasibility_prob = neural_constraint(observation) self.assertAllClose(self.evaluate(feasibility_prob), np.ones([2, 3]))
def testComputeFeasibilityMaskWithActionMask(self): observation_spec = tensor_spec.TensorSpec([2], tf.float32) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2) constraint_net = DummyNet(observation_spec, action_spec) neural_constraint = constraints.NeuralConstraint( time_step_spec, action_spec, constraint_network=constraint_net) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) action_mask = tf.constant([[0, 0, 1], [0, 1, 0]], dtype=tf.int32) feasibility_prob = constraints.compute_feasibility_probability( observations, [neural_constraint], batch_size=2, num_actions=3, action_mask=action_mask) self.assertAllEqual(self.evaluate(tf.cast(action_mask, tf.float32)), self.evaluate(feasibility_prob))
def testComputeLoss(self): constraint_net = DummyNet(self._observation_spec, self._action_spec) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) actions = tf.constant([0, 1], dtype=tf.int32) rewards = tf.constant([0.5, 3.0], dtype=tf.float32) neural_constraint = constraints.NeuralConstraint( self._time_step_spec, self._action_spec, constraint_network=constraint_net) init_op = neural_constraint.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op)) loss = neural_constraint.compute_loss(observations, actions, rewards) self.assertAllClose(self.evaluate(loss), 42.25)
def testTrainAgentWithMaskAndConstraint(self): reward_net = DummyNet(self._observation_spec, self._action_spec) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) reward_spec = { 'reward': tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name='reward'), 'constraint': tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name='constraint') } observation_and_mask_spec = (tensor_spec.TensorSpec([2], tf.float32), tensor_spec.TensorSpec([3], tf.int32)) time_step_spec = ts.time_step_spec(observation_and_mask_spec, reward_spec) constraint_net = DummyNet(self._observation_spec, self._action_spec) neural_constraint = constraints.NeuralConstraint( self._time_step_spec, self._action_spec, constraint_network=constraint_net) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer, observation_and_action_constraint_splitter=lambda x: (x[0], x[1]), constraints=[neural_constraint]) observations = (np.array([[1, 2], [3, 4]], dtype=np.float32), np.array([[1, 0, 0], [1, 1, 0]], dtype=np.int32)) actions = np.array([0, 1], dtype=np.int32) rewards = { 'reward': np.array([0.5, 3.0], dtype=np.float32), 'constraint': np.array([6.0, 4.0], dtype=np.float32) } initial_step, final_step = ( _get_initial_and_final_steps_action_mask_nested_rewards( observations, rewards)) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) # The loss is the sum of the reward loss and the constraint loss. self.assertAllClose(self.evaluate(loss_before), 42.25 + 30.125)
def testComputeMaskFromMultipleSourcesMask(self): observation_spec = bandit_spec_utils.create_per_arm_observation_spec( 4, 5, 6) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5) constraint_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( observation_spec, (3, 4), (4, 3), (2, 3))) neural_constraint = constraints.NeuralConstraint( time_step_spec, action_spec, constraint_network=constraint_net) original_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]] observations = ({ 'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32), 'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]), }, original_mask) mask = constraints.construct_mask_from_multiple_sources( observations, lambda x: (x[0], x[1]), [neural_constraint], 6) self.assertAllGreaterEqual(original_mask - mask, 0)
def testComputeMaskFromMultipleSourcesNumActionsFeature(self): observation_spec = bandit_spec_utils.create_per_arm_observation_spec( 4, 5, 6, add_num_actions_feature=True) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5) constraint_net = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( observation_spec, (3, 4), (4, 3), (2, 3))) neural_constraint = constraints.NeuralConstraint( time_step_spec, action_spec, constraint_network=constraint_net) observations = { 'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32), 'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]), 'num_actions': tf.constant([4, 3], dtype=tf.int32) } mask = constraints.construct_mask_from_multiple_sources( observations, None, [neural_constraint], 6) implied_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]] self.assertAllGreaterEqual(implied_mask - mask, 0)
def testPolicyWithConstraints(self): constraint_net = DummyNet(self._obs_spec) neural_constraint = constraints.NeuralConstraint( self._time_step_spec, self._action_spec, constraint_network=constraint_net) tf.compat.v1.set_random_seed(1) policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy( self._time_step_spec, self._action_spec, reward_network=DummyNet(self._obs_spec), constraints=[neural_constraint], emit_policy_info=('predicted_rewards_mean', )) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllInSet(self.evaluate(action_step.action), [1, 2])
def testCreateConstraint(self): constraint_net = DummyNet(self._observation_spec, self._action_spec) constraints.NeuralConstraint( self._time_step_spec, self._action_spec, constraint_network=constraint_net)