def testPolicyWithConstraints(self):
    constraint_net = DummyNet(self._obs_spec)
    neural_constraint = constraints.NeuralConstraint(
        self._time_step_spec,
        self._action_spec,
        constraint_network=constraint_net)

    tf.compat.v1.set_random_seed(1)
    policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
        self._time_step_spec,
        self._action_spec,
        reward_network=DummyNet(self._obs_spec),
        constraints=[neural_constraint],
        emit_policy_info=('predicted_rewards_mean',))
    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_step = ts.restart(observations, batch_size=2)
    action_step = policy.action(time_step, seed=1)
    self.assertEqual(action_step.action.shape.as_list(), [2])
    self.assertEqual(action_step.action.dtype, tf.int32)
    # Initialize all variables
    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertAllEqual(self.evaluate(action_step.action), [1, 2])
    # The expected values are obtained by passing the observation through the
    # Keras dense layer of the DummyNet (defined above).
    predicted_rewards_expected_array = np.array([[4.0, 5.5, 0.0],
                                                 [8.0, 11.5, 12.0]])
    p_info = self.evaluate(action_step.info)
    self.assertAllClose(p_info.predicted_rewards_mean,
                        predicted_rewards_expected_array)
示例#2
0
  def testComputeLossWithArmFeatures(self):
    obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
        global_dim=2, per_arm_dim=3, max_num_actions=3)
    time_step_spec = ts.time_step_spec(obs_spec)
    constraint_net = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            obs_spec,
            global_layers=(4,),
            arm_layers=(4,),
            common_layers=(4,)))
    neural_constraint = constraints.NeuralConstraint(
        time_step_spec,
        self._action_spec,
        constraint_network=constraint_net)

    observations = {
        bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
        bandit_spec_utils.PER_ARM_FEATURE_KEY:
            tf.cast(
                tf.reshape(tf.range(18), shape=[2, 3, 3]), dtype=tf.float32)
    }
    actions = tf.constant([0, 1], dtype=tf.int32)
    rewards = tf.constant([0.5, 3.0], dtype=tf.float32)

    init_op = neural_constraint.initialize()
    if not tf.executing_eagerly():
      with self.cached_session() as sess:
        common.initialize_uninitialized_variables(sess)
        self.assertIsNone(sess.run(init_op))
    loss = neural_constraint.compute_loss(
        observations,
        actions,
        rewards)
    self.assertGreater(self.evaluate(loss), 0.0)
示例#3
0
 def testInitializeConstraint(self):
   constraint_net = DummyNet(self._observation_spec, self._action_spec)
   neural_constraint = constraints.NeuralConstraint(
       self._time_step_spec,
       self._action_spec,
       constraint_network=constraint_net)
   init_op = neural_constraint.initialize()
   if not tf.executing_eagerly():
     with self.cached_session() as sess:
       common.initialize_uninitialized_variables(sess)
       self.assertIsNone(sess.run(init_op))
示例#4
0
    def testTrainPerArmAgentWithConstraint(self):
        obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
        reward_spec = {
            'reward':
            tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name='reward'),
            'constraint':
            tensor_spec.TensorSpec(shape=(),
                                   dtype=tf.float32,
                                   name='constraint')
        }
        time_step_spec = ts.time_step_spec(obs_spec, reward_spec)
        reward_net = (global_and_arm_feature_network.
                      create_feed_forward_common_tower_network(
                          obs_spec, (4, 3), (3, 4), (4, 2)))
        optimizer = tf.compat.v1.train.GradientDescentOptimizer(
            learning_rate=0.1)
        constraint_net = (global_and_arm_feature_network.
                          create_feed_forward_common_tower_network(
                              obs_spec, (4, 3), (3, 4), (4, 2)))
        neural_constraint = constraints.NeuralConstraint(
            time_step_spec,
            self._action_spec,
            constraint_network=constraint_net)

        agent = greedy_agent.GreedyRewardPredictionAgent(
            time_step_spec,
            self._action_spec,
            reward_network=reward_net,
            accepts_per_arm_features=True,
            optimizer=optimizer,
            constraints=[neural_constraint])
        observations = {
            bandit_spec_utils.GLOBAL_FEATURE_KEY:
            tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
            bandit_spec_utils.PER_ARM_FEATURE_KEY:
            tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                    dtype=tf.float32)
        }
        actions = np.array([0, 3], dtype=np.int32)
        rewards = {
            'reward': np.array([0.5, 3.0], dtype=np.float32),
            'constraint': np.array([6.0, 4.0], dtype=np.float32)
        }
        initial_step, final_step = _get_initial_and_final_steps_nested_rewards(
            observations, rewards)
        action_step = policy_step.PolicyStep(
            action=tf.convert_to_tensor(actions),
            info=policy_utilities.PerArmPolicyInfo(
                chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                             dtype=np.float32)))
        experience = _get_experience(initial_step, action_step, final_step)
        agent.train(experience, None)
        self.evaluate(tf.compat.v1.initialize_all_variables())
示例#5
0
  def testComputeActionFeasibility(self):
    constraint_net = DummyNet(self._observation_spec, self._action_spec)

    neural_constraint = constraints.NeuralConstraint(
        self._time_step_spec,
        self._action_spec,
        constraint_network=constraint_net)
    init_op = neural_constraint.initialize()
    if not tf.executing_eagerly():
      with self.cached_session() as sess:
        common.initialize_uninitialized_variables(sess)
        self.assertIsNone(sess.run(init_op))

    observation = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    feasibility_prob = neural_constraint(observation)
    self.assertAllClose(self.evaluate(feasibility_prob), np.ones([2, 3]))
示例#6
0
  def testComputeFeasibilityMaskWithActionMask(self):
    observation_spec = tensor_spec.TensorSpec([2], tf.float32)
    time_step_spec = ts.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)
    constraint_net = DummyNet(observation_spec, action_spec)
    neural_constraint = constraints.NeuralConstraint(
        time_step_spec,
        action_spec,
        constraint_network=constraint_net)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    action_mask = tf.constant([[0, 0, 1], [0, 1, 0]], dtype=tf.int32)
    feasibility_prob = constraints.compute_feasibility_probability(
        observations, [neural_constraint], batch_size=2, num_actions=3,
        action_mask=action_mask)
    self.assertAllEqual(self.evaluate(tf.cast(action_mask, tf.float32)),
                        self.evaluate(feasibility_prob))
示例#7
0
    def testComputeLoss(self):
        constraint_net = DummyNet(self._observation_spec, self._action_spec)
        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        actions = tf.constant([0, 1], dtype=tf.int32)
        rewards = tf.constant([0.5, 3.0], dtype=tf.float32)

        neural_constraint = constraints.NeuralConstraint(
            self._time_step_spec,
            self._action_spec,
            constraint_network=constraint_net)
        init_op = neural_constraint.initialize()
        if not tf.executing_eagerly():
            with self.cached_session() as sess:
                common.initialize_uninitialized_variables(sess)
                self.assertIsNone(sess.run(init_op))
        loss = neural_constraint.compute_loss(observations, actions, rewards)
        self.assertAllClose(self.evaluate(loss), 42.25)
示例#8
0
    def testTrainAgentWithMaskAndConstraint(self):
        reward_net = DummyNet(self._observation_spec, self._action_spec)
        optimizer = tf.compat.v1.train.GradientDescentOptimizer(
            learning_rate=0.1)
        reward_spec = {
            'reward':
            tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name='reward'),
            'constraint':
            tensor_spec.TensorSpec(shape=(),
                                   dtype=tf.float32,
                                   name='constraint')
        }
        observation_and_mask_spec = (tensor_spec.TensorSpec([2], tf.float32),
                                     tensor_spec.TensorSpec([3], tf.int32))
        time_step_spec = ts.time_step_spec(observation_and_mask_spec,
                                           reward_spec)

        constraint_net = DummyNet(self._observation_spec, self._action_spec)
        neural_constraint = constraints.NeuralConstraint(
            self._time_step_spec,
            self._action_spec,
            constraint_network=constraint_net)

        agent = greedy_agent.GreedyRewardPredictionAgent(
            time_step_spec,
            self._action_spec,
            reward_network=reward_net,
            optimizer=optimizer,
            observation_and_action_constraint_splitter=lambda x: (x[0], x[1]),
            constraints=[neural_constraint])
        observations = (np.array([[1, 2], [3, 4]], dtype=np.float32),
                        np.array([[1, 0, 0], [1, 1, 0]], dtype=np.int32))
        actions = np.array([0, 1], dtype=np.int32)
        rewards = {
            'reward': np.array([0.5, 3.0], dtype=np.float32),
            'constraint': np.array([6.0, 4.0], dtype=np.float32)
        }
        initial_step, final_step = (
            _get_initial_and_final_steps_action_mask_nested_rewards(
                observations, rewards))
        action_step = _get_action_step(actions)
        experience = _get_experience(initial_step, action_step, final_step)
        loss_before, _ = agent.train(experience, None)
        self.evaluate(tf.compat.v1.initialize_all_variables())
        # The loss is the sum of the reward loss and the constraint loss.
        self.assertAllClose(self.evaluate(loss_before), 42.25 + 30.125)
示例#9
0
 def testComputeMaskFromMultipleSourcesMask(self):
   observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
       4, 5, 6)
   time_step_spec = ts.time_step_spec(observation_spec)
   action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5)
   constraint_net = (
       global_and_arm_feature_network.create_feed_forward_common_tower_network(
           observation_spec, (3, 4), (4, 3), (2, 3)))
   neural_constraint = constraints.NeuralConstraint(
       time_step_spec,
       action_spec,
       constraint_network=constraint_net)
   original_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]]
   observations = ({
       'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32),
       'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]),
   }, original_mask)
   mask = constraints.construct_mask_from_multiple_sources(
       observations, lambda x: (x[0], x[1]), [neural_constraint], 6)
   self.assertAllGreaterEqual(original_mask - mask, 0)
示例#10
0
  def testComputeMaskFromMultipleSourcesNumActionsFeature(self):
    observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
        4, 5, 6, add_num_actions_feature=True)
    time_step_spec = ts.time_step_spec(observation_spec)
    action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 5)
    constraint_net = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            observation_spec, (3, 4), (4, 3), (2, 3)))
    neural_constraint = constraints.NeuralConstraint(
        time_step_spec,
        action_spec,
        constraint_network=constraint_net)

    observations = {
        'global': tf.constant([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=tf.float32),
        'per_arm': tf.reshape(tf.range(60, dtype=tf.float32), shape=[2, 6, 5]),
        'num_actions': tf.constant([4, 3], dtype=tf.int32)
    }
    mask = constraints.construct_mask_from_multiple_sources(
        observations, None, [neural_constraint], 6)
    implied_mask = [[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]]
    self.assertAllGreaterEqual(implied_mask - mask, 0)
示例#11
0
    def testPolicyWithConstraints(self):
        constraint_net = DummyNet(self._obs_spec)
        neural_constraint = constraints.NeuralConstraint(
            self._time_step_spec,
            self._action_spec,
            constraint_network=constraint_net)

        tf.compat.v1.set_random_seed(1)
        policy = boltzmann_reward_policy.BoltzmannRewardPredictionPolicy(
            self._time_step_spec,
            self._action_spec,
            reward_network=DummyNet(self._obs_spec),
            constraints=[neural_constraint],
            emit_policy_info=('predicted_rewards_mean', ))
        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_step = ts.restart(observations, batch_size=2)
        action_step = policy.action(time_step, seed=1)
        self.assertEqual(action_step.action.shape.as_list(), [2])
        self.assertEqual(action_step.action.dtype, tf.int32)
        # Initialize all variables
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllInSet(self.evaluate(action_step.action), [1, 2])
示例#12
0
 def testCreateConstraint(self):
   constraint_net = DummyNet(self._observation_spec, self._action_spec)
   constraints.NeuralConstraint(
       self._time_step_spec,
       self._action_spec,
       constraint_network=constraint_net)