def testTrainAgentWithLaplacianSmoothingInvalidMatrix(self):
        if tf.executing_eagerly:
            return

        observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
        actions = np.array([0, 1], dtype=np.int32)
        rewards = np.array([0.5, 3.0], dtype=np.float32)
        initial_step, final_step = _get_initial_and_final_steps(
            observations, rewards)
        action_step = _get_action_step(actions)
        experience = _get_experience(initial_step, action_step, final_step)

        with self.assertRaisesRegexp(errors.InvalidArgumentError, ''):
            reward_net = DummyNet(self._observation_spec, self._action_spec)
            optimizer = tf.compat.v1.train.GradientDescentOptimizer(
                learning_rate=0.1)
            # Set the Laplacian matrix to be the identity, which is not a valid
            # Laplacian.
            laplacian_matrix = tf.eye(3)
            agent = greedy_agent.GreedyRewardPredictionAgent(
                self._time_step_spec,
                self._action_spec,
                reward_network=reward_net,
                optimizer=optimizer,
                laplacian_matrix=laplacian_matrix,
                laplacian_smoothing_weight=1.0)
            self.evaluate(tf.compat.v1.initialize_all_variables())
            loss_before, _ = agent.train(experience, None)
            self.evaluate(loss_before)
 def testTrainPerArmAgent(self):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(2, 3, 4)
     time_step_spec = ts.time_step_spec(obs_spec)
     reward_net = (
         global_and_arm_feature_network.create_feed_forward_per_arm_network(
             obs_spec, (4, 3), (3, 4), (4, 2)))
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     agent = greedy_agent.GreedyRewardPredictionAgent(
         time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         accepts_per_arm_features=True,
         optimizer=optimizer)
     observations = {
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                 dtype=tf.float32)
     }
     actions = np.array([0, 3], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps(
         observations, rewards)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
 def testTrainAgentWithMask(self):
     reward_net = DummyNet(self._observation_spec, self._action_spec)
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     time_step_spec = ts.time_step_spec(
         (tensor_spec.TensorSpec([2], tf.float32),
          tensor_spec.TensorSpec([3], tf.int32)))
     agent = greedy_agent.GreedyRewardPredictionAgent(
         time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         optimizer=optimizer,
         observation_and_action_constraint_splitter=lambda x: (x[0], x[1]))
     observations = (np.array([[1, 2], [3, 4]], dtype=np.float32),
                     np.array([[1, 0, 0], [1, 1, 0]], dtype=np.int32))
     actions = np.array([0, 1], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
         observations, rewards)
     action_step = _get_action_step(actions)
     experience = _get_experience(initial_step, action_step, final_step)
     loss_before, _ = agent.train(experience, None)
     loss_after, _ = agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     self.assertAllClose(self.evaluate(loss_before), 42.25)
     self.assertAllClose(self.evaluate(loss_after), 93.46)
예제 #4
0
    def testTrainAgentWithConstraint(self):
        reward_net = DummyNet(self._observation_spec, self._action_spec)
        optimizer = tf.compat.v1.train.GradientDescentOptimizer(
            learning_rate=0.1)

        constraint_net = DummyNet(self._observation_spec, self._action_spec)
        neural_constraint = constraints.NeuralConstraint(
            self._time_step_spec,
            self._action_spec,
            constraint_network=constraint_net)

        reward_spec = tensor_spec.TensorSpec(shape=(2, ),
                                             dtype=tf.float32,
                                             name='reward')
        self._time_step_spec = ts.time_step_spec(self._obs_spec, reward_spec)

        agent = greedy_agent.GreedyRewardPredictionAgent(
            self._time_step_spec,
            self._action_spec,
            reward_network=reward_net,
            optimizer=optimizer,
            constraints=[neural_constraint])
        observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
        actions = np.array([0, 1], dtype=np.int32)
        rewards = np.array([[0.5, 6.0], [3.0, 4.0]], dtype=np.float32)
        initial_step, final_step = _get_initial_and_final_steps(
            observations, rewards)
        action_step = _get_action_step(actions)
        experience = _get_experience(initial_step, action_step, final_step)
        loss_before, _ = agent.train(experience, None)
        self.evaluate(tf.compat.v1.initialize_all_variables())
        # The loss is the sum of the reward loss and the constraint loss.
        self.assertAllClose(self.evaluate(loss_before), 42.25 + 30.125)
    def testInitializeRestoreAgent(self):
        reward_net = DummyNet(self._observation_spec, self._action_spec)
        agent = greedy_agent.GreedyRewardPredictionAgent(
            self._time_step_spec,
            self._action_spec,
            reward_network=reward_net,
            optimizer=None)
        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        policy = agent.policy
        action_step = policy.action(time_steps)
        self.evaluate(tf.compat.v1.initialize_all_variables())

        checkpoint = tf.train.Checkpoint(agent=agent)

        latest_checkpoint = tf.train.latest_checkpoint(self.get_temp_dir())
        checkpoint_load_status = checkpoint.restore(latest_checkpoint)

        if tf.executing_eagerly():
            self.evaluate(checkpoint_load_status.initialize_or_restore())
            self.assertAllEqual(self.evaluate(action_step.action), [1, 2])
        else:
            with self.cached_session() as sess:
                checkpoint_load_status.initialize_or_restore(sess)
                self.assertAllEqual(sess.run(action_step.action), [1, 2])
 def testNumSamplesList(self):
     if not tf.executing_eagerly():
         return
     reward_net = DummyNet(self._observation_spec, self._action_spec)
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     num_samples_list = []
     for k in range(3):
         num_samples_list.append(
             tf.compat.v2.Variable(tf.zeros([], dtype=tf.int64),
                                   name='num_samples_{}'.format(k)))
     agent = greedy_agent.GreedyRewardPredictionAgent(
         self._time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         optimizer=optimizer,
         num_samples_list=num_samples_list)
     observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
     actions = np.array([0, 1], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps(
         observations, rewards)
     action_step = _get_action_step(actions)
     experience = _get_experience(initial_step, action_step, final_step)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     _, _ = agent.train(experience, None)
     _, _ = agent.train(experience, None)
     # Action 0 and 1 have 2 samples. Action 2 has 0 samples.
     self.assertEqual(self.evaluate(num_samples_list[0].read_value()), 2)
     self.assertEqual(self.evaluate(num_samples_list[1].read_value()), 2)
     self.assertEqual(self.evaluate(num_samples_list[2].read_value()), 0)
 def testCreateAgent(self):
     reward_net = DummyNet(self._observation_spec, self._action_spec)
     agent = greedy_agent.GreedyRewardPredictionAgent(
         self._time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         optimizer=None)
     self.assertIsNotNone(agent.policy)
 def testInitializeAgent(self):
     reward_net = DummyNet(self._observation_spec, self._action_spec)
     agent = greedy_agent.GreedyRewardPredictionAgent(
         self._time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         optimizer=None)
     init_op = agent.initialize()
     if not tf.executing_eagerly():
         with self.cached_session() as sess:
             common.initialize_uninitialized_variables(sess)
             self.assertIsNone(sess.run(init_op))
 def testPolicy(self):
     reward_net = DummyNet(self._observation_spec, self._action_spec)
     agent = greedy_agent.GreedyRewardPredictionAgent(
         self._time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         optimizer=None)
     observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
     time_steps = ts.restart(observations, batch_size=2)
     policy = agent.policy
     action_step = policy.action(time_steps)
     # Batch size 2.
     self.assertAllEqual([2], action_step.action.shape)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     actions = self.evaluate(action_step.action)
     self.assertAllEqual(actions, [1, 2])
    def testTrainAgentWithMaskAndConstraint(self):
        reward_net = DummyNet(self._observation_spec, self._action_spec)
        optimizer = tf.compat.v1.train.GradientDescentOptimizer(
            learning_rate=0.1)
        reward_spec = {
            'reward':
            tensor_spec.TensorSpec(shape=(), dtype=tf.float32, name='reward'),
            'constraint':
            tensor_spec.TensorSpec(shape=(),
                                   dtype=tf.float32,
                                   name='constraint')
        }
        observation_and_mask_spec = (tensor_spec.TensorSpec([2], tf.float32),
                                     tensor_spec.TensorSpec([3], tf.int32))
        time_step_spec = ts.time_step_spec(observation_and_mask_spec,
                                           reward_spec)

        constraint_net = DummyNet(self._observation_spec, self._action_spec)
        neural_constraint = constraints.NeuralConstraint(
            self._time_step_spec,
            self._action_spec,
            constraint_network=constraint_net)

        agent = greedy_agent.GreedyRewardPredictionAgent(
            time_step_spec,
            self._action_spec,
            reward_network=reward_net,
            optimizer=optimizer,
            observation_and_action_constraint_splitter=lambda x: (x[0], x[1]),
            constraints=[neural_constraint])
        observations = (np.array([[1, 2], [3, 4]], dtype=np.float32),
                        np.array([[1, 0, 0], [1, 1, 0]], dtype=np.int32))
        actions = np.array([0, 1], dtype=np.int32)
        rewards = {
            'reward': np.array([0.5, 3.0], dtype=np.float32),
            'constraint': np.array([6.0, 4.0], dtype=np.float32)
        }
        initial_step, final_step = (
            _get_initial_and_final_steps_action_mask_nested_rewards(
                observations, rewards))
        action_step = _get_action_step(actions)
        experience = _get_experience(initial_step, action_step, final_step)
        loss_before, _ = agent.train(experience, None)
        self.evaluate(tf.compat.v1.initialize_all_variables())
        # The loss is the sum of the reward loss and the constraint loss.
        self.assertAllClose(self.evaluate(loss_before), 42.25 + 30.125)
    def testLoss(self):
        reward_net = DummyNet(self._observation_spec, self._action_spec)
        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        actions = tf.constant([0, 1], dtype=tf.int32)
        rewards = tf.constant([0.5, 3.0], dtype=tf.float32)

        agent = greedy_agent.GreedyRewardPredictionAgent(
            self._time_step_spec,
            self._action_spec,
            reward_network=reward_net,
            optimizer=None)
        init_op = agent.initialize()
        if not tf.executing_eagerly():
            with self.cached_session() as sess:
                common.initialize_uninitialized_variables(sess)
                self.assertIsNone(sess.run(init_op))
        loss, _ = agent.loss(observations, actions, rewards)
        self.evaluate(tf.compat.v1.initialize_all_variables())
        self.assertAllClose(self.evaluate(loss), 42.25)
예제 #12
0
 def testTrainAgent(self):
   reward_net = DummyNet(self._observation_spec, self._action_spec)
   optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1)
   agent = greedy_agent.GreedyRewardPredictionAgent(
       self._time_step_spec,
       self._action_spec,
       reward_network=reward_net,
       optimizer=optimizer)
   observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
   actions = np.array([0, 1], dtype=np.int32)
   rewards = np.array([0.5, 3.0], dtype=np.float32)
   initial_step, final_step = _get_initial_and_final_steps(
       observations, rewards)
   action_step = _get_action_step(actions)
   experience = _get_experience(initial_step, action_step, final_step)
   loss_before, _ = agent.train(experience, None)
   loss_after, _ = agent.train(experience, None)
   self.evaluate(tf.compat.v1.initialize_all_variables())
   self.assertAllClose(self.evaluate(loss_before), 42.25)
   self.assertAllClose(self.evaluate(loss_after), 93.46)
 def testTrainPerArmAgentWithMask(self):
     num_actions = 4
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         2, 3, num_actions)
     mask_obs_spec = (obs_spec,
                      tensor_spec.BoundedTensorSpec(shape=[num_actions],
                                                    minimum=0,
                                                    maximum=1,
                                                    dtype=tf.float32))
     time_step_spec = ts.time_step_spec(mask_obs_spec)
     reward_net = (global_and_arm_feature_network.
                   create_feed_forward_common_tower_network(
                       obs_spec, (4, 3), (3, 4), (4, 2)))
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     agent = greedy_agent.GreedyRewardPredictionAgent(
         time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         observation_and_action_constraint_splitter=lambda x: [x[0], x[1]],
         accepts_per_arm_features=True,
         optimizer=optimizer)
     observations = ({
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                 dtype=tf.float32)
     }, tf.ones([2, num_actions]))
     actions = np.array([0, 3], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
         observations, rewards)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
예제 #14
0
    def testLoss(self):
        reward_net = DummyNet(self._observation_spec, self._action_spec)
        observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
        actions = np.array([0, 1], dtype=np.int32)
        rewards = np.array([0.5, 3.0], dtype=np.float32)
        initial_step, final_step = _get_initial_and_final_steps_nested_rewards(
            observations, rewards)
        action_step = _get_action_step(actions)
        experience = _get_experience(initial_step, action_step, final_step)

        agent = greedy_agent.GreedyRewardPredictionAgent(
            self._time_step_spec,
            self._action_spec,
            reward_network=reward_net,
            optimizer=None)
        init_op = agent.initialize()
        if not tf.executing_eagerly():
            with self.cached_session() as sess:
                common.initialize_uninitialized_variables(sess)
                self.assertIsNone(sess.run(init_op))
        loss, _ = agent._loss(experience)
        self.evaluate(tf.compat.v1.initialize_all_variables())
        self.assertAllClose(self.evaluate(loss), 42.25)
 def testTrainAgentWithLaplacianSmoothing(self):
     reward_net = DummyNet(self._observation_spec, self._action_spec)
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.1)
     laplacian_matrix = tf.constant([[1.0, -1.0, 0.0], [-1.0, 2.0, -1.0],
                                     [0.0, -1.0, 1.0]])
     agent = greedy_agent.GreedyRewardPredictionAgent(
         self._time_step_spec,
         self._action_spec,
         reward_network=reward_net,
         optimizer=optimizer,
         laplacian_matrix=laplacian_matrix,
         laplacian_smoothing_weight=1.0)
     observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
     actions = np.array([0, 1], dtype=np.int32)
     rewards = np.array([0.5, 3.0], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps(
         observations, rewards)
     action_step = _get_action_step(actions)
     experience = _get_experience(initial_step, action_step, final_step)
     loss_before, _ = agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     # The Laplacian smoothing term ends up adding 22.5 to the loss.
     self.assertAllClose(self.evaluate(loss_before), 42.25 + 22.5)