Пример #1
0
 def testInitializeAgent(self,
                         batch_size,
                         context_dim,
                         exploration_policy,
                         dtype,
                         use_eigendecomp=False,
                         set_example_weights=False):
   del batch_size, use_eigendecomp, set_example_weights  # Unused in this test.
   num_actions = 5
   observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
   time_step_spec = time_step.time_step_spec(observation_spec)
   action_spec = tensor_spec.BoundedTensorSpec(
       dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1)
   agent = linear_agent.LinearBanditAgent(
       exploration_policy=exploration_policy,
       time_step_spec=time_step_spec,
       action_spec=action_spec,
       dtype=dtype)
   self.evaluate(agent.initialize())
Пример #2
0
    def testDistributedLinearAgentUpdate(self,
                                         batch_size,
                                         context_dim,
                                         exploration_policy,
                                         dtype,
                                         use_eigendecomp=False):
        """Same as above, but uses the distributed train function of the agent."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)

        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            dtype=dtype)
        self.evaluate(agent.initialize())
        train_fn = common.function_in_tf1()(agent._distributed_train_step)
        loss_info = train_fn(experience=experience)
        self.evaluate(loss_info)

        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(experience.observation, [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        expected_theta_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float32)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(observations_for_arm,
                                  observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm)
                return a_new, b_new

            def false_fn():
                return tf.zeros([context_dim,
                                 context_dim]), tf.zeros([context_dim])

            a_new, b_new = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)
            theta_new = tf.squeeze(tf.linalg.solve(
                a_new + tf.eye(context_dim), tf.expand_dims(b_new, axis=-1)),
                                   axis=-1)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))
            expected_theta_updated_list.append(self.evaluate(theta_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
Пример #3
0
    def testLinearAgentUpdateWithForgetting(self,
                                            batch_size,
                                            context_dim,
                                            exploration_policy,
                                            dtype,
                                            use_eigendecomp=False):
        """Check that the agent updates for specified actions and rewards."""
        # We should rewrite this test as it currently does not depend on
        # the value of `gamma`. To properly test the forgetting factor, we need to
        # call `train` twice.
        gamma = 0.9

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            gamma=gamma,
            dtype=dtype,
            use_eigendecomp=use_eigendecomp)
        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)
        final_eig_vals = self.evaluate(agent.eig_vals)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(experience.observation, [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        expected_eigvals_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float32)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(observations_for_arm,
                                  observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm)
                eigmatrix_new = tf.constant([], dtype=dtype)
                eigvals_new = tf.constant([], dtype=dtype)
                if use_eigendecomp:
                    eigvals_new, eigmatrix_new = tf.linalg.eigh(a_new)
                return a_new, b_new, eigvals_new, eigmatrix_new

            def false_fn():
                if use_eigendecomp:
                    return (tf.zeros([context_dim,
                                      context_dim]), tf.zeros([context_dim]),
                            tf.ones([context_dim]), tf.eye(context_dim))
                else:
                    return (tf.zeros([context_dim,
                                      context_dim]), tf.zeros([context_dim]),
                            tf.constant([], dtype=dtype),
                            tf.constant([], dtype=dtype))

            a_new, b_new, eig_vals_new, _ = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))
            expected_eigvals_updated_list.append(self.evaluate(eig_vals_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
        self.assertAllClose(expected_eigvals_updated_list,
                            final_eig_vals,
                            atol=1e-4,
                            rtol=1e-4)
Пример #4
0
    def testLinearAgentUpdateWithMaskedActions(self,
                                               batch_size,
                                               context_dim,
                                               exploration_policy,
                                               dtype,
                                               use_eigendecomp=False):
        """Check that the agent updates for specified actions and rewards."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
            batch_size, context_dim, num_actions=num_actions)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = (tensor_spec.TensorSpec([context_dim], tf.float32),
                            tensor_spec.TensorSpec([num_actions], tf.int32))
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)

        def observation_and_action_constraint_splitter(obs):
            return obs[0], obs[1]

        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter),
            dtype=dtype)
        self.evaluate(agent.initialize())
        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(
                observation_and_action_constraint_splitter(
                    experience.observation)[0], [batch_size, -1]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float32)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(observations_for_arm,
                                  observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm)
                return a_new, b_new

            def false_fn():
                return tf.zeros([context_dim,
                                 context_dim]), tf.zeros([context_dim])

            a_new, b_new = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
    def testLinearAgentUpdateWithBias(self,
                                      batch_size,
                                      context_dim,
                                      exploration_policy,
                                      dtype,
                                      use_eigendecomp=False):
        """Check that the agent updates for specified actions and rewards."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        initial_step, final_step = _get_initial_and_final_steps(
            batch_size, context_dim)
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = _get_action_step(action)
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        variable_collection = linear_agent.LinearBanditVariableCollection(
            context_dim + 1, num_actions, use_eigendecomp, dtype)
        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            variable_collection=variable_collection,
            use_eigendecomp=use_eigendecomp,
            add_bias=True,
            dtype=dtype)
        self.evaluate(agent.initialize())
        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)
        final_theta = self.evaluate(agent.theta)

        # Compute the expected updated estimates.
        observations_list = tf.dynamic_partition(
            data=tf.reshape(experience.observation, [batch_size, context_dim]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        rewards_list = tf.dynamic_partition(
            data=tf.reshape(experience.reward, [batch_size]),
            partitions=tf.convert_to_tensor(action),
            num_partitions=num_actions)
        expected_a_updated_list = []
        expected_b_updated_list = []
        expected_theta_updated_list = []
        for _, (observations_for_arm, rewards_for_arm) in enumerate(
                zip(observations_list, rewards_list)):
            observations_for_arm = tf.concat([
                observations_for_arm,
                tf.ones_like(observations_for_arm[:, 0:1])
            ],
                                             axis=1)
            num_samples_for_arm_current = tf.cast(
                tf.shape(rewards_for_arm)[0], tf.float32)
            num_samples_for_arm_total = num_samples_for_arm_current

            # pylint: disable=cell-var-from-loop
            def true_fn():
                a_new = tf.matmul(observations_for_arm,
                                  observations_for_arm,
                                  transpose_a=True)
                b_new = bandit_utils.sum_reward_weighted_observations(
                    rewards_for_arm, observations_for_arm)
                return a_new, b_new

            def false_fn():
                return tf.zeros([context_dim + 1,
                                 context_dim + 1]), tf.zeros([context_dim + 1])

            a_new, b_new = tf.cond(
                tf.squeeze(num_samples_for_arm_total) > 0, true_fn, false_fn)
            theta_new = tf.squeeze(tf.linalg.solve(
                a_new + tf.eye(context_dim + 1), tf.expand_dims(b_new,
                                                                axis=-1)),
                                   axis=-1)

            expected_a_updated_list.append(self.evaluate(a_new))
            expected_b_updated_list.append(self.evaluate(b_new))
            expected_theta_updated_list.append(self.evaluate(theta_new))

        # Check that the actual updated estimates match the expectations.
        self.assertAllClose(expected_a_updated_list, final_a)
        self.assertAllClose(expected_b_updated_list, final_b)
        self.assertAllClose(self.evaluate(
            tf.stack(expected_theta_updated_list)),
                            final_theta,
                            atol=0.1,
                            rtol=0.05)
    def testLinearAgentUpdatePerArmFeatures(self,
                                            batch_size,
                                            context_dim,
                                            exploration_policy,
                                            dtype,
                                            use_eigendecomp=False):
        """Check that the agent updates for specified actions and rewards."""

        # Construct a `Trajectory` for the given action, observation, reward.
        num_actions = 5
        global_context_dim = context_dim
        arm_context_dim = 3
        initial_step, final_step = (
            _get_initial_and_final_steps_with_per_arm_features(
                batch_size, global_context_dim, num_actions, arm_context_dim))
        action = np.random.randint(num_actions,
                                   size=batch_size,
                                   dtype=np.int32)
        action_step = policy_step.PolicyStep(
            action=tf.convert_to_tensor(action),
            info=policy_utilities.PerArmPolicyInfo(
                chosen_arm_features=np.arange(
                    batch_size * arm_context_dim, dtype=np.float32).reshape(
                        [batch_size, arm_context_dim])))
        experience = _get_experience(initial_step, action_step, final_step)

        # Construct an agent and perform the update.
        observation_spec = bandit_spec_utils.create_per_arm_observation_spec(
            context_dim, arm_context_dim, num_actions)
        time_step_spec = time_step.time_step_spec(observation_spec)
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=num_actions - 1)
        agent = linear_agent.LinearBanditAgent(
            exploration_policy=exploration_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            use_eigendecomp=use_eigendecomp,
            accepts_per_arm_features=True,
            dtype=dtype)
        self.evaluate(agent.initialize())
        loss_info = agent.train(experience)
        self.evaluate(loss_info)
        final_a = self.evaluate(agent.cov_matrix)
        final_b = self.evaluate(agent.data_vector)

        # Compute the expected updated estimates.
        global_observation = experience.observation[
            bandit_spec_utils.GLOBAL_FEATURE_KEY]
        arm_observation = experience.policy_info.chosen_arm_features
        overall_observation = tf.squeeze(tf.concat(
            [global_observation, arm_observation], axis=-1),
                                         axis=1)
        rewards = tf.squeeze(experience.reward, axis=1)

        expected_a_new = tf.matmul(overall_observation,
                                   overall_observation,
                                   transpose_a=True)
        expected_b_new = bandit_utils.sum_reward_weighted_observations(
            rewards, overall_observation)
        self.assertAllClose(expected_a_new, final_a[0])
        self.assertAllClose(expected_b_new, final_b[0])