예제 #1
0
    def testActionBatchWithVariablesAndPolicyUpdate(self, batch_size,
                                                    exploration_strategy):
        a_list = []
        a_new_list = []
        b_list = []
        b_new_list = []
        num_samples_list = []
        num_samples_new_list = []
        for k in range(1, self._num_actions + 1):
            a_initial_value = tf.constant(
                [[2 * k + 1, k + 1], [k + 1, 2 * k + 1]], dtype=tf.float32)
            a_for_one_arm = tf.compat.v2.Variable(a_initial_value)
            a_list.append(a_for_one_arm)
            b_initial_value = tf.constant([k, k], dtype=tf.float32)
            b_for_one_arm = tf.compat.v2.Variable(b_initial_value)
            b_list.append(b_for_one_arm)
            num_samples_initial_value = tf.constant([1], dtype=tf.float32)
            num_samples_for_one_arm = tf.compat.v2.Variable(
                num_samples_initial_value)
            num_samples_list.append(num_samples_for_one_arm)

            # Variables for the new policy (they differ by an offset).
            a_new_for_one_arm = tf.compat.v2.Variable(a_initial_value +
                                                      _POLICY_VARIABLES_OFFSET)
            a_new_list.append(a_new_for_one_arm)
            b_new_for_one_arm = tf.compat.v2.Variable(b_initial_value +
                                                      _POLICY_VARIABLES_OFFSET)
            b_new_list.append(b_new_for_one_arm)
            num_samples_for_one_arm_new = tf.compat.v2.Variable(
                num_samples_initial_value + _POLICY_VARIABLES_OFFSET)
            num_samples_new_list.append(num_samples_for_one_arm_new)

        self.evaluate(tf.compat.v1.global_variables_initializer())

        policy = linear_policy.LinearBanditPolicy(self._action_spec, a_list,
                                                  b_list, num_samples_list,
                                                  self._time_step_spec,
                                                  exploration_strategy)
        self.assertLen(policy.variables(), 3 * self._num_actions)

        new_policy = linear_policy.LinearBanditPolicy(self._action_spec,
                                                      a_new_list, b_new_list,
                                                      num_samples_new_list,
                                                      self._time_step_spec,
                                                      exploration_strategy)
        self.assertLen(new_policy.variables(), 3 * self._num_actions)

        self.evaluate(new_policy.update(policy))

        action_step = policy.action(
            self._time_step_batch(batch_size=batch_size))
        new_action_step = new_policy.action(
            self._time_step_batch(batch_size=batch_size))
        self.assertEqual(action_step.action.shape,
                         new_action_step.action.shape)
        self.assertEqual(action_step.action.dtype,
                         new_action_step.action.dtype)
        actions_, new_actions_ = self.evaluate(
            [action_step.action, new_action_step.action])
        self.assertAllEqual(actions_, new_actions_)
예제 #2
0
    def testPerArmActionBatchWithVariablesAndPolicyUpdate(
            self, batch_size, exploration_strategy):
        a_value = tf.reshape(tf.range(36, dtype=tf.float32), shape=[6, 6])
        a_list = [tf.compat.v2.Variable(a_value)]
        a_new_list = [
            tf.compat.v2.Variable(a_value + _POLICY_VARIABLES_OFFSET)
        ]
        b_value = tf.constant([2, 2, 2, 2, 2, 2], dtype=tf.float32)
        b_list = [tf.compat.v2.Variable(b_value)]
        b_new_list = [
            tf.compat.v2.Variable(b_value + _POLICY_VARIABLES_OFFSET)
        ]
        num_samples_list = [
            tf.compat.v2.Variable(tf.constant([1], dtype=tf.float32))
        ]
        num_samples_new_list = [
            tf.compat.v2.Variable(
                tf.constant([1 + _POLICY_VARIABLES_OFFSET], dtype=tf.float32))
        ]
        self.evaluate(tf.compat.v1.global_variables_initializer())
        policy = linear_policy.LinearBanditPolicy(
            self._action_spec,
            a_list,
            b_list,
            num_samples_list,
            self._per_arm_time_step_spec,
            exploration_strategy,
            accepts_per_arm_features=True)
        self.assertLen(policy.variables(), 3)

        new_policy = linear_policy.LinearBanditPolicy(
            self._action_spec,
            a_new_list,
            b_new_list,
            num_samples_new_list,
            self._per_arm_time_step_spec,
            exploration_strategy,
            accepts_per_arm_features=True)
        self.assertLen(new_policy.variables(), 3)

        self.evaluate(new_policy.update(policy))

        step_batch = self._per_arm_time_step_batch(batch_size=batch_size)
        action_step = policy.action(step_batch)
        new_action_step = new_policy.action(step_batch)
        self.assertEqual(action_step.action.shape,
                         new_action_step.action.shape)
        self.assertEqual(action_step.action.dtype,
                         new_action_step.action.dtype)
        actions_, new_actions_, info = self.evaluate(
            [action_step.action, new_action_step.action, action_step.info])
        self.assertAllEqual(actions_, new_actions_)
        arm_obs = step_batch.observation[bandit_spec_utils.PER_ARM_FEATURE_KEY]
        first_action = actions_[0]
        first_arm_features = arm_obs[0]
        self.assertAllEqual(info.chosen_arm_features[0],
                            first_arm_features[first_action])
예제 #3
0
    def testObservationShapeMismatch(self, batch_size, exploration_strategy):
        policy = linear_policy.LinearBanditPolicy(self._action_spec, self._a,
                                                  self._b,
                                                  self._num_samples_per_arm,
                                                  self._time_step_spec,
                                                  exploration_strategy)

        current_time_step = ts.TimeStep(
            tf.constant(ts.StepType.FIRST,
                        dtype=tf.int32,
                        shape=[batch_size],
                        name='step_type'),
            tf.constant(0.0,
                        dtype=tf.float32,
                        shape=[batch_size],
                        name='reward'),
            tf.constant(1.0,
                        dtype=tf.float32,
                        shape=[batch_size],
                        name='discount'),
            tf.constant(np.array(range(batch_size * (self._obs_dim + 1))),
                        dtype=tf.float32,
                        shape=[batch_size, self._obs_dim + 1],
                        name='observation'))
        with self.assertRaisesRegexp(
                ValueError, r'Observation shape is expected to be \[None, 2\].'
                r' Got \[%d, 3\].' % batch_size):
            policy.action(current_time_step)
예제 #4
0
    def testBuild(self, exploration_strategy):
        policy = linear_policy.LinearBanditPolicy(self._action_spec, self._a,
                                                  self._b,
                                                  self._num_samples_per_arm,
                                                  self._time_step_spec,
                                                  exploration_strategy)

        self.assertEqual(policy.time_step_spec, self._time_step_spec)
예제 #5
0
  def testActionBatch(self, batch_size, exploration_strategy):
    policy = linear_policy.LinearBanditPolicy(self._action_spec, self._a,
                                              self._b,
                                              self._num_samples_per_arm,
                                              self._time_step_spec,
                                              exploration_strategy)

    action_step = policy.action(self._time_step_batch(batch_size=batch_size))
    self.assertEqual(action_step.action.shape.as_list(), [batch_size])
    self.assertEqual(action_step.action.dtype, tf.int32)
    actions_ = self.evaluate(action_step.action)
    self.assertAllGreaterEqual(actions_, self._action_spec.minimum)
    self.assertAllLessEqual(actions_, self._action_spec.maximum)
예제 #6
0
    def testPredictedRewards(self, batch_size, exploration_strategy):
        policy = linear_policy.LinearBanditPolicy(
            self._action_spec,
            self._a,
            self._b,
            self._num_samples_per_arm,
            self._time_step_spec,
            exploration_strategy,
            emit_policy_info=(
                policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, ))

        action_step = policy.action(
            self._time_step_batch(batch_size=batch_size))
        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)

        observation_numpy = np.array(range(batch_size * self._obs_dim),
                                     dtype=np.float32).reshape(
                                         [batch_size, self._obs_dim])

        p_values = []
        predicted_rewards_expected = []
        for k in range(self._num_actions):
            a_inv = np.linalg.inv(self._a_numpy[k] + np.eye(self._obs_dim))
            theta = np.matmul(a_inv,
                              self._b_numpy[k].reshape([self._obs_dim, 1]))
            confidence_intervals = np.sqrt(
                np.diag(
                    np.matmul(
                        observation_numpy,
                        np.matmul(a_inv, np.transpose(observation_numpy)))))
            est_mean_reward = np.matmul(observation_numpy, theta)
            predicted_rewards_expected.append(est_mean_reward)
            p_value = (est_mean_reward +
                       self._alpha * confidence_intervals.reshape([-1, 1]))
            p_values.append(p_value)

        predicted_rewards_expected_array = np.stack(predicted_rewards_expected,
                                                    axis=-1).reshape(
                                                        batch_size,
                                                        self._num_actions)
        p_info = self.evaluate(action_step.info)
        self.assertAllClose(p_info.predicted_rewards_mean,
                            predicted_rewards_expected_array)
예제 #7
0
    def testComparisonWithNumpy(self, batch_size, use_decomposition=False):
        eig_matrix_list = ()
        eig_vals_list = ()
        if use_decomposition:
            eig_vals_one_arm, eig_matrix_one_arm = tf.linalg.eigh(self._a[0])
            eig_vals_list = [eig_vals_one_arm] * self._num_actions
            eig_matrix_list = [eig_matrix_one_arm] * self._num_actions

        policy = linear_policy.LinearBanditPolicy(self._action_spec,
                                                  self._a,
                                                  self._b,
                                                  self._num_samples_per_arm,
                                                  self._time_step_spec,
                                                  eig_vals=eig_vals_list,
                                                  eig_matrix=eig_matrix_list)

        action_step = policy.action(
            self._time_step_batch(batch_size=batch_size))
        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)
        actions_ = self.evaluate(action_step.action)

        observation_numpy = np.array(range(batch_size * self._obs_dim),
                                     dtype=np.float32).reshape(
                                         [batch_size, self._obs_dim])

        p_values = []
        for k in range(self._num_actions):
            a_inv = np.linalg.inv(self._a_numpy[k] + np.eye(self._obs_dim))
            theta = np.matmul(a_inv,
                              self._b_numpy[k].reshape([self._obs_dim, 1]))
            confidence_intervals = np.sqrt(
                np.diag(
                    np.matmul(
                        observation_numpy,
                        np.matmul(a_inv, np.transpose(observation_numpy)))))
            p_value = (np.matmul(observation_numpy, theta) +
                       self._alpha * confidence_intervals.reshape([-1, 1]))
            p_values.append(p_value)

        actions_numpy = np.argmax(np.stack(p_values, axis=-1),
                                  axis=-1).reshape([batch_size])
        self.assertAllEqual(actions_.reshape([batch_size]), actions_numpy)
예제 #8
0
    def testActionBatchWithMask(self, batch_size, exploration_strategy):
        def split_fn(obs):
            return obs[0], obs[1]

        policy = linear_policy.LinearBanditPolicy(
            self._action_spec,
            self._a,
            self._b,
            self._num_samples_per_arm,
            self._time_step_spec_with_mask,
            exploration_strategy,
            observation_and_action_constraint_splitter=split_fn)

        action_step = policy.action(
            self._time_step_batch_with_mask(batch_size=batch_size))
        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)
        actions_ = self.evaluate(action_step.action)
        self.assertAllEqual(actions_, range(batch_size))
예제 #9
0
    def testActionBatchWithBias(self, batch_size, exploration_strategy):
        a = [tf.constant([[4, 1, 2], [1, 5, 3], [2, 3, 6]], dtype=tf.float32)
             ] * self._num_actions
        b = [
            tf.constant([r, r, r], dtype=tf.float32)
            for r in range(self._num_actions)
        ]
        policy = linear_policy.LinearBanditPolicy(self._action_spec,
                                                  a,
                                                  b,
                                                  self._num_samples_per_arm,
                                                  self._time_step_spec,
                                                  exploration_strategy,
                                                  add_bias=True)

        action_step = policy.action(
            self._time_step_batch(batch_size=batch_size))
        self.assertEqual(action_step.action.shape.as_list(), [batch_size])
        self.assertEqual(action_step.action.dtype, tf.int32)
        actions_ = self.evaluate(action_step.action)
        self.assertAllGreaterEqual(actions_, self._action_spec.minimum)
        self.assertAllLessEqual(actions_, self._action_spec.maximum)
예제 #10
0
    def __init__(self,
                 exploration_policy,
                 time_step_spec: types.TimeStep,
                 action_spec: types.BoundedTensorSpec,
                 variable_collection: Optional[
                     LinearBanditVariableCollection] = None,
                 alpha: float = 1.0,
                 gamma: float = 1.0,
                 use_eigendecomp: bool = False,
                 tikhonov_weight: float = 1.0,
                 add_bias: bool = False,
                 emit_policy_info: Sequence[Text] = (),
                 emit_log_probability: bool = False,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 accepts_per_arm_features: bool = False,
                 debug_summaries: bool = False,
                 summarize_grads_and_vars: bool = False,
                 enable_summaries: bool = True,
                 dtype: tf.DType = tf.float32,
                 name: Optional[Text] = None):
        """Initialize an instance of `LinearBanditAgent`.

    Args:
      exploration_policy: An Enum of type `ExplorationPolicy`. The kind of
        policy we use for exploration. Currently supported policies are
        `LinUCBPolicy` and `LinearThompsonSamplingPolicy`.
      time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s.
      action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype
        describing the number of actions for this agent.
      variable_collection: Instance of `LinearBanditVariableCollection`.
        Collection of variables to be updated by the agent. If `None`, a new
        instance of `LinearBanditVariableCollection` will be created.
      alpha: (float) positive scalar. This is the exploration parameter that
        multiplies the confidence intervals.
      gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the
        algorithm does not forget.
      use_eigendecomp: whether to use eigen-decomposition or not. The default
        solver is Conjugate Gradient.
      tikhonov_weight: (float) tikhonov regularization term.
      add_bias: If true, a bias term will be added to the linear reward
        estimation.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      emit_log_probability: Whether the policy emits log-probabilities or not.
        Since the policy is deterministic, the probability is just 1.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the agent accepts per-arm
        features.
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      dtype: The type of the parameters stored and updated by the agent. Should
        be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`.
      name: a name for this instance of `LinearBanditAgent`.

    Raises:
      ValueError if dtype is not one of `tf.float32` or `tf.float64`.
      TypeError if variable_collection is not an instance of
        `LinearBanditVariableCollection`.
    """
        tf.Module.__init__(self, name=name)
        common.tf_agents_gauge.get_cell('TFABandit').set(True)
        self._num_actions = policy_utilities.get_num_actions_from_tensor_spec(
            action_spec)
        self._num_models = 1 if accepts_per_arm_features else self._num_actions
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
        self._time_step_spec = time_step_spec
        self._accepts_per_arm_features = accepts_per_arm_features
        self._add_bias = add_bias
        if observation_and_action_constraint_splitter is not None:
            context_spec, _ = observation_and_action_constraint_splitter(
                time_step_spec.observation)
        else:
            context_spec = time_step_spec.observation

        (self._global_context_dim,
         self._arm_context_dim) = bandit_spec_utils.get_context_dims_from_spec(
             context_spec, accepts_per_arm_features)
        if self._add_bias:
            # The bias is added via a constant 1 feature.
            self._global_context_dim += 1
        self._overall_context_dim = self._global_context_dim + self._arm_context_dim

        self._alpha = alpha
        if variable_collection is None:
            variable_collection = LinearBanditVariableCollection(
                context_dim=self._overall_context_dim,
                num_models=self._num_models,
                use_eigendecomp=use_eigendecomp,
                dtype=dtype)
        elif not isinstance(variable_collection,
                            LinearBanditVariableCollection):
            raise TypeError('Parameter `variable_collection` should be '
                            'of type `LinearBanditVariableCollection`.')
        self._variable_collection = variable_collection
        self._cov_matrix_list = variable_collection.cov_matrix_list
        self._data_vector_list = variable_collection.data_vector_list
        self._eig_matrix_list = variable_collection.eig_matrix_list
        self._eig_vals_list = variable_collection.eig_vals_list
        # We keep track of the number of samples per arm.
        self._num_samples_list = variable_collection.num_samples_list
        self._gamma = gamma
        if self._gamma < 0.0 or self._gamma > 1.0:
            raise ValueError(
                'Forgetting factor `gamma` must be in [0.0, 1.0].')
        self._dtype = dtype
        if dtype not in (tf.float32, tf.float64):
            raise ValueError(
                'Agent dtype should be either `tf.float32 or `tf.float64`.')
        self._use_eigendecomp = use_eigendecomp
        self._tikhonov_weight = tikhonov_weight

        if exploration_policy == ExplorationPolicy.linear_ucb_policy:
            exploration_strategy = lin_policy.ExplorationStrategy.optimistic
        elif exploration_policy == (
                ExplorationPolicy.linear_thompson_sampling_policy):
            exploration_strategy = lin_policy.ExplorationStrategy.sampling
        else:
            raise ValueError(
                'Linear bandit agent with policy %s not implemented' %
                exploration_policy)
        policy = lin_policy.LinearBanditPolicy(
            action_spec=action_spec,
            cov_matrix=self._cov_matrix_list,
            data_vector=self._data_vector_list,
            num_samples=self._num_samples_list,
            time_step_spec=time_step_spec,
            exploration_strategy=exploration_strategy,
            alpha=alpha,
            eig_vals=self._eig_vals_list if self._use_eigendecomp else (),
            eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (),
            tikhonov_weight=self._tikhonov_weight,
            add_bias=add_bias,
            emit_policy_info=emit_policy_info,
            emit_log_probability=emit_log_probability,
            accepts_per_arm_features=accepts_per_arm_features,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter))

        training_data_spec = None
        if accepts_per_arm_features:
            training_data_spec = bandit_spec_utils.drop_arm_observation(
                policy.trajectory_spec)
        super(LinearBanditAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             policy=policy,
                             collect_policy=policy,
                             training_data_spec=training_data_spec,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             train_sequence_length=None)
        self._as_trajectory = data_converter.AsTrajectory(self.data_context,
                                                          sequence_length=None)