def testActionBatchWithVariablesAndPolicyUpdate(self, batch_size, exploration_strategy): a_list = [] a_new_list = [] b_list = [] b_new_list = [] num_samples_list = [] num_samples_new_list = [] for k in range(1, self._num_actions + 1): a_initial_value = tf.constant( [[2 * k + 1, k + 1], [k + 1, 2 * k + 1]], dtype=tf.float32) a_for_one_arm = tf.compat.v2.Variable(a_initial_value) a_list.append(a_for_one_arm) b_initial_value = tf.constant([k, k], dtype=tf.float32) b_for_one_arm = tf.compat.v2.Variable(b_initial_value) b_list.append(b_for_one_arm) num_samples_initial_value = tf.constant([1], dtype=tf.float32) num_samples_for_one_arm = tf.compat.v2.Variable( num_samples_initial_value) num_samples_list.append(num_samples_for_one_arm) # Variables for the new policy (they differ by an offset). a_new_for_one_arm = tf.compat.v2.Variable(a_initial_value + _POLICY_VARIABLES_OFFSET) a_new_list.append(a_new_for_one_arm) b_new_for_one_arm = tf.compat.v2.Variable(b_initial_value + _POLICY_VARIABLES_OFFSET) b_new_list.append(b_new_for_one_arm) num_samples_for_one_arm_new = tf.compat.v2.Variable( num_samples_initial_value + _POLICY_VARIABLES_OFFSET) num_samples_new_list.append(num_samples_for_one_arm_new) self.evaluate(tf.compat.v1.global_variables_initializer()) policy = linear_policy.LinearBanditPolicy(self._action_spec, a_list, b_list, num_samples_list, self._time_step_spec, exploration_strategy) self.assertLen(policy.variables(), 3 * self._num_actions) new_policy = linear_policy.LinearBanditPolicy(self._action_spec, a_new_list, b_new_list, num_samples_new_list, self._time_step_spec, exploration_strategy) self.assertLen(new_policy.variables(), 3 * self._num_actions) self.evaluate(new_policy.update(policy)) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) new_action_step = new_policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape, new_action_step.action.shape) self.assertEqual(action_step.action.dtype, new_action_step.action.dtype) actions_, new_actions_ = self.evaluate( [action_step.action, new_action_step.action]) self.assertAllEqual(actions_, new_actions_)
def testPerArmActionBatchWithVariablesAndPolicyUpdate( self, batch_size, exploration_strategy): a_value = tf.reshape(tf.range(36, dtype=tf.float32), shape=[6, 6]) a_list = [tf.compat.v2.Variable(a_value)] a_new_list = [ tf.compat.v2.Variable(a_value + _POLICY_VARIABLES_OFFSET) ] b_value = tf.constant([2, 2, 2, 2, 2, 2], dtype=tf.float32) b_list = [tf.compat.v2.Variable(b_value)] b_new_list = [ tf.compat.v2.Variable(b_value + _POLICY_VARIABLES_OFFSET) ] num_samples_list = [ tf.compat.v2.Variable(tf.constant([1], dtype=tf.float32)) ] num_samples_new_list = [ tf.compat.v2.Variable( tf.constant([1 + _POLICY_VARIABLES_OFFSET], dtype=tf.float32)) ] self.evaluate(tf.compat.v1.global_variables_initializer()) policy = linear_policy.LinearBanditPolicy( self._action_spec, a_list, b_list, num_samples_list, self._per_arm_time_step_spec, exploration_strategy, accepts_per_arm_features=True) self.assertLen(policy.variables(), 3) new_policy = linear_policy.LinearBanditPolicy( self._action_spec, a_new_list, b_new_list, num_samples_new_list, self._per_arm_time_step_spec, exploration_strategy, accepts_per_arm_features=True) self.assertLen(new_policy.variables(), 3) self.evaluate(new_policy.update(policy)) step_batch = self._per_arm_time_step_batch(batch_size=batch_size) action_step = policy.action(step_batch) new_action_step = new_policy.action(step_batch) self.assertEqual(action_step.action.shape, new_action_step.action.shape) self.assertEqual(action_step.action.dtype, new_action_step.action.dtype) actions_, new_actions_, info = self.evaluate( [action_step.action, new_action_step.action, action_step.info]) self.assertAllEqual(actions_, new_actions_) arm_obs = step_batch.observation[bandit_spec_utils.PER_ARM_FEATURE_KEY] first_action = actions_[0] first_arm_features = arm_obs[0] self.assertAllEqual(info.chosen_arm_features[0], first_arm_features[first_action])
def testObservationShapeMismatch(self, batch_size, exploration_strategy): policy = linear_policy.LinearBanditPolicy(self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec, exploration_strategy) current_time_step = ts.TimeStep( tf.constant(ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), tf.constant(np.array(range(batch_size * (self._obs_dim + 1))), dtype=tf.float32, shape=[batch_size, self._obs_dim + 1], name='observation')) with self.assertRaisesRegexp( ValueError, r'Observation shape is expected to be \[None, 2\].' r' Got \[%d, 3\].' % batch_size): policy.action(current_time_step)
def testBuild(self, exploration_strategy): policy = linear_policy.LinearBanditPolicy(self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec, exploration_strategy) self.assertEqual(policy.time_step_spec, self._time_step_spec)
def testActionBatch(self, batch_size, exploration_strategy): policy = linear_policy.LinearBanditPolicy(self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec, exploration_strategy) action_step = policy.action(self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) actions_ = self.evaluate(action_step.action) self.assertAllGreaterEqual(actions_, self._action_spec.minimum) self.assertAllLessEqual(actions_, self._action_spec.maximum)
def testPredictedRewards(self, batch_size, exploration_strategy): policy = linear_policy.LinearBanditPolicy( self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec, exploration_strategy, emit_policy_info=( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, )) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) observation_numpy = np.array(range(batch_size * self._obs_dim), dtype=np.float32).reshape( [batch_size, self._obs_dim]) p_values = [] predicted_rewards_expected = [] for k in range(self._num_actions): a_inv = np.linalg.inv(self._a_numpy[k] + np.eye(self._obs_dim)) theta = np.matmul(a_inv, self._b_numpy[k].reshape([self._obs_dim, 1])) confidence_intervals = np.sqrt( np.diag( np.matmul( observation_numpy, np.matmul(a_inv, np.transpose(observation_numpy))))) est_mean_reward = np.matmul(observation_numpy, theta) predicted_rewards_expected.append(est_mean_reward) p_value = (est_mean_reward + self._alpha * confidence_intervals.reshape([-1, 1])) p_values.append(p_value) predicted_rewards_expected_array = np.stack(predicted_rewards_expected, axis=-1).reshape( batch_size, self._num_actions) p_info = self.evaluate(action_step.info) self.assertAllClose(p_info.predicted_rewards_mean, predicted_rewards_expected_array)
def testComparisonWithNumpy(self, batch_size, use_decomposition=False): eig_matrix_list = () eig_vals_list = () if use_decomposition: eig_vals_one_arm, eig_matrix_one_arm = tf.linalg.eigh(self._a[0]) eig_vals_list = [eig_vals_one_arm] * self._num_actions eig_matrix_list = [eig_matrix_one_arm] * self._num_actions policy = linear_policy.LinearBanditPolicy(self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec, eig_vals=eig_vals_list, eig_matrix=eig_matrix_list) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) actions_ = self.evaluate(action_step.action) observation_numpy = np.array(range(batch_size * self._obs_dim), dtype=np.float32).reshape( [batch_size, self._obs_dim]) p_values = [] for k in range(self._num_actions): a_inv = np.linalg.inv(self._a_numpy[k] + np.eye(self._obs_dim)) theta = np.matmul(a_inv, self._b_numpy[k].reshape([self._obs_dim, 1])) confidence_intervals = np.sqrt( np.diag( np.matmul( observation_numpy, np.matmul(a_inv, np.transpose(observation_numpy))))) p_value = (np.matmul(observation_numpy, theta) + self._alpha * confidence_intervals.reshape([-1, 1])) p_values.append(p_value) actions_numpy = np.argmax(np.stack(p_values, axis=-1), axis=-1).reshape([batch_size]) self.assertAllEqual(actions_.reshape([batch_size]), actions_numpy)
def testActionBatchWithMask(self, batch_size, exploration_strategy): def split_fn(obs): return obs[0], obs[1] policy = linear_policy.LinearBanditPolicy( self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec_with_mask, exploration_strategy, observation_and_action_constraint_splitter=split_fn) action_step = policy.action( self._time_step_batch_with_mask(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) actions_ = self.evaluate(action_step.action) self.assertAllEqual(actions_, range(batch_size))
def testActionBatchWithBias(self, batch_size, exploration_strategy): a = [tf.constant([[4, 1, 2], [1, 5, 3], [2, 3, 6]], dtype=tf.float32) ] * self._num_actions b = [ tf.constant([r, r, r], dtype=tf.float32) for r in range(self._num_actions) ] policy = linear_policy.LinearBanditPolicy(self._action_spec, a, b, self._num_samples_per_arm, self._time_step_spec, exploration_strategy, add_bias=True) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape.as_list(), [batch_size]) self.assertEqual(action_step.action.dtype, tf.int32) actions_ = self.evaluate(action_step.action) self.assertAllGreaterEqual(actions_, self._action_spec.minimum) self.assertAllLessEqual(actions_, self._action_spec.maximum)
def __init__(self, exploration_policy, time_step_spec: types.TimeStep, action_spec: types.BoundedTensorSpec, variable_collection: Optional[ LinearBanditVariableCollection] = None, alpha: float = 1.0, gamma: float = 1.0, use_eigendecomp: bool = False, tikhonov_weight: float = 1.0, add_bias: bool = False, emit_policy_info: Sequence[Text] = (), emit_log_probability: bool = False, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, accepts_per_arm_features: bool = False, debug_summaries: bool = False, summarize_grads_and_vars: bool = False, enable_summaries: bool = True, dtype: tf.DType = tf.float32, name: Optional[Text] = None): """Initialize an instance of `LinearBanditAgent`. Args: exploration_policy: An Enum of type `ExplorationPolicy`. The kind of policy we use for exploration. Currently supported policies are `LinUCBPolicy` and `LinearThompsonSamplingPolicy`. time_step_spec: A `TimeStep` spec describing the expected `TimeStep`s. action_spec: A scalar `BoundedTensorSpec` with `int32` or `int64` dtype describing the number of actions for this agent. variable_collection: Instance of `LinearBanditVariableCollection`. Collection of variables to be updated by the agent. If `None`, a new instance of `LinearBanditVariableCollection` will be created. alpha: (float) positive scalar. This is the exploration parameter that multiplies the confidence intervals. gamma: a float forgetting factor in [0.0, 1.0]. When set to 1.0, the algorithm does not forget. use_eigendecomp: whether to use eigen-decomposition or not. The default solver is Conjugate Gradient. tikhonov_weight: (float) tikhonov regularization term. add_bias: If true, a bias term will be added to the linear reward estimation. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. emit_log_probability: Whether the policy emits log-probabilities or not. Since the policy is deterministic, the probability is just 1. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the agent accepts per-arm features. debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. enable_summaries: A Python bool, default True. When False, all summaries (debug or otherwise) should not be written. dtype: The type of the parameters stored and updated by the agent. Should be one of `tf.float32` and `tf.float64`. Defaults to `tf.float32`. name: a name for this instance of `LinearBanditAgent`. Raises: ValueError if dtype is not one of `tf.float32` or `tf.float64`. TypeError if variable_collection is not an instance of `LinearBanditVariableCollection`. """ tf.Module.__init__(self, name=name) common.tf_agents_gauge.get_cell('TFABandit').set(True) self._num_actions = policy_utilities.get_num_actions_from_tensor_spec( action_spec) self._num_models = 1 if accepts_per_arm_features else self._num_actions self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._time_step_spec = time_step_spec self._accepts_per_arm_features = accepts_per_arm_features self._add_bias = add_bias if observation_and_action_constraint_splitter is not None: context_spec, _ = observation_and_action_constraint_splitter( time_step_spec.observation) else: context_spec = time_step_spec.observation (self._global_context_dim, self._arm_context_dim) = bandit_spec_utils.get_context_dims_from_spec( context_spec, accepts_per_arm_features) if self._add_bias: # The bias is added via a constant 1 feature. self._global_context_dim += 1 self._overall_context_dim = self._global_context_dim + self._arm_context_dim self._alpha = alpha if variable_collection is None: variable_collection = LinearBanditVariableCollection( context_dim=self._overall_context_dim, num_models=self._num_models, use_eigendecomp=use_eigendecomp, dtype=dtype) elif not isinstance(variable_collection, LinearBanditVariableCollection): raise TypeError('Parameter `variable_collection` should be ' 'of type `LinearBanditVariableCollection`.') self._variable_collection = variable_collection self._cov_matrix_list = variable_collection.cov_matrix_list self._data_vector_list = variable_collection.data_vector_list self._eig_matrix_list = variable_collection.eig_matrix_list self._eig_vals_list = variable_collection.eig_vals_list # We keep track of the number of samples per arm. self._num_samples_list = variable_collection.num_samples_list self._gamma = gamma if self._gamma < 0.0 or self._gamma > 1.0: raise ValueError( 'Forgetting factor `gamma` must be in [0.0, 1.0].') self._dtype = dtype if dtype not in (tf.float32, tf.float64): raise ValueError( 'Agent dtype should be either `tf.float32 or `tf.float64`.') self._use_eigendecomp = use_eigendecomp self._tikhonov_weight = tikhonov_weight if exploration_policy == ExplorationPolicy.linear_ucb_policy: exploration_strategy = lin_policy.ExplorationStrategy.optimistic elif exploration_policy == ( ExplorationPolicy.linear_thompson_sampling_policy): exploration_strategy = lin_policy.ExplorationStrategy.sampling else: raise ValueError( 'Linear bandit agent with policy %s not implemented' % exploration_policy) policy = lin_policy.LinearBanditPolicy( action_spec=action_spec, cov_matrix=self._cov_matrix_list, data_vector=self._data_vector_list, num_samples=self._num_samples_list, time_step_spec=time_step_spec, exploration_strategy=exploration_strategy, alpha=alpha, eig_vals=self._eig_vals_list if self._use_eigendecomp else (), eig_matrix=self._eig_matrix_list if self._use_eigendecomp else (), tikhonov_weight=self._tikhonov_weight, add_bias=add_bias, emit_policy_info=emit_policy_info, emit_log_probability=emit_log_probability, accepts_per_arm_features=accepts_per_arm_features, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter)) training_data_spec = None if accepts_per_arm_features: training_data_spec = bandit_spec_utils.drop_arm_observation( policy.trajectory_spec) super(LinearBanditAgent, self).__init__(time_step_spec=time_step_spec, action_spec=action_spec, policy=policy, collect_policy=policy, training_data_spec=training_data_spec, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, enable_summaries=enable_summaries, train_sequence_length=None) self._as_trajectory = data_converter.AsTrajectory(self.data_context, sequence_length=None)