def testAddPreprocessingLayers(self): batch_size = 3 num_actions = 2 states = (tf.random.uniform([batch_size, 1]), tf.random.uniform([batch_size])) preprocessing_layers = (tf.keras.layers.Dense(4), tf.keras.Sequential([ tf.keras.layers.Reshape((1, )), tf.keras.layers.Dense(4) ])) network = heteroscedastic_q_network.HeteroscedasticQNetwork( input_tensor_spec=(tensor_spec.TensorSpec([1], tf.float32), tensor_spec.TensorSpec([], tf.float32)), preprocessing_layers=preprocessing_layers, preprocessing_combiner=tf.keras.layers.Add(), action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, num_actions - 1)) preds, _ = network(states) q_values, log_variances = preds.q_value_logits, preds.log_variance self.assertAllEqual(q_values.shape.as_list(), [batch_size, num_actions]) self.assertAllEqual(log_variances.shape.as_list(), [batch_size, num_actions]) # At least 2 variables each for the preprocessing layers. self.assertGreater(len(network.trainable_variables), 6)
def testCombinedFeatureColumnInput(self): columns = {} state_tensors = {} state_specs = {} expected_dim = 0 indicator_key = 'indicator_key' vocab_list = [2, 3, 4] column1 = tf.feature_column.categorical_column_with_vocabulary_list( indicator_key, vocab_list) columns[indicator_key] = tf.feature_column.indicator_column(column1) state_tensors[indicator_key] = tf.expand_dims([3, 2, 2, 4, 3], -1) state_specs[indicator_key] = tensor_spec.TensorSpec([1], tf.int32) expected_dim += len(vocab_list) embedding_key = 'embedding_key' embedding_dim = 3 vocab_list = [2, 3, 4] column2 = tf.feature_column.categorical_column_with_vocabulary_list( embedding_key, vocab_list) columns[embedding_key] = tf.feature_column.embedding_column( column2, embedding_dim) state_tensors[embedding_key] = tf.expand_dims([3, 2, 2, 4, 3], -1) state_specs[embedding_key] = tensor_spec.TensorSpec([1], tf.int32) expected_dim += embedding_dim numeric_key = 'numeric_key' batch_size = 5 state_dims = 3 input_shape = (batch_size, state_dims) columns[numeric_key] = tf.feature_column.numeric_column( numeric_key, [state_dims]) state_tensors[numeric_key] = tf.ones(input_shape, tf.int32) state_specs[numeric_key] = tensor_spec.TensorSpec([state_dims], tf.int32) expected_dim += state_dims num_actions = 4 action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, num_actions - 1) dense_features = tf.compat.v2.keras.layers.DenseFeatures( columns.values()) online_network = heteroscedastic_q_network.HeteroscedasticQNetwork( state_specs, action_spec, preprocessing_combiner=dense_features) target_network = online_network.copy(name='TargetNetwork') q_online = online_network(state_tensors)[0].q_value_logits q_target = target_network(state_tensors)[0].q_value_logits self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(tf.compat.v1.initializers.tables_initializer()) expected_shape = (batch_size, num_actions) self.assertEqual(expected_shape, q_online.shape) self.assertEqual(expected_shape, q_target.shape) self.assertAllClose(q_online, q_target, rtol=1.0, atol=1.0)
def testPreprocessingLayersSingleObservations(self): """Tests using preprocessing_layers without preprocessing_combiner.""" num_state_dims = 5 network = heteroscedastic_q_network.HeteroscedasticQNetwork( input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32), action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1), preprocessing_layers=tf.keras.layers.Lambda(lambda x: x), preprocessing_combiner=None) preds, _ = network(tf.ones((3, num_state_dims))) q_logits, log_variances = preds.q_value_logits, preds.log_variance self.assertAllEqual(q_logits.shape.as_list(), [3, 2]) self.assertAllEqual(log_variances.shape.as_list(), [3, 2])
def testCorrectOutputShape(self): batch_size = 3 num_state_dims = 5 num_actions = 2 states = tf.random.uniform([batch_size, num_state_dims]) network = heteroscedastic_q_network.HeteroscedasticQNetwork( input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32), action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)) preds, _ = network(states) q_values, log_variances = preds.q_value_logits, preds.log_variance self.assertAllEqual(q_values.shape.as_list(), [batch_size, num_actions]) self.assertAllEqual(log_variances.shape.as_list(), [batch_size, num_actions])
def testChangeHiddenLayers(self): batch_size = 3 num_state_dims = 5 num_actions = 2 states = tf.random.uniform([batch_size, num_state_dims]) network = heteroscedastic_q_network.HeteroscedasticQNetwork( input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32), action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1), fc_layer_params=(40,)) preds, _ = network(states) q_values, log_variances = preds.q_value_logits, preds.log_variance self.assertAllEqual(q_values.shape.as_list(), [batch_size, num_actions]) self.assertAllEqual(log_variances.shape.as_list(), [batch_size, num_actions]) self.assertEqual(len(network.trainable_variables), 6)
def testNetworkVariablesAreReused(self): batch_size = 3 num_state_dims = 5 states = tf.ones([batch_size, num_state_dims]) next_states = tf.ones([batch_size, num_state_dims]) network = heteroscedastic_q_network.HeteroscedasticQNetwork( input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32), action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)) preds, _ = network(states) q_values, log_variances = preds.q_value_logits, preds.log_variance preds, _ = network(next_states) next_q_values, next_log_variances = preds.q_value_logits, preds.log_variance self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(q_values, next_q_values) self.assertAllClose(log_variances, next_log_variances)
def testVarianceBoundaryConditions(self): """Tests that min/max variance conditions are satisfied.""" batch_size = 3 num_state_dims = 5 min_variance = 1.0 max_variance = 2.0 eps = 0.0001 states = tf.random.uniform([batch_size, num_state_dims]) network = heteroscedastic_q_network.HeteroscedasticQNetwork( input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32), action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1), min_variance=min_variance, max_variance=max_variance) log_variances = network(states)[0].log_variance self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllGreater(log_variances, math.log(min_variance) - eps) self.assertAllLess(log_variances, math.log(max_variance) + eps)
def testNumericFeatureColumnInput(self): key = 'feature_key' batch_size = 3 state_dims = 5 column = tf.feature_column.numeric_column(key, [state_dims]) state = {key: tf.ones([batch_size, state_dims], tf.int32)} state_spec = {key: tensor_spec.TensorSpec([state_dims], tf.int32)} dense_features = tf.compat.v2.keras.layers.DenseFeatures([column]) online_network = heteroscedastic_q_network.HeteroscedasticQNetwork( input_tensor_spec=state_spec, action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1), preprocessing_combiner=dense_features) target_network = online_network.copy(name='TargetNetwork') q_online = online_network(state)[0].q_value_logits q_target = target_network(state)[0].q_value_logits self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(q_online, q_target, rtol=1.0, atol=1.0)
def testEmbeddingFeatureColumnInput(self): key = 'feature_key' vocab_list = ['a', 'b'] column = tf.feature_column.categorical_column_with_vocabulary_list( key, vocab_list) column = tf.feature_column.embedding_column(column, 3) feature_tensor = tf.convert_to_tensor(['a', 'b', 'c', 'a', 'c']) state = {key: tf.expand_dims(feature_tensor, -1)} state_spec = {key: tensor_spec.TensorSpec([1], tf.string)} dense_features = tf.compat.v2.keras.layers.DenseFeatures([column]) online_network = heteroscedastic_q_network.HeteroscedasticQNetwork( input_tensor_spec=state_spec, action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1), preprocessing_combiner=dense_features) target_network = online_network.copy(name='TargetNetwork') q_online = online_network(state)[0].q_value_logits q_target = target_network(state)[0].q_value_logits self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(tf.compat.v1.initializers.tables_initializer()) self.assertAllClose(q_online, q_target, rtol=1.0, atol=1.0)
def __init__( self, time_step_spec, action_spec, optimizer, # Network params. dropout_rate, network_layers, dropout_only_top_layer=True, observation_and_action_constraint_splitter=None, # Params for training. error_loss_fn=tf.compat.v1.losses.mean_squared_error, gradient_clipping=None, heteroscedastic=False, # Params for debugging. debug_summaries=False, summarize_grads_and_vars=False, enable_summaries=True, emit_policy_info=(), train_step_counter=None, laplacian_matrix=None, laplacian_smoothing_weight=0.001, name=None): """Creates a Dropout Thompson Sampling Agent. For more details about the Laplacian smoothing regularization, please see the documentation of the `GreedyRewardPredictionAgent`. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. optimizer: The optimizer to use for training. dropout_rate: Float in `(0, 1)`, the dropout rate. network_layers: Tuple of ints determining the sizes of the network layers. dropout_only_top_layer: Boolean parameter determining if dropout should be done only in the top layer. True by default. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the bandit agent and policy, and 2) the boolean mask. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. error_loss_fn: A function for computing the error loss, taking parameters labels, predictions, and weights (any function from tf.losses would work). The default is `tf.losses.mean_squared_error`. gradient_clipping: A float representing the norm length to clip gradients (or None for no clipping.) heteroscedastic: If True, the variance per action is estimated and the losses are weighted appropriately. debug_summaries: A Python bool, default False. When True, debug summaries are gathered. summarize_grads_and_vars: A Python bool, default False. When True, gradients and network variable summaries are written during training. enable_summaries: A Python bool, default True. When False, all summaries (debug or otherwise) should not be written. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. train_step_counter: An optional `tf.Variable` to increment every time the train op is run. Defaults to the `global_step`. laplacian_matrix: A float `Tensor` shaped `[num_actions, num_actions]`. This holds the Laplacian matrix used to regularize the smoothness of the estimated expected reward function. This only applies to problems where the actions have a graph structure. If `None`, the regularization is not applied. laplacian_smoothing_weight: A float that determines the weight of the regularization term. Note that this has no effect if `laplacian_matrix` above is `None`. name: Python str name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If the action spec contains more than one action or or it is not a bounded scalar int32 spec with minimum 0. """ fc_layer_params = network_layers dropout_param = {'rate': dropout_rate, 'permanent': True} if dropout_only_top_layer: dropout_layer_params = [None] * (len(fc_layer_params) - 1) dropout_layer_params.append(dropout_param) else: dropout_layer_params = [dropout_param] * len(fc_layer_params) if observation_and_action_constraint_splitter is not None: input_tensor_spec, _ = observation_and_action_constraint_splitter( time_step_spec.observation) else: input_tensor_spec = time_step_spec.observation if heteroscedastic: reward_network = heteroscedastic_q_network.HeteroscedasticQNetwork( input_tensor_spec=input_tensor_spec, action_spec=action_spec, fc_layer_params=fc_layer_params, dropout_layer_params=dropout_layer_params) else: reward_network = q_network.QNetwork( input_tensor_spec=input_tensor_spec, action_spec=action_spec, fc_layer_params=fc_layer_params, dropout_layer_params=dropout_layer_params) super(DropoutThompsonSamplingAgent, self).__init__( time_step_spec=time_step_spec, action_spec=action_spec, reward_network=reward_network, optimizer=optimizer, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), error_loss_fn=error_loss_fn, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, enable_summaries=enable_summaries, emit_policy_info=emit_policy_info, train_step_counter=train_step_counter, laplacian_matrix=laplacian_matrix, laplacian_smoothing_weight=laplacian_smoothing_weight, name=name)