def testAddPreprocessingLayers(self):
     batch_size = 3
     num_actions = 2
     states = (tf.random.uniform([batch_size,
                                  1]), tf.random.uniform([batch_size]))
     preprocessing_layers = (tf.keras.layers.Dense(4),
                             tf.keras.Sequential([
                                 tf.keras.layers.Reshape((1, )),
                                 tf.keras.layers.Dense(4)
                             ]))
     network = heteroscedastic_q_network.HeteroscedasticQNetwork(
         input_tensor_spec=(tensor_spec.TensorSpec([1], tf.float32),
                            tensor_spec.TensorSpec([], tf.float32)),
         preprocessing_layers=preprocessing_layers,
         preprocessing_combiner=tf.keras.layers.Add(),
         action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0,
                                                   num_actions - 1))
     preds, _ = network(states)
     q_values, log_variances = preds.q_value_logits, preds.log_variance
     self.assertAllEqual(q_values.shape.as_list(),
                         [batch_size, num_actions])
     self.assertAllEqual(log_variances.shape.as_list(),
                         [batch_size, num_actions])
     # At least 2 variables each for the preprocessing layers.
     self.assertGreater(len(network.trainable_variables), 6)
    def testCombinedFeatureColumnInput(self):
        columns = {}
        state_tensors = {}
        state_specs = {}
        expected_dim = 0

        indicator_key = 'indicator_key'
        vocab_list = [2, 3, 4]
        column1 = tf.feature_column.categorical_column_with_vocabulary_list(
            indicator_key, vocab_list)
        columns[indicator_key] = tf.feature_column.indicator_column(column1)
        state_tensors[indicator_key] = tf.expand_dims([3, 2, 2, 4, 3], -1)
        state_specs[indicator_key] = tensor_spec.TensorSpec([1], tf.int32)
        expected_dim += len(vocab_list)

        embedding_key = 'embedding_key'
        embedding_dim = 3
        vocab_list = [2, 3, 4]
        column2 = tf.feature_column.categorical_column_with_vocabulary_list(
            embedding_key, vocab_list)
        columns[embedding_key] = tf.feature_column.embedding_column(
            column2, embedding_dim)
        state_tensors[embedding_key] = tf.expand_dims([3, 2, 2, 4, 3], -1)
        state_specs[embedding_key] = tensor_spec.TensorSpec([1], tf.int32)
        expected_dim += embedding_dim

        numeric_key = 'numeric_key'
        batch_size = 5
        state_dims = 3
        input_shape = (batch_size, state_dims)
        columns[numeric_key] = tf.feature_column.numeric_column(
            numeric_key, [state_dims])
        state_tensors[numeric_key] = tf.ones(input_shape, tf.int32)
        state_specs[numeric_key] = tensor_spec.TensorSpec([state_dims],
                                                          tf.int32)
        expected_dim += state_dims

        num_actions = 4
        action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0,
                                                    num_actions - 1)
        dense_features = tf.compat.v2.keras.layers.DenseFeatures(
            columns.values())
        online_network = heteroscedastic_q_network.HeteroscedasticQNetwork(
            state_specs, action_spec, preprocessing_combiner=dense_features)
        target_network = online_network.copy(name='TargetNetwork')
        q_online = online_network(state_tensors)[0].q_value_logits
        q_target = target_network(state_tensors)[0].q_value_logits
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(tf.compat.v1.initializers.tables_initializer())

        expected_shape = (batch_size, num_actions)
        self.assertEqual(expected_shape, q_online.shape)
        self.assertEqual(expected_shape, q_target.shape)
        self.assertAllClose(q_online, q_target, rtol=1.0, atol=1.0)
Exemplo n.º 3
0
 def testPreprocessingLayersSingleObservations(self):
   """Tests using preprocessing_layers without preprocessing_combiner."""
   num_state_dims = 5
   network = heteroscedastic_q_network.HeteroscedasticQNetwork(
       input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32),
       action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1),
       preprocessing_layers=tf.keras.layers.Lambda(lambda x: x),
       preprocessing_combiner=None)
   preds, _ = network(tf.ones((3, num_state_dims)))
   q_logits, log_variances = preds.q_value_logits, preds.log_variance
   self.assertAllEqual(q_logits.shape.as_list(), [3, 2])
   self.assertAllEqual(log_variances.shape.as_list(), [3, 2])
Exemplo n.º 4
0
 def testCorrectOutputShape(self):
   batch_size = 3
   num_state_dims = 5
   num_actions = 2
   states = tf.random.uniform([batch_size, num_state_dims])
   network = heteroscedastic_q_network.HeteroscedasticQNetwork(
       input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32),
       action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1))
   preds, _ = network(states)
   q_values, log_variances = preds.q_value_logits, preds.log_variance
   self.assertAllEqual(q_values.shape.as_list(), [batch_size, num_actions])
   self.assertAllEqual(log_variances.shape.as_list(),
                       [batch_size, num_actions])
Exemplo n.º 5
0
 def testChangeHiddenLayers(self):
   batch_size = 3
   num_state_dims = 5
   num_actions = 2
   states = tf.random.uniform([batch_size, num_state_dims])
   network = heteroscedastic_q_network.HeteroscedasticQNetwork(
       input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32),
       action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1),
       fc_layer_params=(40,))
   preds, _ = network(states)
   q_values, log_variances = preds.q_value_logits, preds.log_variance
   self.assertAllEqual(q_values.shape.as_list(), [batch_size, num_actions])
   self.assertAllEqual(log_variances.shape.as_list(),
                       [batch_size, num_actions])
   self.assertEqual(len(network.trainable_variables), 6)
Exemplo n.º 6
0
 def testNetworkVariablesAreReused(self):
   batch_size = 3
   num_state_dims = 5
   states = tf.ones([batch_size, num_state_dims])
   next_states = tf.ones([batch_size, num_state_dims])
   network = heteroscedastic_q_network.HeteroscedasticQNetwork(
       input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32),
       action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1))
   preds, _ = network(states)
   q_values, log_variances = preds.q_value_logits, preds.log_variance
   preds, _ = network(next_states)
   next_q_values, next_log_variances = preds.q_value_logits, preds.log_variance
   self.evaluate(tf.compat.v1.global_variables_initializer())
   self.assertAllClose(q_values, next_q_values)
   self.assertAllClose(log_variances, next_log_variances)
Exemplo n.º 7
0
 def testVarianceBoundaryConditions(self):
   """Tests that min/max variance conditions are satisfied."""
   batch_size = 3
   num_state_dims = 5
   min_variance = 1.0
   max_variance = 2.0
   eps = 0.0001
   states = tf.random.uniform([batch_size, num_state_dims])
   network = heteroscedastic_q_network.HeteroscedasticQNetwork(
       input_tensor_spec=tensor_spec.TensorSpec([num_state_dims], tf.float32),
       action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1),
       min_variance=min_variance,
       max_variance=max_variance)
   log_variances = network(states)[0].log_variance
   self.evaluate(tf.compat.v1.global_variables_initializer())
   self.assertAllGreater(log_variances, math.log(min_variance) - eps)
   self.assertAllLess(log_variances, math.log(max_variance) + eps)
    def testNumericFeatureColumnInput(self):
        key = 'feature_key'
        batch_size = 3
        state_dims = 5
        column = tf.feature_column.numeric_column(key, [state_dims])
        state = {key: tf.ones([batch_size, state_dims], tf.int32)}
        state_spec = {key: tensor_spec.TensorSpec([state_dims], tf.int32)}

        dense_features = tf.compat.v2.keras.layers.DenseFeatures([column])
        online_network = heteroscedastic_q_network.HeteroscedasticQNetwork(
            input_tensor_spec=state_spec,
            action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1),
            preprocessing_combiner=dense_features)
        target_network = online_network.copy(name='TargetNetwork')
        q_online = online_network(state)[0].q_value_logits
        q_target = target_network(state)[0].q_value_logits
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllClose(q_online, q_target, rtol=1.0, atol=1.0)
    def testEmbeddingFeatureColumnInput(self):
        key = 'feature_key'
        vocab_list = ['a', 'b']
        column = tf.feature_column.categorical_column_with_vocabulary_list(
            key, vocab_list)
        column = tf.feature_column.embedding_column(column, 3)
        feature_tensor = tf.convert_to_tensor(['a', 'b', 'c', 'a', 'c'])
        state = {key: tf.expand_dims(feature_tensor, -1)}
        state_spec = {key: tensor_spec.TensorSpec([1], tf.string)}

        dense_features = tf.compat.v2.keras.layers.DenseFeatures([column])
        online_network = heteroscedastic_q_network.HeteroscedasticQNetwork(
            input_tensor_spec=state_spec,
            action_spec=tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1),
            preprocessing_combiner=dense_features)
        target_network = online_network.copy(name='TargetNetwork')
        q_online = online_network(state)[0].q_value_logits
        q_target = target_network(state)[0].q_value_logits
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(tf.compat.v1.initializers.tables_initializer())
        self.assertAllClose(q_online, q_target, rtol=1.0, atol=1.0)
Exemplo n.º 10
0
  def __init__(
      self,
      time_step_spec,
      action_spec,
      optimizer,
      # Network params.
      dropout_rate,
      network_layers,
      dropout_only_top_layer=True,
      observation_and_action_constraint_splitter=None,
      # Params for training.
      error_loss_fn=tf.compat.v1.losses.mean_squared_error,
      gradient_clipping=None,
      heteroscedastic=False,
      # Params for debugging.
      debug_summaries=False,
      summarize_grads_and_vars=False,
      enable_summaries=True,
      emit_policy_info=(),
      train_step_counter=None,
      laplacian_matrix=None,
      laplacian_smoothing_weight=0.001,
      name=None):
    """Creates a Dropout Thompson Sampling Agent.

    For more details about the Laplacian smoothing regularization, please see
    the documentation of the `GreedyRewardPredictionAgent`.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      optimizer: The optimizer to use for training.
      dropout_rate: Float in `(0, 1)`, the dropout rate.
      network_layers: Tuple of ints determining the sizes of the network layers.
      dropout_only_top_layer: Boolean parameter determining if dropout should be
        done only in the top layer. True by default.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      heteroscedastic: If True, the variance per action is estimated and the
        losses are weighted appropriately.
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      laplacian_matrix: A float `Tensor` shaped `[num_actions, num_actions]`.
        This holds the Laplacian matrix used to regularize the smoothness of the
        estimated expected reward function. This only applies to problems where
        the actions have a graph structure. If `None`, the regularization is not
        applied.
      laplacian_smoothing_weight: A float that determines the weight of the
        regularization term. Note that this has no effect if `laplacian_matrix`
        above is `None`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
    """
    fc_layer_params = network_layers
    dropout_param = {'rate': dropout_rate, 'permanent': True}
    if dropout_only_top_layer:
      dropout_layer_params = [None] * (len(fc_layer_params) - 1)
      dropout_layer_params.append(dropout_param)
    else:
      dropout_layer_params = [dropout_param] * len(fc_layer_params)
    if observation_and_action_constraint_splitter is not None:
      input_tensor_spec, _ = observation_and_action_constraint_splitter(
          time_step_spec.observation)
    else:
      input_tensor_spec = time_step_spec.observation

    if heteroscedastic:
      reward_network = heteroscedastic_q_network.HeteroscedasticQNetwork(
          input_tensor_spec=input_tensor_spec,
          action_spec=action_spec,
          fc_layer_params=fc_layer_params,
          dropout_layer_params=dropout_layer_params)
    else:
      reward_network = q_network.QNetwork(
          input_tensor_spec=input_tensor_spec,
          action_spec=action_spec,
          fc_layer_params=fc_layer_params,
          dropout_layer_params=dropout_layer_params)

    super(DropoutThompsonSamplingAgent, self).__init__(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        reward_network=reward_network,
        optimizer=optimizer,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter),
        error_loss_fn=error_loss_fn,
        gradient_clipping=gradient_clipping,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        enable_summaries=enable_summaries,
        emit_policy_info=emit_policy_info,
        train_step_counter=train_step_counter,
        laplacian_matrix=laplacian_matrix,
        laplacian_smoothing_weight=laplacian_smoothing_weight,
        name=name)