示例#1
0
    def _embed_hash_buckets(self, ids: tf.Tensor, embedding_size: int,
                            num_hashes: int, num_buckets: int,
                            initializer_range: int) -> tf.Tensor:
        """Converts IDs (e.g. codepoints) into embeddings via multiple hashing.

    Args:
      ids: The codepoints or other IDs to be hashed.
      embedding_size: The dimensionality of the returned embeddings.
      num_hashes: The number of hash functions to use.
      num_buckets: The number of hash buckets (i.e. embeddings in each table).
      initializer_range: Maximum absolute value for initial weights.

    Returns:
      The codepoint emeddings.
    """

        if embedding_size % num_hashes != 0:
            raise ValueError(f"Expected `embedding_size` ({embedding_size}) % "
                             f"`num_hashes` ({num_hashes}) == 0")

        shard_embedding_size = embedding_size // num_hashes

        hash_bucket_tensors = self._hash_bucket_tensors(
            ids, num_hashes=num_hashes, num_buckets=num_buckets)
        embedding_shards = []
        for i, hash_bucket_ids in enumerate(hash_bucket_tensors):
            embedding_table = tf.get_variable(
                name=f"embeddings/HashBucketCodepointEmbedder_{i}",
                shape=[num_buckets, shard_embedding_size],
                initializer=bert_modeling.create_initializer(
                    initializer_range))
            shard_embeddings = tf.nn.embedding_lookup(embedding_table,
                                                      hash_bucket_ids)
            embedding_shards.append(shard_embeddings)
        return tf.concat(embedding_shards, axis=-1)
示例#2
0
    def _chars_to_molecules(
            self, char_encoding: tf.Tensor,
            expected_molecule_seq_length: tf.Tensor) -> tf.Tensor:
        """Convert char seq to initial molecule seq."""

        del expected_molecule_seq_length  # Used by contract only.

        with tf.variable_scope("initial_char_encoder/chars_to_molecules"):
            downsampled = tf.layers.conv1d(
                inputs=char_encoding,
                filters=self.config.hidden_size,
                kernel_size=self.config.downsampling_rate,
                strides=self.config.downsampling_rate,
                padding="valid",
                activation=bert_modeling.get_activation(
                    self.config.hidden_act),
                name="conv")

            # `char_dim_cls_position`: [batch, 1, char_dim]
            # And below: `cls_position`: [batch, 1, molecule_dim]
            char_dim_cls_position = char_encoding[:, 0:1, :]
            # `molecule_dim_seq`: [batch, char_seq_len, molecule_dim]
            if self.config.hidden_size == self.config.hidden_size:
                cls_position = char_dim_cls_position
            else:
                assert self.config.hidden_size != self.config.hidden_size
                cls_position = bert_modeling.dense_layer_2d(
                    char_dim_cls_position, self.config.hidden_size,
                    bert_modeling.create_initializer(
                        self.config.initializer_range), None,
                    "cls_position_dense")
                if self._is_training:
                    cls_position = bert_modeling.dropout(
                        cls_position, self.config.hidden_dropout_prob)

            # Truncate the last molecule in order to reserve a position for [CLS].
            # Often, the last position is never used (unless we completely fill the
            # text buffer). This is important in order to maintain alignment on TPUs
            # (i.e. a multiple of 128).
            downsampled_truncated = downsampled[:, 0:-1, :]

            # We also keep [CLS] as a separate sequence position since we always
            # want to reserve a position (and the model capacity that goes along
            # with that) in the deep BERT stack.
            # `result`: [batch, molecule_seq, molecule_dim]
            result = tf.concat([cls_position, downsampled_truncated], axis=1)

            return bert_modeling.layer_norm(result)
示例#3
0
 def _pool(self, seq_to_pool: tf.Tensor) -> tf.Tensor:
     """Grab the [CLS] molecule for use in classification tasks."""
     # The "pooler" converts the encoded sequence tensor of shape
     # [batch_size, seq_length, hidden_size] to a tensor of shape
     # [batch_size, hidden_size]. This is necessary for segment-level
     # (or segment-pair-level) classification tasks where we need a fixed
     # dimensional representation of the segment.
     with tf.variable_scope("pooler"):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token. We assume that this has been pre-trained.
         # This snippet is taken from vanilla BERT.
         first_token_tensor = tf.squeeze(seq_to_pool[:, 0:1, :], axis=1)
         return tf.layers.dense(
             first_token_tensor,
             self.config.hidden_size,
             activation=tf.tanh,
             kernel_initializer=bert_modeling.create_initializer(
                 self.config.initializer_range))
示例#4
0
    def _molecules_to_chars(self, molecules: tf.Tensor,
                            molecule_seq_length: tf.Tensor,
                            expected_char_seq_length: tf.Tensor,
                            expected_char_dim: int) -> tf.Tensor:
        """Converts molecule seq back to a char seq."""

        del expected_char_dim  # Used by contract only.

        with tf.variable_scope("molecules_to_chars"):
            repeated = self._repeat_molecules(
                molecules,
                char_seq_length=expected_char_seq_length,
                molecule_seq_length=molecule_seq_length)

            if self.config.hidden_size == self.config.hidden_size:
                # If the dimensionality matches, just directly add a residual (not the
                # typical case).
                return repeated

            # Use a *slice* of the original features in order to create a residual
            # connection despite having different dimensions. This is a fairly
            # unusual (novel?) way of performing residual connections since they
            # typically assume uniform dimensionality.
            orig_features_for_residual = (
                repeated[:, :, :self.config.hidden_size])

            # Project molecules back to `char_dim`.
            result = bert_modeling.dense_layer_2d(
                repeated, self.config.hidden_size,
                bert_modeling.create_initializer(
                    self.config.initializer_range), None, "dense")
            if self._is_training:
                result = bert_modeling.dropout(result,
                                               self.config.hidden_dropout_prob)
            # Add a resnet connection from the final character stack back through
            # the molecule transformer stack for a *slice* of the features.
            return bert_modeling.layer_norm(
                _safe_add(result, orig_features_for_residual))
示例#5
0
def local_transformer_model(input_tensor: tf.Tensor,
                            attention_mask: tf.Tensor,
                            input_kv_tensor: Optional[tf.Tensor] = None,
                            init_kv_attention_mask: Optional[tf.Tensor] = None,
                            hidden_size: int = 768,
                            num_hidden_layers: int = 12,
                            num_attention_heads: int = 12,
                            intermediate_size: int = 3072,
                            intermediate_act_fn: Optional[Text] = None,
                            hidden_dropout_prob: float = 0.1,
                            attention_probs_dropout_prob: float = 0.1,
                            initializer_range: float = 0.02,
                            do_return_all_layers: bool = False,
                            num_layers_to_update: Optional[int] = None,
                            always_attend_to_first_position: bool = True,
                            first_position_attends_to_all: bool = True,
                            attend_from_chunk_width: int = 128,
                            attend_from_chunk_stride: int = 128,
                            attend_to_chunk_width: int = 128,
                            attend_to_chunk_stride: int = 128,
                            init_attend_to_chunk_width: int = 128,
                            init_attend_to_chunk_stride: int = 128):
  """Fork of BERT's `transformer_model` that performs local attention.

  This attention is local in that attention happens only within each block
  (as defined by the length of the stides).

  Function parameters specific to local attention (i.e. added from BERT's
  `attention_layer`) are at the bottom of the argument list.

  IMPORTANT: Both `input_tensor` and `init_kv_tensor` must have a static
  sequence length dimension, such that it can be extracted as a python integer
  at graph-building time. Dynamic sequence lengths are not supported by
  `local_transformer_model` and `local_attention_layer` (because doing so would
  greatly limit XLA's ability to create a highly optimized program on TPU).

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
    attention_mask: int32 Tensor of shape [batch_size, seq_length,
      seq_length], with 1 for positions that can be attended to and 0 in
      positions that should not be.
    input_kv_tensor: float Tensor of shape
      [batch_size, seq_length_kv, seq_dim_kv]. If specified, this will be used
      for the initial layer of keys and values for self-attention.
      `input_tensor` will still be used for queries and resnet connections.
    init_kv_attention_mask: (optional) int32 Tensor of shape [batch_size,
      seq_length, seq_length_kv], with 1 for positions that can be attended to
      and 0 in positions that should not be. i.e. It indicates which items we
      can attend *from* in `input_tensor` (`seq_length`) and *to* in
      `input_kv_tensor` (`seq_length_kv`).
    hidden_size: int. Hidden size of the Transformer.
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
    num_attention_heads: int. Number of attention heads in the Transformer.
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
      forward) layer.
    intermediate_act_fn: function. The non-linear activation function to apply
      to the output of the intermediate/feed-forward layer.
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
    attention_probs_dropout_prob: float. Dropout probability of the attention
      probabilities.
    initializer_range: float. Range of the initializer (stddev of truncated
      normal).
    do_return_all_layers: Whether to also return all layers or just the final
      layer.
    num_layers_to_update: (optional) Number of layers to update during
      training (a `tf.stop_gradient` is applied beyond this point). This is
      useful for gradual layer unfreezing during fine-tuning to prevent
      catastrophic forgetting.
    always_attend_to_first_position: Should all blocks be able to attend to the
      `to_tensor`'s first position (e.g. a [CLS] position)?
    first_position_attends_to_all: Should the query ("from") tensor's first
      position be able to attend to all positions within the key-value tensor?
    attend_from_chunk_width: The width of each block-wise chunk in
      the query ("from") tensor.
    attend_from_chunk_stride: The number of elements to skip when moving to the
      next block in the query ("from") tensor.
    attend_to_chunk_width: The width of each block-wise chunk in the key-value
      ("to") tensor.
    attend_to_chunk_stride: The number of elements to skip when moving to the
      next block in the key-value ("to") tensor.
    init_attend_to_chunk_width: `attend_to_chunk_width` for first layer when
      `init_kv_tensor` is specified.
    init_attend_to_chunk_stride: `attend_to_chunk_stride` for first layer when
      `init_kv_tensor` is specified.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  """

  if num_hidden_layers == 0:
    return input_tensor

  if hidden_size % num_attention_heads != 0:
    raise ValueError(
        "The hidden size (%d) is not a multiple of the number of attention "
        "heads (%d)" % (hidden_size, num_attention_heads))

  attention_head_size = int(hidden_size / num_attention_heads)
  input_shape = bert_modeling.get_shape_list(input_tensor, expected_rank=3)
  input_width = input_shape[2]

  from_shape = bert_modeling.get_shape_list(input_tensor, expected_rank=3)
  # This is enforced as a static int in the contract above.
  from_seq_length: int = from_shape[1]

  # The Transformer performs sum residuals on all layers so the input needs
  # to be the same as the hidden size.
  if input_width != hidden_size:
    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
                     (input_width, hidden_size))

  prev_output = input_tensor
  all_layer_outputs = []
  for layer_idx in range(num_hidden_layers):
    with tf.variable_scope("layer_%d" % layer_idx):
      layer_input = prev_output
      if layer_idx == 0 and input_kv_tensor is not None:
        layer_kv_input = input_kv_tensor
        layer_attention_mask = init_kv_attention_mask
        layer_attend_to_chunk_width = init_attend_to_chunk_width
        layer_attend_to_chunk_stride = init_attend_to_chunk_stride
        if init_kv_attention_mask is None:
          raise ValueError("`init_kv_attention_mask` must be specified when "
                           "`input_kv_tensor` is specified.")
      else:
        layer_kv_input = layer_input
        layer_attention_mask = attention_mask
        layer_attend_to_chunk_width = attend_to_chunk_width
        layer_attend_to_chunk_stride = attend_to_chunk_stride

      to_shape = bert_modeling.get_shape_list(layer_kv_input, expected_rank=3)
      to_seq_length: int = to_shape[1]
      assert isinstance(to_seq_length, int)

      with tf.variable_scope("attention", reuse=tf.AUTO_REUSE):
        with tf.variable_scope("self"):
          attention_output = local_attention_layer(
              from_tensor=layer_input,  # Queries.
              to_tensor=layer_kv_input,
              from_seq_length=from_seq_length,
              to_seq_length=to_seq_length,
              attention_mask=layer_attention_mask,
              num_attention_heads=num_attention_heads,
              size_per_head=attention_head_size,
              attention_probs_dropout_prob=attention_probs_dropout_prob,
              initializer_range=initializer_range,
              # Parameters specific to local attention:
              always_attend_to_first_position=always_attend_to_first_position,
              first_position_attends_to_all=first_position_attends_to_all,
              attend_from_chunk_width=attend_from_chunk_width,
              attend_from_chunk_stride=attend_from_chunk_stride,
              attend_to_chunk_width=layer_attend_to_chunk_width,
              attend_to_chunk_stride=layer_attend_to_chunk_stride)

        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.variable_scope("output"):
          attention_output = bert_modeling.dense_layer_3d_proj(
              attention_output, hidden_size, num_attention_heads,
              attention_head_size,
              bert_modeling.create_initializer(initializer_range), None,
              "dense")
          attention_output = bert_modeling.dropout(attention_output,
                                                   hidden_dropout_prob)
          attention_output = bert_modeling.layer_norm(
              _safe_add(attention_output, layer_input))

      # The activation is only applied to the "intermediate" hidden layer.
      with tf.variable_scope("intermediate"):
        intermediate_output = bert_modeling.dense_layer_2d(
            attention_output, intermediate_size,
            bert_modeling.create_initializer(initializer_range),
            intermediate_act_fn, "dense")

      # Down-project back to `hidden_size` then add the residual.
      with tf.variable_scope("output"):
        layer_output = bert_modeling.dense_layer_2d(
            intermediate_output, hidden_size,
            bert_modeling.create_initializer(initializer_range), None, "dense")
        layer_output = bert_modeling.dropout(layer_output, hidden_dropout_prob)
        layer_output = bert_modeling.layer_norm(
            _safe_add(layer_output, attention_output))

        if num_layers_to_update is not None:
          num_layers_remaining = num_hidden_layers - layer_idx - 1
          if num_layers_remaining == num_layers_to_update:
            layer_output = tf.stop_gradient(layer_output)

        prev_output = layer_output
        all_layer_outputs.append(layer_output)

  if do_return_all_layers:
    return all_layer_outputs
  else:
    return all_layer_outputs[-1]