示例#1
0
    def create_classification_model(self, encoder_model):
        """Creates a classification model."""
        # Get the logits for the start and end predictions.
        final_hidden = encoder_model.get_sequence_output()

        batch_size, seq_length, hidden_size = (bert_modeling.get_shape_list(
            final_hidden, expected_rank=3))

        output_weights = tf.get_variable(
            "cls/tydi/output_weights", [2, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable("cls/tydi/output_bias", [2],
                                      initializer=tf.zeros_initializer())

        final_hidden_matrix = tf.reshape(
            final_hidden, [batch_size * seq_length, hidden_size])
        logits = tf.matmul(final_hidden_matrix,
                           output_weights,
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

        logits = tf.reshape(logits, [batch_size, seq_length, 2])
        logits = tf.transpose(logits, [2, 0, 1])

        start_logits, end_logits = tf.unstack(logits, axis=0)

        # Get the logits for the answer type prediction.
        answer_type_output_layer = encoder_model.get_pooled_output()
        answer_type_hidden_size = bert_modeling.get_shape_list(
            answer_type_output_layer)[-1]

        num_answer_types = len(data.AnswerType)
        answer_type_output_weights = tf.get_variable(
            "answer_type_output_weights",
            [num_answer_types, answer_type_hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        answer_type_output_bias = tf.get_variable(
            "answer_type_output_bias", [num_answer_types],
            initializer=tf.zeros_initializer())

        answer_type_logits = tf.matmul(answer_type_output_layer,
                                       answer_type_output_weights,
                                       transpose_b=True)
        answer_type_logits = tf.nn.bias_add(answer_type_logits,
                                            answer_type_output_bias)

        return start_logits, end_logits, answer_type_logits
示例#2
0
    def _upsample_molecules_to_chars(self, final_char_input_seq: tf.Tensor,
                                     full_molecules: tf.Tensor) -> tf.Tensor:
        """Run a shallow/low-dim transformer to get a final character encoding."""
        _, char_seq_length, _ = bert_modeling.get_shape_list(
            final_char_input_seq)

        # `repeated_molecules`: [batch_size, char_seq_len, molecule_hidden_size]
        repeated_molecules = self._repeat_molecules(
            full_molecules, char_seq_length=char_seq_length)
        # `concat`:
        #     [batch_size, char_seq_len, molecule_hidden_size+char_hidden_final]
        concat = tf.concat([final_char_input_seq, repeated_molecules], axis=-1)

        # `upsampled`: [batch_size, char_seq_len, hidden_size]
        upsampled = tf.layers.conv1d(
            inputs=concat,
            filters=self.config.hidden_size,
            kernel_size=self.config.upsampling_kernel_size,
            strides=1,
            padding="same",
            activation=bert_modeling.get_activation(self.config.hidden_act),
            name="conv")
        upsampled = bert_modeling.layer_norm(upsampled)
        if self._is_training:
            upsampled = bert_modeling.dropout(upsampled,
                                              self.config.hidden_dropout_prob)
        return upsampled
示例#3
0
def local_transformer_model(input_tensor: tf.Tensor,
                            attention_mask: tf.Tensor,
                            input_kv_tensor: Optional[tf.Tensor] = None,
                            init_kv_attention_mask: Optional[tf.Tensor] = None,
                            hidden_size: int = 768,
                            num_hidden_layers: int = 12,
                            num_attention_heads: int = 12,
                            intermediate_size: int = 3072,
                            intermediate_act_fn: Optional[Text] = None,
                            hidden_dropout_prob: float = 0.1,
                            attention_probs_dropout_prob: float = 0.1,
                            initializer_range: float = 0.02,
                            do_return_all_layers: bool = False,
                            num_layers_to_update: Optional[int] = None,
                            always_attend_to_first_position: bool = True,
                            first_position_attends_to_all: bool = True,
                            attend_from_chunk_width: int = 128,
                            attend_from_chunk_stride: int = 128,
                            attend_to_chunk_width: int = 128,
                            attend_to_chunk_stride: int = 128,
                            init_attend_to_chunk_width: int = 128,
                            init_attend_to_chunk_stride: int = 128):
  """Fork of BERT's `transformer_model` that performs local attention.

  This attention is local in that attention happens only within each block
  (as defined by the length of the stides).

  Function parameters specific to local attention (i.e. added from BERT's
  `attention_layer`) are at the bottom of the argument list.

  IMPORTANT: Both `input_tensor` and `init_kv_tensor` must have a static
  sequence length dimension, such that it can be extracted as a python integer
  at graph-building time. Dynamic sequence lengths are not supported by
  `local_transformer_model` and `local_attention_layer` (because doing so would
  greatly limit XLA's ability to create a highly optimized program on TPU).

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
    attention_mask: int32 Tensor of shape [batch_size, seq_length,
      seq_length], with 1 for positions that can be attended to and 0 in
      positions that should not be.
    input_kv_tensor: float Tensor of shape
      [batch_size, seq_length_kv, seq_dim_kv]. If specified, this will be used
      for the initial layer of keys and values for self-attention.
      `input_tensor` will still be used for queries and resnet connections.
    init_kv_attention_mask: (optional) int32 Tensor of shape [batch_size,
      seq_length, seq_length_kv], with 1 for positions that can be attended to
      and 0 in positions that should not be. i.e. It indicates which items we
      can attend *from* in `input_tensor` (`seq_length`) and *to* in
      `input_kv_tensor` (`seq_length_kv`).
    hidden_size: int. Hidden size of the Transformer.
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
    num_attention_heads: int. Number of attention heads in the Transformer.
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
      forward) layer.
    intermediate_act_fn: function. The non-linear activation function to apply
      to the output of the intermediate/feed-forward layer.
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
    attention_probs_dropout_prob: float. Dropout probability of the attention
      probabilities.
    initializer_range: float. Range of the initializer (stddev of truncated
      normal).
    do_return_all_layers: Whether to also return all layers or just the final
      layer.
    num_layers_to_update: (optional) Number of layers to update during
      training (a `tf.stop_gradient` is applied beyond this point). This is
      useful for gradual layer unfreezing during fine-tuning to prevent
      catastrophic forgetting.
    always_attend_to_first_position: Should all blocks be able to attend to the
      `to_tensor`'s first position (e.g. a [CLS] position)?
    first_position_attends_to_all: Should the query ("from") tensor's first
      position be able to attend to all positions within the key-value tensor?
    attend_from_chunk_width: The width of each block-wise chunk in
      the query ("from") tensor.
    attend_from_chunk_stride: The number of elements to skip when moving to the
      next block in the query ("from") tensor.
    attend_to_chunk_width: The width of each block-wise chunk in the key-value
      ("to") tensor.
    attend_to_chunk_stride: The number of elements to skip when moving to the
      next block in the key-value ("to") tensor.
    init_attend_to_chunk_width: `attend_to_chunk_width` for first layer when
      `init_kv_tensor` is specified.
    init_attend_to_chunk_stride: `attend_to_chunk_stride` for first layer when
      `init_kv_tensor` is specified.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  """

  if num_hidden_layers == 0:
    return input_tensor

  if hidden_size % num_attention_heads != 0:
    raise ValueError(
        "The hidden size (%d) is not a multiple of the number of attention "
        "heads (%d)" % (hidden_size, num_attention_heads))

  attention_head_size = int(hidden_size / num_attention_heads)
  input_shape = bert_modeling.get_shape_list(input_tensor, expected_rank=3)
  input_width = input_shape[2]

  from_shape = bert_modeling.get_shape_list(input_tensor, expected_rank=3)
  # This is enforced as a static int in the contract above.
  from_seq_length: int = from_shape[1]

  # The Transformer performs sum residuals on all layers so the input needs
  # to be the same as the hidden size.
  if input_width != hidden_size:
    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
                     (input_width, hidden_size))

  prev_output = input_tensor
  all_layer_outputs = []
  for layer_idx in range(num_hidden_layers):
    with tf.variable_scope("layer_%d" % layer_idx):
      layer_input = prev_output
      if layer_idx == 0 and input_kv_tensor is not None:
        layer_kv_input = input_kv_tensor
        layer_attention_mask = init_kv_attention_mask
        layer_attend_to_chunk_width = init_attend_to_chunk_width
        layer_attend_to_chunk_stride = init_attend_to_chunk_stride
        if init_kv_attention_mask is None:
          raise ValueError("`init_kv_attention_mask` must be specified when "
                           "`input_kv_tensor` is specified.")
      else:
        layer_kv_input = layer_input
        layer_attention_mask = attention_mask
        layer_attend_to_chunk_width = attend_to_chunk_width
        layer_attend_to_chunk_stride = attend_to_chunk_stride

      to_shape = bert_modeling.get_shape_list(layer_kv_input, expected_rank=3)
      to_seq_length: int = to_shape[1]
      assert isinstance(to_seq_length, int)

      with tf.variable_scope("attention", reuse=tf.AUTO_REUSE):
        with tf.variable_scope("self"):
          attention_output = local_attention_layer(
              from_tensor=layer_input,  # Queries.
              to_tensor=layer_kv_input,
              from_seq_length=from_seq_length,
              to_seq_length=to_seq_length,
              attention_mask=layer_attention_mask,
              num_attention_heads=num_attention_heads,
              size_per_head=attention_head_size,
              attention_probs_dropout_prob=attention_probs_dropout_prob,
              initializer_range=initializer_range,
              # Parameters specific to local attention:
              always_attend_to_first_position=always_attend_to_first_position,
              first_position_attends_to_all=first_position_attends_to_all,
              attend_from_chunk_width=attend_from_chunk_width,
              attend_from_chunk_stride=attend_from_chunk_stride,
              attend_to_chunk_width=layer_attend_to_chunk_width,
              attend_to_chunk_stride=layer_attend_to_chunk_stride)

        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.variable_scope("output"):
          attention_output = bert_modeling.dense_layer_3d_proj(
              attention_output, hidden_size, num_attention_heads,
              attention_head_size,
              bert_modeling.create_initializer(initializer_range), None,
              "dense")
          attention_output = bert_modeling.dropout(attention_output,
                                                   hidden_dropout_prob)
          attention_output = bert_modeling.layer_norm(
              _safe_add(attention_output, layer_input))

      # The activation is only applied to the "intermediate" hidden layer.
      with tf.variable_scope("intermediate"):
        intermediate_output = bert_modeling.dense_layer_2d(
            attention_output, intermediate_size,
            bert_modeling.create_initializer(initializer_range),
            intermediate_act_fn, "dense")

      # Down-project back to `hidden_size` then add the residual.
      with tf.variable_scope("output"):
        layer_output = bert_modeling.dense_layer_2d(
            intermediate_output, hidden_size,
            bert_modeling.create_initializer(initializer_range), None, "dense")
        layer_output = bert_modeling.dropout(layer_output, hidden_dropout_prob)
        layer_output = bert_modeling.layer_norm(
            _safe_add(layer_output, attention_output))

        if num_layers_to_update is not None:
          num_layers_remaining = num_hidden_layers - layer_idx - 1
          if num_layers_remaining == num_layers_to_update:
            layer_output = tf.stop_gradient(layer_output)

        prev_output = layer_output
        all_layer_outputs.append(layer_output)

  if do_return_all_layers:
    return all_layer_outputs
  else:
    return all_layer_outputs[-1]
示例#4
0
        def model_fn(features, labels, mode, params):
            """The `model_fn` for TPUEstimator."""

            del labels, params  # Unused.

            logging.info("*** Features ***")
            for name in sorted(features.keys()):
                logging.info("  name = %s, shape = %s", name,
                             features[name].shape)

            unique_ids = features["unique_ids"]
            input_ids = features["input_ids"]
            input_mask = features["input_mask"]
            segment_ids = features["segment_ids"]

            is_training = (mode == tf.estimator.ModeKeys.TRAIN)

            encoder_model = self.create_encoder_model(is_training, input_ids,
                                                      input_mask, segment_ids)
            start_logits, end_logits, answer_type_logits = (
                self.create_classification_model(encoder_model))

            tvars = tf.trainable_variables()

            initialized_variable_names = {}
            scaffold_fn = None
            if init_checkpoint:
                assignment_map, initialized_variable_names = (
                    bert_modeling.get_assignment_map_from_checkpoint(
                        tvars, init_checkpoint))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(init_checkpoint,
                                                      assignment_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)

            logging.info("**** Trainable Variables ****")
            for var in tvars:
                init_string = ""
                if var.name in initialized_variable_names:
                    init_string = ", *INIT_FROM_CKPT*"
                logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                             init_string)

            output_spec = None
            if mode == tf.estimator.ModeKeys.TRAIN:
                seq_length = bert_modeling.get_shape_list(input_ids)[1]

                # Computes the loss for positions.
                def compute_loss(logits, positions):
                    one_hot_positions = (tf.one_hot(positions,
                                                    depth=seq_length,
                                                    dtype=tf.float32))
                    log_probs = tf.nn.log_softmax(logits, axis=-1)
                    loss = -tf.reduce_mean(
                        tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
                    return loss

                # Computes the loss for labels.
                def compute_label_loss(logits, labels):
                    one_hot_labels = (tf.one_hot(labels,
                                                 depth=len(data.AnswerType),
                                                 dtype=tf.float32))
                    log_probs = tf.nn.log_softmax(logits, axis=-1)
                    loss = -tf.reduce_mean(
                        tf.reduce_sum(one_hot_labels * log_probs, axis=-1))
                    return loss

                start_positions = features["start_positions"]
                end_positions = features["end_positions"]
                answer_types = features["answer_types"]

                start_loss = compute_loss(start_logits, start_positions)
                end_loss = compute_loss(end_logits, end_positions)

                answer_type_loss = compute_label_loss(answer_type_logits,
                                                      answer_types)

                total_loss = (start_loss + end_loss + answer_type_loss) / 3.0

                train_op = bert_optimization.create_optimizer(
                    total_loss, learning_rate, num_train_steps,
                    num_warmup_steps, use_tpu)

                output_spec = tf.estimator.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    train_op=train_op,
                    scaffold_fn=scaffold_fn)
            elif mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    "unique_ids": unique_ids,
                    "start_logits": start_logits,
                    "end_logits": end_logits,
                    "answer_type_logits": answer_type_logits,
                }
                output_spec = tf.estimator.tpu.TPUEstimatorSpec(
                    mode=mode,
                    predictions=predictions,
                    scaffold_fn=scaffold_fn)
            else:
                raise ValueError(
                    f"Only TRAIN and PREDICT modes are supported: {mode}")
            return output_spec
示例#5
0
    def __init__(self,
                 config: CanineModelConfig,
                 atom_input_ids: tf.Tensor,
                 atom_input_mask: tf.Tensor,
                 atom_segment_ids: tf.Tensor,
                 is_training: bool,
                 final_seq_char_positions: Optional[tf.Tensor] = None):
        """Creates a `CanineModel`.

    This interface mirrors the `BertModel` class from the public BERT code, but
    abstracts away what type of input is passed (tokens, characters, etc.).

    A config file can be loaded like so:
    ```
    config = CanineModelConfig.from_json_file("/path/to.json")
    ```

    Args:
      config: Instance of `CanineModelConfig`.
      atom_input_ids: <int32>[batch_size, atom_seq_len] Vocabulary ids of the
        inputs.
      atom_input_mask: <int32>[batch_size, atom_seq_len] Indicates which input
        ids are non-padding.
      atom_segment_ids: <int32>[batch_size, atom_seq_len] Indicates the type of
        each feature. For a traditional BERT model with two segments, this would
        contain segment ids (0 and 1).
      is_training: Are we training? If not, disable dropout.
      final_seq_char_positions: Optional indices within each character sequence
        to be predicted by MLM. If specified, causes `get_sequence_output` to
        return only those positions, and, more importantly, when using a
        transformer for the `final_char_encoding`, only those sequence positions
        will be used as query positions for the transformer, giving a
        substantial boost in pre-training speed.
        <int32>[batch_size, max_predictions_per_seq]
    """

        self.config: CanineModelConfig = config
        self._is_training: bool = is_training

        if final_seq_char_positions is not None:
            batch_size, predictions_len = bert_modeling.get_shape_list(
                final_seq_char_positions)
            self._final_char_seq_length: tf.Tensor = predictions_len
        else:
            batch_size, char_seq_length = bert_modeling.get_shape_list(
                atom_input_ids)
            self._final_char_seq_length: tf.Tensor = char_seq_length
        self._batch_size = batch_size

        config.validate()

        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        batch_size, char_seq_length = bert_modeling.get_shape_list(
            atom_input_ids)
        del batch_size  # Unused.

        # `molecule_seq_length`: scalar int.
        molecule_seq_length = char_seq_length // config.downsampling_rate

        # Create attention masks...
        # `char_attention_mask`: <float>[batch, char_seq, char_seq]
        char_attention_mask = bert_modeling.create_attention_mask_from_input_mask(
            atom_input_ids, atom_input_mask)

        # ...for attending from deep BERT molecule stack back to initial characters:
        # `molecule_to_char_attention_mask`: <float>[batch, molecule_seq, char_seq]
        molecule_to_char_attention_mask = self.downsample_attention_mask(
            char_attention_mask, config.downsampling_rate, dim=-2)

        # ...for attending from final character encoder to deep BERT stack:
        # `char_to_molecule_attention_mask`: <float>[batch, char_seq, molecule_seq]
        char_to_molecule_attention_mask = self.downsample_attention_mask(
            char_attention_mask, config.downsampling_rate, dim=-1)

        # ...for self-attention within deep BERT molecule stack:
        # `molecule_attention_mask`: <float>[batch, molecule_seq, molecule_seq]
        molecule_attention_mask = self.downsample_attention_mask(
            molecule_to_char_attention_mask, config.downsampling_rate, dim=-1)

        # The following lines have dimensions: <float>[batch, char_seq, char_dim].
        input_char_embedddings = self._embed_chars(
            codepoints=atom_input_ids, segment_ids=atom_segment_ids)

        # Contextualize character embeddings.
        input_char_encoding = self._encode_initial_chars(
            input_char_embedddings, char_attention_mask)

        # Downsample chars to molecules.
        # The following lines have dimensions: [batch, molecule_seq, molecule_dim].
        # In this transformation, we change the dimensionality from `char_dim` to
        # `molecule_dim`, but do *NOT* add a resnet connection. Instead, we rely on
        # the resnet connections (a) from the final char transformer stack back into
        # the original char transformer stack and (b) the resnet connections from
        # the final char transformer stack back into the deep BERT stack of
        # molecules.
        #
        # Empirically, it is critical to use a powerful enough transformation here:
        # mean pooling causes training to diverge with huge gradient norms in this
        # region of the model; using a convolution here resolves this issue. From
        # this, it seems that molecules and characters require a very different
        # feature space; intuitively, this makes sense.
        with tf.variable_scope("initial_char_encoder"):
            init_molecule_encoding = self._chars_to_molecules(
                input_char_encoding,
                expected_molecule_seq_length=molecule_seq_length)

        bert_layers: Sequence[tf.Tensor] = self._bert_stack(
            molecules_in=init_molecule_encoding,
            attention_mask=molecule_attention_mask)
        bert_molecule_encoding = bert_layers[-1]

        init_output_char_encoding = input_char_encoding

        self.final_char_encoding = self._encode_final_chars(
            init_output_char_encoding,
            char_attention_mask=char_attention_mask,
            full_molecules=bert_molecule_encoding,
            char_to_molecule_attention_mask=char_to_molecule_attention_mask,
            molecule_seq_length=molecule_seq_length,
            final_seq_char_positions=final_seq_char_positions)

        # For pooling (sequence-level tasks), we use only the output of the deep
        # BERT stack since we would end up with reduced dimensionality at each
        # character position.
        self.pooled = self._pool(bert_molecule_encoding)

        self.molecule_seq_length = molecule_seq_length
        self.downsampled_layers = bert_layers
示例#6
0
    def _encode_final_chars(
            self, final_char_input_seq: tf.Tensor,
            char_attention_mask: tf.Tensor, full_molecules: tf.Tensor,
            char_to_molecule_attention_mask: tf.Tensor,
            molecule_seq_length: tf.Tensor,
            final_seq_char_positions: Optional[tf.Tensor]) -> tf.Tensor:
        """Run a shallow/low-dim transformer to get a final character encoding."""

        _, char_seq_length, _ = bert_modeling.get_shape_list(
            final_char_input_seq)

        # `final_char_input_seq` is a projected version of the deep molecule BERT
        # stack with slice-wise resnet connections.
        with tf.variable_scope("final_char_encoder"):
            # `repeated_molecules`: [batch_size, char_seq_len, molecule_hidden_size]
            repeated_molecules = self._repeat_molecules(
                full_molecules,
                char_seq_length=char_seq_length,
                molecule_seq_length=molecule_seq_length)
            layers = [final_char_input_seq, repeated_molecules]
            # `concat`:
            #     [batch_size, char_seq_len, molecule_hidden_size+char_hidden_final]
            concat = tf.concat(layers, axis=-1)

            # `result`: [batch_size, char_seq_len, hidden_size]
            result = tf.layers.conv1d(
                inputs=concat,
                filters=self.config.hidden_size,
                kernel_size=self.config.upsampling_kernel_size,
                strides=1,
                padding="same",
                activation=bert_modeling.get_activation(
                    self.config.hidden_act),
                name="conv")
            result = bert_modeling.layer_norm(result)
            if self._is_training:
                result = bert_modeling.dropout(result,
                                               self.config.hidden_dropout_prob)
            final_char_seq = result

            if final_seq_char_positions is not None:
                # Limit transformer query seq and attention mask to these character
                # positions to greatly reduce the compute cost. Typically, this is just
                # done for the MLM training task.

                # `query_seq`: [batch, final_char_seq, char_dim]
                query_seq = tf.gather(final_char_seq,
                                      final_seq_char_positions,
                                      batch_dims=1)
                # `char_to_molecule_attention_mask`:
                #   [batch, final_len, molecule_seq]
                char_to_molecule_attention_mask = tf.gather(
                    char_to_molecule_attention_mask,
                    final_seq_char_positions,
                    batch_dims=1)
                char_attention_mask = tf.gather(char_attention_mask,
                                                final_seq_char_positions,
                                                batch_dims=1)
            else:
                query_seq = final_char_seq
                # `char_to_molecule_attention_mask` remains unmodified.

            return bert_modeling.transformer_model(
                input_tensor=query_seq,
                input_kv_tensor=final_char_seq,
                attention_mask=char_attention_mask,
                hidden_size=self.config.hidden_size,
                num_hidden_layers=1,
                num_attention_heads=self.config.num_attention_heads,
                intermediate_size=self.config.intermediate_size,
                intermediate_act_fn=bert_modeling.get_activation(
                    self.config.hidden_act),
                hidden_dropout_prob=self.config.hidden_dropout_prob,
                attention_probs_dropout_prob=(
                    self.config.attention_probs_dropout_prob),
                initializer_range=self.config.initializer_range)