예제 #1
0
  def test_serialize_deserialize(self):
    """Validate that the ELECTRA trainer can be serialized and deserialized."""
    # Build a transformer network to use within the BERT trainer. (Here, we use
    # a short sequence_length for convenience.)
    test_generator_network = networks.TransformerEncoder(
        vocab_size=100, num_layers=4, sequence_length=3)
    test_discriminator_network = networks.TransformerEncoder(
        vocab_size=100, num_layers=4, sequence_length=3)

    # Create a ELECTRA trainer with the created network. (Note that all the args
    # are different, so we can catch any serialization mismatches.)
    electra_trainer_model = electra_pretrainer.ElectraPretrainer(
        generator_network=test_generator_network,
        discriminator_network=test_discriminator_network,
        vocab_size=100,
        num_classes=2,
        sequence_length=3,
        num_token_predictions=2)

    # Create another BERT trainer via serialization and deserialization.
    config = electra_trainer_model.get_config()
    new_electra_trainer_model = electra_pretrainer.ElectraPretrainer.from_config(
        config)

    # Validate that the config can be forced to JSON.
    _ = new_electra_trainer_model.to_json()

    # If the serialization was successful, the new config should match the old.
    self.assertAllEqual(electra_trainer_model.get_config(),
                        new_electra_trainer_model.get_config())
예제 #2
0
    def test_electra_pretrainer(self):
        """Validate that the Keras object can be created."""
        # Build a transformer network to use within the ELECTRA trainer.
        vocab_size = 100
        sequence_length = 512
        test_generator_network = networks.TransformerEncoder(
            vocab_size=vocab_size,
            num_layers=2,
            max_sequence_length=sequence_length)
        test_discriminator_network = networks.TransformerEncoder(
            vocab_size=vocab_size,
            num_layers=2,
            max_sequence_length=sequence_length)

        # Create a ELECTRA trainer with the created network.
        num_classes = 3
        num_token_predictions = 2
        eletrca_trainer_model = electra_pretrainer.ElectraPretrainer(
            generator_network=test_generator_network,
            discriminator_network=test_discriminator_network,
            vocab_size=vocab_size,
            num_classes=num_classes,
            num_token_predictions=num_token_predictions,
            disallow_correct=True)

        # Create a set of 2-dimensional inputs (the first dimension is implicit).
        word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        lm_positions = tf.keras.Input(shape=(num_token_predictions, ),
                                      dtype=tf.int32)
        lm_ids = tf.keras.Input(shape=(num_token_predictions, ),
                                dtype=tf.int32)
        inputs = {
            'input_word_ids': word_ids,
            'input_mask': mask,
            'input_type_ids': type_ids,
            'masked_lm_positions': lm_positions,
            'masked_lm_ids': lm_ids
        }

        # Invoke the trainer model on the inputs. This causes the layer to be built.
        outputs = eletrca_trainer_model(inputs)
        lm_outs = outputs['lm_outputs']
        cls_outs = outputs['sentence_outputs']
        disc_logits = outputs['disc_logits']
        disc_label = outputs['disc_label']

        # Validate that the outputs are of the expected shape.
        expected_lm_shape = [None, num_token_predictions, vocab_size]
        expected_classification_shape = [None, num_classes]
        expected_disc_logits_shape = [None, sequence_length]
        expected_disc_label_shape = [None, sequence_length]
        self.assertAllEqual(expected_lm_shape, lm_outs.shape.as_list())
        self.assertAllEqual(expected_classification_shape,
                            cls_outs.shape.as_list())
        self.assertAllEqual(expected_disc_logits_shape,
                            disc_logits.shape.as_list())
        self.assertAllEqual(expected_disc_label_shape,
                            disc_label.shape.as_list())
예제 #3
0
    def test_bert_trainer(self):
        """Validate that the Keras object can be created."""
        # Build a transformer network to use within the BERT trainer.
        vocab_size = 100
        sequence_length = 512
        test_network = networks.TransformerEncoder(
            vocab_size=vocab_size,
            num_layers=2,
            sequence_length=sequence_length)

        # Create a BERT trainer with the created network.
        num_classes = 3
        bert_trainer_model = bert_token_classifier.BertTokenClassifier(
            test_network, num_classes=num_classes)

        # Create a set of 2-dimensional inputs (the first dimension is implicit).
        word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)

        # Invoke the trainer model on the inputs. This causes the layer to be built.
        sequence_outs = bert_trainer_model([word_ids, mask, type_ids])

        # Validate that the outputs are of the expected shape.
        expected_classification_shape = [None, sequence_length, num_classes]
        self.assertAllEqual(expected_classification_shape,
                            sequence_outs.shape.as_list())
    def create_lm_model(self,
                        vocab_size,
                        sequence_length,
                        hidden_size,
                        num_predictions,
                        output="predictions"):
        # First, create a transformer stack that we can use to get the LM's
        # vocabulary weight.
        xformer_stack = networks.TransformerEncoder(
            vocab_size=vocab_size,
            num_layers=1,
            sequence_length=sequence_length,
            hidden_size=hidden_size,
            num_attention_heads=4,
        )
        word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        _ = xformer_stack([word_ids, mask, type_ids])

        # Create a maskedLM from the transformer stack.
        test_layer = layers.MaskedLM(
            embedding_table=xformer_stack.get_embedding_table(), output=output)

        # Create a model from the masked LM layer.
        lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
        masked_lm_positions = tf.keras.Input(shape=(num_predictions, ),
                                             dtype=tf.int32)
        output = test_layer(lm_input_tensor,
                            masked_positions=masked_lm_positions)
        return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
    def test_bert_trainer(self):
        """Validate that the Keras object can be created."""
        # Build a transformer network to use within the BERT trainer.
        vocab_size = 100
        sequence_length = 512
        test_network = networks.TransformerEncoder(
            vocab_size=vocab_size,
            num_layers=2,
            sequence_length=sequence_length)

        # Create a BERT trainer with the created network.
        bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network)

        # Create a set of 2-dimensional inputs (the first dimension is implicit).
        word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)

        # Invoke the trainer model on the inputs. This causes the layer to be built.
        cls_outs = bert_trainer_model([word_ids, mask, type_ids])

        # Validate that there are 2 outputs are of the expected shape.
        self.assertEqual(2, len(cls_outs))
        expected_shape = [None, sequence_length]
        for out in cls_outs:
            self.assertAllEqual(expected_shape, out.shape.as_list())
예제 #6
0
def _create_bert_model(cfg):
  """Creates a BERT keras core model from BERT configuration.

  Args:
    cfg: A `BertConfig` to create the core model.
  Returns:
    A TransformerEncoder netowork.
  """
  bert_encoder = networks.TransformerEncoder(
      vocab_size=cfg.vocab_size,
      hidden_size=cfg.hidden_size,
      num_layers=cfg.num_hidden_layers,
      num_attention_heads=cfg.num_attention_heads,
      intermediate_size=cfg.intermediate_size,
      activation=activations.gelu,
      dropout_rate=cfg.hidden_dropout_prob,
      attention_dropout_rate=cfg.attention_probs_dropout_prob,
<<<<<<< HEAD
      sequence_length=cfg.max_position_embeddings,
=======
      max_sequence_length=cfg.max_position_embeddings,
>>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36
      type_vocab_size=cfg.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=cfg.initializer_range),
      embedding_width=cfg.embedding_size)

  return bert_encoder
예제 #7
0
    def test_serialize_deserialize(self):
        """Validate that the BERT trainer can be serialized and deserialized."""
        # Build a transformer network to use within the BERT trainer. (Here, we use
        # a short sequence_length for convenience.)
        test_network = networks.TransformerEncoder(vocab_size=100,
                                                   num_layers=2,
                                                   sequence_length=5)

        # Create a BERT trainer with the created network. (Note that all the args
        # are different, so we can catch any serialization mismatches.)
        bert_trainer_model = bert_token_classifier.BertTokenClassifier(
            test_network,
            num_classes=4,
            initializer='zeros',
            output='predictions')

        # Create another BERT trainer via serialization and deserialization.
        config = bert_trainer_model.get_config()
        new_bert_trainer_model = (
            bert_token_classifier.BertTokenClassifier.from_config(config))

        # Validate that the config can be forced to JSON.
        _ = new_bert_trainer_model.to_json()

        # If the serialization was successful, the new config should match the old.
        self.assertAllEqual(bert_trainer_model.get_config(),
                            new_bert_trainer_model.get_config())
def get_transformer_encoder(bert_config,
                            sequence_length):
  """Gets a 'TransformerEncoder' object.

  Args:
    bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
    sequence_length: Maximum sequence length of the training data.

  Returns:
    A networks.TransformerEncoder object.
  """
  kwargs = dict(
      vocab_size=bert_config.vocab_size,
      hidden_size=bert_config.hidden_size,
      num_layers=bert_config.num_hidden_layers,
      num_attention_heads=bert_config.num_attention_heads,
      intermediate_size=bert_config.intermediate_size,
      activation=tf_utils.get_activation(bert_config.hidden_act),
      dropout_rate=bert_config.hidden_dropout_prob,
      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
      sequence_length=sequence_length,
      max_sequence_length=bert_config.max_position_embeddings,
      type_vocab_size=bert_config.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=bert_config.initializer_range))
  if isinstance(bert_config, albert_configs.AlbertConfig):
    kwargs['embedding_width'] = bert_config.embedding_size
    return networks.AlbertTransformerEncoder(**kwargs)
  else:
    assert isinstance(bert_config, configs.BertConfig)
    return networks.TransformerEncoder(**kwargs)
예제 #9
0
def instantiate_from_cfg(config: BertPretrainerConfig,
                         encoder_network: Optional[tf.keras.Model] = None):
    """Instantiates a BertPretrainer from the config."""
    encoder_cfg = config.encoder
    if encoder_network is None:
        encoder_network = networks.TransformerEncoder(
            vocab_size=encoder_cfg.vocab_size,
            hidden_size=encoder_cfg.hidden_size,
            num_layers=encoder_cfg.num_layers,
            num_attention_heads=encoder_cfg.num_attention_heads,
            intermediate_size=encoder_cfg.intermediate_size,
            activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
            dropout_rate=encoder_cfg.dropout_rate,
            attention_dropout_rate=encoder_cfg.attention_dropout_rate,
            max_sequence_length=encoder_cfg.max_position_embeddings,
            type_vocab_size=encoder_cfg.type_vocab_size,
            initializer=tf.keras.initializers.TruncatedNormal(
                stddev=encoder_cfg.initializer_range))
    if config.cls_heads:
        classification_heads = [
            layers.ClassificationHead(**cfg.as_dict())
            for cfg in config.cls_heads
        ]
    else:
        classification_heads = []
    return bert_pretrainer.BertPretrainerV2(
        config.num_masked_tokens,
        mlm_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=encoder_cfg.initializer_range),
        encoder_network=encoder_network,
        classification_heads=classification_heads)
예제 #10
0
    def test_dual_encoder_tensor_call(self, hidden_size, output):
        """Validate that the Keras object can be invoked."""
        # Build a transformer network to use within the dual encoder model. (Here,
        # we use # a short sequence_length for convenience.)
        sequence_length = 2
        test_network = networks.TransformerEncoder(
            vocab_size=100, num_layers=2, sequence_length=sequence_length)

        # Create a dual encoder model with the created network.
        dual_encoder_model = dual_encoder.DualEncoder(
            test_network, max_seq_length=sequence_length, output=output)

        # Create a set of 2-dimensional data tensors to feed into the model.
        word_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
        mask = tf.constant([[1, 1], [1, 0]], dtype=tf.int32)
        type_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)

        # Invoke the model model on the tensors. In Eager mode, this does the
        # actual calculation. (We can't validate the outputs, since the network is
        # too complex: this simply ensures we're not hitting runtime errors.)
        if output == 'logits':
            _ = dual_encoder_model(
                [word_ids, mask, type_ids, word_ids, mask, type_ids])
        elif output == 'predictions':
            _ = dual_encoder_model([word_ids, mask, type_ids])
예제 #11
0
def _get_transformer_encoder(bert_config,
                             sequence_length,
                             float_dtype=tf.float32):
  """Gets a 'TransformerEncoder' object.

  Args:
    bert_config: A 'modeling.BertConfig' object.
    sequence_length: Maximum sequence length of the training data.
    float_dtype: tf.dtype, tf.float32 or tf.float16.

  Returns:
    A networks.TransformerEncoder object.
  """
  return networks.TransformerEncoder(
      vocab_size=bert_config.vocab_size,
      hidden_size=bert_config.hidden_size,
      num_layers=bert_config.num_hidden_layers,
      num_attention_heads=bert_config.num_attention_heads,
      intermediate_size=bert_config.intermediate_size,
      activation=tf_utils.get_activation(bert_config.hidden_act),
      dropout_rate=bert_config.hidden_dropout_prob,
      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
      sequence_length=sequence_length,
      max_sequence_length=bert_config.max_position_embeddings,
      type_vocab_size=bert_config.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=bert_config.initializer_range),
      float_dtype=float_dtype.name)
    def test_bert_pretrainerv2(self):
        """Validate that the Keras object can be created."""
        # Build a transformer network to use within the BERT trainer.
        vocab_size = 100
        sequence_length = 512
        test_network = networks.TransformerEncoder(
            vocab_size=vocab_size,
            num_layers=2,
            sequence_length=sequence_length)

        # Create a BERT trainer with the created network.
        bert_trainer_model = bert_pretrainer.BertPretrainerV2(
            encoder_network=test_network)
        num_token_predictions = 20
        # Create a set of 2-dimensional inputs (the first dimension is implicit).
        word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        lm_mask = tf.keras.Input(shape=(num_token_predictions, ),
                                 dtype=tf.int32)

        # Invoke the trainer model on the inputs. This causes the layer to be built.
        outputs = bert_trainer_model([word_ids, mask, type_ids, lm_mask])

        # Validate that the outputs are of the expected shape.
        expected_lm_shape = [None, num_token_predictions, vocab_size]
        self.assertAllEqual(expected_lm_shape,
                            outputs['lm_output'].shape.as_list())
def get_transformer_encoder(bert_config,
                            sequence_length,
                            transformer_encoder_cls=None):
  """Gets a 'TransformerEncoder' object.

  Args:
    bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
    sequence_length: Maximum sequence length of the training data.
    transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the
      default BERT encoder implementation.

  Returns:
    A networks.TransformerEncoder object.
  """
  if transformer_encoder_cls is not None:
    # TODO(hongkuny): evaluate if it is better to put cfg definition in gin.
    embedding_cfg = dict(
        vocab_size=bert_config.vocab_size,
        type_vocab_size=bert_config.type_vocab_size,
        hidden_size=bert_config.hidden_size,
        seq_length=sequence_length,
        max_seq_length=bert_config.max_position_embeddings,
        initializer=tf.keras.initializers.TruncatedNormal(
            stddev=bert_config.initializer_range),
        dropout_rate=bert_config.hidden_dropout_prob,
    )
    hidden_cfg = dict(
        num_attention_heads=bert_config.num_attention_heads,
        intermediate_size=bert_config.intermediate_size,
        intermediate_activation=tf_utils.get_activation(bert_config.hidden_act),
        dropout_rate=bert_config.hidden_dropout_prob,
        attention_dropout_rate=bert_config.attention_probs_dropout_prob,
    )
    kwargs = dict(embedding_cfg=embedding_cfg, hidden_cfg=hidden_cfg,
                  num_hidden_instances=bert_config.num_hidden_layers,)

    # Relies on gin configuration to define the Transformer encoder arguments.
    return transformer_encoder_cls(**kwargs)

  kwargs = dict(
      vocab_size=bert_config.vocab_size,
      hidden_size=bert_config.hidden_size,
      num_layers=bert_config.num_hidden_layers,
      num_attention_heads=bert_config.num_attention_heads,
      intermediate_size=bert_config.intermediate_size,
      activation=tf_utils.get_activation(bert_config.hidden_act),
      dropout_rate=bert_config.hidden_dropout_prob,
      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
      sequence_length=sequence_length,
      max_sequence_length=bert_config.max_position_embeddings,
      type_vocab_size=bert_config.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=bert_config.initializer_range))
  if isinstance(bert_config, albert_configs.AlbertConfig):
    kwargs['embedding_width'] = bert_config.embedding_size
    return networks.AlbertTransformerEncoder(**kwargs)
  else:
    assert isinstance(bert_config, configs.BertConfig)
    return networks.TransformerEncoder(**kwargs)
예제 #14
0
def get_nhnet_layers(params: configs.NHNetConfig):
  """Creates a Mult-doc encoder/decoder.

  Args:
    params: ParamsDict.

  Returns:
    two keras Layers, bert_model_layer and decoder_layer
  """
  input_ids = tf.keras.layers.Input(
      shape=(None,), name="input_ids", dtype=tf.int32)
  input_mask = tf.keras.layers.Input(
      shape=(None,), name="input_mask", dtype=tf.int32)
  segment_ids = tf.keras.layers.Input(
      shape=(None,), name="segment_ids", dtype=tf.int32)
  bert_config = utils.get_bert_config_from_params(params)
  bert_model_layer = networks.TransformerEncoder(
      vocab_size=bert_config.vocab_size,
      hidden_size=bert_config.hidden_size,
      num_layers=bert_config.num_hidden_layers,
      num_attention_heads=bert_config.num_attention_heads,
      intermediate_size=bert_config.intermediate_size,
      activation=tf_utils.get_activation(bert_config.hidden_act),
      dropout_rate=bert_config.hidden_dropout_prob,
      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
      sequence_length=None,
      max_sequence_length=bert_config.max_position_embeddings,
      type_vocab_size=bert_config.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=bert_config.initializer_range),
      return_all_encoder_outputs=True,
      name="bert_encoder")
  bert_model_layer([input_ids, input_mask, segment_ids])

  input_ids = tf.keras.layers.Input(
      shape=(None, None), name="input_ids", dtype=tf.int32)
  all_encoder_outputs = tf.keras.layers.Input((None, None, params.hidden_size),
                                              dtype=tf.float32)
  target_ids = tf.keras.layers.Input(
      shape=(None,), name="target_ids", dtype=tf.int32)
  doc_attention_probs = tf.keras.layers.Input(
      (params.num_decoder_attn_heads, None, None), dtype=tf.float32)
  # pylint: disable=protected-access
  decoder_layer = decoder.Decoder(params, bert_model_layer._embedding_layer)
  # pylint: enable=protected-access
  cross_attention_bias = decoder.AttentionBias(bias_type="multi_cross")(
      input_ids)
  self_attention_bias = decoder.AttentionBias(bias_type="decoder_self")(
      target_ids)
  decoder_inputs = dict(
      attention_bias=cross_attention_bias,
      self_attention_bias=self_attention_bias,
      target_ids=target_ids,
      all_encoder_outputs=all_encoder_outputs,
      doc_attention_probs=doc_attention_probs)
  _ = decoder_layer(decoder_inputs)

  return bert_model_layer, decoder_layer
예제 #15
0
def get_bert2bert_layers(params: configs.BERT2BERTConfig):
  """Creates a Bert2Bert stem model and returns Bert encoder/decoder.

  We use funtional-style to create stem model because we need to make all layers
  built to restore variables in a customized way. The layers are called with
  placeholder inputs to make them fully built.

  Args:
    params: ParamsDict.

  Returns:
    two keras Layers, bert_model_layer and decoder_layer
  """
  input_ids = tf.keras.layers.Input(
      shape=(None,), name="input_ids", dtype=tf.int32)
  input_mask = tf.keras.layers.Input(
      shape=(None,), name="input_mask", dtype=tf.int32)
  segment_ids = tf.keras.layers.Input(
      shape=(None,), name="segment_ids", dtype=tf.int32)
  target_ids = tf.keras.layers.Input(
      shape=(None,), name="target_ids", dtype=tf.int32)
  bert_config = utils.get_bert_config_from_params(params)
  bert_model_layer = networks.TransformerEncoder(
      vocab_size=bert_config.vocab_size,
      hidden_size=bert_config.hidden_size,
      num_layers=bert_config.num_hidden_layers,
      num_attention_heads=bert_config.num_attention_heads,
      intermediate_size=bert_config.intermediate_size,
      activation=tf_utils.get_activation(bert_config.hidden_act),
      dropout_rate=bert_config.hidden_dropout_prob,
      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
<<<<<<< HEAD
      sequence_length=None,
=======
>>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36
      max_sequence_length=bert_config.max_position_embeddings,
      type_vocab_size=bert_config.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=bert_config.initializer_range),
      return_all_encoder_outputs=True,
      name="bert_encoder")
  all_encoder_outputs, _ = bert_model_layer(
      [input_ids, input_mask, segment_ids])
  # pylint: disable=protected-access
  decoder_layer = decoder.Decoder(params, bert_model_layer._embedding_layer)
  # pylint: enable=protected-access
  cross_attention_bias = decoder.AttentionBias(bias_type="single_cross")(
      input_ids)
  self_attention_bias = decoder.AttentionBias(bias_type="decoder_self")(
      target_ids)
  decoder_inputs = dict(
      attention_bias=cross_attention_bias,
      self_attention_bias=self_attention_bias,
      target_ids=target_ids,
      all_encoder_outputs=all_encoder_outputs)
  _ = decoder_layer(decoder_inputs)

  return bert_model_layer, decoder_layer
예제 #16
0
파일: tfrbert.py 프로젝트: zanwenok/ranking
    def __init__(self,
                 context_feature_columns,
                 example_feature_columns,
                 bert_config_file,
                 bert_max_seq_length,
                 bert_output_dropout,
                 name="tfrbert",
                 **kwargs):
        """Initializes an instance of TFRBertRankingNetwork.

    Args:
      context_feature_columns: A dict containing all the context feature columns
        used by the network. Keys are feature names, and values are instances of
        classes derived from `_FeatureColumn`.
      example_feature_columns: A dict containing all the example feature columns
        used by the network. Keys are feature names, and values are instances of
        classes derived from `_FeatureColumn`.
      bert_config_file: (string) path to Bert configuration file.
      bert_max_seq_length: (int) maximum input sequence length (#words) after
        WordPiece tokenization. Sequences longer than this will be truncated,
        and shorter than this will be padded.
      bert_output_dropout: When not `None`, the probability will be used as the
        dropout probability for BERT output.
      name: name of Keras network.
      **kwargs: keyword arguments.
    """
        super(TFRBertRankingNetwork,
              self).__init__(context_feature_columns=context_feature_columns,
                             example_feature_columns=example_feature_columns,
                             name=name,
                             **kwargs)

        self._bert_config_file = bert_config_file
        self._bert_max_seq_length = bert_max_seq_length
        self._bert_output_dropout = bert_output_dropout

        bert_config = configs.BertConfig.from_json_file(self._bert_config_file)
        self._bert_encoder = tfmodel_networks.TransformerEncoder(
            vocab_size=bert_config.vocab_size,
            hidden_size=bert_config.hidden_size,
            num_layers=bert_config.num_hidden_layers,
            num_attention_heads=bert_config.num_attention_heads,
            intermediate_size=bert_config.intermediate_size,
            activation=activations.gelu,
            dropout_rate=bert_config.hidden_dropout_prob,
            attention_dropout_rate=bert_config.attention_probs_dropout_prob,
            sequence_length=self._bert_max_seq_length,
            max_sequence_length=bert_config.max_position_embeddings,
            type_vocab_size=bert_config.type_vocab_size,
            initializer=tf.keras.initializers.TruncatedNormal(
                stddev=bert_config.initializer_range))

        self._dropout_layer = tf.keras.layers.Dropout(
            rate=self._bert_output_dropout)

        self._score_layer = tf.keras.layers.Dense(units=1, name="score")
예제 #17
0
  def test_electra_pretrainer(self):
    """Validate that the Keras object can be created."""
    # Build a transformer network to use within the ELECTRA trainer.
    vocab_size = 100
    sequence_length = 512
    test_generator_network = networks.TransformerEncoder(
<<<<<<< HEAD
        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)
    test_discriminator_network = networks.TransformerEncoder(
        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)
예제 #18
0
    def test_electra_trainer_tensor_call(self):
        """Validate that the Keras object can be invoked."""
        # Build a transformer network to use within the ELECTRA trainer. (Here, we
        # use a short sequence_length for convenience.)
        test_generator_network = networks.TransformerEncoder(vocab_size=100,
                                                             num_layers=4,
                                                             sequence_length=3)
        test_discriminator_network = networks.TransformerEncoder(
            vocab_size=100, num_layers=4, sequence_length=3)

        # Create a ELECTRA trainer with the created network.
        eletrca_trainer_model = electra_pretrainer.ElectraPretrainer(
            generator_network=test_generator_network,
            discriminator_network=test_discriminator_network,
            vocab_size=100,
            num_classes=2,
            sequence_length=3,
            last_hidden_dim=768,
            num_token_predictions=2)

        # Create a set of 2-dimensional data tensors to feed into the model.
        word_ids = tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.int32)
        mask = tf.constant([[1, 1, 1], [1, 0, 0]], dtype=tf.int32)
        type_ids = tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.int32)
        lm_positions = tf.constant([[0, 1], [0, 2]], dtype=tf.int32)
        lm_ids = tf.constant([[10, 20], [20, 30]], dtype=tf.int32)
        inputs = {
            'input_word_ids': word_ids,
            'input_mask': mask,
            'input_type_ids': type_ids,
            'masked_lm_positions': lm_positions,
            'masked_lm_ids': lm_ids
        }

        # Invoke the trainer model on the tensors. In Eager mode, this does the
        # actual calculation. (We can't validate the outputs, since the network is
        # too complex: this simply ensures we're not hitting runtime errors.)
        _, _, _, _ = eletrca_trainer_model(inputs)
예제 #19
0
def instantiate_encoder_from_cfg(
        config: TransformerEncoderConfig) -> networks.TransformerEncoder:
    """Instantiate a Transformer encoder network from TransformerEncoderConfig."""
    encoder_network = networks.TransformerEncoder(
        vocab_size=config.vocab_size,
        hidden_size=config.hidden_size,
        num_layers=config.num_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        activation=tf_utils.get_activation(config.hidden_activation),
        dropout_rate=config.dropout_rate,
        attention_dropout_rate=config.attention_dropout_rate,
        max_sequence_length=config.max_position_embeddings,
        type_vocab_size=config.type_vocab_size,
        initializer=tf.keras.initializers.TruncatedNormal(
            stddev=config.initializer_range))
    return encoder_network
    def test_bert_trainer_named_compilation(self):
        """Validate compilation using explicit output names."""
        # Build a transformer network to use within the BERT trainer.
        vocab_size = 100
        test_network = networks.TransformerEncoder(vocab_size=vocab_size,
                                                   num_layers=2)

        # Create a BERT trainer with the created network.
        bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network)

        # Attempt to compile the model using a string-keyed dict of output names to
        # loss functions. This will validate that the outputs are named as we
        # expect.
        bert_trainer_model.compile(optimizer='sgd',
                                   loss={
                                       'start_positions': 'mse',
                                       'end_positions': 'mse'
                                   })
예제 #21
0
    def test_dual_encoder(self, hidden_size, output):
        """Validate that the Keras object can be created."""
        # Build a transformer network to use within the dual encoder model.
        vocab_size = 100
        sequence_length = 512
        test_network = networks.TransformerEncoder(
            vocab_size=vocab_size,
            num_layers=2,
            hidden_size=hidden_size,
            sequence_length=sequence_length)

        # Create a dual encoder model with the created network.
        dual_encoder_model = dual_encoder.DualEncoder(
            test_network, max_seq_length=sequence_length, output=output)

        # Create a set of 2-dimensional inputs (the first dimension is implicit).
        left_word_ids = tf.keras.Input(shape=(sequence_length, ),
                                       dtype=tf.int32)
        left_mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        left_type_ids = tf.keras.Input(shape=(sequence_length, ),
                                       dtype=tf.int32)

        right_word_ids = tf.keras.Input(shape=(sequence_length, ),
                                        dtype=tf.int32)
        right_mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        right_type_ids = tf.keras.Input(shape=(sequence_length, ),
                                        dtype=tf.int32)

        if output == 'logits':
            outputs = dual_encoder_model([
                left_word_ids, left_mask, left_type_ids, right_word_ids,
                right_mask, right_type_ids
            ])

            left_encoded, _ = outputs
        elif output == 'predictions':
            left_encoded = dual_encoder_model(
                [left_word_ids, left_mask, left_type_ids])

            # Validate that the outputs are of the expected shape.
            expected_encoding_shape = [None, 768]
            self.assertAllEqual(expected_encoding_shape,
                                left_encoded.shape.as_list())
    def test_bert_trainer_tensor_call(self):
        """Validate that the Keras object can be invoked."""
        # Build a transformer network to use within the BERT trainer. (Here, we use
        # a short sequence_length for convenience.)
        test_network = networks.TransformerEncoder(vocab_size=100,
                                                   num_layers=2)

        # Create a BERT trainer with the created network.
        bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network)

        # Create a set of 2-dimensional data tensors to feed into the model.
        word_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
        mask = tf.constant([[1, 1], [1, 0]], dtype=tf.int32)
        type_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)

        # Invoke the trainer model on the tensors. In Eager mode, this does the
        # actual calculation. (We can't validate the outputs, since the network is
        # too complex: this simply ensures we're not hitting runtime errors.)
        _ = bert_trainer_model([word_ids, mask, type_ids])
예제 #23
0
    def test_serialize_deserialize(self):
        """Validate that the dual encoder model can be serialized / deserialized."""
        # Build a transformer network to use within the dual encoder model. (Here,
        # we use a short sequence_length for convenience.)
        sequence_length = 32
        test_network = networks.TransformerEncoder(
            vocab_size=100, num_layers=2, sequence_length=sequence_length)

        # Create a dual encoder model with the created network. (Note that all the
        # args are different, so we can catch any serialization mismatches.)
        dual_encoder_model = dual_encoder.DualEncoder(
            test_network, max_seq_length=sequence_length, output='predictions')

        # Create another dual encoder model via serialization and deserialization.
        config = dual_encoder_model.get_config()
        new_dual_encoder = dual_encoder.DualEncoder.from_config(config)

        # Validate that the config can be forced to JSON.
        _ = new_dual_encoder.to_json()

        # If the serialization was successful, the new config should match the old.
        self.assertAllEqual(dual_encoder_model.get_config(),
                            new_dual_encoder.get_config())
예제 #24
0
def _create_bert_model(cfg):
    """Creates a BERT keras core model from BERT configuration.

  Args:
    cfg: A `BertConfig` to create the core model.
  Returns:
    A keras model.
  """
    bert_encoder = networks.TransformerEncoder(
        vocab_size=cfg.vocab_size,
        hidden_size=cfg.hidden_size,
        num_layers=cfg.num_hidden_layers,
        num_attention_heads=cfg.num_attention_heads,
        intermediate_size=cfg.intermediate_size,
        activation=activations.gelu,
        dropout_rate=cfg.hidden_dropout_prob,
        attention_dropout_rate=cfg.attention_probs_dropout_prob,
        sequence_length=cfg.max_position_embeddings,
        type_vocab_size=cfg.type_vocab_size,
        initializer=tf.keras.initializers.TruncatedNormal(
            stddev=cfg.initializer_range))

    return bert_encoder
예제 #25
0
      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
<<<<<<< HEAD
      sequence_length=sequence_length,
=======
>>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36
      max_sequence_length=bert_config.max_position_embeddings,
      type_vocab_size=bert_config.type_vocab_size,
      embedding_width=bert_config.embedding_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=bert_config.initializer_range))
  if isinstance(bert_config, albert_configs.AlbertConfig):
    return networks.AlbertTransformerEncoder(**kwargs)
  else:
    assert isinstance(bert_config, configs.BertConfig)
    kwargs['output_range'] = output_range
    return networks.TransformerEncoder(**kwargs)


def pretrain_model(bert_config,
                   seq_length,
                   max_predictions_per_seq,
                   initializer=None,
                   use_next_sentence_label=True,
                   return_core_pretrainer_model=False):
  """Returns model to be used for pre-training.

  Args:
      bert_config: Configuration that defines the core BERT model.
      seq_length: Maximum sequence length of the training data.
      max_predictions_per_seq: Maximum number of tokens in sequence to mask out
        and use for pretraining.
예제 #26
0
def classifier_model(bert_config,
                     float_type,
                     num_labels,
                     max_seq_length,
                     final_layer_initializer=None,
                     hub_module_url=None):
    """BERT classifier model in functional API style.

  Construct a Keras model for predicting `num_labels` outputs from an input with
  maximum sequence length `max_seq_length`.

  Args:
    bert_config: BertConfig, the config defines the core BERT model.
    float_type: dtype, tf.float32 or tf.bfloat16.
    num_labels: integer, the number of classes.
    max_seq_length: integer, the maximum input sequence length.
    final_layer_initializer: Initializer for final dense layer. Defaulted
      TruncatedNormal initializer.
    hub_module_url: TF-Hub path/url to Bert module.

  Returns:
    Combined prediction model (words, mask, type) -> (one-hot labels)
    BERT sub-model (words, mask, type) -> (bert_outputs)
  """
    if final_layer_initializer is not None:
        initializer = final_layer_initializer
    else:
        initializer = tf.keras.initializers.TruncatedNormal(
            stddev=bert_config.initializer_range)

    if not hub_module_url:
        bert_encoder = networks.TransformerEncoder(
            vocab_size=bert_config.vocab_size,
            hidden_size=bert_config.hidden_size,
            num_layers=bert_config.num_hidden_layers,
            num_attention_heads=bert_config.num_attention_heads,
            intermediate_size=bert_config.intermediate_size,
            activation=tf_utils.get_activation('gelu'),
            dropout_rate=bert_config.hidden_dropout_prob,
            attention_dropout_rate=bert_config.attention_probs_dropout_prob,
            sequence_length=max_seq_length,
            max_sequence_length=bert_config.max_position_embeddings,
            type_vocab_size=bert_config.type_vocab_size,
            initializer=tf.keras.initializers.TruncatedNormal(
                stddev=bert_config.initializer_range))
        return bert_classifier.BertClassifier(
            bert_encoder,
            num_classes=num_labels,
            dropout_rate=bert_config.hidden_dropout_prob,
            initializer=initializer), bert_encoder

    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                           dtype=tf.int32,
                                           name='input_word_ids')
    input_mask = tf.keras.layers.Input(shape=(max_seq_length, ),
                                       dtype=tf.int32,
                                       name='input_mask')
    input_type_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                           dtype=tf.int32,
                                           name='input_type_ids')
    bert_model = hub.KerasLayer(hub_module_url, trainable=True)
    pooled_output, _ = bert_model([input_word_ids, input_mask, input_type_ids])
    output = tf.keras.layers.Dropout(
        rate=bert_config.hidden_dropout_prob)(pooled_output)

    output = tf.keras.layers.Dense(num_labels,
                                   kernel_initializer=initializer,
                                   name='output',
                                   dtype=float_type)(output)
    return tf.keras.Model(inputs={
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    },
                          outputs=output), bert_model