예제 #1
0
def instantiate_from_cfg(config: BertPretrainerConfig,
                         encoder_network: Optional[tf.keras.Model] = None):
    """Instantiates a BertPretrainer from the config."""
    encoder_cfg = config.encoder
    if encoder_network is None:
        encoder_network = networks.TransformerEncoder(
            vocab_size=encoder_cfg.vocab_size,
            hidden_size=encoder_cfg.hidden_size,
            num_layers=encoder_cfg.num_layers,
            num_attention_heads=encoder_cfg.num_attention_heads,
            intermediate_size=encoder_cfg.intermediate_size,
            activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
            dropout_rate=encoder_cfg.dropout_rate,
            attention_dropout_rate=encoder_cfg.attention_dropout_rate,
            max_sequence_length=encoder_cfg.max_position_embeddings,
            type_vocab_size=encoder_cfg.type_vocab_size,
            initializer=tf.keras.initializers.TruncatedNormal(
                stddev=encoder_cfg.initializer_range))
    if config.cls_heads:
        classification_heads = [
            layers.ClassificationHead(**cfg.as_dict())
            for cfg in config.cls_heads
        ]
    else:
        classification_heads = []
    return bert_pretrainer.BertPretrainerV2(
        config.num_masked_tokens,
        mlm_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=encoder_cfg.initializer_range),
        encoder_network=encoder_network,
        classification_heads=classification_heads)
예제 #2
0
def _build_pretrainer(
        config: electra.ElectraPretrainerConfig) -> models.ElectraPretrainer:
    """Instantiates ElectraPretrainer from the config."""
    generator_encoder_cfg = config.generator_encoder
    discriminator_encoder_cfg = config.discriminator_encoder
    # Copy discriminator's embeddings to generator for easier model serialization.
    discriminator_network = encoders.build_encoder(discriminator_encoder_cfg)
    if config.tie_embeddings:
        embedding_layer = discriminator_network.get_embedding_layer()
        generator_network = encoders.build_encoder(
            generator_encoder_cfg, embedding_layer=embedding_layer)
    else:
        generator_network = encoders.build_encoder(generator_encoder_cfg)

    generator_encoder_cfg = generator_encoder_cfg.get()
    return models.ElectraPretrainer(
        generator_network=generator_network,
        discriminator_network=discriminator_network,
        vocab_size=generator_encoder_cfg.vocab_size,
        num_classes=config.num_classes,
        sequence_length=config.sequence_length,
        num_token_predictions=config.num_masked_tokens,
        mlm_activation=tf_utils.get_activation(
            generator_encoder_cfg.hidden_activation),
        mlm_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=generator_encoder_cfg.initializer_range),
        classification_heads=[
            layers.ClassificationHead(**cfg.as_dict())
            for cfg in config.cls_heads
        ],
        disallow_correct=config.disallow_correct)
예제 #3
0
  def __init__(
      self,
      network: Union[tf.keras.layers.Layer, tf.keras.Model],
      num_classes: int,
      initializer: tf.keras.initializers.Initializer = 'random_normal',
      summary_type: str = 'last',
      dropout_rate: float = 0.1,
      **kwargs):
    super().__init__(**kwargs)
    self._network = network
    self._initializer = initializer
    self._summary_type = summary_type
    self._num_classes = num_classes
    self._config = {
        'network': network,
        'initializer': initializer,
        'num_classes': num_classes,
        'summary_type': summary_type,
        'dropout_rate': dropout_rate,
    }

    if summary_type == 'last':
      cls_token_idx = -1
    elif summary_type == 'first':
      cls_token_idx = 0
    else:
      raise ValueError('Invalid summary type provided: %s.' % summary_type)

    self.classifier = layers.ClassificationHead(
        inner_dim=network.get_config()['inner_size'],
        num_classes=num_classes,
        initializer=initializer,
        dropout_rate=dropout_rate,
        cls_token_idx=cls_token_idx,
        name='sentence_prediction')
예제 #4
0
    def build_small_model(self, model_cfg):
        encoder_cfg = model_cfg['encoder']['bert']
        dataconf = self.task_config.train_data
        encoder_network = small_encoder_lib.TransformerEncoder(
            vocab_size=encoder_cfg['vocab_size'],
            hidden_size=encoder_cfg['hidden_size'],
            num_layers=encoder_cfg['num_layers'],
            num_attention_heads=encoder_cfg['num_attention_heads'],
            intermediate_size=encoder_cfg['intermediate_size'],
            activation=tf_utils.get_activation(
                encoder_cfg['hidden_activation']),
            dropout_rate=encoder_cfg['dropout_rate'],
            attention_dropout_rate=encoder_cfg['attention_dropout_rate'],
            max_sequence_length=encoder_cfg['max_position_embeddings'],
            type_vocab_size=encoder_cfg['type_vocab_size'],
            initializer=tf.keras.initializers.TruncatedNormal(
                stddev=encoder_cfg['initializer_range']),
            net2net_ratio=encoder_cfg['net2net_ratio'],
            net2net_layers=encoder_cfg['net2net_layers'],
            lightatt_layers=encoder_cfg['lightatt_layers'],
            input_pool_name=encoder_cfg['input_pool_name'],
            input_pool_size=encoder_cfg['input_pool_size'])
        sequence_length = dataconf.seq_length
        predict_length = dataconf.max_predictions_per_seq
        dummy_inputs = dict(input_mask=tf.zeros((1, sequence_length),
                                                dtype=tf.int32),
                            input_positions=tf.zeros((1, sequence_length),
                                                     dtype=tf.int32),
                            input_type_ids=tf.zeros((1, sequence_length),
                                                    dtype=tf.int32),
                            input_word_ids=tf.zeros((1, sequence_length),
                                                    dtype=tf.int32),
                            masked_lm_positions=tf.zeros((1, predict_length),
                                                         dtype=tf.int32),
                            masked_input_ids=tf.zeros((1, predict_length),
                                                      dtype=tf.int32),
                            masked_segment_ids=tf.zeros((1, predict_length),
                                                        dtype=tf.int32),
                            masked_lm_weights=tf.zeros((1, predict_length),
                                                       dtype=tf.float32))
        _ = encoder_network(dummy_inputs)

        if 'cls_heads' in model_cfg:
            classification_heads = [
                layers.ClassificationHead(**cfg)
                for cfg in model_cfg['cls_heads']
            ]
        else:
            classification_heads = []
        model = small_pretrainer.BertPretrainModel(
            mlm_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=encoder_cfg['initializer_range']),
            mlm_activation=tf_utils.get_activation(
                encoder_cfg['hidden_activation']),
            encoder_network=encoder_network,
            classification_heads=classification_heads)
        _ = model(dummy_inputs)
        return model
예제 #5
0
def instantiate_classification_heads_from_cfgs(
    cls_head_configs: List[bert.ClsHeadConfig]
) -> List[layers.ClassificationHead]:
  if cls_head_configs:
    return [
        layers.ClassificationHead(**cfg.as_dict()) for cfg in cls_head_configs
    ]
  else:
    return []
예제 #6
0
    def __init__(self,
                 generator_network,
                 discriminator_network,
                 vocab_size,
                 num_classes,
                 sequence_length,
                 num_token_predictions,
                 mlm_activation=None,
                 mlm_initializer='glorot_uniform',
                 output_type='logits',
                 disallow_correct=False,
                 **kwargs):
        super(ElectraPretrainer, self).__init__()
        self._config = {
            'generator_network': generator_network,
            'discriminator_network': discriminator_network,
            'vocab_size': vocab_size,
            'num_classes': num_classes,
            'sequence_length': sequence_length,
            'num_token_predictions': num_token_predictions,
            'mlm_activation': mlm_activation,
            'mlm_initializer': mlm_initializer,
            'output_type': output_type,
            'disallow_correct': disallow_correct,
        }
        for k, v in kwargs.items():
            self._config[k] = v

        self.generator_network = generator_network
        self.discriminator_network = discriminator_network
        self.vocab_size = vocab_size
        self.num_classes = num_classes
        self.sequence_length = sequence_length
        self.num_token_predictions = num_token_predictions
        self.mlm_activation = mlm_activation
        self.mlm_initializer = mlm_initializer
        self.output_type = output_type
        self.disallow_correct = disallow_correct
        self.masked_lm = layers.MaskedLM(
            embedding_table=generator_network.get_embedding_table(),
            activation=mlm_activation,
            initializer=mlm_initializer,
            output=output_type,
            name='generator_masked_lm')
        self.classification = layers.ClassificationHead(
            inner_dim=generator_network._config_dict['hidden_size'],
            num_classes=num_classes,
            initializer=mlm_initializer,
            name='generator_classification_head')
        self.discriminator_projection = tf.keras.layers.Dense(
            units=discriminator_network._config_dict['hidden_size'],
            activation=mlm_activation,
            kernel_initializer=mlm_initializer,
            name='discriminator_projection_head')
        self.discriminator_head = tf.keras.layers.Dense(
            units=1, kernel_initializer=mlm_initializer)
예제 #7
0
    def __init__(self,
                 network,
                 num_classes,
                 initializer='glorot_uniform',
                 dropout_rate=0.1,
                 use_encoder_pooler=True,
                 **kwargs):
        self._self_setattr_tracking = False
        self._network = network
        self._config = {
            'network': network,
            'num_classes': num_classes,
            'initializer': initializer,
            'use_encoder_pooler': use_encoder_pooler,
        }

        # We want to use the inputs of the passed network as the inputs to this
        # Model. To do this, we need to keep a handle to the network inputs for use
        # when we construct the Model object at the end of init.
        inputs = network.inputs

        if use_encoder_pooler:
            # Because we have a copy of inputs to create this Model object, we can
            # invoke the Network object with its own input tensors to start the Model.
            outputs = network(inputs)
            if isinstance(outputs, list):
                cls_output = outputs[1]
            else:
                cls_output = outputs['pooled_output']
            cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)

            self.classifier = networks.Classification(
                input_width=cls_output.shape[-1],
                num_classes=num_classes,
                initializer=initializer,
                output='logits',
                name='sentence_prediction')
            predictions = self.classifier(cls_output)
        else:
            outputs = network(inputs)
            if isinstance(outputs, list):
                sequence_output = outputs[0]
            else:
                sequence_output = outputs['sequence_output']
            self.classifier = layers.ClassificationHead(
                inner_dim=sequence_output.shape[-1],
                num_classes=num_classes,
                initializer=initializer,
                dropout_rate=dropout_rate,
                name='sentence_prediction')
            predictions = self.classifier(sequence_output)

        super(BertClassifier, self).__init__(inputs=inputs,
                                             outputs=predictions,
                                             **kwargs)
예제 #8
0
    def test_copy_pooler_dense_to_encoder(self):
        encoder_config = encoders.EncoderConfig(
            type="bert",
            bert=encoders.BertEncoderConfig(hidden_size=24,
                                            intermediate_size=48,
                                            num_layers=2))
        cls_heads = [
            layers.ClassificationHead(inner_dim=24,
                                      num_classes=2,
                                      name="next_sentence")
        ]
        encoder = encoders.build_encoder(encoder_config)
        pretrainer = models.BertPretrainerV2(
            encoder_network=encoder,
            classification_heads=cls_heads,
            mlm_activation=tf_utils.get_activation(
                encoder_config.get().hidden_activation))
        # Makes sure the pretrainer variables are created.
        _ = pretrainer(pretrainer.inputs)
        checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
        model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
        checkpoint.save(os.path.join(model_checkpoint_dir, "test"))

        vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
            self.get_temp_dir(), use_sp_model=True)
        export_path = os.path.join(self.get_temp_dir(), "hub")
        export_tfhub_lib.export_model(
            export_path=export_path,
            encoder_config=encoder_config,
            model_checkpoint_path=tf.train.latest_checkpoint(
                model_checkpoint_dir),
            with_mlm=True,
            copy_pooler_dense_to_encoder=True,
            vocab_file=vocab_file,
            sp_model_file=sp_model_file,
            do_lower_case=True)
        # Restores a hub KerasLayer.
        hub_layer = hub.KerasLayer(export_path, trainable=True)
        dummy_ids = np.zeros((2, 10), dtype=np.int32)
        input_dict = dict(input_word_ids=dummy_ids,
                          input_mask=dummy_ids,
                          input_type_ids=dummy_ids)
        hub_pooled_output = hub_layer(input_dict)["pooled_output"]
        encoder_outputs = encoder(input_dict)
        # Verify that hub_layer's pooled_output is the same as the output of next
        # sentence prediction's dense layer.
        pretrained_pooled_output = cls_heads[0].dense(
            (encoder_outputs["sequence_output"][:, 0, :]))
        self.assertAllClose(hub_pooled_output, pretrained_pooled_output)
        # But the pooled_output between encoder and hub_layer are not the same.
        encoder_pooled_output = encoder_outputs["pooled_output"]
        self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
 def build_model(self, params=None):
   config = params or self.task_config.model
   encoder_cfg = config.encoder
   encoder_network = self._build_encoder(encoder_cfg)
   cls_heads = [
       layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads
   ] if config.cls_heads else []
   return models.BertPretrainerV2(
       mlm_activation=tf_utils.get_activation(config.mlm_activation),
       mlm_initializer=tf.keras.initializers.TruncatedNormal(
           stddev=config.mlm_initializer_range),
       encoder_network=encoder_network,
       classification_heads=cls_heads)
예제 #10
0
  def _build_pretrainer(self, pretrainer_cfg: bert.PretrainerConfig, name: str):
    """Builds pretrainer from config and encoder."""
    encoder = encoders.build_encoder(pretrainer_cfg.encoder)
    if pretrainer_cfg.cls_heads:
      cls_heads = [
          layers.ClassificationHead(**cfg.as_dict())
          for cfg in pretrainer_cfg.cls_heads
      ]
    else:
      cls_heads = []

    masked_lm = layers.MobileBertMaskedLM(
        embedding_table=encoder.get_embedding_table(),
        activation=tf_utils.get_activation(pretrainer_cfg.mlm_activation),
        initializer=tf.keras.initializers.TruncatedNormal(
            stddev=pretrainer_cfg.mlm_initializer_range),
        name='cls/predictions')

    pretrainer = models.BertPretrainerV2(
        encoder_network=encoder,
        classification_heads=cls_heads,
        customized_masked_lm=masked_lm,
        name=name)
    return pretrainer
예제 #11
0
  def __init__(self,
               network,
               num_classes,
               initializer='glorot_uniform',
               dropout_rate=0.1,
               use_encoder_pooler=True,
               cls_head=None,
               **kwargs):
    self.num_classes = num_classes
    self.initializer = initializer
    self.use_encoder_pooler = use_encoder_pooler
    self.cls_head = cls_head

    # We want to use the inputs of the passed network as the inputs to this
    # Model. To do this, we need to keep a handle to the network inputs for use
    # when we construct the Model object at the end of init.
    inputs = network.inputs

    if use_encoder_pooler:
      # Because we have a copy of inputs to create this Model object, we can
      # invoke the Network object with its own input tensors to start the Model.
      outputs = network(inputs)
      if isinstance(outputs, list):
        cls_inputs = outputs[1]
      else:
        cls_inputs = outputs['pooled_output']
      cls_inputs = tf.keras.layers.Dropout(rate=dropout_rate)(cls_inputs)
    else:
      outputs = network(inputs)
      if isinstance(outputs, list):
        cls_inputs = outputs[0]
      else:
        cls_inputs = outputs['sequence_output']

    if cls_head:
      classifier = cls_head
    else:
      classifier = layers.ClassificationHead(
          inner_dim=0 if use_encoder_pooler else cls_inputs.shape[-1],
          num_classes=num_classes,
          initializer=initializer,
          dropout_rate=dropout_rate,
          name='sentence_prediction')

    predictions = classifier(cls_inputs)

    # b/164516224
    # Once we've created the network using the Functional API, we call
    # super().__init__ as though we were invoking the Functional API Model
    # constructor, resulting in this object having all the properties of a model
    # created using the Functional API. Once super().__init__ is called, we
    # can assign attributes to `self` - note that all `self` assignments are
    # below this line.
    super(BertClassifier, self).__init__(
        inputs=inputs, outputs=predictions, **kwargs)
    self._network = network
    config_dict = self._make_config_dict()
    # We are storing the config dict as a namedtuple here to ensure checkpoint
    # compatibility with an earlier version of this model which did not track
    # the config dict attribute. TF does not track immutable attrs which
    # do not contain Trackables, so by creating a config namedtuple instead of
    # a dict we avoid tracking it.
    config_cls = collections.namedtuple('Config', config_dict.keys())
    self._config = config_cls(**config_dict)
    self.classifier = classifier
예제 #12
0
    def prepare_config(self, teacher_block_num, student_block_num,
                       transfer_teacher_layers):
        # using small model for testing
        task_config = distillation.BertDistillationTaskConfig(
            teacher_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig(
                type='mobilebert',
                mobilebert=encoders.MobileBertEncoderConfig(
                    num_blocks=teacher_block_num)),
                                                cls_heads=[
                                                    bert.ClsHeadConfig(
                                                        inner_dim=256,
                                                        num_classes=2,
                                                        dropout_rate=0.1,
                                                        name='next_sentence')
                                                ],
                                                mlm_activation='gelu'),
            student_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig(
                type='mobilebert',
                mobilebert=encoders.MobileBertEncoderConfig(
                    num_blocks=student_block_num)),
                                                cls_heads=[
                                                    bert.ClsHeadConfig(
                                                        inner_dim=256,
                                                        num_classes=2,
                                                        dropout_rate=0.1,
                                                        name='next_sentence')
                                                ],
                                                mlm_activation='relu'),
            train_data=pretrain_dataloader.BertPretrainDataConfig(
                input_path='dummy',
                max_predictions_per_seq=76,
                seq_length=512,
                global_batch_size=10),
            validation_data=pretrain_dataloader.BertPretrainDataConfig(
                input_path='dummy',
                max_predictions_per_seq=76,
                seq_length=512,
                global_batch_size=10))

        # set only 1 step for each stage
        progressive_config = distillation.BertDistillationProgressiveConfig()
        progressive_config.layer_wise_distill_config.transfer_teacher_layers = (
            transfer_teacher_layers)
        progressive_config.layer_wise_distill_config.num_steps = 1
        progressive_config.pretrain_distill_config.num_steps = 1

        optimization_config = optimization.OptimizationConfig(
            optimizer=optimization.OptimizerConfig(
                type='lamb',
                lamb=optimization.LAMBConfig(weight_decay_rate=0.0001,
                                             exclude_from_weight_decay=[
                                                 'LayerNorm', 'layer_norm',
                                                 'bias', 'no_norm'
                                             ])),
            learning_rate=optimization.LrConfig(
                type='polynomial',
                polynomial=optimization.PolynomialLrConfig(
                    initial_learning_rate=1.5e-3,
                    decay_steps=10000,
                    end_learning_rate=1.5e-3)),
            warmup=optimization.WarmupConfig(
                type='linear',
                linear=optimization.LinearWarmupConfig(
                    warmup_learning_rate=0)))

        exp_config = cfg.ExperimentConfig(
            task=task_config,
            trainer=prog_trainer_lib.ProgressiveTrainerConfig(
                progressive=progressive_config,
                optimizer_config=optimization_config))

        # Create a teacher model checkpoint.
        teacher_encoder = encoders.build_encoder(
            task_config.teacher_model.encoder)
        pretrainer_config = task_config.teacher_model
        if pretrainer_config.cls_heads:
            teacher_cls_heads = [
                layers.ClassificationHead(**cfg.as_dict())
                for cfg in pretrainer_config.cls_heads
            ]
        else:
            teacher_cls_heads = []

        masked_lm = layers.MobileBertMaskedLM(
            embedding_table=teacher_encoder.get_embedding_table(),
            activation=tf_utils.get_activation(
                pretrainer_config.mlm_activation),
            initializer=tf.keras.initializers.TruncatedNormal(
                stddev=pretrainer_config.mlm_initializer_range),
            name='cls/predictions')
        teacher_pretrainer = models.BertPretrainerV2(
            encoder_network=teacher_encoder,
            classification_heads=teacher_cls_heads,
            customized_masked_lm=masked_lm)

        # The model variables will be created after the forward call.
        _ = teacher_pretrainer(teacher_pretrainer.inputs)
        teacher_pretrainer_ckpt = tf.train.Checkpoint(
            **teacher_pretrainer.checkpoint_items)
        teacher_ckpt_path = os.path.join(self.get_temp_dir(),
                                         'teacher_model.ckpt')
        teacher_pretrainer_ckpt.save(teacher_ckpt_path)
        exp_config.task.teacher_model_init_checkpoint = self.get_temp_dir()

        return exp_config
예제 #13
0
=======
>>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36
    self.num_token_predictions = num_token_predictions
    self.mlm_activation = mlm_activation
    self.mlm_initializer = mlm_initializer
    self.output_type = output_type
    self.disallow_correct = disallow_correct
    self.masked_lm = layers.MaskedLM(
        embedding_table=generator_network.get_embedding_table(),
        activation=mlm_activation,
        initializer=mlm_initializer,
        output=output_type,
        name='generator_masked_lm')
    self.classification = layers.ClassificationHead(
        inner_dim=generator_network._config_dict['hidden_size'],
        num_classes=num_classes,
        initializer=mlm_initializer,
        name='generator_classification_head')
    self.discriminator_projection = tf.keras.layers.Dense(
        units=discriminator_network._config_dict['hidden_size'],
        activation=mlm_activation,
        kernel_initializer=mlm_initializer,
        name='discriminator_projection_head')
    self.discriminator_head = tf.keras.layers.Dense(
        units=1, kernel_initializer=mlm_initializer)

  def call(self, inputs):
    """ELECTRA forward pass.

    Args:
      inputs: A dict of all inputs, same as the standard BERT model.