def roformer_pretraining() -> cfg.ExperimentConfig:
  """BERT pretraining experiment."""
  config = cfg.ExperimentConfig(
      runtime=cfg.RuntimeConfig(enable_xla=True),
      task=masked_lm.MaskedLMConfig(
          model=bert.PretrainerConfig(
              encoder=encoders.EncoderConfig(
                  type='any', any=roformer.RoformerEncoderConfig()),
              cls_heads=[
                  bert.ClsHeadConfig(
                      inner_dim=768,
                      num_classes=2,
                      dropout_rate=0.1,
                      name='next_sentence')
              ]),
          train_data=pretrain_dataloader.BertPretrainDataConfig(
              use_v2_feature_names=True),
          validation_data=pretrain_dataloader.BertPretrainDataConfig(
              use_v2_feature_names=True, is_training=False)),
      trainer=cfg.TrainerConfig(
          optimizer_config=RoformerOptimizationConfig(), train_steps=1000000),
      restrictions=[
          'task.train_data.is_training != None',
          'task.validation_data.is_training != None'
      ])
  return config
示例#2
0
def bert_pretraining() -> cfg.ExperimentConfig:
    """BERT pretraining experiment."""
    config = cfg.ExperimentConfig(
        task=masked_lm.MaskedLMConfig(
            train_data=pretrain_dataloader.BertPretrainDataConfig(),
            validation_data=pretrain_dataloader.BertPretrainDataConfig(
                is_training=False)),
        trainer=cfg.TrainerConfig(
            train_steps=1000000,
            optimizer_config=optimization.OptimizationConfig({
                'optimizer': {
                    'type': 'adamw',
                    'adamw': {
                        'weight_decay_rate':
                        0.01,
                        'exclude_from_weight_decay':
                        ['LayerNorm', 'layer_norm', 'bias'],
                    }
                },
                'learning_rate': {
                    'type': 'polynomial',
                    'polynomial': {
                        'initial_learning_rate': 1e-4,
                        'end_learning_rate': 0.0,
                    }
                },
                'warmup': {
                    'type': 'polynomial'
                }
            })),
        restrictions=[
            'task.train_data.is_training != None',
            'task.validation_data.is_training != None'
        ])
    return config
 def setUp(self):
   super(ProgressiveMaskedLMTest, self).setUp()
   self.task_config = progressive_masked_lm.ProgMaskedLMConfig(
       model=bert.PretrainerConfig(
           encoder=encoders.EncoderConfig(
               bert=encoders.BertEncoderConfig(vocab_size=30522,
                                               num_layers=2)),
           cls_heads=[
               bert.ClsHeadConfig(
                   inner_dim=10, num_classes=2, name="next_sentence")
           ]),
       train_data=pretrain_dataloader.BertPretrainDataConfig(
           input_path="dummy",
           max_predictions_per_seq=20,
           seq_length=128,
           global_batch_size=1),
       validation_data=pretrain_dataloader.BertPretrainDataConfig(
           input_path="dummy",
           max_predictions_per_seq=20,
           seq_length=128,
           global_batch_size=1),
       stage_list=[
           progressive_masked_lm.StackingStageConfig(
               num_layers=1, num_steps=4),
           progressive_masked_lm.StackingStageConfig(
               num_layers=2, num_steps=8),
           ],
       )
   self.exp_config = cfg.ExperimentConfig(
       task=self.task_config,
       trainer=prog_trainer_lib.ProgressiveTrainerConfig())
示例#4
0
def bert_pretraining() -> cfg.ExperimentConfig:
    """BERT pretraining experiment."""
    config = cfg.ExperimentConfig(
        task=masked_lm.MaskedLMConfig(
            train_data=pretrain_dataloader.BertPretrainDataConfig(),
            validation_data=pretrain_dataloader.BertPretrainDataConfig(
                is_training=False)),
        trainer=_TRAINER,
        restrictions=[
            'task.train_data.is_training != None',
            'task.validation_data.is_training != None'
        ])
    return config
示例#5
0
def get_exp_config():
    """Get ExperimentConfig."""
    params = cfg.ExperimentConfig(
        task=distillation.BertDistillationTaskConfig(
            train_data=pretrain_dataloader.BertPretrainDataConfig(),
            validation_data=pretrain_dataloader.BertPretrainDataConfig(
                is_training=False)),
        trainer=prog_trainer_lib.ProgressiveTrainerConfig(
            progressive=distillation.BertDistillationProgressiveConfig(),
            optimizer_config=optimization_config,
            train_steps=740000,
            checkpoint_interval=20000))

    return config_override(params, FLAGS)
示例#6
0
def teams_pretrain() -> cfg.ExperimentConfig:
  """TEAMS pretraining."""
  config = cfg.ExperimentConfig(
      task=teams_task.TeamsPretrainTaskConfig(
          train_data=pretrain_dataloader.BertPretrainDataConfig(),
          validation_data=pretrain_dataloader.BertPretrainDataConfig(
              is_training=False)),
      trainer=cfg.TrainerConfig(
          optimizer_config=TeamsOptimizationConfig(), train_steps=1000000),
      restrictions=[
          "task.train_data.is_training != None",
          "task.validation_data.is_training != None"
      ])
  return config
  def test_task(self, num_shared_hidden_layers,
                num_task_agnostic_layers):
    config = teams_task.TeamsPretrainTaskConfig(
        model=teams.TeamsPretrainerConfig(
            generator=encoders.BertEncoderConfig(
                vocab_size=30522, num_layers=2),
            discriminator=encoders.BertEncoderConfig(
                vocab_size=30522, num_layers=2),
            num_shared_generator_hidden_layers=num_shared_hidden_layers,
            num_discriminator_task_agnostic_layers=num_task_agnostic_layers,
        ),
        train_data=pretrain_dataloader.BertPretrainDataConfig(
            input_path="dummy",
            max_predictions_per_seq=20,
            seq_length=128,
            global_batch_size=1))
    task = teams_task.TeamsPretrainTask(config)
    model = task.build_model()
    metrics = task.build_metrics()
    dataset = task.build_inputs(config.train_data)

    iterator = iter(dataset)
    optimizer = tf.keras.optimizers.SGD(lr=0.1)
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
    task.validation_step(next(iterator), model, metrics=metrics)
    def test_task(self):
        config = electra_task.ElectraPretrainConfig(
            model=electra.ElectraPretrainerConfig(
                generator_encoder=encoders.EncoderConfig(
                    bert=encoders.BertEncoderConfig(vocab_size=30522,
                                                    num_layers=1)),
                discriminator_encoder=encoders.EncoderConfig(
                    bert=encoders.BertEncoderConfig(vocab_size=30522,
                                                    num_layers=1)),
                num_masked_tokens=20,
                sequence_length=128,
                cls_heads=[
                    bert.ClsHeadConfig(inner_dim=10,
                                       num_classes=2,
                                       name="next_sentence")
                ]),
            train_data=pretrain_dataloader.BertPretrainDataConfig(
                input_path="dummy",
                max_predictions_per_seq=20,
                seq_length=128,
                global_batch_size=1))
        task = electra_task.ElectraPretrainTask(config)
        model = task.build_model()
        metrics = task.build_metrics()
        dataset = task.build_inputs(config.train_data)

        iterator = iter(dataset)
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        task.train_step(next(iterator), model, optimizer, metrics=metrics)
        task.validation_step(next(iterator), model, metrics=metrics)
  def test_v2_feature_names(self):
    train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
    seq_length = 128
    max_predictions_per_seq = 20
    _create_fake_bert_dataset(
        train_data_path,
        seq_length,
        max_predictions_per_seq,
        use_next_sentence_label=True,
        use_position_id=False,
        use_v2_feature_names=True)
    data_config = pretrain_dataloader.BertPretrainDataConfig(
        input_path=train_data_path,
        max_predictions_per_seq=max_predictions_per_seq,
        seq_length=seq_length,
        global_batch_size=10,
        is_training=True,
        use_next_sentence_label=True,
        use_position_id=False,
        use_v2_feature_names=True)

    dataset = pretrain_dataloader.BertPretrainDataLoader(data_config).load()
    features = next(iter(dataset))
    self.assertIn("input_word_ids", features)
    self.assertIn("input_mask", features)
    self.assertIn("input_type_ids", features)
    self.assertIn("masked_lm_positions", features)
    self.assertIn("masked_lm_ids", features)
    self.assertIn("masked_lm_weights", features)
示例#10
0
class StackingStageConfig(base_config.Config):
    num_steps: int = 0
    warmup_steps: int = 10000
    initial_learning_rate: float = 1e-4
    end_learning_rate: float = 0.0
    decay_steps: int = 1000000
    override_num_layers: Optional[int] = None

    small_encoder_config: Optional[
        ecfg.SmallEncoderConfig] = ecfg.SmallEncoderConfig()
    override_train_data: Optional[
        pretrain_dataloader.
        BertPretrainDataConfig] = pretrain_dataloader.BertPretrainDataConfig()
    override_valid_data: Optional[
        pretrain_dataloader.
        BertPretrainDataConfig] = pretrain_dataloader.BertPretrainDataConfig()
示例#11
0
    def test_load_data(self, use_next_sentence_label, use_position_id):
        train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
        seq_length = 128
        max_predictions_per_seq = 20
        _create_fake_dataset(train_data_path,
                             seq_length,
                             max_predictions_per_seq,
                             use_next_sentence_label=use_next_sentence_label,
                             use_position_id=use_position_id)
        data_config = pretrain_dataloader.BertPretrainDataConfig(
            input_path=train_data_path,
            max_predictions_per_seq=max_predictions_per_seq,
            seq_length=seq_length,
            global_batch_size=10,
            is_training=True,
            use_next_sentence_label=use_next_sentence_label,
            use_position_id=use_position_id)

        dataset = pretrain_dataloader.BertPretrainDataLoader(
            data_config).load()
        features = next(iter(dataset))
        self.assertLen(features,
                       6 + int(use_next_sentence_label) + int(use_position_id))
        self.assertIn("input_word_ids", features)
        self.assertIn("input_mask", features)
        self.assertIn("input_type_ids", features)
        self.assertIn("masked_lm_positions", features)
        self.assertIn("masked_lm_ids", features)
        self.assertIn("masked_lm_weights", features)

        self.assertEqual("next_sentence_labels" in features,
                         use_next_sentence_label)
        self.assertEqual("position_ids" in features, use_position_id)
  def test_task(self):
    config = masked_lm.MaskedLMConfig(
        init_checkpoint=self.get_temp_dir(),
        scale_loss=True,
        model=bert.PretrainerConfig(
            encoder=encoders.EncoderConfig(
                bert=encoders.BertEncoderConfig(vocab_size=30522,
                                                num_layers=1)),
            cls_heads=[
                bert.ClsHeadConfig(
                    inner_dim=10, num_classes=2, name="next_sentence")
            ]),
        train_data=pretrain_dataloader.BertPretrainDataConfig(
            input_path="dummy",
            max_predictions_per_seq=20,
            seq_length=128,
            global_batch_size=1))
    task = masked_lm.MaskedLMTask(config)
    model = task.build_model()
    metrics = task.build_metrics()
    dataset = task.build_inputs(config.train_data)

    iterator = iter(dataset)
    optimizer = tf.keras.optimizers.SGD(lr=0.1)
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
    task.validation_step(next(iterator), model, metrics=metrics)

    # Saves a checkpoint.
    ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
    ckpt.save(config.init_checkpoint)
    task.initialize(model)
    def test_task_determinism(self):
        config = masked_lm.MaskedLMConfig(
            init_checkpoint=self.get_temp_dir(),
            scale_loss=True,
            model=bert.PretrainerConfig(
                encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig(
                    vocab_size=30522, num_layers=1)),
                cls_heads=[
                    bert.ClsHeadConfig(inner_dim=10,
                                       num_classes=2,
                                       name="next_sentence")
                ]),
            train_data=pretrain_dataloader.BertPretrainDataConfig(
                max_predictions_per_seq=20,
                seq_length=128,
                global_batch_size=1))

        tf.keras.utils.set_random_seed(1)
        logs1, validation_logs1, weights1 = self._build_and_run_model(config)
        tf.keras.utils.set_random_seed(1)
        logs2, validation_logs2, weights2 = self._build_and_run_model(config)

        self.assertEqual(logs1["loss"], logs2["loss"])
        self.assertEqual(validation_logs1["loss"], validation_logs2["loss"])
        for weight1, weight2 in zip(weights1, weights2):
            self.assertAllEqual(weight1, weight2)
示例#14
0
class MaskedLMConfig(cfg.TaskConfig):
    """The model config."""
    init_checkpoint: str = ''
    model: bert.PretrainerConfig = bert.PretrainerConfig(
        cls_heads=[
            bert.ClsHeadConfig(inner_dim=768,
                               num_classes=2,
                               dropout_rate=0.1,
                               name='next_sentence')
        ],
        encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig()))
    scale_loss: bool = False
    train_data: pretrain_dataloader.BertPretrainDataConfig = pretrain_dataloader.BertPretrainDataConfig(
    )
    small_train_data: pretrain_dataloader.BertPretrainDataConfig = pretrain_dataloader.BertPretrainDataConfig(
    )
    validation_data: pretrain_dataloader.BertPretrainDataConfig = pretrain_dataloader.BertPretrainDataConfig(
    )
示例#15
0
def get_exp_config():
    """Get ExperimentConfig."""

    params = cfg.ExperimentConfig(
        task=masked_lm.MaskedLMConfig(
            train_data=pretrain_dataloader.BertPretrainDataConfig(),
            small_train_data=pretrain_dataloader.BertPretrainDataConfig(),
            validation_data=pretrain_dataloader.BertPretrainDataConfig(
                is_training=False)),
        trainer=prog_trainer_lib.ProgressiveTrainerConfig(
            progressive=masked_lm.ProgStackingConfig(),
            optimizer_config=BertOptimizationConfig(),
            train_steps=1000000),
        restrictions=[
            'task.train_data.is_training != None',
            'task.validation_data.is_training != None'
        ])

    return utils.config_override(params, FLAGS)
def token_drop_bert_pretraining() -> cfg.ExperimentConfig:
    """BERT pretraining with token dropping."""
    config = cfg.ExperimentConfig(
        runtime=cfg.RuntimeConfig(enable_xla=True),
        task=masked_lm.TokenDropMaskedLMConfig(
            model=bert.PretrainerConfig(encoder=encoders.EncoderConfig(
                any=encoder_config.TokenDropBertEncoderConfig(
                    vocab_size=30522, num_layers=1, token_keep_k=64),
                type='any')),
            train_data=pretrain_dataloader.BertPretrainDataConfig(),
            validation_data=pretrain_dataloader.BertPretrainDataConfig(
                is_training=False)),
        trainer=cfg.TrainerConfig(
            train_steps=1000000,
            optimizer_config=optimization.OptimizationConfig({
                'optimizer': {
                    'type': 'adamw',
                    'adamw': {
                        'weight_decay_rate':
                        0.01,
                        'exclude_from_weight_decay':
                        ['LayerNorm', 'layer_norm', 'bias'],
                    }
                },
                'learning_rate': {
                    'type': 'polynomial',
                    'polynomial': {
                        'initial_learning_rate': 1e-4,
                        'end_learning_rate': 0.0,
                    }
                },
                'warmup': {
                    'type': 'polynomial'
                }
            })),
        restrictions=[
            'task.train_data.is_training != None',
            'task.validation_data.is_training != None'
        ])
    return config
示例#17
0
def bert_dynamic() -> cfg.ExperimentConfig:
    """BERT base with dynamic input sequences.

  TPU needs to run with tf.data service with round-robin behavior.
  """
    config = cfg.ExperimentConfig(
        task=masked_lm.MaskedLMConfig(
            train_data=pretrain_dynamic_dataloader.BertPretrainDataConfig(),
            validation_data=pretrain_dataloader.BertPretrainDataConfig(
                is_training=False)),
        trainer=_TRAINER,
        restrictions=[
            'task.train_data.is_training != None',
            'task.validation_data.is_training != None'
        ])
    return config
示例#18
0
    def prepare_config(self, teacher_block_num, student_block_num,
                       transfer_teacher_layers):
        # using small model for testing
        task_config = distillation.BertDistillationTaskConfig(
            teacher_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig(
                type='mobilebert',
                mobilebert=encoders.MobileBertEncoderConfig(
                    num_blocks=teacher_block_num)),
                                                cls_heads=[
                                                    bert.ClsHeadConfig(
                                                        inner_dim=256,
                                                        num_classes=2,
                                                        dropout_rate=0.1,
                                                        name='next_sentence')
                                                ],
                                                mlm_activation='gelu'),
            student_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig(
                type='mobilebert',
                mobilebert=encoders.MobileBertEncoderConfig(
                    num_blocks=student_block_num)),
                                                cls_heads=[
                                                    bert.ClsHeadConfig(
                                                        inner_dim=256,
                                                        num_classes=2,
                                                        dropout_rate=0.1,
                                                        name='next_sentence')
                                                ],
                                                mlm_activation='relu'),
            train_data=pretrain_dataloader.BertPretrainDataConfig(
                input_path='dummy',
                max_predictions_per_seq=76,
                seq_length=512,
                global_batch_size=10),
            validation_data=pretrain_dataloader.BertPretrainDataConfig(
                input_path='dummy',
                max_predictions_per_seq=76,
                seq_length=512,
                global_batch_size=10))

        # set only 1 step for each stage
        progressive_config = distillation.BertDistillationProgressiveConfig()
        progressive_config.layer_wise_distill_config.transfer_teacher_layers = (
            transfer_teacher_layers)
        progressive_config.layer_wise_distill_config.num_steps = 1
        progressive_config.pretrain_distill_config.num_steps = 1

        optimization_config = optimization.OptimizationConfig(
            optimizer=optimization.OptimizerConfig(
                type='lamb',
                lamb=optimization.LAMBConfig(weight_decay_rate=0.0001,
                                             exclude_from_weight_decay=[
                                                 'LayerNorm', 'layer_norm',
                                                 'bias', 'no_norm'
                                             ])),
            learning_rate=optimization.LrConfig(
                type='polynomial',
                polynomial=optimization.PolynomialLrConfig(
                    initial_learning_rate=1.5e-3,
                    decay_steps=10000,
                    end_learning_rate=1.5e-3)),
            warmup=optimization.WarmupConfig(
                type='linear',
                linear=optimization.LinearWarmupConfig(
                    warmup_learning_rate=0)))

        exp_config = cfg.ExperimentConfig(
            task=task_config,
            trainer=prog_trainer_lib.ProgressiveTrainerConfig(
                progressive=progressive_config,
                optimizer_config=optimization_config))

        # Create a teacher model checkpoint.
        teacher_encoder = encoders.build_encoder(
            task_config.teacher_model.encoder)
        pretrainer_config = task_config.teacher_model
        if pretrainer_config.cls_heads:
            teacher_cls_heads = [
                layers.ClassificationHead(**cfg.as_dict())
                for cfg in pretrainer_config.cls_heads
            ]
        else:
            teacher_cls_heads = []

        masked_lm = layers.MobileBertMaskedLM(
            embedding_table=teacher_encoder.get_embedding_table(),
            activation=tf_utils.get_activation(
                pretrainer_config.mlm_activation),
            initializer=tf.keras.initializers.TruncatedNormal(
                stddev=pretrainer_config.mlm_initializer_range),
            name='cls/predictions')
        teacher_pretrainer = models.BertPretrainerV2(
            encoder_network=teacher_encoder,
            classification_heads=teacher_cls_heads,
            customized_masked_lm=masked_lm)

        # The model variables will be created after the forward call.
        _ = teacher_pretrainer(teacher_pretrainer.inputs)
        teacher_pretrainer_ckpt = tf.train.Checkpoint(
            **teacher_pretrainer.checkpoint_items)
        teacher_ckpt_path = os.path.join(self.get_temp_dir(),
                                         'teacher_model.ckpt')
        teacher_pretrainer_ckpt.save(teacher_ckpt_path)
        exp_config.task.teacher_model_init_checkpoint = self.get_temp_dir()

        return exp_config
    def test_distribution_strategy(self, distribution_strategy):
        max_seq_length = 128
        batch_size = 8
        input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
        _create_fake_dataset(input_path,
                             seq_length=60,
                             num_masked_tokens=20,
                             max_seq_length=max_seq_length,
                             num_examples=batch_size)
        data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig(
            is_training=False,
            input_path=input_path,
            seq_bucket_lengths=[64, 128],
            global_batch_size=batch_size)
        dataloader = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader(
            data_config)
        distributed_ds = orbit.utils.make_distributed_dataset(
            distribution_strategy, dataloader.load)
        train_iter = iter(distributed_ds)
        with distribution_strategy.scope():
            config = masked_lm.MaskedLMConfig(
                init_checkpoint=self.get_temp_dir(),
                model=bert.PretrainerConfig(
                    encoders.EncoderConfig(bert=encoders.BertEncoderConfig(
                        vocab_size=30522, num_layers=1)),
                    cls_heads=[
                        bert.ClsHeadConfig(inner_dim=10,
                                           num_classes=2,
                                           name='next_sentence')
                    ]),
                train_data=data_config)
            task = masked_lm.MaskedLMTask(config)
            model = task.build_model()
            metrics = task.build_metrics()

        @tf.function
        def step_fn(features):
            return task.validation_step(features, model, metrics=metrics)

        distributed_outputs = distribution_strategy.run(
            step_fn, args=(next(train_iter), ))
        local_results = tf.nest.map_structure(
            distribution_strategy.experimental_local_results,
            distributed_outputs)
        logging.info('Dynamic padding:  local_results= %s', str(local_results))
        dynamic_metrics = {}
        for metric in metrics:
            dynamic_metrics[metric.name] = metric.result()

        data_config = pretrain_dataloader.BertPretrainDataConfig(
            is_training=False,
            input_path=input_path,
            seq_length=max_seq_length,
            max_predictions_per_seq=20,
            global_batch_size=batch_size)
        dataloader = pretrain_dataloader.BertPretrainDataLoader(data_config)
        distributed_ds = orbit.utils.make_distributed_dataset(
            distribution_strategy, dataloader.load)
        train_iter = iter(distributed_ds)
        with distribution_strategy.scope():
            metrics = task.build_metrics()

        @tf.function
        def step_fn_b(features):
            return task.validation_step(features, model, metrics=metrics)

        distributed_outputs = distribution_strategy.run(
            step_fn_b, args=(next(train_iter), ))
        local_results = tf.nest.map_structure(
            distribution_strategy.experimental_local_results,
            distributed_outputs)
        logging.info('Static padding:  local_results= %s', str(local_results))
        static_metrics = {}
        for metric in metrics:
            static_metrics[metric.name] = metric.result()
        for key in static_metrics:
            # We need to investigate the differences on losses.
            if key != 'next_sentence_loss':
                self.assertEqual(dynamic_metrics[key], static_metrics[key])