Пример #1
0
class ProgMaskedLMConfig(masked_lm.MaskedLMConfig):
    """The progressive model config."""
    optimizer_config: optimization.OptimizationConfig = (
        optimization.OptimizationConfig(
            optimizer=optimization.OptimizerConfig(type='adamw'),
            learning_rate=optimization.LrConfig(type='polynomial'),
            warmup=optimization.WarmupConfig(type='polynomial'),
        ))
    stage_list: List[StackingStageConfig] = dataclasses.field(
        default_factory=lambda: [  # pylint: disable=g-long-lambda
            StackingStageConfig(num_layers=3,
                                num_steps=112500,
                                warmup_steps=10000,
                                initial_learning_rate=1e-4,
                                end_learning_rate=1e-4,
                                decay_steps=112500),
            StackingStageConfig(num_layers=6,
                                num_steps=112500,
                                warmup_steps=10000,
                                initial_learning_rate=1e-4,
                                end_learning_rate=1e-4,
                                decay_steps=112500),
            StackingStageConfig(num_layers=12,
                                num_steps=450000,
                                warmup_steps=10000,
                                initial_learning_rate=1e-4,
                                end_learning_rate=0.0,
                                decay_steps=450000)
        ])
Пример #2
0
class LaBSEOptimizationConfig(optimization.OptimizationConfig):
    """Bert optimization config."""
    optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig(
        type="adamw", adamw=AdamWeightDecay())
    learning_rate: optimization.LrConfig = optimization.LrConfig(
        type="polynomial",
        polynomial=PolynomialLr(initial_learning_rate=1e-4,
                                decay_steps=1000000,
                                end_learning_rate=0.0))
    warmup: optimization.WarmupConfig = optimization.WarmupConfig(
        type="polynomial",
        polynomial=PolynomialWarmupConfig(warmup_steps=10000))
Пример #3
0
class OptimizerParams(optimization.OptimizationConfig):
    """Optimizer parameters for MobileBERT-EdgeTPU."""
    optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig(
        type='adamw',
        adamw=optimization.AdamWeightDecayConfig(
            weight_decay_rate=0.01,
            exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']))
    learning_rate: optimization.LrConfig = optimization.LrConfig(
        type='polynomial',
        polynomial=optimization.PolynomialLrConfig(initial_learning_rate=1e-4,
                                                   decay_steps=1000000,
                                                   end_learning_rate=0.0))
    warmup: optimization.WarmupConfig = optimization.WarmupConfig(
        type='polynomial',
        polynomial=optimization.PolynomialWarmupConfig(warmup_steps=10000))
Пример #4
0
class BertOptimizationConfig(optimization.OptimizationConfig):
    """Bert optimization config."""
    optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig(
        type='adamw',
        adamw=AdamWeightDecay(
            weight_decay_rate=0.01,
            exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']))
    learning_rate: optimization.LrConfig = optimization.LrConfig(
        type='polynomial',
        polynomial=PolynomialLr(initial_learning_rate=1e-4,
                                decay_steps=1000000,
                                end_learning_rate=0.0))
    warmup: optimization.WarmupConfig = optimization.WarmupConfig(
        type='polynomial',
        polynomial=PolynomialWarmupConfig(warmup_steps=10000))
Пример #5
0
class TeamsOptimizationConfig(optimization.OptimizationConfig):
  """TEAMS optimization config."""
  optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig(
      type="adamw",
      adamw=AdamWeightDecay(
          weight_decay_rate=0.01,
          exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
          epsilon=1e-6))
  learning_rate: optimization.LrConfig = optimization.LrConfig(
      type="polynomial",
      polynomial=PolynomialLr(
          initial_learning_rate=1e-4,
          decay_steps=1000000,
          end_learning_rate=0.0))
  warmup: optimization.WarmupConfig = optimization.WarmupConfig(
      type="polynomial", polynomial=PolynomialWarmupConfig(warmup_steps=10000))
Пример #6
0
    def prepare_config(self, teacher_block_num, student_block_num,
                       transfer_teacher_layers):
        # using small model for testing
        task_config = distillation.BertDistillationTaskConfig(
            teacher_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig(
                type='mobilebert',
                mobilebert=encoders.MobileBertEncoderConfig(
                    num_blocks=teacher_block_num)),
                                                cls_heads=[
                                                    bert.ClsHeadConfig(
                                                        inner_dim=256,
                                                        num_classes=2,
                                                        dropout_rate=0.1,
                                                        name='next_sentence')
                                                ],
                                                mlm_activation='gelu'),
            student_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig(
                type='mobilebert',
                mobilebert=encoders.MobileBertEncoderConfig(
                    num_blocks=student_block_num)),
                                                cls_heads=[
                                                    bert.ClsHeadConfig(
                                                        inner_dim=256,
                                                        num_classes=2,
                                                        dropout_rate=0.1,
                                                        name='next_sentence')
                                                ],
                                                mlm_activation='relu'),
            train_data=pretrain_dataloader.BertPretrainDataConfig(
                input_path='dummy',
                max_predictions_per_seq=76,
                seq_length=512,
                global_batch_size=10),
            validation_data=pretrain_dataloader.BertPretrainDataConfig(
                input_path='dummy',
                max_predictions_per_seq=76,
                seq_length=512,
                global_batch_size=10))

        # set only 1 step for each stage
        progressive_config = distillation.BertDistillationProgressiveConfig()
        progressive_config.layer_wise_distill_config.transfer_teacher_layers = (
            transfer_teacher_layers)
        progressive_config.layer_wise_distill_config.num_steps = 1
        progressive_config.pretrain_distill_config.num_steps = 1

        optimization_config = optimization.OptimizationConfig(
            optimizer=optimization.OptimizerConfig(
                type='lamb',
                lamb=optimization.LAMBConfig(weight_decay_rate=0.0001,
                                             exclude_from_weight_decay=[
                                                 'LayerNorm', 'layer_norm',
                                                 'bias', 'no_norm'
                                             ])),
            learning_rate=optimization.LrConfig(
                type='polynomial',
                polynomial=optimization.PolynomialLrConfig(
                    initial_learning_rate=1.5e-3,
                    decay_steps=10000,
                    end_learning_rate=1.5e-3)),
            warmup=optimization.WarmupConfig(
                type='linear',
                linear=optimization.LinearWarmupConfig(
                    warmup_learning_rate=0)))

        exp_config = cfg.ExperimentConfig(
            task=task_config,
            trainer=prog_trainer_lib.ProgressiveTrainerConfig(
                progressive=progressive_config,
                optimizer_config=optimization_config))

        # Create a teacher model checkpoint.
        teacher_encoder = encoders.build_encoder(
            task_config.teacher_model.encoder)
        pretrainer_config = task_config.teacher_model
        if pretrainer_config.cls_heads:
            teacher_cls_heads = [
                layers.ClassificationHead(**cfg.as_dict())
                for cfg in pretrainer_config.cls_heads
            ]
        else:
            teacher_cls_heads = []

        masked_lm = layers.MobileBertMaskedLM(
            embedding_table=teacher_encoder.get_embedding_table(),
            activation=tf_utils.get_activation(
                pretrainer_config.mlm_activation),
            initializer=tf.keras.initializers.TruncatedNormal(
                stddev=pretrainer_config.mlm_initializer_range),
            name='cls/predictions')
        teacher_pretrainer = models.BertPretrainerV2(
            encoder_network=teacher_encoder,
            classification_heads=teacher_cls_heads,
            customized_masked_lm=masked_lm)

        # The model variables will be created after the forward call.
        _ = teacher_pretrainer(teacher_pretrainer.inputs)
        teacher_pretrainer_ckpt = tf.train.Checkpoint(
            **teacher_pretrainer.checkpoint_items)
        teacher_ckpt_path = os.path.join(self.get_temp_dir(),
                                         'teacher_model.ckpt')
        teacher_pretrainer_ckpt.save(teacher_ckpt_path)
        exp_config.task.teacher_model_init_checkpoint = self.get_temp_dir()

        return exp_config
Пример #7
0
from official.core import config_definitions as cfg
from official.core import train_utils
from official.modeling import hyperparams
from official.modeling import optimization
from official.modeling import performance
from official.modeling.progressive import train_lib
from official.modeling.progressive import trainer as prog_trainer_lib
from official.nlp.data import pretrain_dataloader
from official.nlp.projects.mobilebert import distillation

FLAGS = flags.FLAGS

optimization_config = optimization.OptimizationConfig(
    optimizer=optimization.OptimizerConfig(
        type='lamb',
        lamb=optimization.LAMBConfig(
            weight_decay_rate=0.01,
            exclude_from_weight_decay=['LayerNorm', 'bias', 'norm'],
            clipnorm=1.0)),
    learning_rate=optimization.LrConfig(
        type='polynomial',
        polynomial=optimization.PolynomialLrConfig(
            initial_learning_rate=1.5e-3,
            decay_steps=10000,
            end_learning_rate=1.5e-3)),
    warmup=optimization.WarmupConfig(
        type='linear',
        linear=optimization.LinearWarmupConfig(warmup_learning_rate=0)))


# copy from progressive/utils.py due to the private visibility issue.
def config_override(params, flags_obj):