class BertDistillationTaskConfig(cfg.TaskConfig): """Defines the teacher/student model architecture and training data.""" teacher_model: bert.PretrainerConfig = bert.PretrainerConfig( encoder=encoders.EncoderConfig(type='mobilebert')) student_model: bert.PretrainerConfig = bert.PretrainerConfig( encoder=encoders.EncoderConfig(type='mobilebert')) # The path to the teacher model checkpoint or its directory. teacher_model_init_checkpoint: str = '' train_data: cfg.DataConfig = cfg.DataConfig() validation_data: cfg.DataConfig = cfg.DataConfig()
def test_task(self): config = masked_lm.MaskedLMConfig( init_checkpoint=self.get_temp_dir(), scale_loss=True, model=bert.PretrainerConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig( inner_dim=10, num_classes=2, name="next_sentence") ]), train_data=pretrain_dataloader.BertPretrainDataConfig( input_path="dummy", max_predictions_per_seq=20, seq_length=128, global_batch_size=1)) task = masked_lm.MaskedLMTask(config) model = task.build_model() metrics = task.build_metrics() dataset = task.build_inputs(config.train_data) iterator = iter(dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.train_step(next(iterator), model, optimizer, metrics=metrics) task.validation_step(next(iterator), model, metrics=metrics) # Saves a checkpoint. ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items) ckpt.save(config.init_checkpoint) task.initialize(model)
def get_exp_config(): return cfg.ExperimentConfig( task=cfg.TaskConfig(model=bert.PretrainerConfig()), trainer=trainer_lib.ProgressiveTrainerConfig( export_checkpoint=True, export_checkpoint_interval=1, export_only_final_stage_ckpt=False))
def setUp(self): super(ProgressiveMaskedLMTest, self).setUp() self.task_config = progressive_masked_lm.ProgMaskedLMConfig( model=bert.PretrainerConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=2)), cls_heads=[ bert.ClsHeadConfig( inner_dim=10, num_classes=2, name="next_sentence") ]), train_data=pretrain_dataloader.BertPretrainDataConfig( input_path="dummy", max_predictions_per_seq=20, seq_length=128, global_batch_size=1), validation_data=pretrain_dataloader.BertPretrainDataConfig( input_path="dummy", max_predictions_per_seq=20, seq_length=128, global_batch_size=1), stage_list=[ progressive_masked_lm.StackingStageConfig( num_layers=1, num_steps=4), progressive_masked_lm.StackingStageConfig( num_layers=2, num_steps=8), ], ) self.exp_config = cfg.ExperimentConfig( task=self.task_config, trainer=prog_trainer_lib.ProgressiveTrainerConfig())
def roformer_pretraining() -> cfg.ExperimentConfig: """BERT pretraining experiment.""" config = cfg.ExperimentConfig( runtime=cfg.RuntimeConfig(enable_xla=True), task=masked_lm.MaskedLMConfig( model=bert.PretrainerConfig( encoder=encoders.EncoderConfig( type='any', any=roformer.RoformerEncoderConfig()), cls_heads=[ bert.ClsHeadConfig( inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence') ]), train_data=pretrain_dataloader.BertPretrainDataConfig( use_v2_feature_names=True), validation_data=pretrain_dataloader.BertPretrainDataConfig( use_v2_feature_names=True, is_training=False)), trainer=cfg.TrainerConfig( optimizer_config=RoformerOptimizationConfig(), train_steps=1000000), restrictions=[ 'task.train_data.is_training != None', 'task.validation_data.is_training != None' ]) return config
def test_task(self): config = sentence_prediction.SentencePredictionConfig( init_checkpoint=self.get_temp_dir(), model=self.get_model_config(2), train_data=self._train_data_config) task = sentence_prediction.SentencePredictionTask(config) model = task.build_model() metrics = task.build_metrics() dataset = task.build_inputs(config.train_data) iterator = iter(dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.train_step(next(iterator), model, optimizer, metrics=metrics) task.validation_step(next(iterator), model, metrics=metrics) # Saves a checkpoint. pretrain_cfg = bert.PretrainerConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig( inner_dim=10, num_classes=3, name="next_sentence") ]) pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg) ckpt = tf.train.Checkpoint( model=pretrain_model, **pretrain_model.checkpoint_items) ckpt.save(config.init_checkpoint) task.initialize(model)
def test_task_determinism(self): config = masked_lm.MaskedLMConfig( init_checkpoint=self.get_temp_dir(), scale_loss=True, model=bert.PretrainerConfig( encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig( vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig(inner_dim=10, num_classes=2, name="next_sentence") ]), train_data=pretrain_dataloader.BertPretrainDataConfig( max_predictions_per_seq=20, seq_length=128, global_batch_size=1)) tf.keras.utils.set_random_seed(1) logs1, validation_logs1, weights1 = self._build_and_run_model(config) tf.keras.utils.set_random_seed(1) logs2, validation_logs2, weights2 = self._build_and_run_model(config) self.assertEqual(logs1["loss"], logs2["loss"]) self.assertEqual(validation_logs1["loss"], validation_logs2["loss"]) for weight1, weight2 in zip(weights1, weights2): self.assertAllEqual(weight1, weight2)
def test_task(self, init_cls_pooler): # Saves a checkpoint. pretrain_cfg = bert.PretrainerConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig( inner_dim=768, num_classes=2, name="next_sentence") ]) pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg) # The model variables will be created after the forward call. _ = pretrain_model(pretrain_model.inputs) ckpt = tf.train.Checkpoint( model=pretrain_model, **pretrain_model.checkpoint_items) init_path = ckpt.save(self.get_temp_dir()) # Creates the task. config = sentence_prediction.SentencePredictionConfig( init_checkpoint=init_path, model=self.get_model_config(num_classes=2), train_data=self._train_data_config, init_cls_pooler=init_cls_pooler) task = sentence_prediction.SentencePredictionTask(config) model = task.build_model() metrics = task.build_metrics() dataset = task.build_inputs(config.train_data) iterator = iter(dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.initialize(model) task.train_step(next(iterator), model, optimizer, metrics=metrics) task.validation_step(next(iterator), model, metrics=metrics)
class MaskedLMConfig(cfg.TaskConfig): """The model config.""" model: bert.PretrainerConfig = bert.PretrainerConfig(cls_heads=[ bert.ClsHeadConfig(inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence') ]) train_data: cfg.DataConfig = cfg.DataConfig() validation_data: cfg.DataConfig = cfg.DataConfig()
class MaskedLMConfig(cfg.TaskConfig): """The model config.""" model: bert.PretrainerConfig = bert.PretrainerConfig(cls_heads=[ bert.ClsHeadConfig( inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence') ]) # TODO(b/154564893): Mathematically, scale_loss should be True. # However, it works better with scale_loss being False. scale_loss: bool = False train_data: cfg.DataConfig = cfg.DataConfig() validation_data: cfg.DataConfig = cfg.DataConfig()
def test_masked_lm(self, use_v2_feature_names): if use_v2_feature_names: input_word_ids_field = "input_word_ids" input_type_ids_field = "input_type_ids" else: input_word_ids_field = "input_ids" input_type_ids_field = "segment_ids" config = masked_lm.MaskedLMConfig(model=bert.PretrainerConfig( encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig( vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig( inner_dim=10, num_classes=2, name="next_sentence") ])) task = masked_lm.MaskedLMTask(config) model = task.build_model() params = serving_modules.MaskedLM.Params( parse_sequence_length=10, max_predictions_per_seq=5, use_v2_feature_names=use_v2_feature_names) export_module = serving_modules.MaskedLM(params=params, model=model) functions = export_module.get_inference_signatures({ "serve": "serving_default", "serve_examples": "serving_examples" }) self.assertSameElements(functions.keys(), ["serving_default", "serving_examples"]) dummy_ids = tf.ones((10, 10), dtype=tf.int32) dummy_pos = tf.ones((10, 5), dtype=tf.int32) outputs = functions["serving_default"](input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids, masked_lm_positions=dummy_pos) self.assertEqual(outputs["classification"].shape, (10, 2)) dummy_ids = tf.ones((10, ), dtype=tf.int32) dummy_pos = tf.ones((5, ), dtype=tf.int32) examples = _create_fake_serialized_examples({ input_word_ids_field: dummy_ids, "input_mask": dummy_ids, input_type_ids_field: dummy_ids, "masked_lm_positions": dummy_pos }) outputs = functions["serving_examples"](examples) self.assertEqual(outputs["classification"].shape, (10, 2))
class MaskedLMConfig(cfg.TaskConfig): """The model config.""" init_checkpoint: str = '' model: bert.PretrainerConfig = bert.PretrainerConfig( cls_heads=[ bert.ClsHeadConfig(inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence') ], encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig())) scale_loss: bool = False train_data: pretrain_dataloader.BertPretrainDataConfig = pretrain_dataloader.BertPretrainDataConfig( ) small_train_data: pretrain_dataloader.BertPretrainDataConfig = pretrain_dataloader.BertPretrainDataConfig( ) validation_data: pretrain_dataloader.BertPretrainDataConfig = pretrain_dataloader.BertPretrainDataConfig( )
def test_configure_optimizer(self, mixed_precision_dtype, loss_scale): config = cfg.ExperimentConfig( task=cfg.TaskConfig( model=bert.PretrainerConfig()), runtime=cfg.RuntimeConfig( mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale), trainer=trainer_lib.ProgressiveTrainerConfig( export_checkpoint=True, export_checkpoint_interval=1, export_only_final_stage_ckpt=False)) task = TestPolicy(None, config.task) trainer = trainer_lib.ProgressiveTrainer(config, task, self.get_temp_dir()) if mixed_precision_dtype != 'float16': self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD) elif mixed_precision_dtype == 'float16' and loss_scale is None: self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD) metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32)) self.assertIn('training_loss', metrics)
def test_task(self, version_2_with_negative, tokenization): # Saves a checkpoint. pretrain_cfg = bert.PretrainerConfig( encoder=self._encoder_config, cls_heads=[ bert.ClsHeadConfig( inner_dim=10, num_classes=3, name="next_sentence") ]) pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg) ckpt = tf.train.Checkpoint( model=pretrain_model, **pretrain_model.checkpoint_items) saved_path = ckpt.save(self.get_temp_dir()) config = question_answering.QuestionAnsweringConfig( init_checkpoint=saved_path, model=question_answering.ModelConfig(encoder=self._encoder_config), train_data=self._train_data_config, validation_data=self._get_validation_data_config( version_2_with_negative)) self._run_task(config)
def test_masked_lm(self): config = masked_lm.MaskedLMConfig( model=bert.PretrainerConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig(inner_dim=10, num_classes=2, name="foo") ])) task = masked_lm.MaskedLMTask(config) model = task.build_model() ckpt = tf.train.Checkpoint(model=model) ckpt_path = ckpt.save(self.get_temp_dir()) export_module_cls = export_savedmodel.lookup_export_module(task) serving_params = { "cls_head_name": "foo", "parse_sequence_length": 10, "max_predictions_per_seq": 5 } params = export_module_cls.Params(**serving_params) export_module = export_module_cls(params=params, model=model) export_dir = export_savedmodel_util.export( export_module, function_keys={ "serve": "serving_default", "serve_examples": "serving_examples" }, checkpoint_path=ckpt_path, export_savedmodel_dir=self.get_temp_dir()) imported = tf.saved_model.load(export_dir) self.assertSameElements(imported.signatures.keys(), ["serving_default", "serving_examples"]) serving_fn = imported.signatures["serving_default"] dummy_ids = tf.ones((1, 10), dtype=tf.int32) dummy_pos = tf.ones((1, 5), dtype=tf.int32) outputs = serving_fn( input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids, masked_lm_positions=dummy_pos) self.assertEqual(outputs["classification"].shape, (1, 2))
def token_drop_bert_pretraining() -> cfg.ExperimentConfig: """BERT pretraining with token dropping.""" config = cfg.ExperimentConfig( runtime=cfg.RuntimeConfig(enable_xla=True), task=masked_lm.TokenDropMaskedLMConfig( model=bert.PretrainerConfig(encoder=encoders.EncoderConfig( any=encoder_config.TokenDropBertEncoderConfig( vocab_size=30522, num_layers=1, token_keep_k=64), type='any')), train_data=pretrain_dataloader.BertPretrainDataConfig(), validation_data=pretrain_dataloader.BertPretrainDataConfig( is_training=False)), trainer=cfg.TrainerConfig( train_steps=1000000, optimizer_config=optimization.OptimizationConfig({ 'optimizer': { 'type': 'adamw', 'adamw': { 'weight_decay_rate': 0.01, 'exclude_from_weight_decay': ['LayerNorm', 'layer_norm', 'bias'], } }, 'learning_rate': { 'type': 'polynomial', 'polynomial': { 'initial_learning_rate': 1e-4, 'end_learning_rate': 0.0, } }, 'warmup': { 'type': 'polynomial' } })), restrictions=[ 'task.train_data.is_training != None', 'task.validation_data.is_training != None' ]) return config
def test_task(self): config = dual_encoder.DualEncoderConfig( init_checkpoint=self.get_temp_dir(), model=self.get_model_config(), train_data=self._train_data_config) task = dual_encoder.DualEncoderTask(config) model = task.build_model() metrics = task.build_metrics() dataset = task.build_inputs(config.train_data) iterator = iter(dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.train_step(next(iterator), model, optimizer, metrics=metrics) task.validation_step(next(iterator), model, metrics=metrics) # Saves a checkpoint. pretrain_cfg = bert.PretrainerConfig(encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1))) pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg) ckpt = tf.train.Checkpoint(model=pretrain_model, **pretrain_model.checkpoint_items) ckpt.save(config.init_checkpoint) task.initialize(model)
def prepare_config(self, teacher_block_num, student_block_num, transfer_teacher_layers): # using small model for testing task_config = distillation.BertDistillationTaskConfig( teacher_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig( type='mobilebert', mobilebert=encoders.MobileBertEncoderConfig( num_blocks=teacher_block_num)), cls_heads=[ bert.ClsHeadConfig( inner_dim=256, num_classes=2, dropout_rate=0.1, name='next_sentence') ], mlm_activation='gelu'), student_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig( type='mobilebert', mobilebert=encoders.MobileBertEncoderConfig( num_blocks=student_block_num)), cls_heads=[ bert.ClsHeadConfig( inner_dim=256, num_classes=2, dropout_rate=0.1, name='next_sentence') ], mlm_activation='relu'), train_data=pretrain_dataloader.BertPretrainDataConfig( input_path='dummy', max_predictions_per_seq=76, seq_length=512, global_batch_size=10), validation_data=pretrain_dataloader.BertPretrainDataConfig( input_path='dummy', max_predictions_per_seq=76, seq_length=512, global_batch_size=10)) # set only 1 step for each stage progressive_config = distillation.BertDistillationProgressiveConfig() progressive_config.layer_wise_distill_config.transfer_teacher_layers = ( transfer_teacher_layers) progressive_config.layer_wise_distill_config.num_steps = 1 progressive_config.pretrain_distill_config.num_steps = 1 optimization_config = optimization.OptimizationConfig( optimizer=optimization.OptimizerConfig( type='lamb', lamb=optimization.LAMBConfig(weight_decay_rate=0.0001, exclude_from_weight_decay=[ 'LayerNorm', 'layer_norm', 'bias', 'no_norm' ])), learning_rate=optimization.LrConfig( type='polynomial', polynomial=optimization.PolynomialLrConfig( initial_learning_rate=1.5e-3, decay_steps=10000, end_learning_rate=1.5e-3)), warmup=optimization.WarmupConfig( type='linear', linear=optimization.LinearWarmupConfig( warmup_learning_rate=0))) exp_config = cfg.ExperimentConfig( task=task_config, trainer=prog_trainer_lib.ProgressiveTrainerConfig( progressive=progressive_config, optimizer_config=optimization_config)) # Create a teacher model checkpoint. teacher_encoder = encoders.build_encoder( task_config.teacher_model.encoder) pretrainer_config = task_config.teacher_model if pretrainer_config.cls_heads: teacher_cls_heads = [ layers.ClassificationHead(**cfg.as_dict()) for cfg in pretrainer_config.cls_heads ] else: teacher_cls_heads = [] masked_lm = layers.MobileBertMaskedLM( embedding_table=teacher_encoder.get_embedding_table(), activation=tf_utils.get_activation( pretrainer_config.mlm_activation), initializer=tf.keras.initializers.TruncatedNormal( stddev=pretrainer_config.mlm_initializer_range), name='cls/predictions') teacher_pretrainer = models.BertPretrainerV2( encoder_network=teacher_encoder, classification_heads=teacher_cls_heads, customized_masked_lm=masked_lm) # The model variables will be created after the forward call. _ = teacher_pretrainer(teacher_pretrainer.inputs) teacher_pretrainer_ckpt = tf.train.Checkpoint( **teacher_pretrainer.checkpoint_items) teacher_ckpt_path = os.path.join(self.get_temp_dir(), 'teacher_model.ckpt') teacher_pretrainer_ckpt.save(teacher_ckpt_path) exp_config.task.teacher_model_init_checkpoint = self.get_temp_dir() return exp_config
def test_distribution_strategy(self, distribution_strategy): max_seq_length = 128 batch_size = 8 input_path = os.path.join(self.get_temp_dir(), 'train.tf_record') _create_fake_dataset(input_path, seq_length=60, num_masked_tokens=20, max_seq_length=max_seq_length, num_examples=batch_size) data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig( is_training=False, input_path=input_path, seq_bucket_lengths=[64, 128], global_batch_size=batch_size) dataloader = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader( data_config) distributed_ds = orbit.utils.make_distributed_dataset( distribution_strategy, dataloader.load) train_iter = iter(distributed_ds) with distribution_strategy.scope(): config = masked_lm.MaskedLMConfig( init_checkpoint=self.get_temp_dir(), model=bert.PretrainerConfig( encoders.EncoderConfig(bert=encoders.BertEncoderConfig( vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig(inner_dim=10, num_classes=2, name='next_sentence') ]), train_data=data_config) task = masked_lm.MaskedLMTask(config) model = task.build_model() metrics = task.build_metrics() @tf.function def step_fn(features): return task.validation_step(features, model, metrics=metrics) distributed_outputs = distribution_strategy.run( step_fn, args=(next(train_iter), )) local_results = tf.nest.map_structure( distribution_strategy.experimental_local_results, distributed_outputs) logging.info('Dynamic padding: local_results= %s', str(local_results)) dynamic_metrics = {} for metric in metrics: dynamic_metrics[metric.name] = metric.result() data_config = pretrain_dataloader.BertPretrainDataConfig( is_training=False, input_path=input_path, seq_length=max_seq_length, max_predictions_per_seq=20, global_batch_size=batch_size) dataloader = pretrain_dataloader.BertPretrainDataLoader(data_config) distributed_ds = orbit.utils.make_distributed_dataset( distribution_strategy, dataloader.load) train_iter = iter(distributed_ds) with distribution_strategy.scope(): metrics = task.build_metrics() @tf.function def step_fn_b(features): return task.validation_step(features, model, metrics=metrics) distributed_outputs = distribution_strategy.run( step_fn_b, args=(next(train_iter), )) local_results = tf.nest.map_structure( distribution_strategy.experimental_local_results, distributed_outputs) logging.info('Static padding: local_results= %s', str(local_results)) static_metrics = {} for metric in metrics: static_metrics[metric.name] = metric.result() for key in static_metrics: # We need to investigate the differences on losses. if key != 'next_sentence_loss': self.assertEqual(dynamic_metrics[key], static_metrics[key])