def test_task(self, num_shared_hidden_layers, num_task_agnostic_layers): config = teams_task.TeamsPretrainTaskConfig( model=teams.TeamsPretrainerConfig( generator=encoders.BertEncoderConfig( vocab_size=30522, num_layers=2), discriminator=encoders.BertEncoderConfig( vocab_size=30522, num_layers=2), num_shared_generator_hidden_layers=num_shared_hidden_layers, num_discriminator_task_agnostic_layers=num_task_agnostic_layers, ), train_data=pretrain_dataloader.BertPretrainDataConfig( input_path="dummy", max_predictions_per_seq=20, seq_length=128, global_batch_size=1)) task = teams_task.TeamsPretrainTask(config) model = task.build_model() metrics = task.build_metrics() dataset = task.build_inputs(config.train_data) iterator = iter(dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.train_step(next(iterator), model, optimizer, metrics=metrics) task.validation_step(next(iterator), model, metrics=metrics)
def test_task(self): config = electra_task.ElectraPretrainConfig( model=electra.ElectraPretrainerConfig( generator_encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), discriminator_encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), num_masked_tokens=20, sequence_length=128, cls_heads=[ bert.ClsHeadConfig(inner_dim=10, num_classes=2, name="next_sentence") ]), train_data=pretrain_dataloader.BertPretrainDataConfig( input_path="dummy", max_predictions_per_seq=20, seq_length=128, global_batch_size=1)) task = electra_task.ElectraPretrainTask(config) model = task.build_model() metrics = task.build_metrics() dataset = task.build_inputs(config.train_data) iterator = iter(dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.train_step(next(iterator), model, optimizer, metrics=metrics) task.validation_step(next(iterator), model, metrics=metrics)
def test_sentence_prediction_text(self, inputs_only): vocab_file_path = os.path.join(self.get_temp_dir(), "vocab.txt") _create_fake_vocab_file(vocab_file_path) config = sentence_prediction.SentencePredictionConfig( model=sentence_prediction.ModelConfig( encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig( vocab_size=30522, num_layers=1)), num_classes=2)) task = sentence_prediction.SentencePredictionTask(config) model = task.build_model() params = serving_modules.SentencePrediction.Params( inputs_only=inputs_only, parse_sequence_length=10, text_fields=["foo", "bar"], vocab_file=vocab_file_path) export_module = serving_modules.SentencePrediction(params=params, model=model) examples = _create_fake_serialized_examples({ "foo": b"hello world", "bar": b"hello world" }) functions = export_module.get_inference_signatures({ "serve_text_examples": "serving_default", }) outputs = functions["serving_default"](examples) self.assertEqual(outputs["outputs"].shape, (10, 2))
def test_task(self, init_cls_pooler): # Saves a checkpoint. pretrain_cfg = bert.PretrainerConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig( inner_dim=768, num_classes=2, name="next_sentence") ]) pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg) # The model variables will be created after the forward call. _ = pretrain_model(pretrain_model.inputs) ckpt = tf.train.Checkpoint( model=pretrain_model, **pretrain_model.checkpoint_items) init_path = ckpt.save(self.get_temp_dir()) # Creates the task. config = sentence_prediction.SentencePredictionConfig( init_checkpoint=init_path, model=self.get_model_config(num_classes=2), train_data=self._train_data_config, init_cls_pooler=init_cls_pooler) task = sentence_prediction.SentencePredictionTask(config) model = task.build_model() metrics = task.build_metrics() dataset = task.build_inputs(config.train_data) iterator = iter(dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.initialize(model) task.train_step(next(iterator), model, optimizer, metrics=metrics) task.validation_step(next(iterator), model, metrics=metrics)
def test_tfr_bert_model_builder(self): encoder_config = encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)) encoder_network = encoders.build_encoder(encoder_config) preprocess_dict = {} scorer = tfrbert_task.TFRBertScorer(encoder=encoder_network, bert_output_dropout=0.1) example_feature_spec = { 'input_word_ids': tf.io.FixedLenFeature(shape=(None, ), dtype=tf.int64), 'input_mask': tf.io.FixedLenFeature(shape=(None, ), dtype=tf.int64), 'input_type_ids': tf.io.FixedLenFeature(shape=(None, ), dtype=tf.int64) } context_feature_spec = {} model_builder = tfrbert_task.TFRBertModelBuilder( input_creator=tfr_model.FeatureSpecInputCreator( context_feature_spec, example_feature_spec), preprocessor=tfr_model.PreprocessorWithSpec(preprocess_dict), scorer=scorer, mask_feature_name='example_list_mask', name='tfrbert_model') model = model_builder.build() output = model(self._create_input_data()) self.assertAllEqual(output.shape.as_list(), [12, 10])
def test_task(self): config = masked_lm.MaskedLMConfig( init_checkpoint=self.get_temp_dir(), scale_loss=True, model=bert.PretrainerConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig( inner_dim=10, num_classes=2, name="next_sentence") ]), train_data=pretrain_dataloader.BertPretrainDataConfig( input_path="dummy", max_predictions_per_seq=20, seq_length=128, global_batch_size=1)) task = masked_lm.MaskedLMTask(config) model = task.build_model() metrics = task.build_metrics() dataset = task.build_inputs(config.train_data) iterator = iter(dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.train_step(next(iterator), model, optimizer, metrics=metrics) task.validation_step(next(iterator), model, metrics=metrics) # Saves a checkpoint. ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items) ckpt.save(config.init_checkpoint) task.initialize(model)
def test_sentence_prediction(self): config = sentence_prediction.SentencePredictionConfig( model=sentence_prediction.ModelConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), num_classes=2)) task = sentence_prediction.SentencePredictionTask(config) model = task.build_model() ckpt = tf.train.Checkpoint(model=model) ckpt_path = ckpt.save(self.get_temp_dir()) export_module_cls = export_savedmodel.lookup_export_module(task) serving_params = {"inputs_only": False} params = export_module_cls.Params(**serving_params) export_module = export_module_cls(params=params, model=model) export_dir = export_savedmodel_util.export( export_module, function_keys=["serve"], checkpoint_path=ckpt_path, export_savedmodel_dir=self.get_temp_dir()) imported = tf.saved_model.load(export_dir) serving_fn = imported.signatures["serving_default"] dummy_ids = tf.ones((1, 5), dtype=tf.int32) inputs = dict( input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) ref_outputs = model(inputs) outputs = serving_fn(**inputs) self.assertAllClose(ref_outputs, outputs["outputs"]) self.assertEqual(outputs["outputs"].shape, (1, 2))
def _create_bert_ckpt(self): config = encoders.EncoderConfig( type='bert', bert=encoders.BertEncoderConfig(num_layers=1)) encoder = encoders.build_encoder(config) ckpt = tf.train.Checkpoint(encoder=encoder) ckpt_path = ckpt.save(os.path.join(self._logging_dir, 'ckpt')) return ckpt_path
def setUp(self): super(ProgressiveMaskedLMTest, self).setUp() self.task_config = progressive_masked_lm.ProgMaskedLMConfig( model=bert.PretrainerConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=2)), cls_heads=[ bert.ClsHeadConfig( inner_dim=10, num_classes=2, name="next_sentence") ]), train_data=pretrain_dataloader.BertPretrainDataConfig( input_path="dummy", max_predictions_per_seq=20, seq_length=128, global_batch_size=1), validation_data=pretrain_dataloader.BertPretrainDataConfig( input_path="dummy", max_predictions_per_seq=20, seq_length=128, global_batch_size=1), stage_list=[ progressive_masked_lm.StackingStageConfig( num_layers=1, num_steps=4), progressive_masked_lm.StackingStageConfig( num_layers=2, num_steps=8), ], ) self.exp_config = cfg.ExperimentConfig( task=self.task_config, trainer=prog_trainer_lib.ProgressiveTrainerConfig())
def test_task(self): config = sentence_prediction.SentencePredictionConfig( init_checkpoint=self.get_temp_dir(), model=self.get_model_config(2), train_data=self._train_data_config) task = sentence_prediction.SentencePredictionTask(config) model = task.build_model() metrics = task.build_metrics() dataset = task.build_inputs(config.train_data) iterator = iter(dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.train_step(next(iterator), model, optimizer, metrics=metrics) task.validation_step(next(iterator), model, metrics=metrics) # Saves a checkpoint. pretrain_cfg = bert.PretrainerConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig( inner_dim=10, num_classes=3, name="next_sentence") ]) pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg) ckpt = tf.train.Checkpoint( model=pretrain_model, **pretrain_model.checkpoint_items) ckpt.save(config.init_checkpoint) task.initialize(model)
def test_task_determinism(self): config = masked_lm.MaskedLMConfig( init_checkpoint=self.get_temp_dir(), scale_loss=True, model=bert.PretrainerConfig( encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig( vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig(inner_dim=10, num_classes=2, name="next_sentence") ]), train_data=pretrain_dataloader.BertPretrainDataConfig( max_predictions_per_seq=20, seq_length=128, global_batch_size=1)) tf.keras.utils.set_random_seed(1) logs1, validation_logs1, weights1 = self._build_and_run_model(config) tf.keras.utils.set_random_seed(1) logs2, validation_logs2, weights2 = self._build_and_run_model(config) self.assertEqual(logs1["loss"], logs2["loss"]) self.assertEqual(validation_logs1["loss"], validation_logs2["loss"]) for weight1, weight2 in zip(weights1, weights2): self.assertAllEqual(weight1, weight2)
class TeamsPretrainerConfig(base_config.Config): """Teams pretrainer configuration.""" # Candidate size for multi-word selection task, including the correct word. candidate_size: int = 5 # Weight for the generator masked language model task. generator_loss_weight: float = 1.0 # Weight for the replaced token detection task. discriminator_rtd_loss_weight: float = 5.0 # Weight for the multi-word selection task. discriminator_mws_loss_weight: float = 2.0 # Whether share embedding network between generator and discriminator. tie_embeddings: bool = True # Number of bottom layers shared between generator and discriminator. # Non-positive value implies no sharing. num_shared_generator_hidden_layers: int = 3 # Number of bottom layers shared between different discriminator tasks. num_discriminator_task_agnostic_layers: int = 11 generator: encoders.BertEncoderConfig = encoders.BertEncoderConfig() discriminator: encoders.BertEncoderConfig = encoders.BertEncoderConfig()
def test_tagging(self, use_v2_feature_names, output_encoder_outputs): if use_v2_feature_names: input_word_ids_field = "input_word_ids" input_type_ids_field = "input_type_ids" else: input_word_ids_field = "input_ids" input_type_ids_field = "segment_ids" hidden_size = 768 num_classes = 3 config = tagging.TaggingConfig( model=tagging.ModelConfig(encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(hidden_size=hidden_size, num_layers=1))), class_names=["class_0", "class_1", "class_2"]) task = tagging.TaggingTask(config) model = task.build_model() params = serving_modules.Tagging.Params( parse_sequence_length=10, use_v2_feature_names=use_v2_feature_names, output_encoder_outputs=output_encoder_outputs) export_module = serving_modules.Tagging(params=params, model=model) functions = export_module.get_inference_signatures({ "serve": "serving_default", "serve_examples": "serving_examples" }) dummy_ids = tf.ones((10, 10), dtype=tf.int32) outputs = functions["serving_default"](input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) self.assertEqual(outputs["logits"].shape, (10, 10, num_classes)) if output_encoder_outputs: self.assertEqual(outputs["encoder_outputs"].shape, (10, 10, hidden_size)) dummy_ids = tf.ones((10, ), dtype=tf.int32) examples = _create_fake_serialized_examples({ input_word_ids_field: dummy_ids, "input_mask": dummy_ids, input_type_ids_field: dummy_ids }) outputs = functions["serving_examples"](examples) self.assertEqual(outputs["logits"].shape, (10, 10, num_classes)) if output_encoder_outputs: self.assertEqual(outputs["encoder_outputs"].shape, (10, 10, hidden_size)) with self.assertRaises(ValueError): _ = export_module.get_inference_signatures({"foo": None})
def _export_bert_tfhub(self): encoder = encoders.build_encoder( encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1))) encoder_inputs_dict = {x.name: x for x in encoder.inputs} encoder_output_dict = encoder(encoder_inputs_dict) core_model = tf.keras.Model( inputs=encoder_inputs_dict, outputs=encoder_output_dict) hub_destination = os.path.join(self.get_temp_dir(), "hub") core_model.save(hub_destination, include_optimizer=False, save_format="tf") return hub_destination
def test_copy_pooler_dense_to_encoder(self): encoder_config = encoders.EncoderConfig( type="bert", bert=encoders.BertEncoderConfig(hidden_size=24, intermediate_size=48, num_layers=2)) cls_heads = [ layers.ClassificationHead(inner_dim=24, num_classes=2, name="next_sentence") ] encoder = encoders.build_encoder(encoder_config) pretrainer = models.BertPretrainerV2( encoder_network=encoder, classification_heads=cls_heads, mlm_activation=tf_utils.get_activation( encoder_config.get().hidden_activation)) # Makes sure the pretrainer variables are created. _ = pretrainer(pretrainer.inputs) checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items) model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint") checkpoint.save(os.path.join(model_checkpoint_dir, "test")) vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy( self.get_temp_dir(), use_sp_model=True) export_path = os.path.join(self.get_temp_dir(), "hub") export_tfhub_lib.export_model( export_path=export_path, encoder_config=encoder_config, model_checkpoint_path=tf.train.latest_checkpoint( model_checkpoint_dir), with_mlm=True, copy_pooler_dense_to_encoder=True, vocab_file=vocab_file, sp_model_file=sp_model_file, do_lower_case=True) # Restores a hub KerasLayer. hub_layer = hub.KerasLayer(export_path, trainable=True) dummy_ids = np.zeros((2, 10), dtype=np.int32) input_dict = dict(input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) hub_pooled_output = hub_layer(input_dict)["pooled_output"] encoder_outputs = encoder(input_dict) # Verify that hub_layer's pooled_output is the same as the output of next # sentence prediction's dense layer. pretrained_pooled_output = cls_heads[0].dense( (encoder_outputs["sequence_output"][:, 0, :])) self.assertAllClose(hub_pooled_output, pretrained_pooled_output) # But the pooled_output between encoder and hub_layer are not the same. encoder_pooled_output = encoder_outputs["pooled_output"] self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
def test_encoder_from_yaml(self): config = encoders.EncoderConfig( type="bert", bert=encoders.BertEncoderConfig(num_layers=1)) encoder = encoders.build_encoder(config) ckpt = tf.train.Checkpoint(encoder=encoder) ckpt_path = ckpt.save(self.get_temp_dir() + "/ckpt") params_save_path = os.path.join(self.get_temp_dir(), "params.yaml") hyperparams.save_params_dict_to_yaml(config, params_save_path) retored_cfg = encoders.EncoderConfig.from_yaml(params_save_path) retored_encoder = encoders.build_encoder(retored_cfg) status = tf.train.Checkpoint(encoder=retored_encoder).restore(ckpt_path) status.assert_consumed()
def test_masked_lm(self, use_v2_feature_names): if use_v2_feature_names: input_word_ids_field = "input_word_ids" input_type_ids_field = "input_type_ids" else: input_word_ids_field = "input_ids" input_type_ids_field = "segment_ids" config = masked_lm.MaskedLMConfig(model=bert.PretrainerConfig( encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig( vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig( inner_dim=10, num_classes=2, name="next_sentence") ])) task = masked_lm.MaskedLMTask(config) model = task.build_model() params = serving_modules.MaskedLM.Params( parse_sequence_length=10, max_predictions_per_seq=5, use_v2_feature_names=use_v2_feature_names) export_module = serving_modules.MaskedLM(params=params, model=model) functions = export_module.get_inference_signatures({ "serve": "serving_default", "serve_examples": "serving_examples" }) self.assertSameElements(functions.keys(), ["serving_default", "serving_examples"]) dummy_ids = tf.ones((10, 10), dtype=tf.int32) dummy_pos = tf.ones((10, 5), dtype=tf.int32) outputs = functions["serving_default"](input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids, masked_lm_positions=dummy_pos) self.assertEqual(outputs["classification"].shape, (10, 2)) dummy_ids = tf.ones((10, ), dtype=tf.int32) dummy_pos = tf.ones((5, ), dtype=tf.int32) examples = _create_fake_serialized_examples({ input_word_ids_field: dummy_ids, "input_mask": dummy_ids, input_type_ids_field: dummy_ids, "masked_lm_positions": dummy_pos }) outputs = functions["serving_examples"](examples) self.assertEqual(outputs["classification"].shape, (10, 2))
def test_question_answering(self, use_v2_feature_names): if use_v2_feature_names: input_word_ids_field = "input_word_ids" input_type_ids_field = "input_type_ids" else: input_word_ids_field = "input_ids" input_type_ids_field = "segment_ids" config = question_answering.QuestionAnsweringConfig( model=question_answering.ModelConfig( encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig( vocab_size=30522, num_layers=1))), validation_data=None) task = question_answering.QuestionAnsweringTask(config) model = task.build_model() params = serving_modules.QuestionAnswering.Params( parse_sequence_length=10, use_v2_feature_names=use_v2_feature_names) export_module = serving_modules.QuestionAnswering(params=params, model=model) functions = export_module.get_inference_signatures({ "serve": "serving_default", "serve_examples": "serving_examples" }) self.assertSameElements(functions.keys(), ["serving_default", "serving_examples"]) dummy_ids = tf.ones((10, 10), dtype=tf.int32) outputs = functions["serving_default"](input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) self.assertEqual(outputs["start_logits"].shape, (10, 10)) self.assertEqual(outputs["end_logits"].shape, (10, 10)) dummy_ids = tf.ones((10, ), dtype=tf.int32) examples = _create_fake_serialized_examples({ input_word_ids_field: dummy_ids, "input_mask": dummy_ids, input_type_ids_field: dummy_ids }) outputs = functions["serving_examples"](examples) self.assertEqual(outputs["start_logits"].shape, (10, 10)) self.assertEqual(outputs["end_logits"].shape, (10, 10))
class MaskedLMConfig(cfg.TaskConfig): """The model config.""" init_checkpoint: str = '' model: bert.PretrainerConfig = bert.PretrainerConfig( cls_heads=[ bert.ClsHeadConfig(inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence') ], encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig())) scale_loss: bool = False train_data: pretrain_dataloader.BertPretrainDataConfig = pretrain_dataloader.BertPretrainDataConfig( ) small_train_data: pretrain_dataloader.BertPretrainDataConfig = pretrain_dataloader.BertPretrainDataConfig( ) validation_data: pretrain_dataloader.BertPretrainDataConfig = pretrain_dataloader.BertPretrainDataConfig( )
def test_tagging(self, output_encoder_outputs): hidden_size = 768 num_classes = 3 config = tagging.TaggingConfig( model=tagging.ModelConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig( hidden_size=hidden_size, num_layers=1))), class_names=["class_0", "class_1", "class_2"]) task = tagging.TaggingTask(config) model = task.build_model() ckpt = tf.train.Checkpoint(model=model) ckpt_path = ckpt.save(self.get_temp_dir()) export_module_cls = export_savedmodel.lookup_export_module(task) serving_params = { "parse_sequence_length": 10, } params = export_module_cls.Params( **serving_params, output_encoder_outputs=output_encoder_outputs) export_module = export_module_cls(params=params, model=model) export_dir = export_savedmodel_util.export( export_module, function_keys={ "serve": "serving_default", "serve_examples": "serving_examples" }, checkpoint_path=ckpt_path, export_savedmodel_dir=self.get_temp_dir()) imported = tf.saved_model.load(export_dir) self.assertCountEqual(imported.signatures.keys(), ["serving_default", "serving_examples"]) serving_fn = imported.signatures["serving_default"] dummy_ids = tf.ones((1, 5), dtype=tf.int32) inputs = dict( input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) outputs = serving_fn(**inputs) self.assertEqual(outputs["logits"].shape, (1, 5, num_classes)) if output_encoder_outputs: self.assertEqual(outputs["encoder_outputs"].shape, (1, 5, hidden_size))
def test_masked_lm(self): config = masked_lm.MaskedLMConfig( model=bert.PretrainerConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig(inner_dim=10, num_classes=2, name="foo") ])) task = masked_lm.MaskedLMTask(config) model = task.build_model() ckpt = tf.train.Checkpoint(model=model) ckpt_path = ckpt.save(self.get_temp_dir()) export_module_cls = export_savedmodel.lookup_export_module(task) serving_params = { "cls_head_name": "foo", "parse_sequence_length": 10, "max_predictions_per_seq": 5 } params = export_module_cls.Params(**serving_params) export_module = export_module_cls(params=params, model=model) export_dir = export_savedmodel_util.export( export_module, function_keys={ "serve": "serving_default", "serve_examples": "serving_examples" }, checkpoint_path=ckpt_path, export_savedmodel_dir=self.get_temp_dir()) imported = tf.saved_model.load(export_dir) self.assertSameElements(imported.signatures.keys(), ["serving_default", "serving_examples"]) serving_fn = imported.signatures["serving_default"] dummy_ids = tf.ones((1, 10), dtype=tf.int32) dummy_pos = tf.ones((1, 5), dtype=tf.int32) outputs = serving_fn( input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids, masked_lm_positions=dummy_pos) self.assertEqual(outputs["classification"].shape, (1, 2))
def setUp(self): super(QuestionAnsweringTaskTest, self).setUp() self._encoder_config = encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)) self._train_data_config = question_answering_dataloader.QADataConfig( input_path="dummy", seq_length=128, global_batch_size=1) val_data = { "version": "1.1", "data": [{ "paragraphs": [{ "context": "Sky is blue.", "qas": [{ "question": "What is blue?", "id": "1234", "answers": [{ "text": "Sky", "answer_start": 0 }, { "text": "Sky", "answer_start": 0 }, { "text": "Sky", "answer_start": 0 }] }] }] }] } self._val_input_path = os.path.join(self.get_temp_dir(), "val_data.json") with tf.io.gfile.GFile(self._val_input_path, "w") as writer: writer.write(json.dumps(val_data, indent=4) + "\n") self._test_vocab = os.path.join(self.get_temp_dir(), "vocab.txt") with tf.io.gfile.GFile(self._test_vocab, "w") as writer: writer.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nsky\nis\nblue\n")
def get_tagging_config(model_config, label_list, train_data_config=None, validation_data_config=None): """Returns a TaggingConfig.""" common_params = { "train_data": train_data_config, "validation_data": validation_data_config, "class_names": label_list } if model_config.pretrained: return TaggingConfig(hub_module_url=_get_hub_url(model_config), **common_params) else: assert model_config.size == ModelSize.TINY return TaggingConfig(model=ModelConfig(encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(num_layers=2, hidden_size=128, num_attention_heads=2, intermediate_size=128 * 4))), **common_params)
def test_task(self): config = dual_encoder.DualEncoderConfig( init_checkpoint=self.get_temp_dir(), model=self.get_model_config(), train_data=self._train_data_config) task = dual_encoder.DualEncoderTask(config) model = task.build_model() metrics = task.build_metrics() dataset = task.build_inputs(config.train_data) iterator = iter(dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.train_step(next(iterator), model, optimizer, metrics=metrics) task.validation_step(next(iterator), model, metrics=metrics) # Saves a checkpoint. pretrain_cfg = bert.PretrainerConfig(encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1))) pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg) ckpt = tf.train.Checkpoint(model=pretrain_model, **pretrain_model.checkpoint_items) ckpt.save(config.init_checkpoint) task.initialize(model)
def get_model_config(self, num_classes): return sentence_prediction.ModelConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)), num_classes=num_classes)
def get_model_config(self): return dual_encoder.ModelConfig( max_sequence_length=32, encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)))
def setUp(self): super(TaggingTest, self).setUp() self._encoder_config = encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)) self._train_data_config = tagging_dataloader.TaggingDataConfig( input_path="dummy", seq_length=128, global_batch_size=1)
def get_model_config(self): return classification_example.ModelConfig( encoder=encoders.EncoderConfig( bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=2)))
def test_distribution_strategy(self, distribution_strategy): max_seq_length = 128 batch_size = 8 input_path = os.path.join(self.get_temp_dir(), 'train.tf_record') _create_fake_dataset(input_path, seq_length=60, num_masked_tokens=20, max_seq_length=max_seq_length, num_examples=batch_size) data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig( is_training=False, input_path=input_path, seq_bucket_lengths=[64, 128], global_batch_size=batch_size) dataloader = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader( data_config) distributed_ds = orbit.utils.make_distributed_dataset( distribution_strategy, dataloader.load) train_iter = iter(distributed_ds) with distribution_strategy.scope(): config = masked_lm.MaskedLMConfig( init_checkpoint=self.get_temp_dir(), model=bert.PretrainerConfig( encoders.EncoderConfig(bert=encoders.BertEncoderConfig( vocab_size=30522, num_layers=1)), cls_heads=[ bert.ClsHeadConfig(inner_dim=10, num_classes=2, name='next_sentence') ]), train_data=data_config) task = masked_lm.MaskedLMTask(config) model = task.build_model() metrics = task.build_metrics() @tf.function def step_fn(features): return task.validation_step(features, model, metrics=metrics) distributed_outputs = distribution_strategy.run( step_fn, args=(next(train_iter), )) local_results = tf.nest.map_structure( distribution_strategy.experimental_local_results, distributed_outputs) logging.info('Dynamic padding: local_results= %s', str(local_results)) dynamic_metrics = {} for metric in metrics: dynamic_metrics[metric.name] = metric.result() data_config = pretrain_dataloader.BertPretrainDataConfig( is_training=False, input_path=input_path, seq_length=max_seq_length, max_predictions_per_seq=20, global_batch_size=batch_size) dataloader = pretrain_dataloader.BertPretrainDataLoader(data_config) distributed_ds = orbit.utils.make_distributed_dataset( distribution_strategy, dataloader.load) train_iter = iter(distributed_ds) with distribution_strategy.scope(): metrics = task.build_metrics() @tf.function def step_fn_b(features): return task.validation_step(features, model, metrics=metrics) distributed_outputs = distribution_strategy.run( step_fn_b, args=(next(train_iter), )) local_results = tf.nest.map_structure( distribution_strategy.experimental_local_results, distributed_outputs) logging.info('Static padding: local_results= %s', str(local_results)) static_metrics = {} for metric in metrics: static_metrics[metric.name] = metric.result() for key in static_metrics: # We need to investigate the differences on losses. if key != 'next_sentence_loss': self.assertEqual(dynamic_metrics[key], static_metrics[key])
def test_task(self): # Prepare check point and test data ckpt_path = self._create_bert_ckpt() input_path = os.path.join(self.get_temp_dir(), 'train.tf_record') seq_length = 128 _create_fake_preprocessed_dataset(input_path, seq_length, tf.float32) # Set up data config train_data_config = tfrbert_task.TFRBertDataConfig( input_path=input_path, is_training=True, global_batch_size=5, list_size=3, dataset_fn='tfrecord', seq_length=128) validation_data_config = tfrbert_task.TFRBertDataConfig( input_path=input_path, is_training=False, global_batch_size=5, list_size=3, dataset_fn='tfrecord', seq_length=128, read_query_id=True, read_document_id=True) # Set up task config task_config = tfrbert_task.TFRBertConfig( output_preds=True, init_checkpoint=ckpt_path, aggregated_metrics=True, train_data=train_data_config, validation_data=validation_data_config, model=tfrbert_task.TFRBertModelConfig( encoder=encoders.EncoderConfig(bert=encoders.BertEncoderConfig( num_layers=1)))) # Set up TFRBertTask label_spec = ('label', tf.io.FixedLenFeature(shape=(1, ), dtype=tf.int64, default_value=-1)) task = tfrbert_task.TFRBertTask(task_config, label_spec=label_spec, dataset_fn=tf.data.TFRecordDataset, logging_dir=self._logging_dir) # Test model = task.build_model() metrics = task.build_metrics() train_dataset = task.build_inputs(task_config.train_data) vali_dataset = task.build_inputs(task_config.validation_data) task.initialize(model) train_iterator = iter(train_dataset) vali_iterator = iter(vali_dataset) optimizer = tf.keras.optimizers.SGD(lr=0.1) task.train_step(next(train_iterator), model, optimizer, metrics=metrics) logs = task.validation_step(next(vali_iterator), model, metrics=metrics) logs = {x: (logs[x], ) for x in logs} logs = task.aggregate_logs(step_outputs=logs) self.assertEqual(tf.constant(logs['query_id']).shape, (1, 5, 3)) self.assertEqual(tf.constant(logs['document_id']).shape, (1, 5, 3)) self.assertEqual( tf.constant(logs[tfrbert_task._PREDICTION]).shape, (1, 5, 3)) self.assertEqual( tf.constant(logs[tfrbert_task._LABEL]).shape, (1, 5, 3)) metrics = task.reduce_aggregated_logs(logs)