def bert_squad() -> cfg.ExperimentConfig: """BERT Squad V1/V2.""" config = cfg.ExperimentConfig( task=question_answering.QuestionAnsweringConfig( train_data=question_answering_dataloader.QADataConfig(), validation_data=question_answering_dataloader.QADataConfig()), trainer=cfg.TrainerConfig( optimizer_config=optimization.OptimizationConfig({ 'optimizer': { 'type': 'adamw', 'adamw': { 'weight_decay_rate': 0.01, 'exclude_from_weight_decay': ['LayerNorm', 'layer_norm', 'bias'], } }, 'learning_rate': { 'type': 'polynomial', 'polynomial': { 'initial_learning_rate': 8e-5, 'end_learning_rate': 0.0, } }, 'warmup': { 'type': 'polynomial' } })), restrictions=[ 'task.train_data.is_training != None', 'task.validation_data.is_training != None' ]) config.task.model.encoder.type = 'bert' return config
def teams_squad() -> cfg.ExperimentConfig: """Teams Squad V1/V2.""" config = cfg.ExperimentConfig( task=question_answering.QuestionAnsweringConfig( model=question_answering.ModelConfig( encoder=encoders.EncoderConfig( type="any", any=teams.TeamsEncoderConfig(num_layers=1))), train_data=question_answering_dataloader.QADataConfig(), validation_data=question_answering_dataloader.QADataConfig()), trainer=cfg.TrainerConfig(optimizer_config=TeamsOptimizationConfig()), restrictions=[ "task.train_data.is_training != None", "task.validation_data.is_training != None" ]) return config
def test_load_dataset(self): seq_length = 128 batch_size = 10 input_path = os.path.join(self.get_temp_dir(), 'train.tf_record') _create_fake_dataset(input_path, seq_length) data_config = question_answering_dataloader.QADataConfig( is_training=True, input_path=input_path, seq_length=seq_length, global_batch_size=batch_size) dataset = question_answering_dataloader.QuestionAnsweringDataLoader( data_config).load() features, labels = next(iter(dataset)) self.assertCountEqual( ['input_word_ids', 'input_mask', 'input_type_ids'], features.keys()) self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_mask'].shape, (batch_size, seq_length)) self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length)) self.assertCountEqual(['start_positions', 'end_positions'], labels.keys()) self.assertEqual(labels['start_positions'].shape, (batch_size, )) self.assertEqual(labels['end_positions'].shape, (batch_size, ))
def write_question_answering(task, model, input_file, output_file, predict_batch_size, seq_length, tokenization, vocab_file, do_lower_case, version_2_with_negative=False): """Makes question answering predictions and writes to output file.""" data_config = question_answering_dataloader.QADataConfig( do_lower_case=do_lower_case, doc_stride=128, drop_remainder=False, global_batch_size=predict_batch_size, input_path=input_file, is_training=False, query_length=64, seq_length=seq_length, tokenization=tokenization, version_2_with_negative=version_2_with_negative, vocab_file=vocab_file) all_predictions, _, _ = question_answering.predict(task, data_config, model) with tf.io.gfile.GFile(output_file, 'w') as writer: writer.write(json.dumps(all_predictions, indent=4) + '\n')
def _get_validation_data_config(self, version_2_with_negative=False): return question_answering_dataloader.QADataConfig( is_training=False, input_path=self._val_input_path, input_preprocessed_data_path=self.get_temp_dir(), seq_length=128, global_batch_size=1, version_2_with_negative=version_2_with_negative, vocab_file=self._test_vocab, tokenization="WordPiece", do_lower_case=True)
def _representative_dataset(): dataset_params = question_answering_dataloader.QADataConfig() dataset_params.input_path = SQUAD_TRAIN_SPLIT dataset_params.drop_remainder = False dataset_params.global_batch_size = 1 dataset_params.is_training = True dataset = orbit.utils.make_distributed_dataset( tf.distribute.get_strategy(), build_inputs, dataset_params) for example in dataset.take(100): inputs = example[0] input_word_ids = inputs['input_word_ids'] input_mask = inputs['input_mask'] input_type_ids = inputs['input_type_ids'] yield [input_word_ids, input_mask, input_type_ids]
def setUp(self): super(XLNetQuestionAnsweringTaskTest, self).setUp() self._encoder_config = encoders.EncoderConfig( type="xlnet", xlnet=encoders.XLNetEncoderConfig(vocab_size=30522, num_layers=1)) self._train_data_config = question_answering_dataloader.QADataConfig( input_path="dummy", seq_length=128, global_batch_size=2, xlnet_format=True) val_data = { "version": "2.0", "data": [{ "paragraphs": [{ "context": "Sky is blue.", "qas": [{ "question": "What is blue?", "id": "1234", "answers": [{ "text": "Sky", "answer_start": 0 }, { "text": "Sky", "answer_start": 0 }, { "text": "Sky", "answer_start": 0 }] }] }] }] } self._val_input_path = os.path.join(self.get_temp_dir(), "val_data.json") with tf.io.gfile.GFile(self._val_input_path, "w") as writer: writer.write(json.dumps(val_data, indent=4) + "\n") self._test_vocab = os.path.join(self.get_temp_dir(), "vocab.txt") with tf.io.gfile.GFile(self._test_vocab, "w") as writer: writer.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nsky\nis\nblue\n")