Пример #1
0
def bert_squad() -> cfg.ExperimentConfig:
    """BERT Squad V1/V2."""
    config = cfg.ExperimentConfig(
        task=question_answering.QuestionAnsweringConfig(
            train_data=question_answering_dataloader.QADataConfig(),
            validation_data=question_answering_dataloader.QADataConfig()),
        trainer=cfg.TrainerConfig(
            optimizer_config=optimization.OptimizationConfig({
                'optimizer': {
                    'type': 'adamw',
                    'adamw': {
                        'weight_decay_rate':
                        0.01,
                        'exclude_from_weight_decay':
                        ['LayerNorm', 'layer_norm', 'bias'],
                    }
                },
                'learning_rate': {
                    'type': 'polynomial',
                    'polynomial': {
                        'initial_learning_rate': 8e-5,
                        'end_learning_rate': 0.0,
                    }
                },
                'warmup': {
                    'type': 'polynomial'
                }
            })),
        restrictions=[
            'task.train_data.is_training != None',
            'task.validation_data.is_training != None'
        ])
    config.task.model.encoder.type = 'bert'
    return config
Пример #2
0
def teams_squad() -> cfg.ExperimentConfig:
  """Teams Squad V1/V2."""
  config = cfg.ExperimentConfig(
      task=question_answering.QuestionAnsweringConfig(
          model=question_answering.ModelConfig(
              encoder=encoders.EncoderConfig(
                  type="any", any=teams.TeamsEncoderConfig(num_layers=1))),
          train_data=question_answering_dataloader.QADataConfig(),
          validation_data=question_answering_dataloader.QADataConfig()),
      trainer=cfg.TrainerConfig(optimizer_config=TeamsOptimizationConfig()),
      restrictions=[
          "task.train_data.is_training != None",
          "task.validation_data.is_training != None"
      ])
  return config
    def test_load_dataset(self):
        seq_length = 128
        batch_size = 10
        input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
        _create_fake_dataset(input_path, seq_length)
        data_config = question_answering_dataloader.QADataConfig(
            is_training=True,
            input_path=input_path,
            seq_length=seq_length,
            global_batch_size=batch_size)
        dataset = question_answering_dataloader.QuestionAnsweringDataLoader(
            data_config).load()
        features, labels = next(iter(dataset))

        self.assertCountEqual(
            ['input_word_ids', 'input_mask', 'input_type_ids'],
            features.keys())
        self.assertEqual(features['input_word_ids'].shape,
                         (batch_size, seq_length))
        self.assertEqual(features['input_mask'].shape,
                         (batch_size, seq_length))
        self.assertEqual(features['input_type_ids'].shape,
                         (batch_size, seq_length))

        self.assertCountEqual(['start_positions', 'end_positions'],
                              labels.keys())
        self.assertEqual(labels['start_positions'].shape, (batch_size, ))
        self.assertEqual(labels['end_positions'].shape, (batch_size, ))
Пример #4
0
def write_question_answering(task,
                             model,
                             input_file,
                             output_file,
                             predict_batch_size,
                             seq_length,
                             tokenization,
                             vocab_file,
                             do_lower_case,
                             version_2_with_negative=False):
  """Makes question answering predictions and writes to output file."""
  data_config = question_answering_dataloader.QADataConfig(
      do_lower_case=do_lower_case,
      doc_stride=128,
      drop_remainder=False,
      global_batch_size=predict_batch_size,
      input_path=input_file,
      is_training=False,
      query_length=64,
      seq_length=seq_length,
      tokenization=tokenization,
      version_2_with_negative=version_2_with_negative,
      vocab_file=vocab_file)
  all_predictions, _, _ = question_answering.predict(task, data_config, model)
  with tf.io.gfile.GFile(output_file, 'w') as writer:
    writer.write(json.dumps(all_predictions, indent=4) + '\n')
 def _get_validation_data_config(self, version_2_with_negative=False):
     return question_answering_dataloader.QADataConfig(
         is_training=False,
         input_path=self._val_input_path,
         input_preprocessed_data_path=self.get_temp_dir(),
         seq_length=128,
         global_batch_size=1,
         version_2_with_negative=version_2_with_negative,
         vocab_file=self._test_vocab,
         tokenization="WordPiece",
         do_lower_case=True)
Пример #6
0
    def _representative_dataset():
        dataset_params = question_answering_dataloader.QADataConfig()
        dataset_params.input_path = SQUAD_TRAIN_SPLIT
        dataset_params.drop_remainder = False
        dataset_params.global_batch_size = 1
        dataset_params.is_training = True

        dataset = orbit.utils.make_distributed_dataset(
            tf.distribute.get_strategy(), build_inputs, dataset_params)
        for example in dataset.take(100):
            inputs = example[0]
            input_word_ids = inputs['input_word_ids']
            input_mask = inputs['input_mask']
            input_type_ids = inputs['input_type_ids']
            yield [input_word_ids, input_mask, input_type_ids]
Пример #7
0
    def setUp(self):
        super(XLNetQuestionAnsweringTaskTest, self).setUp()
        self._encoder_config = encoders.EncoderConfig(
            type="xlnet",
            xlnet=encoders.XLNetEncoderConfig(vocab_size=30522, num_layers=1))
        self._train_data_config = question_answering_dataloader.QADataConfig(
            input_path="dummy",
            seq_length=128,
            global_batch_size=2,
            xlnet_format=True)

        val_data = {
            "version":
            "2.0",
            "data": [{
                "paragraphs": [{
                    "context":
                    "Sky is blue.",
                    "qas": [{
                        "question":
                        "What is blue?",
                        "id":
                        "1234",
                        "answers": [{
                            "text": "Sky",
                            "answer_start": 0
                        }, {
                            "text": "Sky",
                            "answer_start": 0
                        }, {
                            "text": "Sky",
                            "answer_start": 0
                        }]
                    }]
                }]
            }]
        }
        self._val_input_path = os.path.join(self.get_temp_dir(),
                                            "val_data.json")
        with tf.io.gfile.GFile(self._val_input_path, "w") as writer:
            writer.write(json.dumps(val_data, indent=4) + "\n")

        self._test_vocab = os.path.join(self.get_temp_dir(), "vocab.txt")
        with tf.io.gfile.GFile(self._test_vocab, "w") as writer:
            writer.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nsky\nis\nblue\n")