예제 #1
0
def generate_retrieval_dataset():
    """Generate retrieval test and dev dataset and returns input meta data."""
    assert (FLAGS.input_data_dir and FLAGS.retrieval_task_name)
    if FLAGS.tokenizer_impl == "word_piece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenizer_impl == "sentence_piece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    processors = {
        "bucc": sentence_retrieval_lib.BuccProcessor,
        "tatoeba": sentence_retrieval_lib.TatoebaProcessor,
    }

    task_name = FLAGS.retrieval_task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    processor = processors[task_name](process_text_fn=processor_text_fn)

    return sentence_retrieval_lib.generate_sentence_retrevial_tf_record(
        processor, FLAGS.input_data_dir, tokenizer,
        FLAGS.eval_data_output_path, FLAGS.test_data_output_path,
        FLAGS.max_seq_length)
예제 #2
0
def generate_tagging_dataset():
    """Generates tagging dataset."""
    processors = {
        "panx": tagging_data_lib.PanxProcessor,
        "udpos": tagging_data_lib.UdposProcessor,
    }
    task_name = FLAGS.tagging_task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    if FLAGS.tokenizer_impl == "word_piece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    elif FLAGS.tokenizer_impl == "sentence_piece":
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)
    else:
        raise ValueError("Unsupported tokenizer_impl: %s" %
                         FLAGS.tokenizer_impl)

    processor = processors[task_name]()
    return tagging_data_lib.generate_tf_record_from_data_file(
        processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length,
        FLAGS.train_data_output_path, FLAGS.eval_data_output_path,
        FLAGS.test_data_output_path, processor_text_fn)
def generate_classifier_dataset():
    """Generates classifier dataset and returns input meta data."""
    assert FLAGS.input_data_dir and FLAGS.classification_task_name

    processors = {
        "cola": classifier_data_lib.ColaProcessor,
        "mnli": classifier_data_lib.MnliProcessor,
        "mrpc": classifier_data_lib.MrpcProcessor,
        "qnli": classifier_data_lib.QnliProcessor,
        "sst-2": classifier_data_lib.SstProcessor,
        "xnli": classifier_data_lib.XnliProcessor,
    }
    task_name = FLAGS.classification_task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    if FLAGS.tokenizer_impl == "word_piece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenizer_impl == "sentence_piece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    processor = processors[task_name](processor_text_fn)
    return classifier_data_lib.generate_tf_record_from_data_file(
        processor,
        FLAGS.input_data_dir,
        tokenizer,
        train_data_output_path=FLAGS.train_data_output_path,
        eval_data_output_path=FLAGS.eval_data_output_path,
        max_seq_length=FLAGS.max_seq_length)
예제 #4
0
def generate_regression_dataset():
    """Generates regression dataset and returns input meta data."""
    if FLAGS.tokenizer_impl == "word_piece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenizer_impl == "sentence_piece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    if FLAGS.tfds_params:
        processor = classifier_data_lib.TfdsProcessor(
            tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn)
        return classifier_data_lib.generate_tf_record_from_data_file(
            processor,
            None,
            tokenizer,
            train_data_output_path=FLAGS.train_data_output_path,
            eval_data_output_path=FLAGS.eval_data_output_path,
            test_data_output_path=FLAGS.test_data_output_path,
            max_seq_length=FLAGS.max_seq_length)
    else:
        raise ValueError(
            "No data processor found for the given regression task.")
예제 #5
0
    def _preprocess_eval_data(self, params):
        eval_examples = self.squad_lib.read_squad_examples(
            input_file=params.input_path,
            is_training=False,
            version_2_with_negative=params.version_2_with_negative)

        temp_file_path = params.input_preprocessed_data_path or self.logging_dir
        if not temp_file_path:
            raise ValueError(
                'You must specify a temporary directory, either in '
                'params.input_preprocessed_data_path or logging_dir to '
                'store intermediate evaluation TFRecord data.')
        eval_writer = self.squad_lib.FeatureWriter(filename=os.path.join(
            temp_file_path, 'eval.tf_record'),
                                                   is_training=False)
        eval_features = []

        def _append_feature(feature, is_padding):
            if not is_padding:
                eval_features.append(feature)
            eval_writer.process_feature(feature)

        # XLNet preprocesses SQuAD examples in a P, Q, class order whereas
        # BERT preprocesses in a class, Q, P order.
        xlnet_ordering = self.task_config.model.encoder.type == 'xlnet'
        kwargs = dict(examples=eval_examples,
                      max_seq_length=params.seq_length,
                      doc_stride=params.doc_stride,
                      max_query_length=params.query_length,
                      is_training=False,
                      output_fn=_append_feature,
                      batch_size=params.global_batch_size,
                      xlnet_format=xlnet_ordering)

        if params.tokenization == 'SentencePiece':
            # squad_lib_sp requires one more argument 'do_lower_case'.
            kwargs['do_lower_case'] = params.do_lower_case
            kwargs['tokenizer'] = tokenization.FullSentencePieceTokenizer(
                sp_model_file=params.vocab_file)
        elif params.tokenization == 'WordPiece':
            kwargs['tokenizer'] = tokenization.FullTokenizer(
                vocab_file=params.vocab_file,
                do_lower_case=params.do_lower_case)
        else:
            raise ValueError('Unexpected tokenization: %s' %
                             params.tokenization)

        eval_dataset_size = self.squad_lib.convert_examples_to_features(
            **kwargs)
        eval_writer.close()

        logging.info('***** Evaluation input stats *****')
        logging.info('  Num orig examples = %d', len(eval_examples))
        logging.info('  Num split examples = %d', len(eval_features))
        logging.info('  Batch size = %d', params.global_batch_size)
        logging.info('  Dataset size = %d', eval_dataset_size)

        return eval_writer.filename, eval_examples, eval_features
예제 #6
0
def predict_squad(strategy, input_meta_data):
    """Makes predictions for the squad dataset."""
    bert_config = albert_configs.AlbertConfig.from_json_file(
        FLAGS.bert_config_file)
    tokenizer = tokenization.FullSentencePieceTokenizer(
        sp_model_file=FLAGS.sp_model_file)

    run_squad_helper.predict_squad(strategy, input_meta_data, tokenizer,
                                   bert_config, squad_lib_sp)
def eval_squad(strategy, input_meta_data):
  """Evaluate on the squad dataset."""
  bert_config = albert_configs.AlbertConfig.from_json_file(
      FLAGS.bert_config_file)
  tokenizer = tokenization.FullSentencePieceTokenizer(
      sp_model_file=FLAGS.sp_model_file)

  eval_metrics = run_squad_helper.eval_squad(
      strategy, input_meta_data, tokenizer, bert_config, squad_lib_sp)
  return eval_metrics
예제 #8
0
def main(_):
  tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
  create_tfrecords(
      tokenizer=tokenizer,
      input_file_or_files=FLAGS.input_file,
      use_eod_token=FLAGS.use_eod_token,
      do_lower_case=FLAGS.do_lower_case,
      per_host_batch_size=FLAGS.per_host_batch_size,
      seq_length=FLAGS.seq_length,
      reuse_length=FLAGS.reuse_length,
      bi_data=FLAGS.bi_data,
      num_cores_per_host=FLAGS.num_cores_per_host,
      save_dir=FLAGS.save_dir,
      prefix=FLAGS.prefix,
      suffix=FLAGS.suffix,
      num_tasks=FLAGS.num_tasks,
      task_id=FLAGS.task_id,
      num_passes=FLAGS.num_passes)
def generate_tf_record_from_json_file(input_file_path,
                                      sp_model_file,
                                      output_path,
                                      translated_input_folder=None,
                                      max_seq_length=384,
                                      do_lower_case=True,
                                      max_query_length=64,
                                      doc_stride=128,
                                      xlnet_format=False,
                                      version_2_with_negative=False):
  """Generates and saves training data into a tf record file."""
  train_examples = read_squad_examples(
      input_file=input_file_path,
      is_training=True,
      version_2_with_negative=version_2_with_negative,
      translated_input_folder=translated_input_folder)
  tokenizer = tokenization.FullSentencePieceTokenizer(
      sp_model_file=sp_model_file)
  train_writer = FeatureWriter(
      filename=output_path, is_training=True)
  number_of_examples = convert_examples_to_features(
      examples=train_examples,
      tokenizer=tokenizer,
      max_seq_length=max_seq_length,
      doc_stride=doc_stride,
      max_query_length=max_query_length,
      is_training=True,
      output_fn=train_writer.process_feature,
      xlnet_format=xlnet_format,
      do_lower_case=do_lower_case)
  train_writer.close()

  meta_data = {
      "task_type": "bert_squad",
      "train_data_size": number_of_examples,
      "max_seq_length": max_seq_length,
      "max_query_length": max_query_length,
      "doc_stride": doc_stride,
      "version_2_with_negative": version_2_with_negative,
  }

  return meta_data
예제 #10
0
    def prepare_input_data(self, data):
        """prepare text input for transformers as tensors """
        if self.embedding_type == "bert":
            vocab_file = (self.embedding_layer.resolved_object.vocab_file.
                          asset_path.numpy())
            do_lower_case = self.embedding_layer.resolved_object.do_lower_case.numpy(
            )
            tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

        elif self.embedding_type == "albert":
            sp_model_file = (self.embedding_layer.resolved_object.
                             sp_model_file.asset_path.numpy())
            tokenizer = tokenization.FullSentencePieceTokenizer(sp_model_file)

        input_ids, input_masks, input_segments = [], [], []

        for s in data:
            stokens = tokenizer.tokenize(s)
            stokens = ["[CLS]"] + stokens + ["[SEP]"]
            input_ids.append(get_ids(stokens, tokenizer, self.max_seq_len))
            input_masks.append(get_masks(stokens, self.max_seq_len))
            input_segments.append(get_segments(stokens, self.max_seq_len))
        return input_ids, input_masks, input_segments
예제 #11
0
def generate_classifier_dataset():
    """Generates classifier dataset and returns input meta data."""
    assert (FLAGS.input_data_dir and FLAGS.classification_task_name
            or FLAGS.tfds_params)

    if FLAGS.tokenizer_impl == "word_piece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenizer_impl == "sentence_piece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    if FLAGS.tfds_params:
        processor = classifier_data_lib.TfdsProcessor(
            tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn)
        return classifier_data_lib.generate_tf_record_from_data_file(
            processor,
            None,
            tokenizer,
            train_data_output_path=FLAGS.train_data_output_path,
            eval_data_output_path=FLAGS.eval_data_output_path,
            test_data_output_path=FLAGS.test_data_output_path,
            max_seq_length=FLAGS.max_seq_length)
    else:
        processors = {
            "cola":
            classifier_data_lib.ColaProcessor,
            "mnli":
            classifier_data_lib.MnliProcessor,
            "mrpc":
            classifier_data_lib.MrpcProcessor,
            "qnli":
            classifier_data_lib.QnliProcessor,
            "qqp":
            classifier_data_lib.QqpProcessor,
            "rte":
            classifier_data_lib.RteProcessor,
            "sst-2":
            classifier_data_lib.SstProcessor,
            "sts-b":
            classifier_data_lib.StsBProcessor,
            "xnli":
            functools.partial(classifier_data_lib.XnliProcessor,
                              language=FLAGS.xnli_language),
            "paws-x":
            functools.partial(classifier_data_lib.PawsxProcessor,
                              language=FLAGS.pawsx_language),
            "wnli":
            classifier_data_lib.WnliProcessor,
            "xtreme-xnli":
            functools.partial(classifier_data_lib.XtremeXnliProcessor),
            "xtreme-paws-x":
            functools.partial(classifier_data_lib.XtremePawsxProcessor)
        }
        task_name = FLAGS.classification_task_name.lower()
        if task_name not in processors:
            raise ValueError("Task not found: %s" % (task_name))

        processor = processors[task_name](process_text_fn=processor_text_fn)
        return classifier_data_lib.generate_tf_record_from_data_file(
            processor,
            FLAGS.input_data_dir,
            tokenizer,
            train_data_output_path=FLAGS.train_data_output_path,
            eval_data_output_path=FLAGS.eval_data_output_path,
            test_data_output_path=FLAGS.test_data_output_path,
            max_seq_length=FLAGS.max_seq_length)
예제 #12
0
def generate_classifier_dataset():
    """Generates classifier dataset and returns input meta data."""
    if FLAGS.classification_task_name in [
            "COLA",
            "WNLI",
            "SST-2",
            "MRPC",
            "QQP",
            "STS-B",
            "MNLI",
            "QNLI",
            "RTE",
            "AX",
            "SUPERGLUE-RTE",
            "CB",
            "BoolQ",
            "WIC",
    ]:
        assert not FLAGS.input_data_dir or FLAGS.tfds_params
    else:
        assert (FLAGS.input_data_dir and FLAGS.classification_task_name
                or FLAGS.tfds_params)

    if FLAGS.tokenization == "WordPiece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenization == "SentencePiece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    if FLAGS.tfds_params:
        processor = classifier_data_lib.TfdsProcessor(
            tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn)
        return classifier_data_lib.generate_tf_record_from_data_file(
            processor,
            None,
            tokenizer,
            train_data_output_path=FLAGS.train_data_output_path,
            eval_data_output_path=FLAGS.eval_data_output_path,
            test_data_output_path=FLAGS.test_data_output_path,
            max_seq_length=FLAGS.max_seq_length)
    else:
        processors = {
            "ax":
            classifier_data_lib.AxProcessor,
            "cola":
            classifier_data_lib.ColaProcessor,
            "imdb":
            classifier_data_lib.ImdbProcessor,
            "mnli":
            functools.partial(classifier_data_lib.MnliProcessor,
                              mnli_type=FLAGS.mnli_type),
            "mrpc":
            classifier_data_lib.MrpcProcessor,
            "qnli":
            classifier_data_lib.QnliProcessor,
            "qqp":
            classifier_data_lib.QqpProcessor,
            "rte":
            classifier_data_lib.RteProcessor,
            "sst-2":
            classifier_data_lib.SstProcessor,
            "sts-b":
            classifier_data_lib.StsBProcessor,
            "xnli":
            functools.partial(classifier_data_lib.XnliProcessor,
                              language=FLAGS.xnli_language),
            "paws-x":
            functools.partial(classifier_data_lib.PawsxProcessor,
                              language=FLAGS.pawsx_language),
            "wnli":
            classifier_data_lib.WnliProcessor,
            "xtreme-xnli":
            functools.partial(
                classifier_data_lib.XtremeXnliProcessor,
                translated_data_dir=FLAGS.translated_input_data_dir,
                only_use_en_dev=FLAGS.only_use_en_dev),
            "xtreme-paws-x":
            functools.partial(
                classifier_data_lib.XtremePawsxProcessor,
                translated_data_dir=FLAGS.translated_input_data_dir,
                only_use_en_dev=FLAGS.only_use_en_dev),
            "ax-g":
            classifier_data_lib.AXgProcessor,
            "superglue-rte":
            classifier_data_lib.SuperGLUERTEProcessor,
            "cb":
            classifier_data_lib.CBProcessor,
            "boolq":
            classifier_data_lib.BoolQProcessor,
            "wic":
            classifier_data_lib.WnliProcessor,
        }
        task_name = FLAGS.classification_task_name.lower()
        if task_name not in processors:
            raise ValueError("Task not found: %s" % (task_name))

        processor = processors[task_name](process_text_fn=processor_text_fn)
        return classifier_data_lib.generate_tf_record_from_data_file(
            processor,
            FLAGS.input_data_dir,
            tokenizer,
            train_data_output_path=FLAGS.train_data_output_path,
            eval_data_output_path=FLAGS.eval_data_output_path,
            test_data_output_path=FLAGS.test_data_output_path,
            max_seq_length=FLAGS.max_seq_length)
예제 #13
0
 def init_tokenizer(self):
     sp_model_file = self.model_layer.resolved_object.sp_model_file.asset_path.numpy(
     )
     return tokenization.FullSentencePieceTokenizer(sp_model_file)