def generate_retrieval_dataset(): """Generate retrieval test and dev dataset and returns input meta data.""" assert (FLAGS.input_data_dir and FLAGS.retrieval_task_name) if FLAGS.tokenization == "WordPiece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenization == "SentencePiece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) processors = { "bucc": sentence_retrieval_lib.BuccProcessor, "tatoeba": sentence_retrieval_lib.TatoebaProcessor, } task_name = FLAGS.retrieval_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name](process_text_fn=processor_text_fn) return sentence_retrieval_lib.generate_sentence_retrevial_tf_record( processor, FLAGS.input_data_dir, tokenizer, FLAGS.eval_data_output_path, FLAGS.test_data_output_path, FLAGS.max_seq_length)
def generate_tagging_dataset(): """Generates tagging dataset.""" processors = { "panx": functools.partial(tagging_data_lib.PanxProcessor, only_use_en_train=FLAGS.tagging_only_use_en_train, only_use_en_dev=FLAGS.only_use_en_dev), "udpos": functools.partial(tagging_data_lib.UdposProcessor, only_use_en_train=FLAGS.tagging_only_use_en_train, only_use_en_dev=FLAGS.only_use_en_dev), } task_name = FLAGS.tagging_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) if FLAGS.tokenization == "WordPiece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode elif FLAGS.tokenization == "SentencePiece": tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) else: raise ValueError("Unsupported tokenization: %s" % FLAGS.tokenization) processor = processors[task_name]() return tagging_data_lib.generate_tf_record_from_data_file( processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length, FLAGS.train_data_output_path, FLAGS.eval_data_output_path, FLAGS.test_data_output_path, processor_text_fn)
def generate_regression_dataset(): """Generates regression dataset and returns input meta data.""" if FLAGS.tokenization == "WordPiece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenization == "SentencePiece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) if FLAGS.tfds_params: processor = classifier_data_lib.TfdsProcessor( tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, None, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, test_data_output_path=FLAGS.test_data_output_path, max_seq_length=FLAGS.max_seq_length) else: raise ValueError( "No data processor found for the given regression task.")
def main(_): tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.io.gfile.glob(input_pattern)) logging.info("*** Reading from input files ***") for input_file in input_files: logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) instances = create_training_instances( input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng, FLAGS.do_whole_word_mask, FLAGS.max_ngram_size) output_files = FLAGS.output_file.split(",") logging.info("*** Writing to output files ***") for output_file in output_files: logging.info(" %s", output_file) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files, FLAGS.gzip_compress, FLAGS.use_v2_feature_names)
def predict_squad(strategy, input_meta_data): """Makes predictions for the squad dataset.""" bert_config = bert_configs.BertConfig.from_json_file( FLAGS.bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) run_squad_helper.predict_squad(strategy, input_meta_data, tokenizer, bert_config, squad_lib_wp)
def _preprocess_eval_data(self, params): eval_examples = self.squad_lib.read_squad_examples( input_file=params.input_path, is_training=False, version_2_with_negative=params.version_2_with_negative) temp_file_path = params.input_preprocessed_data_path or self.logging_dir if not temp_file_path: raise ValueError( 'You must specify a temporary directory, either in ' 'params.input_preprocessed_data_path or logging_dir to ' 'store intermediate evaluation TFRecord data.') eval_writer = self.squad_lib.FeatureWriter(filename=os.path.join( temp_file_path, 'eval.tf_record'), is_training=False) eval_features = [] def _append_feature(feature, is_padding): if not is_padding: eval_features.append(feature) eval_writer.process_feature(feature) # XLNet preprocesses SQuAD examples in a P, Q, class order whereas # BERT preprocesses in a class, Q, P order. xlnet_ordering = self.task_config.model.encoder.type == 'xlnet' kwargs = dict(examples=eval_examples, max_seq_length=params.seq_length, doc_stride=params.doc_stride, max_query_length=params.query_length, is_training=False, output_fn=_append_feature, batch_size=params.global_batch_size, xlnet_format=xlnet_ordering) if params.tokenization == 'SentencePiece': # squad_lib_sp requires one more argument 'do_lower_case'. kwargs['do_lower_case'] = params.do_lower_case kwargs['tokenizer'] = tokenization.FullSentencePieceTokenizer( sp_model_file=params.vocab_file) elif params.tokenization == 'WordPiece': kwargs['tokenizer'] = tokenization.FullTokenizer( vocab_file=params.vocab_file, do_lower_case=params.do_lower_case) else: raise ValueError('Unexpected tokenization: %s' % params.tokenization) eval_dataset_size = self.squad_lib.convert_examples_to_features( **kwargs) eval_writer.close() logging.info('***** Evaluation input stats *****') logging.info(' Num orig examples = %d', len(eval_examples)) logging.info(' Num split examples = %d', len(eval_features)) logging.info(' Batch size = %d', params.global_batch_size) logging.info(' Dataset size = %d', eval_dataset_size) return eval_writer.filename, eval_examples, eval_features
def eval_squad(strategy, input_meta_data): """Evaluate on the squad dataset.""" bert_config = bert_configs.BertConfig.from_json_file( FLAGS.bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) eval_metrics = run_squad_helper.eval_squad(strategy, input_meta_data, tokenizer, bert_config, squad_lib_wp) return eval_metrics
def test_generate_tf_record(self, task_type): processor = self.processors[task_type]() input_data_dir = os.path.join(self.get_temp_dir(), task_type) tf.io.gfile.mkdir(input_data_dir) # Write fake train file. _create_fake_file( os.path.join(input_data_dir, "train-en.tsv"), processor.get_labels(), is_test=False) # Write fake dev file. _create_fake_file( os.path.join(input_data_dir, "dev-en.tsv"), processor.get_labels(), is_test=False) # Write fake test files. for lang in processor.supported_languages: _create_fake_file( os.path.join(input_data_dir, "test-%s.tsv" % lang), processor.get_labels(), is_test=True) output_path = os.path.join(self.get_temp_dir(), task_type, "output") tokenizer = tokenization.FullTokenizer( vocab_file=self.vocab_file, do_lower_case=True) metadata = tagging_data_lib.generate_tf_record_from_data_file( processor, input_data_dir, tokenizer, max_seq_length=8, train_data_output_path=os.path.join(output_path, "train.tfrecord"), eval_data_output_path=os.path.join(output_path, "eval.tfrecord"), test_data_output_path=os.path.join(output_path, "test_{}.tfrecord"), text_preprocessing=tokenization.convert_to_unicode) self.assertEqual(metadata["train_data_size"], 5) files = tf.io.gfile.glob(output_path + "/*") expected_files = [] expected_files.append(os.path.join(output_path, "train.tfrecord")) expected_files.append(os.path.join(output_path, "eval.tfrecord")) for lang in processor.supported_languages: expected_files.append( os.path.join(output_path, "test_%s.tfrecord" % lang)) self.assertCountEqual(files, expected_files)
def setUp(self): super(BertClassifierLibTest, self).setUp() self.model_dir = self.get_temp_dir() self.processors = { "CB": classifier_data_lib.CBProcessor, "SUPERGLUE-RTE": classifier_data_lib.SuperGLUERTEProcessor, "BOOLQ": classifier_data_lib.BoolQProcessor, "WIC": classifier_data_lib.WiCProcessor, } vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens ]).encode("utf-8")) vocab_file = vocab_writer.name self.tokenizer = tokenization.FullTokenizer(vocab_file)
def generate_tf_record_from_json_file(input_file_path, vocab_file_path, output_path, translated_input_folder=None, max_seq_length=384, do_lower_case=True, max_query_length=64, doc_stride=128, version_2_with_negative=False, xlnet_format=False): """Generates and saves training data into a tf record file.""" train_examples = read_squad_examples( input_file=input_file_path, is_training=True, version_2_with_negative=version_2_with_negative, translated_input_folder=translated_input_folder) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file_path, do_lower_case=do_lower_case) train_writer = FeatureWriter(filename=output_path, is_training=True) number_of_examples = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=True, output_fn=train_writer.process_feature, xlnet_format=xlnet_format) train_writer.close() meta_data = { "task_type": "bert_squad", "train_data_size": number_of_examples, "max_seq_length": max_seq_length, "max_query_length": max_query_length, "doc_stride": doc_stride, "version_2_with_negative": version_2_with_negative, } return meta_data
def test_full_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: if six.PY2: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) else: vocab_writer.write("".join([x + "\n" for x in vocab_tokens ]).encode("utf-8")) vocab_file = vocab_writer.name tokenizer = tokenization.FullTokenizer(vocab_file) os.unlink(vocab_file) tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertAllEqual( tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def __init__(self, vocab: str, do_lower_case: bool, len_title: int = 15, len_passage: int = 200, max_num_articles: int = 5, include_article_title_in_passage: bool = False, include_text_snippet_in_example: bool = False): """Constructs a RawDataProcessor. Args: vocab: Filepath of the BERT vocabulary. do_lower_case: Whether the vocabulary is uncased or not. len_title: Maximum number of tokens in story headline. len_passage: Maximum number of tokens in article passage. max_num_articles: Maximum number of articles in a story. include_article_title_in_passage: Whether to include article title in article passage. include_text_snippet_in_example: Whether to include text snippet (headline and article content) in generated tensorflow Examples, for debug usage. If include_article_title_in_passage=True, title and body will be separated by [SEP]. """ self.articles = dict() self.tokenizer = tokenization.FullTokenizer( vocab, do_lower_case=do_lower_case, split_on_punc=False) self.len_title = len_title self.len_passage = len_passage self.max_num_articles = max_num_articles self.include_article_title_in_passage = include_article_title_in_passage self.include_text_snippet_in_example = include_text_snippet_in_example # ex_index=5 deactivates printing inside convert_single_example. self.ex_index = 5 # Parameters used in InputExample, not used in NHNet. self.label = 0 self.guid = 0 self.num_generated_examples = 0
def generate_classifier_dataset(): """Generates classifier dataset and returns input meta data.""" if FLAGS.classification_task_name in [ "COLA", "WNLI", "SST-2", "MRPC", "QQP", "STS-B", "MNLI", "QNLI", "RTE", "AX", "SUPERGLUE-RTE", "CB", "BoolQ", "WIC", ]: assert not FLAGS.input_data_dir or FLAGS.tfds_params else: assert (FLAGS.input_data_dir and FLAGS.classification_task_name or FLAGS.tfds_params) if FLAGS.tokenization == "WordPiece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenization == "SentencePiece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) if FLAGS.tfds_params: processor = classifier_data_lib.TfdsProcessor( tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, None, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, test_data_output_path=FLAGS.test_data_output_path, max_seq_length=FLAGS.max_seq_length) else: processors = { "ax": classifier_data_lib.AxProcessor, "cola": classifier_data_lib.ColaProcessor, "imdb": classifier_data_lib.ImdbProcessor, "mnli": functools.partial(classifier_data_lib.MnliProcessor, mnli_type=FLAGS.mnli_type), "mrpc": classifier_data_lib.MrpcProcessor, "qnli": classifier_data_lib.QnliProcessor, "qqp": classifier_data_lib.QqpProcessor, "rte": classifier_data_lib.RteProcessor, "sst-2": classifier_data_lib.SstProcessor, "sts-b": classifier_data_lib.StsBProcessor, "xnli": functools.partial(classifier_data_lib.XnliProcessor, language=FLAGS.xnli_language), "paws-x": functools.partial(classifier_data_lib.PawsxProcessor, language=FLAGS.pawsx_language), "wnli": classifier_data_lib.WnliProcessor, "xtreme-xnli": functools.partial( classifier_data_lib.XtremeXnliProcessor, translated_data_dir=FLAGS.translated_input_data_dir, only_use_en_dev=FLAGS.only_use_en_dev), "xtreme-paws-x": functools.partial( classifier_data_lib.XtremePawsxProcessor, translated_data_dir=FLAGS.translated_input_data_dir, only_use_en_dev=FLAGS.only_use_en_dev), "ax-g": classifier_data_lib.AXgProcessor, "superglue-rte": classifier_data_lib.SuperGLUERTEProcessor, "cb": classifier_data_lib.CBProcessor, "boolq": classifier_data_lib.BoolQProcessor, "wic": classifier_data_lib.WnliProcessor, } task_name = FLAGS.classification_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name](process_text_fn=processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, FLAGS.input_data_dir, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, test_data_output_path=FLAGS.test_data_output_path, max_seq_length=FLAGS.max_seq_length)