def generate_retrieval_dataset(): """Generate retrieval test and dev dataset and returns input meta data.""" assert (FLAGS.input_data_dir and FLAGS.retrieval_task_name) if FLAGS.tokenizer_impl == "word_piece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenizer_impl == "sentence_piece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) processors = { "bucc": sentence_retrieval_lib.BuccProcessor, "tatoeba": sentence_retrieval_lib.TatoebaProcessor, } task_name = FLAGS.retrieval_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name](process_text_fn=processor_text_fn) return sentence_retrieval_lib.generate_sentence_retrevial_tf_record( processor, FLAGS.input_data_dir, tokenizer, FLAGS.eval_data_output_path, FLAGS.test_data_output_path, FLAGS.max_seq_length)
def generate_tagging_dataset(): """Generates tagging dataset.""" processors = { "panx": tagging_data_lib.PanxProcessor, "udpos": tagging_data_lib.UdposProcessor, } task_name = FLAGS.tagging_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) if FLAGS.tokenizer_impl == "word_piece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode elif FLAGS.tokenizer_impl == "sentence_piece": tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) else: raise ValueError("Unsupported tokenizer_impl: %s" % FLAGS.tokenizer_impl) processor = processors[task_name]() return tagging_data_lib.generate_tf_record_from_data_file( processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length, FLAGS.train_data_output_path, FLAGS.eval_data_output_path, FLAGS.test_data_output_path, processor_text_fn)
def generate_classifier_dataset(): """Generates classifier dataset and returns input meta data.""" assert FLAGS.input_data_dir and FLAGS.classification_task_name processors = { "cola": classifier_data_lib.ColaProcessor, "mnli": classifier_data_lib.MnliProcessor, "mrpc": classifier_data_lib.MrpcProcessor, "qnli": classifier_data_lib.QnliProcessor, "sst-2": classifier_data_lib.SstProcessor, "xnli": classifier_data_lib.XnliProcessor, } task_name = FLAGS.classification_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) if FLAGS.tokenizer_impl == "word_piece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenizer_impl == "sentence_piece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) processor = processors[task_name](processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, FLAGS.input_data_dir, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, max_seq_length=FLAGS.max_seq_length)
def generate_regression_dataset(): """Generates regression dataset and returns input meta data.""" if FLAGS.tokenizer_impl == "word_piece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenizer_impl == "sentence_piece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) if FLAGS.tfds_params: processor = classifier_data_lib.TfdsProcessor( tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, None, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, test_data_output_path=FLAGS.test_data_output_path, max_seq_length=FLAGS.max_seq_length) else: raise ValueError( "No data processor found for the given regression task.")
def _preprocess_eval_data(self, params): eval_examples = self.squad_lib.read_squad_examples( input_file=params.input_path, is_training=False, version_2_with_negative=params.version_2_with_negative) temp_file_path = params.input_preprocessed_data_path or self.logging_dir if not temp_file_path: raise ValueError( 'You must specify a temporary directory, either in ' 'params.input_preprocessed_data_path or logging_dir to ' 'store intermediate evaluation TFRecord data.') eval_writer = self.squad_lib.FeatureWriter(filename=os.path.join( temp_file_path, 'eval.tf_record'), is_training=False) eval_features = [] def _append_feature(feature, is_padding): if not is_padding: eval_features.append(feature) eval_writer.process_feature(feature) # XLNet preprocesses SQuAD examples in a P, Q, class order whereas # BERT preprocesses in a class, Q, P order. xlnet_ordering = self.task_config.model.encoder.type == 'xlnet' kwargs = dict(examples=eval_examples, max_seq_length=params.seq_length, doc_stride=params.doc_stride, max_query_length=params.query_length, is_training=False, output_fn=_append_feature, batch_size=params.global_batch_size, xlnet_format=xlnet_ordering) if params.tokenization == 'SentencePiece': # squad_lib_sp requires one more argument 'do_lower_case'. kwargs['do_lower_case'] = params.do_lower_case kwargs['tokenizer'] = tokenization.FullSentencePieceTokenizer( sp_model_file=params.vocab_file) elif params.tokenization == 'WordPiece': kwargs['tokenizer'] = tokenization.FullTokenizer( vocab_file=params.vocab_file, do_lower_case=params.do_lower_case) else: raise ValueError('Unexpected tokenization: %s' % params.tokenization) eval_dataset_size = self.squad_lib.convert_examples_to_features( **kwargs) eval_writer.close() logging.info('***** Evaluation input stats *****') logging.info(' Num orig examples = %d', len(eval_examples)) logging.info(' Num split examples = %d', len(eval_features)) logging.info(' Batch size = %d', params.global_batch_size) logging.info(' Dataset size = %d', eval_dataset_size) return eval_writer.filename, eval_examples, eval_features
def predict_squad(strategy, input_meta_data): """Makes predictions for the squad dataset.""" bert_config = albert_configs.AlbertConfig.from_json_file( FLAGS.bert_config_file) tokenizer = tokenization.FullSentencePieceTokenizer( sp_model_file=FLAGS.sp_model_file) run_squad_helper.predict_squad(strategy, input_meta_data, tokenizer, bert_config, squad_lib_sp)
def eval_squad(strategy, input_meta_data): """Evaluate on the squad dataset.""" bert_config = albert_configs.AlbertConfig.from_json_file( FLAGS.bert_config_file) tokenizer = tokenization.FullSentencePieceTokenizer( sp_model_file=FLAGS.sp_model_file) eval_metrics = run_squad_helper.eval_squad( strategy, input_meta_data, tokenizer, bert_config, squad_lib_sp) return eval_metrics
def main(_): tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file) create_tfrecords( tokenizer=tokenizer, input_file_or_files=FLAGS.input_file, use_eod_token=FLAGS.use_eod_token, do_lower_case=FLAGS.do_lower_case, per_host_batch_size=FLAGS.per_host_batch_size, seq_length=FLAGS.seq_length, reuse_length=FLAGS.reuse_length, bi_data=FLAGS.bi_data, num_cores_per_host=FLAGS.num_cores_per_host, save_dir=FLAGS.save_dir, prefix=FLAGS.prefix, suffix=FLAGS.suffix, num_tasks=FLAGS.num_tasks, task_id=FLAGS.task_id, num_passes=FLAGS.num_passes)
def generate_tf_record_from_json_file(input_file_path, sp_model_file, output_path, translated_input_folder=None, max_seq_length=384, do_lower_case=True, max_query_length=64, doc_stride=128, xlnet_format=False, version_2_with_negative=False): """Generates and saves training data into a tf record file.""" train_examples = read_squad_examples( input_file=input_file_path, is_training=True, version_2_with_negative=version_2_with_negative, translated_input_folder=translated_input_folder) tokenizer = tokenization.FullSentencePieceTokenizer( sp_model_file=sp_model_file) train_writer = FeatureWriter( filename=output_path, is_training=True) number_of_examples = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=True, output_fn=train_writer.process_feature, xlnet_format=xlnet_format, do_lower_case=do_lower_case) train_writer.close() meta_data = { "task_type": "bert_squad", "train_data_size": number_of_examples, "max_seq_length": max_seq_length, "max_query_length": max_query_length, "doc_stride": doc_stride, "version_2_with_negative": version_2_with_negative, } return meta_data
def prepare_input_data(self, data): """prepare text input for transformers as tensors """ if self.embedding_type == "bert": vocab_file = (self.embedding_layer.resolved_object.vocab_file. asset_path.numpy()) do_lower_case = self.embedding_layer.resolved_object.do_lower_case.numpy( ) tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case) elif self.embedding_type == "albert": sp_model_file = (self.embedding_layer.resolved_object. sp_model_file.asset_path.numpy()) tokenizer = tokenization.FullSentencePieceTokenizer(sp_model_file) input_ids, input_masks, input_segments = [], [], [] for s in data: stokens = tokenizer.tokenize(s) stokens = ["[CLS]"] + stokens + ["[SEP]"] input_ids.append(get_ids(stokens, tokenizer, self.max_seq_len)) input_masks.append(get_masks(stokens, self.max_seq_len)) input_segments.append(get_segments(stokens, self.max_seq_len)) return input_ids, input_masks, input_segments
def generate_classifier_dataset(): """Generates classifier dataset and returns input meta data.""" assert (FLAGS.input_data_dir and FLAGS.classification_task_name or FLAGS.tfds_params) if FLAGS.tokenizer_impl == "word_piece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenizer_impl == "sentence_piece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) if FLAGS.tfds_params: processor = classifier_data_lib.TfdsProcessor( tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, None, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, test_data_output_path=FLAGS.test_data_output_path, max_seq_length=FLAGS.max_seq_length) else: processors = { "cola": classifier_data_lib.ColaProcessor, "mnli": classifier_data_lib.MnliProcessor, "mrpc": classifier_data_lib.MrpcProcessor, "qnli": classifier_data_lib.QnliProcessor, "qqp": classifier_data_lib.QqpProcessor, "rte": classifier_data_lib.RteProcessor, "sst-2": classifier_data_lib.SstProcessor, "sts-b": classifier_data_lib.StsBProcessor, "xnli": functools.partial(classifier_data_lib.XnliProcessor, language=FLAGS.xnli_language), "paws-x": functools.partial(classifier_data_lib.PawsxProcessor, language=FLAGS.pawsx_language), "wnli": classifier_data_lib.WnliProcessor, "xtreme-xnli": functools.partial(classifier_data_lib.XtremeXnliProcessor), "xtreme-paws-x": functools.partial(classifier_data_lib.XtremePawsxProcessor) } task_name = FLAGS.classification_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name](process_text_fn=processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, FLAGS.input_data_dir, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, test_data_output_path=FLAGS.test_data_output_path, max_seq_length=FLAGS.max_seq_length)
def generate_classifier_dataset(): """Generates classifier dataset and returns input meta data.""" if FLAGS.classification_task_name in [ "COLA", "WNLI", "SST-2", "MRPC", "QQP", "STS-B", "MNLI", "QNLI", "RTE", "AX", "SUPERGLUE-RTE", "CB", "BoolQ", "WIC", ]: assert not FLAGS.input_data_dir or FLAGS.tfds_params else: assert (FLAGS.input_data_dir and FLAGS.classification_task_name or FLAGS.tfds_params) if FLAGS.tokenization == "WordPiece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenization == "SentencePiece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) if FLAGS.tfds_params: processor = classifier_data_lib.TfdsProcessor( tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, None, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, test_data_output_path=FLAGS.test_data_output_path, max_seq_length=FLAGS.max_seq_length) else: processors = { "ax": classifier_data_lib.AxProcessor, "cola": classifier_data_lib.ColaProcessor, "imdb": classifier_data_lib.ImdbProcessor, "mnli": functools.partial(classifier_data_lib.MnliProcessor, mnli_type=FLAGS.mnli_type), "mrpc": classifier_data_lib.MrpcProcessor, "qnli": classifier_data_lib.QnliProcessor, "qqp": classifier_data_lib.QqpProcessor, "rte": classifier_data_lib.RteProcessor, "sst-2": classifier_data_lib.SstProcessor, "sts-b": classifier_data_lib.StsBProcessor, "xnli": functools.partial(classifier_data_lib.XnliProcessor, language=FLAGS.xnli_language), "paws-x": functools.partial(classifier_data_lib.PawsxProcessor, language=FLAGS.pawsx_language), "wnli": classifier_data_lib.WnliProcessor, "xtreme-xnli": functools.partial( classifier_data_lib.XtremeXnliProcessor, translated_data_dir=FLAGS.translated_input_data_dir, only_use_en_dev=FLAGS.only_use_en_dev), "xtreme-paws-x": functools.partial( classifier_data_lib.XtremePawsxProcessor, translated_data_dir=FLAGS.translated_input_data_dir, only_use_en_dev=FLAGS.only_use_en_dev), "ax-g": classifier_data_lib.AXgProcessor, "superglue-rte": classifier_data_lib.SuperGLUERTEProcessor, "cb": classifier_data_lib.CBProcessor, "boolq": classifier_data_lib.BoolQProcessor, "wic": classifier_data_lib.WnliProcessor, } task_name = FLAGS.classification_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name](process_text_fn=processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, FLAGS.input_data_dir, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, test_data_output_path=FLAGS.test_data_output_path, max_seq_length=FLAGS.max_seq_length)
def init_tokenizer(self): sp_model_file = self.model_layer.resolved_object.sp_model_file.asset_path.numpy( ) return tokenization.FullSentencePieceTokenizer(sp_model_file)