def setUp(self): super(PreprocessingSmithTest, self).setUp() doc_one_text = ( "I am in Dominick's for my dinner. OK, no problem. I am " "in Dominick's for my dinner which is the best dinner I have " "in my whole life.") doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip() vocab_tokens = [ "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "i", "am", "in", "for", "my", "dinner", "ok", "no", "problem", "which", "is", "the", "be", "##s", "##t", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens ]).encode("utf-8")) self.vocab_file = vocab_writer.name self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_file, do_lower_case=True) self.vocab_words = list(self.tokenizer.vocab.keys()) self.rng = random.Random(12345) self.doc_one_tokens, _ = preprocessing_smith.get_smith_model_tokens( doc_one_text, self.tokenizer, [0, 0]) self.max_sent_length_by_word = 20 self.max_doc_length_by_sentence = 3 self.greedy_sentence_filling = True self.max_predictions_per_seq = 0 self.masked_lm_prob = 0
def tokenize_with_full_tokenizer(self): """Returns tokens and ids processed with FullTokenizer.""" text = u"UNwant\u00E9d,running [unused0] [CLS] [unused55]" vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ",", "[unused0]" ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: if six.PY2: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) else: vocab_writer.write("".join([x + "\n" for x in vocab_tokens ]).encode("utf-8")) vocab_file = vocab_writer.name tokenizer = tokenization.FullTokenizer(vocab_file) os.unlink(vocab_file) tokens = tokenizer.tokenize(text) ids = tokenizer.convert_tokens_to_ids(tokens) return tokens, ids
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Reading from input files ***") for input_file in input_files: tf.logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) # Creates training instances. max_predictions_per_seq = FLAGS.max_predictions_per_seq if FLAGS.add_masks_lm else 0 masked_lm_prob = FLAGS.masked_lm_prob if FLAGS.add_masks_lm else 0 instances, sent_token_counter = create_training_instances_wiki_doc_pair( input_file=FLAGS.input_file, tokenizer=tokenizer, max_sent_length_by_word=FLAGS.max_sent_length_by_word, max_doc_length_by_sentence=FLAGS.max_doc_length_by_sentence, masked_lm_prob=masked_lm_prob, max_predictions_per_seq=max_predictions_per_seq, rng=rng) output_files = FLAGS.output_file.split(",") tf.logging.info("*** Writing to output files ***") for output_file in output_files: tf.logging.info(" %s", output_file) # Transfers training instances into tensorflow examples and write the results. write_instance_to_example_files(instances, tokenizer, output_files) # Finally outputs some data statistics. tf.logging.info("sent_count, token_count, doc_pair_count: %d %d %d", sent_token_counter[0], sent_token_counter[1], len(instances))