def test_vocab_token_counts(self): token_counts = tokenizer.vocab_token_counts(self.vocab_path, 0) expected = { u"lollipop": 8, u"reverberated": 12, u"kattywampus": 11, u"balderdash": 10, u"jiggery-pokery": 14, } self.assertDictEqual(expected, token_counts)
def test_vocab_token_counts_with_max_lines(self): # vocab-1 has 2 lines, vocab-2 has 3 token_counts = tokenizer.vocab_token_counts(self.vocab_path, 5) expected = { u"lollipop": 8, u"reverberated": 12, u"kattywampus": 11, u"balderdash": 10, } self.assertDictEqual(expected, token_counts)
def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts(token_counts, FLAGS.min_count, FLAGS.num_iterations) encoder.store_to_file(FLAGS.output_filename)