Пример #1
0
  def test_vocab_token_counts(self):
    token_counts = tokenizer.vocab_token_counts(self.vocab_path, 0)

    expected = {
        u"lollipop": 8,
        u"reverberated": 12,
        u"kattywampus": 11,
        u"balderdash": 10,
        u"jiggery-pokery": 14,
    }
    self.assertDictEqual(expected, token_counts)
Пример #2
0
  def test_vocab_token_counts_with_max_lines(self):
    # vocab-1 has 2 lines, vocab-2 has 3
    token_counts = tokenizer.vocab_token_counts(self.vocab_path, 5)

    expected = {
        u"lollipop": 8,
        u"reverberated": 12,
        u"kattywampus": 11,
        u"balderdash": 10,
    }
    self.assertDictEqual(expected, token_counts)
Пример #3
0
def main(unused_argv):
    if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
        raise ValueError(
            'Must only provide one of --corpus_filepattern or --vocab_filepattern'
        )

    elif FLAGS.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(
            FLAGS.corpus_filepattern,
            FLAGS.corpus_max_lines,
            split_on_newlines=FLAGS.split_on_newlines)

    elif FLAGS.vocab_filepattern:
        token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                    FLAGS.corpus_max_lines)

    else:
        raise ValueError(
            'Must provide one of --corpus_filepattern or --vocab_filepattern')

    encoder = text_encoder.SubwordTextEncoder()
    encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                    FLAGS.num_iterations)
    encoder.store_to_file(FLAGS.output_filename)