예제 #1
0
    def test_corpus_token_counts_split_with_max_lines(self):
        token_counts = tokenizer.corpus_token_counts(self.corpus_path,
                                                     corpus_max_lines=5,
                                                     split_on_newlines=True)

        self.assertIn(u"slept", token_counts)
        self.assertNotIn(u"Mitch", token_counts)
예제 #2
0
  def test_corpus_token_counts_no_split_with_max_lines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=5, split_on_newlines=False)

    self.assertIn(u"slept", token_counts)
    self.assertNotIn(u"Mitch", token_counts)
    self.assertDictContainsSubset({
        u".\n\n": 1,
        u"\n": 2,
        u".\n": 1
    }, token_counts)
예제 #3
0
  def test_corpus_token_counts_no_split_with_max_lines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=5, split_on_newlines=False)

    self.assertIn(u"slept", token_counts)
    self.assertNotIn(u"Mitch", token_counts)
    self.assertDictContainsSubset({
        u".\n\n": 1,
        u"\n": 2,
        u".\n": 1
    }, token_counts)
def main(unused_argv):
    gs = text_encoder.SubwordTextEncoder()
    if not FLAGS.corpus_filepattern:
        raise ValueError('Must provide --corpus_filepattern')
    token_counts = tokenizer.corpus_token_counts(
        FLAGS.corpus_filepattern,
        FLAGS.corpus_max_lines,
        split_on_newlines=FLAGS.split_on_newlines)
    gs.build_from_token_counts(token_counts, FLAGS.min_count,
                               FLAGS.num_iterations)
    gs.store_to_file(FLAGS.output_fn)
예제 #5
0
def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
  """Read or create vocabulary."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  print('Vocab file written to: ' + vocab_filepath)

  if tf.gfile.Exists(vocab_filepath):
    gs = text_encoder.SubwordTextEncoder(vocab_filepath)
    return gs
  example_file = os.path.join(tmp_dir, _EXAMPLES_FILE)
  gs = text_encoder.SubwordTextEncoder()
  token_counts = tokenizer.corpus_token_counts(
      example_file, corpus_max_lines=1000000)
  gs = gs.build_to_target_size(
      vocab_size, token_counts, min_val=1, max_val=1e3)
  gs.store_to_file(vocab_filepath)
  return gs
예제 #6
0
def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
  """Read or create vocabulary."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  print('Vocab file written to: ' + vocab_filepath)

  if tf.gfile.Exists(vocab_filepath):
    gs = text_encoder.SubwordTextEncoder(vocab_filepath)
    return gs
  example_file = os.path.join(tmp_dir, _EXAMPLES_FILE)
  gs = text_encoder.SubwordTextEncoder()
  token_counts = tokenizer.corpus_token_counts(
      example_file, corpus_max_lines=1000000)
  gs = gs.build_to_target_size(
      vocab_size, token_counts, min_val=1, max_val=1e3)
  gs.store_to_file(vocab_filepath)
  return gs
예제 #7
0
  def test_corpus_token_counts_split_on_newlines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=0, split_on_newlines=True)

    expected = {
        u"'": 2,
        u".": 2,
        u". ": 1,
        u"... ": 1,
        u"Groucho": 1,
        u"Marx": 1,
        u"Mitch": 1,
        u"Hedberg": 1,
        u"I": 3,
        u"in": 2,
        u"my": 2,
        u"pajamas": 2,
    }
    self.assertDictContainsSubset(expected, token_counts)
    self.assertNotIn(u".\n\n", token_counts)
    self.assertNotIn(u"\n", token_counts)
예제 #8
0
  def test_corpus_token_counts_split_on_newlines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=0, split_on_newlines=True)

    expected = {
        u"'": 2,
        u".": 2,
        u". ": 1,
        u"... ": 1,
        u"Groucho": 1,
        u"Marx": 1,
        u"Mitch": 1,
        u"Hedberg": 1,
        u"I": 3,
        u"in": 2,
        u"my": 2,
        u"pajamas": 2,
    }
    self.assertDictContainsSubset(expected, token_counts)
    self.assertNotIn(u".\n\n", token_counts)
    self.assertNotIn(u"\n", token_counts)
예제 #9
0
def main(unused_argv):
  if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
    raise ValueError(
        'Must only provide one of --corpus_filepattern or --vocab_filepattern')

  elif FLAGS.corpus_filepattern:
    token_counts = tokenizer.corpus_token_counts(
        FLAGS.corpus_filepattern,
        FLAGS.corpus_max_lines,
        split_on_newlines=FLAGS.split_on_newlines)

  elif FLAGS.vocab_filepattern:
    token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                FLAGS.corpus_max_lines)

  else:
    raise ValueError(
        'Must provide one of --corpus_filepattern or --vocab_filepattern')

  encoder = text_encoder.SubwordTextEncoder()
  encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                  FLAGS.num_iterations)
  encoder.store_to_file(FLAGS.output_filename)
def main(unused_argv):
  if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
    raise ValueError(
        'Must only provide one of --corpus_filepattern or --vocab_filepattern')

  elif FLAGS.corpus_filepattern:
    token_counts = tokenizer.corpus_token_counts(
        FLAGS.corpus_filepattern,
        FLAGS.corpus_max_lines,
        split_on_newlines=FLAGS.split_on_newlines)

  elif FLAGS.vocab_filepattern:
    token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                FLAGS.corpus_max_lines)

  else:
    raise ValueError(
        'Must provide one of --corpus_filepattern or --vocab_filepattern')

  encoder = text_encoder.SubwordTextEncoder()
  encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                  FLAGS.num_iterations)
  encoder.store_to_file(FLAGS.output_filename)
예제 #11
0
    def test_corpus_token_counts_no_split_on_newlines(self):
        token_counts = tokenizer.corpus_token_counts(self.corpus_path,
                                                     corpus_max_lines=0,
                                                     split_on_newlines=False)

        self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts)
예제 #12
0
  def test_corpus_token_counts_split_with_max_lines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=5, split_on_newlines=True)

    self.assertIn(u"slept", token_counts)
    self.assertNotIn(u"Mitch", token_counts)
예제 #13
0
  def test_corpus_token_counts_no_split_on_newlines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=0, split_on_newlines=False)

    self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts)