Python corpus_token_counts 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: tensor2tensor.data_generators.tokenizer

메소드/함수: corpus_token_counts

hotexamples.com에서의 예제들: 13

Python corpus_token_counts - 13개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 tensor2tensor.data_generators.tokenizer.corpus_token_counts에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: tokenizer_test.py 프로젝트: yehuifzu/tensor2tensor

    def test_corpus_token_counts_split_with_max_lines(self):
        token_counts = tokenizer.corpus_token_counts(self.corpus_path,
                                                     corpus_max_lines=5,
                                                     split_on_newlines=True)

        self.assertIn(u"slept", token_counts)
        self.assertNotIn(u"Mitch", token_counts)

예제 #2

파일 보기

  def test_corpus_token_counts_no_split_with_max_lines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=5, split_on_newlines=False)

    self.assertIn(u"slept", token_counts)
    self.assertNotIn(u"Mitch", token_counts)
    self.assertDictContainsSubset({
        u".\n\n": 1,
        u"\n": 2,
        u".\n": 1
    }, token_counts)

예제 #3

파일 보기

파일: tokenizer_test.py 프로젝트: qixiuai/tensor2tensor

  def test_corpus_token_counts_no_split_with_max_lines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=5, split_on_newlines=False)

    self.assertIn(u"slept", token_counts)
    self.assertNotIn(u"Mitch", token_counts)
    self.assertDictContainsSubset({
        u".\n\n": 1,
        u"\n": 2,
        u".\n": 1
    }, token_counts)

예제 #4

파일 보기

파일: text_encoder_build_subword.py 프로젝트: zabin10/tensor2tensor

def main(unused_argv):
    gs = text_encoder.SubwordTextEncoder()
    if not FLAGS.corpus_filepattern:
        raise ValueError('Must provide --corpus_filepattern')
    token_counts = tokenizer.corpus_token_counts(
        FLAGS.corpus_filepattern,
        FLAGS.corpus_max_lines,
        split_on_newlines=FLAGS.split_on_newlines)
    gs.build_from_token_counts(token_counts, FLAGS.min_count,
                               FLAGS.num_iterations)
    gs.store_to_file(FLAGS.output_fn)

예제 #5

파일 보기

def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
  """Read or create vocabulary."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  print('Vocab file written to: ' + vocab_filepath)

  if tf.gfile.Exists(vocab_filepath):
    gs = text_encoder.SubwordTextEncoder(vocab_filepath)
    return gs
  example_file = os.path.join(tmp_dir, _EXAMPLES_FILE)
  gs = text_encoder.SubwordTextEncoder()
  token_counts = tokenizer.corpus_token_counts(
      example_file, corpus_max_lines=1000000)
  gs = gs.build_to_target_size(
      vocab_size, token_counts, min_val=1, max_val=1e3)
  gs.store_to_file(vocab_filepath)
  return gs

예제 #6

파일 보기

파일: snli.py 프로젝트: qixiuai/tensor2tensor

def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
  """Read or create vocabulary."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  print('Vocab file written to: ' + vocab_filepath)

  if tf.gfile.Exists(vocab_filepath):
    gs = text_encoder.SubwordTextEncoder(vocab_filepath)
    return gs
  example_file = os.path.join(tmp_dir, _EXAMPLES_FILE)
  gs = text_encoder.SubwordTextEncoder()
  token_counts = tokenizer.corpus_token_counts(
      example_file, corpus_max_lines=1000000)
  gs = gs.build_to_target_size(
      vocab_size, token_counts, min_val=1, max_val=1e3)
  gs.store_to_file(vocab_filepath)
  return gs

예제 #7

파일 보기

  def test_corpus_token_counts_split_on_newlines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=0, split_on_newlines=True)

    expected = {
        u"'": 2,
        u".": 2,
        u". ": 1,
        u"... ": 1,
        u"Groucho": 1,
        u"Marx": 1,
        u"Mitch": 1,
        u"Hedberg": 1,
        u"I": 3,
        u"in": 2,
        u"my": 2,
        u"pajamas": 2,
    }
    self.assertDictContainsSubset(expected, token_counts)
    self.assertNotIn(u".\n\n", token_counts)
    self.assertNotIn(u"\n", token_counts)

예제 #8

파일 보기

파일: tokenizer_test.py 프로젝트: qixiuai/tensor2tensor

  def test_corpus_token_counts_split_on_newlines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=0, split_on_newlines=True)

    expected = {
        u"'": 2,
        u".": 2,
        u". ": 1,
        u"... ": 1,
        u"Groucho": 1,
        u"Marx": 1,
        u"Mitch": 1,
        u"Hedberg": 1,
        u"I": 3,
        u"in": 2,
        u"my": 2,
        u"pajamas": 2,
    }
    self.assertDictContainsSubset(expected, token_counts)
    self.assertNotIn(u".\n\n", token_counts)
    self.assertNotIn(u"\n", token_counts)

예제 #9

파일 보기

def main(unused_argv):
  if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
    raise ValueError(
        'Must only provide one of --corpus_filepattern or --vocab_filepattern')

  elif FLAGS.corpus_filepattern:
    token_counts = tokenizer.corpus_token_counts(
        FLAGS.corpus_filepattern,
        FLAGS.corpus_max_lines,
        split_on_newlines=FLAGS.split_on_newlines)

  elif FLAGS.vocab_filepattern:
    token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                FLAGS.corpus_max_lines)

  else:
    raise ValueError(
        'Must provide one of --corpus_filepattern or --vocab_filepattern')

  encoder = text_encoder.SubwordTextEncoder()
  encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                  FLAGS.num_iterations)
  encoder.store_to_file(FLAGS.output_filename)

예제 #10

파일 보기

파일: text_encoder_build_subword.py 프로젝트: qixiuai/tensor2tensor

def main(unused_argv):
  if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
    raise ValueError(
        'Must only provide one of --corpus_filepattern or --vocab_filepattern')

  elif FLAGS.corpus_filepattern:
    token_counts = tokenizer.corpus_token_counts(
        FLAGS.corpus_filepattern,
        FLAGS.corpus_max_lines,
        split_on_newlines=FLAGS.split_on_newlines)

  elif FLAGS.vocab_filepattern:
    token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
                                                FLAGS.corpus_max_lines)

  else:
    raise ValueError(
        'Must provide one of --corpus_filepattern or --vocab_filepattern')

  encoder = text_encoder.SubwordTextEncoder()
  encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                  FLAGS.num_iterations)
  encoder.store_to_file(FLAGS.output_filename)

예제 #11

파일 보기

파일: tokenizer_test.py 프로젝트: yehuifzu/tensor2tensor

    def test_corpus_token_counts_no_split_on_newlines(self):
        token_counts = tokenizer.corpus_token_counts(self.corpus_path,
                                                     corpus_max_lines=0,
                                                     split_on_newlines=False)

        self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts)

예제 #12

파일 보기

파일: tokenizer_test.py 프로젝트: qixiuai/tensor2tensor

  def test_corpus_token_counts_split_with_max_lines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=5, split_on_newlines=True)

    self.assertIn(u"slept", token_counts)
    self.assertNotIn(u"Mitch", token_counts)

예제 #13

파일 보기

파일: tokenizer_test.py 프로젝트: qixiuai/tensor2tensor

  def test_corpus_token_counts_no_split_on_newlines(self):
    token_counts = tokenizer.corpus_token_counts(
        self.corpus_path, corpus_max_lines=0, split_on_newlines=False)

    self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts)