Python encode примеры, tensor2tensor.data_generators.tokenizer.encode Python примеры использования

Пример #1

0

Показать файл

Файл: tokenizer_test.py Проект: vohoaiviet/tensor2tensor

 def testEncode(self):
     self.assertEqual(
         tokenizer.encode(u"Dude - that's so cool."),
         [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."])
     self.assertEqual(tokenizer.encode(u"Łukasz est né en 1981."),
                      [u"Łukasz", u"est", u"né", u"en", u"1981", u"."])
     self.assertEqual(tokenizer.encode(u" Spaces at the ends "),
                      [u" ", u"Spaces", u"at", u"the", u"ends", u" "])
     self.assertEqual(tokenizer.encode(u"802.11b"), [u"802", u".", u"11b"])
     self.assertEqual(tokenizer.encode(u"two. \nlines"),
                      [u"two", u". \n", u"lines"])

Пример #2

0

Показать файл

Файл: tokenizer_test.py Проект: qixiuai/tensor2tensor

 def test_encode(self):
   self.assertListEqual(
       [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."],
       tokenizer.encode(u"Dude - that's so cool."))
   self.assertListEqual([u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
                        tokenizer.encode(u"Łukasz est né en 1981."))
   self.assertListEqual([u" ", u"Spaces", u"at", u"the", u"ends", u" "],
                        tokenizer.encode(u" Spaces at the ends "))
   self.assertListEqual([u"802", u".", u"11b"], tokenizer.encode(u"802.11b"))
   self.assertListEqual([u"two", u". \n", u"lines"],
                        tokenizer.encode(u"two. \nlines"))

Пример #3

0

Показать файл

Файл: test_bpe.py Проект: mideind/bpe_nmt

def debug_bpe_large_02():
    enc = BPEEncoder.load_from_file("tests/vocab_large.json")
    max_line_count = 100_000

    corpus = line_gen("/home/haukur/Projects/bpe_nmt/eng-isl.tsv")
    corpus = itertools.islice(corpus, max_line_count)

    total_matches = 0
    for line in corpus:
        greedy_ids = enc.encode(line)
        optimal_ids = enc.encode(line, greedy=False)
        total_matches += int(greedy_ids == optimal_ids)
        if greedy_ids != optimal_ids:
            print(line)
            toks = t2t_tokenizer.encode(line)
            toks = [
                tok for tok in toks if enc.encode(tok) != enc.encode(tok, greedy=False)
            ]
            toks = " ".join(toks)
            print([enc.all_symbols[token_id] for token_id in enc.encode(toks)])
            print(
                [
                    enc.all_symbols[token_id]
                    for token_id in enc.encode(toks, greedy=False)
                ]
            )
            print()

    print(total_matches / max_line_count)

Пример #4

0

Показать файл

Файл: lm1b.py Проект: jiaorunnju/one-model-to-learn-them-all

def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath, target_size):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_filepath: path to store (or load) vocab.
    target_size: an optional integer.

  Returns:
    a SubwordTextEncoder.
  """
    if tf.gfile.Exists(vocab_filepath):
        return text_encoder.SubwordTextEncoder(vocab_filepath)
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    token_counts = defaultdict(int)
    line_count = 0
    max_lines = 63000
    for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
        tokens = tokenizer.encode(
            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
        for tok in tokens:
            token_counts[tok] += 1
        line_count += 1
        if line_count >= max_lines:
            break
    if target_size == 2**15:
        # legacy behavior
        ret = text_encoder.SubwordTextEncoder()
        ret.build_from_token_counts(token_counts, min_count=5)
    else:
        ret = text_encoder.SubwordTextEncoder.build_to_target_size(
            target_size, token_counts, 1, 1000)
    ret.store_to_file(vocab_filepath)
    return ret

Пример #5

0

Показать файл

Файл: text_encoder.py Проект: stanford-futuredata/mlperf-results

    def build_from_generator(cls,
                             generator,
                             target_size,
                             max_subtoken_length=None,
                             reserved_tokens=None):
        """Builds a SubwordTextEncoder from the generated text.

    Args:
      generator: yields text.
      target_size: int, approximate vocabulary size to create.
      max_subtoken_length: Maximum length of a subtoken. If this is not set,
        then the runtime and memory use of creating the vocab is quadratic in
        the length of the longest token. If this is set, then it is instead
        O(max_subtoken_length * length of longest token).
      reserved_tokens: List of reserved tokens. The global variable
        `RESERVED_TOKENS` must be a prefix of `reserved_tokens`. If this
        argument is `None`, it will use `RESERVED_TOKENS`.

    Returns:
      SubwordTextEncoder with `vocab_size` approximately `target_size`.
    """
        token_counts = collections.defaultdict(int)
        for item in generator:
            for tok in tokenizer.encode(native_to_unicode(item)):
                token_counts[tok] += 1
        encoder = cls.build_to_target_size(
            target_size,
            token_counts,
            1,
            1e3,
            max_subtoken_length=max_subtoken_length,
            reserved_tokens=reserved_tokens)
        return encoder

Пример #6

0

Показать файл

Файл: wikisum.py Проект: xueeinstein/tensor2tensor-rl

def _token_counts(text, token_set=None):
    counts = collections.defaultdict(int)
    for token in tokenizer.encode(text_encoder.native_to_unicode(text)):
        if token_set and token not in token_set:
            continue
        counts[token] += 1
    return counts

Пример #7

0

Показать файл

Файл: text_encoder.py Проект: qixiuai/tensor2tensor

  def build_from_generator(cls,
                           generator,
                           target_size,
                           max_subtoken_length=None,
                           reserved_tokens=None):
    """Builds a SubwordTextEncoder from the generated text.

    Args:
      generator: yields text.
      target_size: int, approximate vocabulary size to create.
      max_subtoken_length: Maximum length of a subtoken. If this is not set,
        then the runtime and memory use of creating the vocab is quadratic in
        the length of the longest token. If this is set, then it is instead
        O(max_subtoken_length * length of longest token).
      reserved_tokens: List of reserved tokens. The global variable
        `RESERVED_TOKENS` must be a prefix of `reserved_tokens`. If this
        argument is `None`, it will use `RESERVED_TOKENS`.

    Returns:
      SubwordTextEncoder with `vocab_size` approximately `target_size`.
    """
    token_counts = collections.defaultdict(int)
    for item in generator:
      for tok in tokenizer.encode(native_to_unicode(item)):
        token_counts[tok] += 1
    encoder = cls.build_to_target_size(
        target_size, token_counts, 1, 1e3,
        max_subtoken_length=max_subtoken_length,
        reserved_tokens=reserved_tokens)
    return encoder

Пример #8

0

Показать файл

Файл: bpe.py Проект: mideind/bpe_nmt

 def build_from_generator(
     cls,
     generator,
     max_size,
     separate_case=True,
     verbose=False,
     use_eow=True,
     max_lines=100000,
     continue_for_alphabet=True,
 ):
     token_counts = collections.defaultdict(int)
     rest = []
     if max_lines is not None:
         rest = iter(generator)
         generator = itertools.islice(generator, max_lines)
     for idx, line in enumerate(generator):
         for token in t2t_tokenizer.encode(line):
             token_counts[token] += 1
     if max_lines is not None and continue_for_alphabet:
         alphabet_rest = set()
         for line in rest:
             alphabet_rest.update(line)
         alphabet_rest = "".join(alphabet_rest)
         token_counts[alphabet_rest] += 1
     return cls.build_from_token_counts(
         token_counts,
         max_size,
         separate_case=separate_case,
         verbose=verbose,
         use_eow=use_eow,
     )

Пример #9

0

Показать файл

Файл: wiki.py Проект: tamcap/tensor2tensor

def _get_or_build_subword_text_encoder(tmp_dir):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: a string

  Returns:
    a SubwordTextEncoder.
  """
    filename = os.path.join(tmp_dir, "wiki_32k.subword_text_encoder")
    if tf.gfile.Exists(filename):
        return text_encoder.SubwordTextEncoder(filename)
    token_counts = defaultdict(int)
    for page in page_generator(tmp_dir, max_docs=1000):
        tokens = tokenizer.encode(page)
        tokens = set(tokens)
        for tok in tokens:
            token_counts[tok] += 1
    new_token_counts = defaultdict(int)
    for token, count in six.iteritems(token_counts):
        if count >= 3:
            new_token_counts[token] = count
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(new_token_counts, min_count=10)
    ret.store_to_file(filename)
    return ret

Пример #10

0

Показать файл

Файл: wikisum.py Проект: xueeinstein/tensor2tensor-rl

def _rank_reference_paragraphs(wiki_title, references_content):
    """Rank and return reference paragraphs by tf-idf score on title tokens."""
    title_tokens = _tokens_to_score(
        set(tokenizer.encode(text_encoder.native_to_unicode(wiki_title))))
    ref_paragraph_info = []
    doc_counts = collections.defaultdict(int)
    for ref in references_content:
        for paragraph in ref.split("\n"):
            paragraph = _normalize_text(paragraph)
            if cc_utils.filter_paragraph(paragraph):
                # Skip paragraph
                continue
            counts = _token_counts(paragraph, title_tokens)
            for token in title_tokens:
                if counts[token]:
                    doc_counts[token] += 1
            info = {"content": paragraph, "counts": counts}
            ref_paragraph_info.append(info)

    for info in ref_paragraph_info:
        score = 0.
        for token in title_tokens:
            term_frequency = info["counts"][token]
            inv_doc_frequency = (float(len(ref_paragraph_info)) /
                                 max(doc_counts[token], 1))
            score += term_frequency * math.log(inv_doc_frequency)
        info["score"] = score

    ref_paragraph_info.sort(key=lambda el: el["score"], reverse=True)
    return [info["content"] for info in ref_paragraph_info]

Пример #11

0

Показать файл

Файл: generator_utils.py Проект: vohoaiviet/tensor2tensor

def get_or_generate_vocab_es(tmp_dir, vocab_filename, vocab_size, datasets):
  """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  print(vocab_filepath)
  if tf.gfile.Exists(vocab_filepath):
    tf.logging.info("Found vocab file: %s", vocab_filepath)
    vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
    return vocab

  sources = datasets
  tf.logging.info("Generating vocab from: %s", str(sources))
  token_counts = defaultdict(int)
  for source in sources:
    for lang_file in source[0]:
      tf.logging.info("Reading file: %s" % lang_file)
      filepath = os.path.join(tmp_dir, lang_file)
      print(filepath)

      # Use Tokenizer to count the word occurrences.
      with tf.gfile.GFile(filepath, mode="r") as source_file:
        file_byte_budget = 3.5e5 if "en" in filepath else 7e5
        for line in source_file:
          if file_byte_budget <= 0:
            break
          line = line.strip()
          file_byte_budget -= len(line)
          for tok in tokenizer.encode(text_encoder.native_to_unicode(line)):
            token_counts[tok] += 1

  vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
      vocab_size, token_counts, 1, 1e3)
  vocab.store_to_file(vocab_filepath)
  return vocab

Пример #12

0

Показать файл

def _get_or_build_subword_text_encoder(tmp_dir):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
  Returns:
    a SubwordTextEncoder.
  """
    filepath = os.path.join(tmp_dir, "lm1b_32k.subword_text_encoder")
    if tf.gfile.Exists(filepath):
        return text_encoder.SubwordTextEncoder(filepath)
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    token_counts = defaultdict(int)
    line_count = 0
    max_lines = 63000
    for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
        tokens = tokenizer.encode(
            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
        for tok in tokens:
            token_counts[tok] += 1
        line_count += 1
        if line_count >= max_lines:
            break
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(token_counts, min_count=5)
    ret.store_to_file(filepath)
    return ret

Пример #13

0

Показать файл

Файл: wikisum.py Проект: qixiuai/tensor2tensor

def _token_counts(text, token_set=None):
  counts = collections.defaultdict(int)
  for token in tokenizer.encode(text_encoder.native_to_unicode(text)):
    if token_set and token not in token_set:
      continue
    counts[token] += 1
  return counts

Пример #14

0

Показать файл

Файл: wikisum.py Проект: qixiuai/tensor2tensor

def rank_reference_paragraphs(wiki_title, references_content, normalize=True):
  """Rank and return reference paragraphs by tf-idf score on title tokens."""
  normalized_title = _normalize_text(wiki_title)
  title_tokens = _tokens_to_score(
      set(tokenizer.encode(text_encoder.native_to_unicode(normalized_title))))
  ref_paragraph_info = []
  doc_counts = collections.defaultdict(int)
  for ref in references_content:
    for paragraph in ref.split("\n"):
      normalized_paragraph = _normalize_text(paragraph)
      if cc_utils.filter_paragraph(normalized_paragraph):
        # Skip paragraph
        continue
      counts = _token_counts(normalized_paragraph, title_tokens)
      for token in title_tokens:
        if counts[token]:
          doc_counts[token] += 1
      content = normalized_paragraph if normalize else paragraph
      info = {"content": content, "counts": counts}
      ref_paragraph_info.append(info)

  for info in ref_paragraph_info:
    score = 0.
    for token in title_tokens:
      term_frequency = info["counts"][token]
      inv_doc_frequency = (
          float(len(ref_paragraph_info)) / max(doc_counts[token], 1))
      score += term_frequency * math.log(inv_doc_frequency)
    info["score"] = score

  ref_paragraph_info.sort(key=lambda el: el["score"], reverse=True)
  return [info["content"] for info in ref_paragraph_info]

Пример #15

0

Показать файл

Файл: reg_problems.py Проект: entn-at/translate_0

def generate_bpe_vocab(file_list, targeted_vocab_size):
    token_counts = defaultdict(int)
    for item in generator_fn(file_list):
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1
    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        targeted_vocab_size, token_counts, 1, 1e3)
    return vocab

Пример #16

0

Показать файл

Файл: bpe.py Проект: mideind/bpe_nmt

 def encode_with_dropout(self, text, dropout):
     ret = []
     for token in t2t_tokenizer.encode(text):
         if self._ignore_ooa:
             token = enforce_alphabet(token, self._alphabet_set)
         token = self.maybe_add_meta_symbols(token)
         ret.extend(self._encode_token_with_dropout(token, dropout))
     return ret

Пример #17

0

Показать файл

Файл: generator_utils.py Проект: zabin10/tensor2tensor

def get_or_generate_vocab(data_dir,
                          tmp_dir,
                          vocab_filename,
                          vocab_size,
                          sources=None):
    """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
    vocab_filepath = os.path.join(data_dir, vocab_filename)
    if tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    sources = sources or _DATA_FILE_URLS
    tf.logging.info("Generating vocab from: %s", str(sources))
    token_counts = defaultdict(int)
    for source in sources:
        url = source[0]
        filename = os.path.basename(url)
        read_type = "r:gz" if "tgz" in filename else "r"

        compressed_file = maybe_download(tmp_dir, filename, url)

        with tarfile.open(compressed_file, read_type) as corpus_tar:
            corpus_tar.extractall(tmp_dir)

        for lang_file in source[1]:
            tf.logging.info("Reading file: %s" % lang_file)
            filepath = os.path.join(tmp_dir, lang_file)

            # For some datasets a second extraction is necessary.
            if ".gz" in lang_file:
                new_filepath = os.path.join(tmp_dir, lang_file[:-3])
                if tf.gfile.Exists(new_filepath):
                    tf.logging.info(
                        "Subdirectory %s already exists, skipping unpacking" %
                        filepath)
                else:
                    tf.logging.info("Unpacking subdirectory %s" % filepath)
                    gunzip_file(filepath, new_filepath)
                filepath = new_filepath

            # Use Tokenizer to count the word occurrences.
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                file_byte_budget = 3.5e5 if "en" in filepath else 7e5
                for line in source_file:
                    if file_byte_budget <= 0:
                        break
                    line = line.strip()
                    file_byte_budget -= len(line)
                    for tok in tokenizer.encode(
                            text_encoder.native_to_unicode(line)):
                        token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)
    vocab.store_to_file(vocab_filepath)
    return vocab

Пример #18

0

Показать файл

Файл: bpe.py Проект: mideind/bpe_nmt

 def encode(self, text, greedy=True):
     tokens = []
     for token in t2t_tokenizer.encode(text):
         token = self.maybe_add_meta_symbols(token)
         if self._ignore_ooa:
             token = enforce_alphabet(token,
                                      self._alphabet_set,
                                      lower=self.separate_case)
         tokens.append(token)
     return self._encode_tokens(tokens, greedy=greedy)

Пример #19

0

Показать файл

Файл: text_encoder.py Проект: stanford-futuredata/mlperf-results

    def encode(self, s):
        """Converts a native string to a list of subtoken ids.

    Args:
      s: a native string.
    Returns:
      a list of integers in the range [0, vocab_size)
    """
        return self._tokens_to_subtoken_ids(
            tokenizer.encode(native_to_unicode(s)))

Пример #20

0

Показать файл

Файл: text_encoder.py Проект: zeyu-h/tensor2tensor

  def encode(self, raw_text):
    """Converts a native string to a list of subtoken ids.

    Args:
      raw_text: a native string.
    Returns:
      a list of integers in the range [0, vocab_size)
    """
    return self._tokens_to_subtoken_ids(
        tokenizer.encode(native_to_unicode(raw_text)))

Пример #21

0

Показать файл

Файл: string_utils.py Проект: ziyouzizai111/google-research

def t2t_tokenize_to_ids(text):
    """Tokenize text string with tensor2tensor tokenizer."""
    token_vocab = _get_token_vocab()
    tokens = t2t_tokenizer.encode(text)
    token_ids = []
    for token in tokens:
        if token not in token_vocab:
            raise UnknownTokenError('Unknown token %s' % token)
        else:
            token_ids.append(token_vocab[token])
    return token_ids, tokens

Пример #22

0

Показать файл

Файл: babi_qa.py Проект: kltony/tensor2tensor

def _normalize_string(raw_str):
  """Normalizes the string using tokenizer.encode.

  Args:
    raw_str: the input string

  Returns:
   A string which is ready to be tokenized using split()
  """
  return ' '.join(
      token.strip()
      for token in tokenizer.encode(text_encoder.native_to_unicode(raw_str)))

Пример #23

0

Показать файл

Файл: babi_qa.py Проект: hubayirp/fabric-vsf

def _normalize_string(raw_str):
    """Normalizes the string using tokenizer.encode.

  Args:
    raw_str: the input string

  Returns:
   A string which is ready to be tokenized using split()
  """
    return " ".join(
        token.strip()
        for token in tokenizer.encode(text_encoder.native_to_unicode(raw_str)))

Пример #24

0

Показать файл

Файл: build_encoders.py Проект: hocop/VAE-paraphraser-and-dialogue-model

def BuildStatFromCorpus(callback,
                        filter_freq=-1,
                        input_delimiters=[],
                        delim_ends=''):
    if filter_freq < 0:
        filter_freq = 0
    ignore_delimiters = len(input_delimiters) > 0

    token_dict = [{}, {}]
    size = len(all_pairs)
    for idx in range(size):
        p = all_pairs[idx]
        for i in range(2):
            if i == 0 and not ignore_delimiters:
                last_delimiter_idx = max(
                    [0] +
                    [p[i].rfind(delimiter) for delimiter in input_delimiters])
                if last_delimiter_idx > len(p[i]) - 2:
                    continue
                string_to_be_tokenized = p[i][
                    last_delimiter_idx + 2 if last_delimiter_idx > 0 else 0:]
            else:
                string_to_be_tokenized = p[i]
            toks = tokenizer.encode(string_to_be_tokenized)
            toks_processed = []
            for tok in toks:
                #if is_float(tok):
                #    continue
                if has_digits(tok):
                    continue
                toks_processed.extend(callback(tok.strip()))
            for tok in toks_processed:
                if tok in token_dict[i]:
                    token_dict[i][tok] += 1
                else:
                    token_dict[i][tok] = 1
        if idx % 1000 == 0:
            print('    %d%%' % int(idx / size * 100), end='\r')
    token_dict = dict(Counter(token_dict[0]) + Counter(token_dict[1]))
    del_keys = []
    for key in token_dict.keys():
        if token_dict[key] <= filter_freq:
            del_keys.append(key)
    for key in del_keys:
        token_dict.pop(key)
    digits = []
    digits.extend([str(d) for d in range(10) if str(d) not in digits])
    digits.extend(
        [str(d) + '_' for d in range(10) if str(d) + '_' not in digits])
    digits.append('.')
    return digits + list(token_dict.keys())

Пример #25

0

Показать файл

Файл: generator_utils.py Проект: zxie/tensor2tensor

def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                generator_fn):
    """Inner implementation for vocab generators."""
    vocab_filepath = os.path.join(data_dir, vocab_filename)
    if tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    token_counts = defaultdict(int)
    for item in generator_fn():
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)
    vocab.store_to_file(vocab_filepath)
    return vocab

Пример #26

0

Показать файл

Файл: create_token_vocab.py Проект: LONG-9621/Stackedcapsule

def create_token_id_files(corpus_dir, output_vocab_dir):
    """Creates token id csv  files.

  Args:
    corpus_dir: input corpus directory
    output_vocab_dir: output token vocabulary csv file directory
  """
    walking_iter = gfile.Walk(corpus_dir)
    for iter_rst in walking_iter:
        valid_filenames = [
            filename for filename in iter_rst[2]
            if ".txt" in filename or "wadata" in filename
        ]
        if not valid_filenames:
            continue
        input_file_dir = iter_rst[0]
        for filename in valid_filenames:
            path = os.path.join(input_file_dir, filename)
            with gfile.Open(path, "r") as f:
                for line in f.read().lower().split("\n"):
                    tokens = tokenizer.encode(line)
                    for token in tokens:
                        word_count[token] += 1

    sorted_vocab = sorted(word_count.items(), key=operator.itemgetter(1))
    tf.logging.info("%d items in vocb", sum(word_count.values()))

    csv_file = gfile.Open(os.path.join(output_vocab_dir, "vocab.csv"), "w+")
    csv_writter = csv.writer(csv_file)

    rows = [["<PAD>", 0, 0], ["<EOS>", 0, 1], ["<UKN>", 0, 2],
            ["<START>", 0, 3]]
    for row in rows:
        csv_writter.writerow(row)
    start_index = len(rows)
    for word_freq in reversed(sorted_vocab):
        row = [word_freq[0], word_freq[1], start_index]
        freq_count[word_freq[1]] += 1
        start_index += 1
        csv_writter.writerow(row)
    tf.logging.info("vocab_size=%d", start_index)
    tf.logging.info("token frequency count")
    tf.logging.info(sorted(freq_count.items(), key=operator.itemgetter(1)))
    csv_file.close()

Пример #27

0

Показать файл

Файл: utils.py Проект: twang18/t2t_wmt_zhen

def get_or_generate_vocab_inner(data_dir,
                                vocab_filename,
                                vocab_size,
                                generator,
                                num_iterations=1e3):
    """Inner implementation for vocab generators.

    *
        has minimum token count set to 50. 
    *
    Args:
        data_dir: The base directory where data and vocab files are stored. If None,
            then do not save the vocab even if it doesn't exist.
        vocab_filename: relative filename where vocab file is stored
        vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
        generator: a generator that produces tokens from the vocabulary

    Returns:
        A SubwordTextEncoder vocabulary object.
    """
    if data_dir is None:
        vocab_filepath = None
    else:
        vocab_filepath = os.path.join(data_dir, vocab_filename)

    if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    tf.logging.info("Generating vocab file: %s", vocab_filepath)
    token_counts = defaultdict(int)
    for item in generator:
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 50, 1e3, int(num_iterations))

    if vocab_filepath is not None:
        vocab.store_to_file(vocab_filepath)
    return vocab

Пример #28

0

Показать файл

Файл: reg_problems.py Проект: entn-at/translate_0

def generate_bpe_vocab(file_list, targeted_vocab_size):
    def generator_fn():
        for filepath in file_list:
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                #file_byte_budget = 3.5e5 if filepath.endswith("en") else 7e5
                for line in source_file:
                    #if file_byte_budget <= 0:
                    #    break
                    line = line.strip()
                    #file_byte_budget -= len(line)
                    yield line

    token_counts = defaultdict(int)
    for item in generator_fn():
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1
    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        targeted_vocab_size, token_counts, 1, 1e3)

    return vocab

Пример #29

0

Показать файл

Файл: generator_utils.py Проект: zabin10/tensor2tensor

def get_or_generate_tabbed_vocab(data_dir, tmp_dir, source_filename, index,
                                 vocab_filename, vocab_size):
    r"""Generate a vocabulary from a tabbed source file.

  The source is a file of source, target pairs, where each line contains
  a source string and a target string, separated by a tab ('\t') character.
  The index parameter specifies 0 for the source or 1 for the target.

  Args:
    data_dir: path to the data directory.
    tmp_dir: path to the temporary directory.
    source_filename: the name of the tab-separated source file.
    index: index.
    vocab_filename: the name of the vocabulary file.
    vocab_size: vocabulary size.

  Returns:
    The vocabulary.
  """
    vocab_filepath = os.path.join(data_dir, vocab_filename)
    if os.path.exists(vocab_filepath):
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    # Use Tokenizer to count the word occurrences.
    token_counts = defaultdict(int)
    filepath = os.path.join(tmp_dir, source_filename)
    with tf.gfile.GFile(filepath, mode="r") as source_file:
        for line in source_file:
            line = line.strip()
            if line and "\t" in line:
                parts = line.split("\t", maxsplit=1)
                part = parts[index].strip()
                for tok in tokenizer.encode(
                        text_encoder.native_to_unicode(part)):
                    token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)
    vocab.store_to_file(vocab_filepath)
    return vocab

Пример #30

0

Показать файл

Файл: lm1b.py Проект: AranKomat/tensor2tensor

def _get_or_build_subword_text_encoder(tmp_dir,
                                       vocab_filepath,
                                       target_size):
  """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_filepath: path to store (or load) vocab.
    target_size: an optional integer.

  Returns:
    a SubwordTextEncoder.
  """
  if tf.gfile.Exists(vocab_filepath):
    return text_encoder.SubwordTextEncoder(vocab_filepath)
  _maybe_download_corpus(tmp_dir)
  original_vocab = _original_vocab(tmp_dir)
  token_counts = defaultdict(int)
  line_count = 0
  max_lines = 63000
  for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
    tokens = tokenizer.encode(
        _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
    for tok in tokens:
      token_counts[tok] += 1
    line_count += 1
    if line_count >= max_lines:
      break
  if target_size == 2 ** 15:
    # legacy behavior
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(token_counts, min_count=5)
  else:
    ret = text_encoder.SubwordTextEncoder.build_to_target_size(
        target_size, token_counts, 1, 1000)
  ret.store_to_file(vocab_filepath)
  return ret

Пример #31

0

Показать файл

Файл: generator_utils.py Проект: chqiwang/tensor2tensor

def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                generator):
  """Inner implementation for vocab generators.

  Args:
    data_dir: The base directory where data and vocab files are stored. If None,
        then do not save the vocab even if it doesn't exist.
    vocab_filename: relative filename where vocab file is stored
    vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
    generator: a generator that produces tokens from the vocabulary

  Returns:
    A SubwordTextEncoder vocabulary object.
  """
  if data_dir is None:
    vocab_filepath = None
  else:
    vocab_filepath = os.path.join(data_dir, vocab_filename)

  if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
    tf.logging.info("Found vocab file: %s", vocab_filepath)
    vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
    return vocab

  tf.logging.info("Generating vocab file: %s", vocab_filepath)
  token_counts = defaultdict(int)
  for item in generator:
    for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
      token_counts[tok] += 1

  vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
      vocab_size, token_counts, 1, 1e3)

  if vocab_filepath is not None:
    vocab.store_to_file(vocab_filepath)
  return vocab

Пример #32

0

Показать файл

 def encode(text):
     if mode == 'character':
         return list(text)
     else:
         return tokenizer.encode(text)

Пример #33

0

Показать файл

Файл: tokenizer_test.py Проект: yehuifzu/tensor2tensor

 def test_invertibility_on_random_strings(self):
     for _ in range(1000):
         s = u"".join(
             six.unichr(random.randint(0, 65535)) for _ in range(10))
         self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))

Пример #34

0

Показать файл

Файл: multiencoder_utils.py Проект: hocop/VAE-paraphraser-and-dialogue-model

    def encode(self, sentence, emit_eos=False):
        toks = tokenizer.encode(sentence)
        if len(toks) == 0:
            print("|%s|" % sentence)
        if self.emit_spaces:
            toks_spaced = []
            for tok_idx in range(len(toks)):
                toks_spaced.append(toks[tok_idx])
                if not (tok_idx == len(toks) - 1 and self.omit_ending_space):
                    toks_spaced.append('П')
            toks = toks_spaced
        if self.emit_newword:
            toks = divide_numbers(toks)
        results = []
        emitted_nw = 0
        emitted_toks = 0
        #print("Encoding '%s'" % sentence)
        for enc_idx in range(len(self.encoders)):
            results.append([])
            emitted = 0
            enc = self.encoders[enc_idx]

            cut_single_for_copy = False
            if enc['type'] == 'copy':
                cut_single_for_copy = len(self.encoders) > 1
                enc = enc['mirror']

            if enc['type'] == 'char':
                if enc['drop_end']:
                    encode_func = lambda tok: \
                        self.t2t_encs[enc_idx].encode(Multiencoder.prepare_char_drop_end(tok))
                else:
                    encode_func = lambda tok: self.t2t_encs[
                        enc_idx].encode_without_tokenizing(tok)
            elif enc['type'] == 'bigram':
                encode_func = \
                    lambda tok: self.t2t_encs[enc_idx]\
                                    .encode(\
                                        ' '.join(Multiencoder.prepare_ngrams(tok.strip(), 2, enc['drop_end']))
                                           )
            elif enc['type'] == 'trigram':
                encode_func = \
                    lambda tok: self.t2t_encs[enc_idx]\
                                    .encode(\
                                        ' '.join(Multiencoder.prepare_ngrams(tok.strip(), 3, enc['drop_end']))
                                           )
            elif enc['type'] == 'wordpiece':
                encode_func = lambda tok: self.t2t_encs[enc_idx].encode(tok)
            elif enc['type'] == 'token':
                encode_func = lambda tok: self.t2t_encs[enc_idx].encode(
                    self.prepare_token(tok))
            for tok in toks:
                encoded_tok = encode_func(tok)
                if cut_single_for_copy:
                    results[enc_idx].append(encoded_tok[0])
                else:
                    results[enc_idx].extend(encoded_tok)
                if self.emit_newword and enc[
                        'type'] != 'token' and not cut_single_for_copy:
                    results[enc_idx].append(self.newword_codes[enc_idx])
                    emitted_nw += 1
                elif enc['type'] == 'token' or cut_single_for_copy:
                    emitted_toks += 1
            if emit_eos:
                results[enc_idx].append(1)
        if emitted_toks % len(toks) or emitted_nw % len(toks):
            raise NameError(
                'Bad number of tokens or NEWWORDs : emitted_toks = %d, emitted_nw = %d, toks = %d'
                % (emitted_toks, emitted_nw, len(toks)))
        result = np.zeros([len(self.encoders) * self.max_source_len],
                          dtype=int)
        for enc_idx in range(len(self.encoders)):
            if len(results[enc_idx]) > self.max_source_len:
                raise NameError('Cannot fit sentence in max_*_len = %d slots' %
                                self.max_source_len)

        for enc_idx in range(len(self.encoders)):
            sublist_start = self.max_source_len * enc_idx
            result[sublist_start :
                   sublist_start +
                   len(results[enc_idx])] = \
                   results[enc_idx]
            #print('Encoding %ss: ' % (self.encoders[enc_idx]['type']), results[enc_idx])
        return result

Пример #35

0

Показать файл

Файл: multiencoder_utils.py Проект: hocop/VAE-paraphraser-and-dialogue-model

 def count_length(self, sentence):
     toks = tokenizer.encode(sentence)
     toks = divide_numbers(toks)
     return len(toks)

Пример #36

0

Показать файл

Файл: tokenizer_test.py Проект: vohoaiviet/tensor2tensor

 def testInvertibilityOnRandomStrings(self):
     random.seed(123)
     for _ in xrange(1000):
         s = u"".join(
             [unichr(random.randint(0, 65535)) for _ in xrange(10)])
         self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))

Пример #37

0

Показать файл

Файл: tokenizer_test.py Проект: qixiuai/tensor2tensor

 def test_invertibility_on_random_strings(self):
   for _ in range(1000):
     s = u"".join(six.unichr(random.randint(0, 65535)) for _ in range(10))
     self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))

Пример #38

0

Показать файл

Файл: test_tokenizer.py Проект: johndpope/advrelation

from tensor2tensor.data_generators import tokenizer

doc = ["I've got it! Don't you see? :)", 
"TY KU /taɪkuː/ is an American alcoholic beverage company",
"The GOAT Store (Games Of All Type Store) LLC is one of",
"188BET is an online sportsbook provider. 188BET is owned by Cube Limited",
"Avista Utilities is a U.S. energy company.",
"it has 18ml.",
"''i've got it!!!",
"小燕子穿花衣，年年春天来这里。"]

for text in doc:
  tokens = tokenizer.encode(text)
  print(tokens)

# import sys
# import six
# import unicodedata

# _ALPHANUMERIC_CHAR_SET = set(
#     six.unichr(i) for i in range(sys.maxunicode)
#     if (unicodedata.category(six.unichr(i)).startswith("L") or
#         unicodedata.category(six.unichr(i)).startswith("N")))
# n = len(_ALPHANUMERIC_CHAR_SET)
# print(n)

Python encode примеры использования