Пример #1
0
 def testEncode(self):
     self.assertEqual(
         tokenizer.encode(u"Dude - that's so cool."),
         [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."])
     self.assertEqual(tokenizer.encode(u"Łukasz est né en 1981."),
                      [u"Łukasz", u"est", u"né", u"en", u"1981", u"."])
     self.assertEqual(tokenizer.encode(u" Spaces at the ends "),
                      [u" ", u"Spaces", u"at", u"the", u"ends", u" "])
     self.assertEqual(tokenizer.encode(u"802.11b"), [u"802", u".", u"11b"])
     self.assertEqual(tokenizer.encode(u"two. \nlines"),
                      [u"two", u". \n", u"lines"])
Пример #2
0
 def test_encode(self):
   self.assertListEqual(
       [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."],
       tokenizer.encode(u"Dude - that's so cool."))
   self.assertListEqual([u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
                        tokenizer.encode(u"Łukasz est né en 1981."))
   self.assertListEqual([u" ", u"Spaces", u"at", u"the", u"ends", u" "],
                        tokenizer.encode(u" Spaces at the ends "))
   self.assertListEqual([u"802", u".", u"11b"], tokenizer.encode(u"802.11b"))
   self.assertListEqual([u"two", u". \n", u"lines"],
                        tokenizer.encode(u"two. \nlines"))
Пример #3
0
def debug_bpe_large_02():
    enc = BPEEncoder.load_from_file("tests/vocab_large.json")
    max_line_count = 100_000

    corpus = line_gen("/home/haukur/Projects/bpe_nmt/eng-isl.tsv")
    corpus = itertools.islice(corpus, max_line_count)

    total_matches = 0
    for line in corpus:
        greedy_ids = enc.encode(line)
        optimal_ids = enc.encode(line, greedy=False)
        total_matches += int(greedy_ids == optimal_ids)
        if greedy_ids != optimal_ids:
            print(line)
            toks = t2t_tokenizer.encode(line)
            toks = [
                tok for tok in toks if enc.encode(tok) != enc.encode(tok, greedy=False)
            ]
            toks = " ".join(toks)
            print([enc.all_symbols[token_id] for token_id in enc.encode(toks)])
            print(
                [
                    enc.all_symbols[token_id]
                    for token_id in enc.encode(toks, greedy=False)
                ]
            )
            print()

    print(total_matches / max_line_count)
def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath, target_size):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_filepath: path to store (or load) vocab.
    target_size: an optional integer.

  Returns:
    a SubwordTextEncoder.
  """
    if tf.gfile.Exists(vocab_filepath):
        return text_encoder.SubwordTextEncoder(vocab_filepath)
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    token_counts = defaultdict(int)
    line_count = 0
    max_lines = 63000
    for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
        tokens = tokenizer.encode(
            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
        for tok in tokens:
            token_counts[tok] += 1
        line_count += 1
        if line_count >= max_lines:
            break
    if target_size == 2**15:
        # legacy behavior
        ret = text_encoder.SubwordTextEncoder()
        ret.build_from_token_counts(token_counts, min_count=5)
    else:
        ret = text_encoder.SubwordTextEncoder.build_to_target_size(
            target_size, token_counts, 1, 1000)
    ret.store_to_file(vocab_filepath)
    return ret
    def build_from_generator(cls,
                             generator,
                             target_size,
                             max_subtoken_length=None,
                             reserved_tokens=None):
        """Builds a SubwordTextEncoder from the generated text.

    Args:
      generator: yields text.
      target_size: int, approximate vocabulary size to create.
      max_subtoken_length: Maximum length of a subtoken. If this is not set,
        then the runtime and memory use of creating the vocab is quadratic in
        the length of the longest token. If this is set, then it is instead
        O(max_subtoken_length * length of longest token).
      reserved_tokens: List of reserved tokens. The global variable
        `RESERVED_TOKENS` must be a prefix of `reserved_tokens`. If this
        argument is `None`, it will use `RESERVED_TOKENS`.

    Returns:
      SubwordTextEncoder with `vocab_size` approximately `target_size`.
    """
        token_counts = collections.defaultdict(int)
        for item in generator:
            for tok in tokenizer.encode(native_to_unicode(item)):
                token_counts[tok] += 1
        encoder = cls.build_to_target_size(
            target_size,
            token_counts,
            1,
            1e3,
            max_subtoken_length=max_subtoken_length,
            reserved_tokens=reserved_tokens)
        return encoder
Пример #6
0
def _token_counts(text, token_set=None):
    counts = collections.defaultdict(int)
    for token in tokenizer.encode(text_encoder.native_to_unicode(text)):
        if token_set and token not in token_set:
            continue
        counts[token] += 1
    return counts
Пример #7
0
  def build_from_generator(cls,
                           generator,
                           target_size,
                           max_subtoken_length=None,
                           reserved_tokens=None):
    """Builds a SubwordTextEncoder from the generated text.

    Args:
      generator: yields text.
      target_size: int, approximate vocabulary size to create.
      max_subtoken_length: Maximum length of a subtoken. If this is not set,
        then the runtime and memory use of creating the vocab is quadratic in
        the length of the longest token. If this is set, then it is instead
        O(max_subtoken_length * length of longest token).
      reserved_tokens: List of reserved tokens. The global variable
        `RESERVED_TOKENS` must be a prefix of `reserved_tokens`. If this
        argument is `None`, it will use `RESERVED_TOKENS`.

    Returns:
      SubwordTextEncoder with `vocab_size` approximately `target_size`.
    """
    token_counts = collections.defaultdict(int)
    for item in generator:
      for tok in tokenizer.encode(native_to_unicode(item)):
        token_counts[tok] += 1
    encoder = cls.build_to_target_size(
        target_size, token_counts, 1, 1e3,
        max_subtoken_length=max_subtoken_length,
        reserved_tokens=reserved_tokens)
    return encoder
Пример #8
0
 def build_from_generator(
     cls,
     generator,
     max_size,
     separate_case=True,
     verbose=False,
     use_eow=True,
     max_lines=100000,
     continue_for_alphabet=True,
 ):
     token_counts = collections.defaultdict(int)
     rest = []
     if max_lines is not None:
         rest = iter(generator)
         generator = itertools.islice(generator, max_lines)
     for idx, line in enumerate(generator):
         for token in t2t_tokenizer.encode(line):
             token_counts[token] += 1
     if max_lines is not None and continue_for_alphabet:
         alphabet_rest = set()
         for line in rest:
             alphabet_rest.update(line)
         alphabet_rest = "".join(alphabet_rest)
         token_counts[alphabet_rest] += 1
     return cls.build_from_token_counts(
         token_counts,
         max_size,
         separate_case=separate_case,
         verbose=verbose,
         use_eow=use_eow,
     )
Пример #9
0
def _get_or_build_subword_text_encoder(tmp_dir):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: a string

  Returns:
    a SubwordTextEncoder.
  """
    filename = os.path.join(tmp_dir, "wiki_32k.subword_text_encoder")
    if tf.gfile.Exists(filename):
        return text_encoder.SubwordTextEncoder(filename)
    token_counts = defaultdict(int)
    for page in page_generator(tmp_dir, max_docs=1000):
        tokens = tokenizer.encode(page)
        tokens = set(tokens)
        for tok in tokens:
            token_counts[tok] += 1
    new_token_counts = defaultdict(int)
    for token, count in six.iteritems(token_counts):
        if count >= 3:
            new_token_counts[token] = count
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(new_token_counts, min_count=10)
    ret.store_to_file(filename)
    return ret
Пример #10
0
def _rank_reference_paragraphs(wiki_title, references_content):
    """Rank and return reference paragraphs by tf-idf score on title tokens."""
    title_tokens = _tokens_to_score(
        set(tokenizer.encode(text_encoder.native_to_unicode(wiki_title))))
    ref_paragraph_info = []
    doc_counts = collections.defaultdict(int)
    for ref in references_content:
        for paragraph in ref.split("\n"):
            paragraph = _normalize_text(paragraph)
            if cc_utils.filter_paragraph(paragraph):
                # Skip paragraph
                continue
            counts = _token_counts(paragraph, title_tokens)
            for token in title_tokens:
                if counts[token]:
                    doc_counts[token] += 1
            info = {"content": paragraph, "counts": counts}
            ref_paragraph_info.append(info)

    for info in ref_paragraph_info:
        score = 0.
        for token in title_tokens:
            term_frequency = info["counts"][token]
            inv_doc_frequency = (float(len(ref_paragraph_info)) /
                                 max(doc_counts[token], 1))
            score += term_frequency * math.log(inv_doc_frequency)
        info["score"] = score

    ref_paragraph_info.sort(key=lambda el: el["score"], reverse=True)
    return [info["content"] for info in ref_paragraph_info]
Пример #11
0
def get_or_generate_vocab_es(tmp_dir, vocab_filename, vocab_size, datasets):
  """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  print(vocab_filepath)
  if tf.gfile.Exists(vocab_filepath):
    tf.logging.info("Found vocab file: %s", vocab_filepath)
    vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
    return vocab

  sources = datasets
  tf.logging.info("Generating vocab from: %s", str(sources))
  token_counts = defaultdict(int)
  for source in sources:
    for lang_file in source[0]:
      tf.logging.info("Reading file: %s" % lang_file)
      filepath = os.path.join(tmp_dir, lang_file)
      print(filepath)

      # Use Tokenizer to count the word occurrences.
      with tf.gfile.GFile(filepath, mode="r") as source_file:
        file_byte_budget = 3.5e5 if "en" in filepath else 7e5
        for line in source_file:
          if file_byte_budget <= 0:
            break
          line = line.strip()
          file_byte_budget -= len(line)
          for tok in tokenizer.encode(text_encoder.native_to_unicode(line)):
            token_counts[tok] += 1

  vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
      vocab_size, token_counts, 1, 1e3)
  vocab.store_to_file(vocab_filepath)
  return vocab
Пример #12
0
def _get_or_build_subword_text_encoder(tmp_dir):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
  Returns:
    a SubwordTextEncoder.
  """
    filepath = os.path.join(tmp_dir, "lm1b_32k.subword_text_encoder")
    if tf.gfile.Exists(filepath):
        return text_encoder.SubwordTextEncoder(filepath)
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    token_counts = defaultdict(int)
    line_count = 0
    max_lines = 63000
    for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
        tokens = tokenizer.encode(
            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
        for tok in tokens:
            token_counts[tok] += 1
        line_count += 1
        if line_count >= max_lines:
            break
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(token_counts, min_count=5)
    ret.store_to_file(filepath)
    return ret
Пример #13
0
def _token_counts(text, token_set=None):
  counts = collections.defaultdict(int)
  for token in tokenizer.encode(text_encoder.native_to_unicode(text)):
    if token_set and token not in token_set:
      continue
    counts[token] += 1
  return counts
Пример #14
0
def rank_reference_paragraphs(wiki_title, references_content, normalize=True):
  """Rank and return reference paragraphs by tf-idf score on title tokens."""
  normalized_title = _normalize_text(wiki_title)
  title_tokens = _tokens_to_score(
      set(tokenizer.encode(text_encoder.native_to_unicode(normalized_title))))
  ref_paragraph_info = []
  doc_counts = collections.defaultdict(int)
  for ref in references_content:
    for paragraph in ref.split("\n"):
      normalized_paragraph = _normalize_text(paragraph)
      if cc_utils.filter_paragraph(normalized_paragraph):
        # Skip paragraph
        continue
      counts = _token_counts(normalized_paragraph, title_tokens)
      for token in title_tokens:
        if counts[token]:
          doc_counts[token] += 1
      content = normalized_paragraph if normalize else paragraph
      info = {"content": content, "counts": counts}
      ref_paragraph_info.append(info)

  for info in ref_paragraph_info:
    score = 0.
    for token in title_tokens:
      term_frequency = info["counts"][token]
      inv_doc_frequency = (
          float(len(ref_paragraph_info)) / max(doc_counts[token], 1))
      score += term_frequency * math.log(inv_doc_frequency)
    info["score"] = score

  ref_paragraph_info.sort(key=lambda el: el["score"], reverse=True)
  return [info["content"] for info in ref_paragraph_info]
Пример #15
0
def generate_bpe_vocab(file_list, targeted_vocab_size):
    token_counts = defaultdict(int)
    for item in generator_fn(file_list):
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1
    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        targeted_vocab_size, token_counts, 1, 1e3)
    return vocab
Пример #16
0
 def encode_with_dropout(self, text, dropout):
     ret = []
     for token in t2t_tokenizer.encode(text):
         if self._ignore_ooa:
             token = enforce_alphabet(token, self._alphabet_set)
         token = self.maybe_add_meta_symbols(token)
         ret.extend(self._encode_token_with_dropout(token, dropout))
     return ret
Пример #17
0
def get_or_generate_vocab(data_dir,
                          tmp_dir,
                          vocab_filename,
                          vocab_size,
                          sources=None):
    """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
    vocab_filepath = os.path.join(data_dir, vocab_filename)
    if tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    sources = sources or _DATA_FILE_URLS
    tf.logging.info("Generating vocab from: %s", str(sources))
    token_counts = defaultdict(int)
    for source in sources:
        url = source[0]
        filename = os.path.basename(url)
        read_type = "r:gz" if "tgz" in filename else "r"

        compressed_file = maybe_download(tmp_dir, filename, url)

        with tarfile.open(compressed_file, read_type) as corpus_tar:
            corpus_tar.extractall(tmp_dir)

        for lang_file in source[1]:
            tf.logging.info("Reading file: %s" % lang_file)
            filepath = os.path.join(tmp_dir, lang_file)

            # For some datasets a second extraction is necessary.
            if ".gz" in lang_file:
                new_filepath = os.path.join(tmp_dir, lang_file[:-3])
                if tf.gfile.Exists(new_filepath):
                    tf.logging.info(
                        "Subdirectory %s already exists, skipping unpacking" %
                        filepath)
                else:
                    tf.logging.info("Unpacking subdirectory %s" % filepath)
                    gunzip_file(filepath, new_filepath)
                filepath = new_filepath

            # Use Tokenizer to count the word occurrences.
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                file_byte_budget = 3.5e5 if "en" in filepath else 7e5
                for line in source_file:
                    if file_byte_budget <= 0:
                        break
                    line = line.strip()
                    file_byte_budget -= len(line)
                    for tok in tokenizer.encode(
                            text_encoder.native_to_unicode(line)):
                        token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)
    vocab.store_to_file(vocab_filepath)
    return vocab
Пример #18
0
 def encode(self, text, greedy=True):
     tokens = []
     for token in t2t_tokenizer.encode(text):
         token = self.maybe_add_meta_symbols(token)
         if self._ignore_ooa:
             token = enforce_alphabet(token,
                                      self._alphabet_set,
                                      lower=self.separate_case)
         tokens.append(token)
     return self._encode_tokens(tokens, greedy=greedy)
    def encode(self, s):
        """Converts a native string to a list of subtoken ids.

    Args:
      s: a native string.
    Returns:
      a list of integers in the range [0, vocab_size)
    """
        return self._tokens_to_subtoken_ids(
            tokenizer.encode(native_to_unicode(s)))
Пример #20
0
  def encode(self, raw_text):
    """Converts a native string to a list of subtoken ids.

    Args:
      raw_text: a native string.
    Returns:
      a list of integers in the range [0, vocab_size)
    """
    return self._tokens_to_subtoken_ids(
        tokenizer.encode(native_to_unicode(raw_text)))
Пример #21
0
def t2t_tokenize_to_ids(text):
    """Tokenize text string with tensor2tensor tokenizer."""
    token_vocab = _get_token_vocab()
    tokens = t2t_tokenizer.encode(text)
    token_ids = []
    for token in tokens:
        if token not in token_vocab:
            raise UnknownTokenError('Unknown token %s' % token)
        else:
            token_ids.append(token_vocab[token])
    return token_ids, tokens
Пример #22
0
def _normalize_string(raw_str):
  """Normalizes the string using tokenizer.encode.

  Args:
    raw_str: the input string

  Returns:
   A string which is ready to be tokenized using split()
  """
  return ' '.join(
      token.strip()
      for token in tokenizer.encode(text_encoder.native_to_unicode(raw_str)))
Пример #23
0
def _normalize_string(raw_str):
    """Normalizes the string using tokenizer.encode.

  Args:
    raw_str: the input string

  Returns:
   A string which is ready to be tokenized using split()
  """
    return " ".join(
        token.strip()
        for token in tokenizer.encode(text_encoder.native_to_unicode(raw_str)))
def BuildStatFromCorpus(callback,
                        filter_freq=-1,
                        input_delimiters=[],
                        delim_ends=''):
    if filter_freq < 0:
        filter_freq = 0
    ignore_delimiters = len(input_delimiters) > 0

    token_dict = [{}, {}]
    size = len(all_pairs)
    for idx in range(size):
        p = all_pairs[idx]
        for i in range(2):
            if i == 0 and not ignore_delimiters:
                last_delimiter_idx = max(
                    [0] +
                    [p[i].rfind(delimiter) for delimiter in input_delimiters])
                if last_delimiter_idx > len(p[i]) - 2:
                    continue
                string_to_be_tokenized = p[i][
                    last_delimiter_idx + 2 if last_delimiter_idx > 0 else 0:]
            else:
                string_to_be_tokenized = p[i]
            toks = tokenizer.encode(string_to_be_tokenized)
            toks_processed = []
            for tok in toks:
                #if is_float(tok):
                #    continue
                if has_digits(tok):
                    continue
                toks_processed.extend(callback(tok.strip()))
            for tok in toks_processed:
                if tok in token_dict[i]:
                    token_dict[i][tok] += 1
                else:
                    token_dict[i][tok] = 1
        if idx % 1000 == 0:
            print('    %d%%' % int(idx / size * 100), end='\r')
    token_dict = dict(Counter(token_dict[0]) + Counter(token_dict[1]))
    del_keys = []
    for key in token_dict.keys():
        if token_dict[key] <= filter_freq:
            del_keys.append(key)
    for key in del_keys:
        token_dict.pop(key)
    digits = []
    digits.extend([str(d) for d in range(10) if str(d) not in digits])
    digits.extend(
        [str(d) + '_' for d in range(10) if str(d) + '_' not in digits])
    digits.append('.')
    return digits + list(token_dict.keys())
Пример #25
0
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                generator_fn):
    """Inner implementation for vocab generators."""
    vocab_filepath = os.path.join(data_dir, vocab_filename)
    if tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    token_counts = defaultdict(int)
    for item in generator_fn():
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)
    vocab.store_to_file(vocab_filepath)
    return vocab
Пример #26
0
def create_token_id_files(corpus_dir, output_vocab_dir):
    """Creates token id csv  files.

  Args:
    corpus_dir: input corpus directory
    output_vocab_dir: output token vocabulary csv file directory
  """
    walking_iter = gfile.Walk(corpus_dir)
    for iter_rst in walking_iter:
        valid_filenames = [
            filename for filename in iter_rst[2]
            if ".txt" in filename or "wadata" in filename
        ]
        if not valid_filenames:
            continue
        input_file_dir = iter_rst[0]
        for filename in valid_filenames:
            path = os.path.join(input_file_dir, filename)
            with gfile.Open(path, "r") as f:
                for line in f.read().lower().split("\n"):
                    tokens = tokenizer.encode(line)
                    for token in tokens:
                        word_count[token] += 1

    sorted_vocab = sorted(word_count.items(), key=operator.itemgetter(1))
    tf.logging.info("%d items in vocb", sum(word_count.values()))

    csv_file = gfile.Open(os.path.join(output_vocab_dir, "vocab.csv"), "w+")
    csv_writter = csv.writer(csv_file)

    rows = [["<PAD>", 0, 0], ["<EOS>", 0, 1], ["<UKN>", 0, 2],
            ["<START>", 0, 3]]
    for row in rows:
        csv_writter.writerow(row)
    start_index = len(rows)
    for word_freq in reversed(sorted_vocab):
        row = [word_freq[0], word_freq[1], start_index]
        freq_count[word_freq[1]] += 1
        start_index += 1
        csv_writter.writerow(row)
    tf.logging.info("vocab_size=%d", start_index)
    tf.logging.info("token frequency count")
    tf.logging.info(sorted(freq_count.items(), key=operator.itemgetter(1)))
    csv_file.close()
Пример #27
0
def get_or_generate_vocab_inner(data_dir,
                                vocab_filename,
                                vocab_size,
                                generator,
                                num_iterations=1e3):
    """Inner implementation for vocab generators.

    *
        has minimum token count set to 50. 
    *
    Args:
        data_dir: The base directory where data and vocab files are stored. If None,
            then do not save the vocab even if it doesn't exist.
        vocab_filename: relative filename where vocab file is stored
        vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
        generator: a generator that produces tokens from the vocabulary

    Returns:
        A SubwordTextEncoder vocabulary object.
    """
    if data_dir is None:
        vocab_filepath = None
    else:
        vocab_filepath = os.path.join(data_dir, vocab_filename)

    if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    tf.logging.info("Generating vocab file: %s", vocab_filepath)
    token_counts = defaultdict(int)
    for item in generator:
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 50, 1e3, int(num_iterations))

    if vocab_filepath is not None:
        vocab.store_to_file(vocab_filepath)
    return vocab
Пример #28
0
def generate_bpe_vocab(file_list, targeted_vocab_size):
    def generator_fn():
        for filepath in file_list:
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                #file_byte_budget = 3.5e5 if filepath.endswith("en") else 7e5
                for line in source_file:
                    #if file_byte_budget <= 0:
                    #    break
                    line = line.strip()
                    #file_byte_budget -= len(line)
                    yield line

    token_counts = defaultdict(int)
    for item in generator_fn():
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1
    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        targeted_vocab_size, token_counts, 1, 1e3)

    return vocab
Пример #29
0
def get_or_generate_tabbed_vocab(data_dir, tmp_dir, source_filename, index,
                                 vocab_filename, vocab_size):
    r"""Generate a vocabulary from a tabbed source file.

  The source is a file of source, target pairs, where each line contains
  a source string and a target string, separated by a tab ('\t') character.
  The index parameter specifies 0 for the source or 1 for the target.

  Args:
    data_dir: path to the data directory.
    tmp_dir: path to the temporary directory.
    source_filename: the name of the tab-separated source file.
    index: index.
    vocab_filename: the name of the vocabulary file.
    vocab_size: vocabulary size.

  Returns:
    The vocabulary.
  """
    vocab_filepath = os.path.join(data_dir, vocab_filename)
    if os.path.exists(vocab_filepath):
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    # Use Tokenizer to count the word occurrences.
    token_counts = defaultdict(int)
    filepath = os.path.join(tmp_dir, source_filename)
    with tf.gfile.GFile(filepath, mode="r") as source_file:
        for line in source_file:
            line = line.strip()
            if line and "\t" in line:
                parts = line.split("\t", maxsplit=1)
                part = parts[index].strip()
                for tok in tokenizer.encode(
                        text_encoder.native_to_unicode(part)):
                    token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)
    vocab.store_to_file(vocab_filepath)
    return vocab
Пример #30
0
def _get_or_build_subword_text_encoder(tmp_dir,
                                       vocab_filepath,
                                       target_size):
  """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_filepath: path to store (or load) vocab.
    target_size: an optional integer.

  Returns:
    a SubwordTextEncoder.
  """
  if tf.gfile.Exists(vocab_filepath):
    return text_encoder.SubwordTextEncoder(vocab_filepath)
  _maybe_download_corpus(tmp_dir)
  original_vocab = _original_vocab(tmp_dir)
  token_counts = defaultdict(int)
  line_count = 0
  max_lines = 63000
  for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
    tokens = tokenizer.encode(
        _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
    for tok in tokens:
      token_counts[tok] += 1
    line_count += 1
    if line_count >= max_lines:
      break
  if target_size == 2 ** 15:
    # legacy behavior
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(token_counts, min_count=5)
  else:
    ret = text_encoder.SubwordTextEncoder.build_to_target_size(
        target_size, token_counts, 1, 1000)
  ret.store_to_file(vocab_filepath)
  return ret
Пример #31
0
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                generator):
  """Inner implementation for vocab generators.

  Args:
    data_dir: The base directory where data and vocab files are stored. If None,
        then do not save the vocab even if it doesn't exist.
    vocab_filename: relative filename where vocab file is stored
    vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
    generator: a generator that produces tokens from the vocabulary

  Returns:
    A SubwordTextEncoder vocabulary object.
  """
  if data_dir is None:
    vocab_filepath = None
  else:
    vocab_filepath = os.path.join(data_dir, vocab_filename)

  if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
    tf.logging.info("Found vocab file: %s", vocab_filepath)
    vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
    return vocab

  tf.logging.info("Generating vocab file: %s", vocab_filepath)
  token_counts = defaultdict(int)
  for item in generator:
    for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
      token_counts[tok] += 1

  vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
      vocab_size, token_counts, 1, 1e3)

  if vocab_filepath is not None:
    vocab.store_to_file(vocab_filepath)
  return vocab
Пример #32
0
 def encode(text):
     if mode == 'character':
         return list(text)
     else:
         return tokenizer.encode(text)
Пример #33
0
 def test_invertibility_on_random_strings(self):
     for _ in range(1000):
         s = u"".join(
             six.unichr(random.randint(0, 65535)) for _ in range(10))
         self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
    def encode(self, sentence, emit_eos=False):
        toks = tokenizer.encode(sentence)
        if len(toks) == 0:
            print("|%s|" % sentence)
        if self.emit_spaces:
            toks_spaced = []
            for tok_idx in range(len(toks)):
                toks_spaced.append(toks[tok_idx])
                if not (tok_idx == len(toks) - 1 and self.omit_ending_space):
                    toks_spaced.append('П')
            toks = toks_spaced
        if self.emit_newword:
            toks = divide_numbers(toks)
        results = []
        emitted_nw = 0
        emitted_toks = 0
        #print("Encoding '%s'" % sentence)
        for enc_idx in range(len(self.encoders)):
            results.append([])
            emitted = 0
            enc = self.encoders[enc_idx]

            cut_single_for_copy = False
            if enc['type'] == 'copy':
                cut_single_for_copy = len(self.encoders) > 1
                enc = enc['mirror']

            if enc['type'] == 'char':
                if enc['drop_end']:
                    encode_func = lambda tok: \
                        self.t2t_encs[enc_idx].encode(Multiencoder.prepare_char_drop_end(tok))
                else:
                    encode_func = lambda tok: self.t2t_encs[
                        enc_idx].encode_without_tokenizing(tok)
            elif enc['type'] == 'bigram':
                encode_func = \
                    lambda tok: self.t2t_encs[enc_idx]\
                                    .encode(\
                                        ' '.join(Multiencoder.prepare_ngrams(tok.strip(), 2, enc['drop_end']))
                                           )
            elif enc['type'] == 'trigram':
                encode_func = \
                    lambda tok: self.t2t_encs[enc_idx]\
                                    .encode(\
                                        ' '.join(Multiencoder.prepare_ngrams(tok.strip(), 3, enc['drop_end']))
                                           )
            elif enc['type'] == 'wordpiece':
                encode_func = lambda tok: self.t2t_encs[enc_idx].encode(tok)
            elif enc['type'] == 'token':
                encode_func = lambda tok: self.t2t_encs[enc_idx].encode(
                    self.prepare_token(tok))
            for tok in toks:
                encoded_tok = encode_func(tok)
                if cut_single_for_copy:
                    results[enc_idx].append(encoded_tok[0])
                else:
                    results[enc_idx].extend(encoded_tok)
                if self.emit_newword and enc[
                        'type'] != 'token' and not cut_single_for_copy:
                    results[enc_idx].append(self.newword_codes[enc_idx])
                    emitted_nw += 1
                elif enc['type'] == 'token' or cut_single_for_copy:
                    emitted_toks += 1
            if emit_eos:
                results[enc_idx].append(1)
        if emitted_toks % len(toks) or emitted_nw % len(toks):
            raise NameError(
                'Bad number of tokens or NEWWORDs : emitted_toks = %d, emitted_nw = %d, toks = %d'
                % (emitted_toks, emitted_nw, len(toks)))
        result = np.zeros([len(self.encoders) * self.max_source_len],
                          dtype=int)
        for enc_idx in range(len(self.encoders)):
            if len(results[enc_idx]) > self.max_source_len:
                raise NameError('Cannot fit sentence in max_*_len = %d slots' %
                                self.max_source_len)

        for enc_idx in range(len(self.encoders)):
            sublist_start = self.max_source_len * enc_idx
            result[sublist_start :
                   sublist_start +
                   len(results[enc_idx])] = \
                   results[enc_idx]
            #print('Encoding %ss: ' % (self.encoders[enc_idx]['type']), results[enc_idx])
        return result
 def count_length(self, sentence):
     toks = tokenizer.encode(sentence)
     toks = divide_numbers(toks)
     return len(toks)
Пример #36
0
 def testInvertibilityOnRandomStrings(self):
     random.seed(123)
     for _ in xrange(1000):
         s = u"".join(
             [unichr(random.randint(0, 65535)) for _ in xrange(10)])
         self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
Пример #37
0
 def test_invertibility_on_random_strings(self):
   for _ in range(1000):
     s = u"".join(six.unichr(random.randint(0, 65535)) for _ in range(10))
     self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
Пример #38
0
from tensor2tensor.data_generators import tokenizer

doc = ["I've got it! Don't you see? :)", 
"TY KU /taɪkuː/ is an American alcoholic beverage company",
"The GOAT Store (Games Of All Type Store) LLC is one of",
"188BET is an online sportsbook provider. 188BET is owned by Cube Limited",
"Avista Utilities is a U.S. energy company.",
"it has 18ml.",
"''i've got it!!!",
"小燕子穿花衣,年年春天来这里。"]

for text in doc:
  tokens = tokenizer.encode(text)
  print(tokens)

# import sys
# import six
# import unicodedata

# _ALPHANUMERIC_CHAR_SET = set(
#     six.unichr(i) for i in range(sys.maxunicode)
#     if (unicodedata.category(six.unichr(i)).startswith("L") or
#         unicodedata.category(six.unichr(i)).startswith("N")))
# n = len(_ALPHANUMERIC_CHAR_SET)
# print(n)