예제 #1
0
 def test_encode(self):
   self.assertListEqual(
       [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."],
       tokenizer.encode(u"Dude - that's so cool."))
   self.assertListEqual([u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
                        tokenizer.encode(u"Łukasz est né en 1981."))
   self.assertListEqual([u" ", u"Spaces", u"at", u"the", u"ends", u" "],
                        tokenizer.encode(u" Spaces at the ends "))
   self.assertListEqual([u"802", u".", u"11b"], tokenizer.encode(u"802.11b"))
   self.assertListEqual([u"two", u". \n", u"lines"],
                        tokenizer.encode(u"two. \nlines"))
예제 #2
0
    def build_from_generator(cls,
                             generator,
                             target_size,
                             max_subtoken_length=None,
                             reserved_tokens=None):
        """Builds a SubwordTextEncoder from the generated text.

    Args:
      generator: yields text.
      target_size: int, approximate vocabulary size to create.
      max_subtoken_length: Maximum length of a subtoken. If this is not set,
        then the runtime and memory use of creating the vocab is quadratic in
        the length of the longest token. If this is set, then it is instead
        O(max_subtoken_length * length of longest token).
      reserved_tokens: List of reserved tokens. The global variable
        `RESERVED_TOKENS` must be a prefix of `reserved_tokens`. If this
        argument is `None`, it will use `RESERVED_TOKENS`.

    Returns:
      SubwordTextEncoder with `vocab_size` approximately `target_size`.
    """
        token_counts = collections.defaultdict(int)
        for item in generator:
            for tok in tokenizer.encode(native_to_unicode(item)):
                token_counts[tok] += 1
        encoder = cls.build_to_target_size(
            target_size,
            token_counts,
            1,
            1e3,
            max_subtoken_length=max_subtoken_length,
            reserved_tokens=reserved_tokens)
        return encoder
예제 #3
0
    def encode(self, s):
        """Converts a native string to a list of subtoken ids.

    Args:
      s: a native string.
    Returns:
      a list of integers in the range [0, vocab_size)
    """
        return self._tokens_to_subtoken_ids(
            tokenizer.encode(native_to_unicode(s)))
예제 #4
0
 def test_invertibility_on_random_strings(self):
     for _ in range(1000):
         s = u"".join(
             six.unichr(random.randint(0, 65535)) for _ in range(10))
         self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))