예제 #1
0
  def __init__(self, filename=None, num_reserved_ids=2):
    """Initialize and read from a file, if provided."""
    self._tokenizer = tokenizer.Tokenizer()
    if filename is not None:
      self._load_from_file(filename)

    super(SubwordTextEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
예제 #2
0
 def testInvertibilityOnRandomStrings(self):
     t = tokenizer.Tokenizer()
     random.seed(123)
     for _ in xrange(1000):
         s = u"".join(
             [unichr(random.randint(0, 65535)) for _ in xrange(10)])
         self.assertEqual(s, t.decode(t.encode(s)))
예제 #3
0
 def testEncode(self):
     t = tokenizer.Tokenizer()
     self.assertEqual(
         t.encode(u"Dude - that's so cool."),
         [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."])
     self.assertEqual(t.encode(u"Łukasz est né en 1981."),
                      [u"Łukasz", u"est", u"né", u"en", u"1981", u"."])
     self.assertEqual(t.encode(u" Spaces at the ends "),
                      [u" ", u"Spaces", u"at", u"the", u"ends", u" "])
     self.assertEqual(t.encode(u"802.11b"), [u"802", u".", u"11b"])
     self.assertEqual(t.encode(u"two. \nlines"),
                      [u"two", u". \n", u"lines"])
예제 #4
0
 def get_token_counts(cls, text_filepattern, corpus_max_lines):
   """Read the corpus and compute a dictionary of token counts."""
   tok = tokenizer.Tokenizer()
   lines_read = 0
   filenames = tf.gfile.Glob(text_filepattern)
   for text_filename in filenames:
     with tf.gfile.Open(text_filename) as f:
       for line in f:
         # The tokenizer updates token_counts in encode()
         tok.encode(_native_to_unicode(line.strip()))
         lines_read += 1
         if corpus_max_lines > 0 and lines_read > corpus_max_lines:
           return tok.token_counts
   return tok.token_counts
예제 #5
0
 def testEncode(self):
   t = tokenizer.Tokenizer()
   self.assertEqual(
       t.encode("Dude - that's so cool."),
       ["Dude", " - ", "that", "'", "s", "so", "cool", "."])
   # TODO(lukaszkaiser): make it work again with Unicode.
   # self.assertEqual(
   #     t.encode("Łukasz est né en 1981."),
   #     ["Łukasz", "est", "né", "en", "1981", "."])
   self.assertEqual(
       t.encode(" Spaces at the ends "),
       [" ", "Spaces", "at", "the", "ends", " "])
   self.assertEqual(t.encode("802.11b"), ["802", ".", "11b"])
   self.assertEqual(t.encode("two. \nlines"), ["two", ". \n", "lines"])
예제 #6
0
 def get_token_counts(cls, text_filepattern, corpus_max_lines):
     """Read the corpus and compute a dictionary of word counts."""
     tok = tokenizer.Tokenizer()
     token_counts = {}
     lines_read = 0
     filenames = tf.gfile.Glob(text_filepattern)
     for text_filename in filenames:
         with tf.gfile.Open(text_filename) as f:
             for line in f:
                 tokens = tok.encode(line.strip())
                 for t in tokens:
                     token_counts[t] = token_counts.get(t, 0) + 1
                 lines_read += 1
                 if corpus_max_lines > 0 and lines_read > corpus_max_lines:
                     return token_counts
     return token_counts
예제 #7
0
 def testDecode(self):
     t = tokenizer.Tokenizer()
     self.assertEqual(
         t.decode(
             [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]),
         u"Dude - that's so cool.")
예제 #8
0
 def testDecode(self):
   t = tokenizer.Tokenizer()
   self.assertEqual(
       t.decode(["你", "好", "-", "?"]),
       "你 好 - ?")
예제 #9
0
 def testEncode(self):
   t = tokenizer.Tokenizer()
   self.assertEqual(
       t.encode("你 好 - ?"),
       ["你", "好", "-", "?"])
예제 #10
0
 def testInvertibilityOnRandomStrings(self):
   t = tokenizer.Tokenizer()
   random.seed(123)
   for _ in xrange(0):  # TODO(lukaszkaiser): make it work again with Unicode.
     s = "".join([six.int2byte(random.randint(0, 255)) for _ in xrange(10)])
     self.assertEqual(s, t.decode(t.encode(s)))