示例#1
0
 def test_with_nonalphanum(self):
   text = 'hi world<<>><<>foo!^* bar &&  bye (%s hi)' % ZH_HELLO
   tokens = [
       'hi', ' ', 'world', '<<>><<>', 'foo', '!^* ', 'bar', ' &&  ', 'bye',
       ' (',
       ZH_HELLO.strip(), '  ', 'hi', ')'
   ]
   tokenizer = text_encoder.Tokenizer(alphanum_only=False)
   self.assertEqual(tokens, tokenizer.tokenize(text))
   self.assertEqual(text, tokenizer.join(tokenizer.tokenize(text)))
示例#2
0
 def test_reserved_tokens_with_regex_chars(self):
     text = r'hello worldba\)r bar foozoo zoo FOO|<EOS>'
     tokens = [
         'hello', ' ', 'world', r'ba\)r', ' ', 'bar', ' ', 'foozoo', ' ',
         'zoo', ' ', 'FOO|', '<EOS>'
     ]
     tokenizer = text_encoder.Tokenizer(
         alphanum_only=False, reserved_tokens=['<EOS>', 'FOO|', r'ba\)r'])
     self.assertEqual(tokens, tokenizer.tokenize(text))
     self.assertEqual(text, tokenizer.join(tokenizer.tokenize(text)))
示例#3
0
def _token_counts_from_generator(generator, max_chars, reserved_tokens):
    """Builds token counts from generator."""
    reserved_tokens = list(reserved_tokens) + [_UNDERSCORE_REPLACEMENT]
    tokenizer = text_encoder.Tokenizer(alphanum_only=False,
                                       reserved_tokens=reserved_tokens)
    num_chars = 0
    token_counts = collections.defaultdict(int)
    for s in generator:
        s = tf.compat.as_text(s)
        if max_chars and (num_chars + len(s)) >= max_chars:
            s = s[:(max_chars - num_chars)]
        tokens = tokenizer.tokenize(s)
        tokens = _prepare_tokens_for_encode(tokens)
        for t in tokens:
            token_counts[t] += 1
        if max_chars:
            num_chars += len(s)
            if num_chars > max_chars:
                break
    return token_counts
示例#4
0
 def test_whitespace(self, s, exp):
   tokenizer = text_encoder.Tokenizer(alphanum_only=False)
   self.assertEqual(exp, tokenizer.tokenize(s))
   self.assertEqual(s, tokenizer.join(tokenizer.tokenize(s)))
示例#5
0
 def test_default(self):
   text = 'hi<<>><<>foo!^* bar &&  bye (%s hi)' % ZH_HELLO
   self.assertEqual(['hi', 'foo', 'bar', 'bye',
                     ZH_HELLO.strip(), 'hi'],
                    text_encoder.Tokenizer().tokenize(text))