def tokenize(s): u = safe_decode(s) s = safe_encode(s) return [(safe_decode(s[start:start + length]), token_types.from_id(token_type)) for start, length, token_type in _tokenize.tokenize(u)]
def tokenize_raw(s): return _tokenize.tokenize(safe_decode(s))