Пример #1
0
 def tokenize(self, text):
     if text_utils.format_text(text) == constants.EMPTY_TEXT:
         return [Token(_EMPTY, _EMPTY)]
     tokens = []
     for token in self._basic_tokenizer.tokenize(text):
         for piece in self._wp_tokenizer.tokenize(token):
             tokens.append(Token(token, piece))
     return tokens
Пример #2
0
def get_cleaned_seq_tokens(str_tokens):
    """Transform a string to a cleaned list of tokens.

  Args:
    str_tokens: the string to tokenize

  Returns:
    A list of tokens
  """
    stemmer = PorterStemmer()
    tokens = text_utils.tokenize_text(text_utils.format_text(str_tokens))
    return [stemmer.stem(token) for token in tokens]
Пример #3
0
def _tokenize(text):
    return text_utils.tokenize_text(text_utils.format_text(text))