def tokenize(self, text): if text_utils.format_text(text) == constants.EMPTY_TEXT: return [Token(_EMPTY, _EMPTY)] tokens = [] for token in self._basic_tokenizer.tokenize(text): for piece in self._wp_tokenizer.tokenize(token): tokens.append(Token(token, piece)) return tokens
def get_cleaned_seq_tokens(str_tokens): """Transform a string to a cleaned list of tokens. Args: str_tokens: the string to tokenize Returns: A list of tokens """ stemmer = PorterStemmer() tokens = text_utils.tokenize_text(text_utils.format_text(str_tokens)) return [stemmer.stem(token) for token in tokens]
def _tokenize(text): return text_utils.tokenize_text(text_utils.format_text(text))