Пример #1
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        from janome.tokenizer import Tokenizer
        text = message.get(attribute)
        text = self.removePunctuation(text)

        tokenizer = Tokenizer()
        tokenized = tokenizer.tokenize(text)
        tokens = []
        for token in tokenized:
            tokens.append(Token(token.node.surface, token.node.pos - 1))

        return self._apply_token_pattern(tokens)
Пример #2
0
def _apply_tokenizer_to_states(tokenizer: Tokenizer,
                               states: List[State]) -> None:
    """Split each user text into tokens and concatenate them again.

    Args:
        tokenizer: A tokenizer to tokenize the user messages.
        states: The states to be tokenized.
    """
    for state in states:
        if USER in state and TEXT in state[USER]:
            state[USER][TEXT] = " ".join(
                token.text for token in tokenizer.tokenize(
                    Message({TEXT: state[USER][TEXT]}), TEXT))