示例#1
0
    def __call__(self, batch: Union[List[str], List[List[str]]]) -> \
            Union[List[List[str]], List[str]]:
        """Tokenize or detokenize strings, depends on the type structure of passed arguments.

        Args:
            batch: a batch of documents to perform tokenizing/lemmatizing;
             or a batch of lists of tokens/lemmas to perform detokenizing

        Returns:
            a batch of lists of tokens/lemmas; or a batch of detokenized strings

        Raises:
            TypeError: If the first element of ``batch`` is neither ``List``, nor ``str``.

        """
        try:
            if isinstance(batch[0], str):
                if self.lemmas:
                    return list(self._lemmatize(batch))
                else:
                    return list(self._tokenize(batch))
            if isinstance(batch[0], list):
                return [detokenize(doc) for doc in batch]
        except:
            self.save(self.save_path)
            self.manager.shutdown()

        raise TypeError(
            "StreamSpacyTokenizer.__call__() is not implemented for `{}`".
            format(type(batch[0])))
示例#2
0
 def __call__(self, batch):
     if isinstance(batch[0], str):
         if self.lemmas:
             return list(self._lemmatize(batch))
         else:
             return list(self._tokenize(batch))
     if isinstance(batch[0], list):
         return [detokenize(doc) for doc in batch]
     raise TypeError(
         "StreamSpacyTokenizer.__call__() is not implemented for `{}`".
         format(type(batch[0])))
示例#3
0
 def __call__(self, dialogs: Sequence[Dialog]) -> Sequence[str]:
     new_responses = []
     for d in dialogs:
         # get tokens & tags
         response = d['utterances'][-1]
         ner_annotations = response['annotations']['ner']
         user_name = d['user']['profile']['name']
         # replace names with user name
         if ner_annotations and (response['active_skill'] == 'chitchat'):
             response_toks_norm, _ = \
                 self.person_normalizer([ner_annotations['tokens']],
                                        [ner_annotations['tags']],
                                        [user_name])
             response_toks_norm = response_toks_norm[0]
             # detokenize
             new_responses.append(detokenize(response_toks_norm))
         else:
             new_responses.append(response['text'])
     return new_responses
示例#4
0
    def __call__(self, batch: Union[List[str], List[List[str]]]) -> \
            Union[List[List[str]], List[str]]:
        """Tokenize or detokenize strings, depends on the type structure of passed arguments.

        Args:
            batch: a batch of documents to perform tokenizing/lemmatizing;
             or a batch of lists of tokens/lemmas to perform detokenizing

        Returns:
            a batch of lists of tokens/lemmas; or a batch of detokenized strings

        Raises:
            TypeError: If the first element of ``batch`` is neither List, nor str.

        """
        if isinstance(batch[0], str):
            if self.lemmas:
                return list(self._lemmatize(batch))
            else:
                return list(self._tokenize(batch))
        if isinstance(batch[0], list):
            return [detokenize(doc) for doc in batch]
        raise TypeError(
            "StreamSpacyTokenizer.__call__() is not implemented for `{}`".format(type(batch[0])))