Exemplo n.º 1
0
 def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
     # Compiled Regexes are NOT deepcopy-able and hence to make Corpus deepcopy-able
     # we cannot store then (due to Corpus also storing used_preprocessor for BoW compute values).
     # To bypass the problem regex is compiled before every __call__ and discarded right after.
     self.tokenizer = self.tokenizer_cls(self.__pattern)
     corpus = Preprocessor.__call__(self, corpus)
     if callback is None:
         callback = dummy_callback
     callback(0, "Tokenizing...")
     corpus = self._store_tokens_from_documents(corpus, callback)
     self.tokenizer = None
     return corpus
Exemplo n.º 2
0
    def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
        try:
            self.__model = udpipe.Model.load(self.models[self.__language])
        except StopIteration:
            raise UDPipeStopIteration

        if self.__use_tokenizer:
            corpus = Preprocessor.__call__(self, corpus)
            if callback is None:
                callback = dummy_callback
            callback(0, "Normalizing...")
            return self._store_tokens_from_documents(corpus, callback)
        else:
            return super().__call__(corpus, callback)