def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: # Compiled Regexes are NOT deepcopy-able and hence to make Corpus deepcopy-able # we cannot store then (due to Corpus also storing used_preprocessor for BoW compute values). # To bypass the problem regex is compiled before every __call__ and discarded right after. self.tokenizer = self.tokenizer_cls(self.__pattern) corpus = Preprocessor.__call__(self, corpus) if callback is None: callback = dummy_callback callback(0, "Tokenizing...") corpus = self._store_tokens_from_documents(corpus, callback) self.tokenizer = None return corpus
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: try: self.__model = udpipe.Model.load(self.models[self.__language]) except StopIteration: raise UDPipeStopIteration if self.__use_tokenizer: corpus = Preprocessor.__call__(self, corpus) if callback is None: callback = dummy_callback callback(0, "Normalizing...") return self._store_tokens_from_documents(corpus, callback) else: return super().__call__(corpus, callback)