def __call__(self, corpus: Corpus, callback: Callable = None, **kw) -> Corpus: """ Marks tokens of a corpus with POS tags. """ if callback is None: callback = dummy_callback corpus = super().__call__(corpus, wrap_callback(callback, end=0.2)) assert corpus.has_tokens() callback(0.2, "POS Tagging...") tags = np.array(self._preprocess(corpus.tokens, **kw), dtype=object) corpus.pos_tags = tags return corpus
def _store_tokens_from_documents(self, corpus: Corpus, callback: Callable) -> Corpus: """ Create tokens from documents and set corpus.tokens. :param corpus: Corpus :param callback: progress callback function :return: Corpus Preprocessed corpus. """ assert callback is not None tokens, n = [], len(corpus.pp_documents) for i, doc in enumerate(corpus.pp_documents): callback(i / n) tokens.append(self._preprocess(doc)) corpus.pos_tags = None corpus.store_tokens(tokens) return corpus
def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus: if corpus.pos_tags is None: return corpus callback(0, "Filtering...") filtered_tags = [] filtered_tokens = [] for tags, tokens in zip(corpus.pos_tags, corpus.tokens): tmp_tags = [] tmp_tokens = [] for tag, token in zip(tags, tokens): # should we consider partial matches, i.e. "NN" for "NNS"? if tag in self._tags: tmp_tags.append(tag) tmp_tokens.append(token) filtered_tags.append(tmp_tags) filtered_tokens.append(tmp_tokens) corpus.store_tokens(filtered_tokens) corpus.pos_tags = filtered_tags return corpus
def _filter_tokens(self, corpus: Corpus, callback: Callable, dictionary=None) -> Corpus: callback(0, "Filtering...") filtered_tokens = [] filtered_tags = [] for i, tokens in enumerate(corpus.tokens): filter_map = self._preprocess(tokens) filtered_tokens.append(list(compress(tokens, filter_map))) if corpus.pos_tags is not None: filtered_tags.append( list(compress(corpus.pos_tags[i], filter_map))) if dictionary is None: corpus.store_tokens(filtered_tokens) else: corpus.store_tokens(filtered_tokens, dictionary) if filtered_tags: corpus.pos_tags = np.array(filtered_tags, dtype=object) return corpus