def corpus_mock(tokens): corpus = Corpus.from_numpy( Domain([], metas=[StringVariable("texts")]), np.empty((len(tokens), 0)), metas=np.array([[" ".join(t)] for t in tokens]), ) return corpus
def _send_output(self) -> None: """ Create corpus with scores and output it """ if self.corpus is None: self.Outputs.corpus.send(None) self.Outputs.selected_documents.send(None) return scores, labels = self._gather_scores() if labels: d = self.corpus.domain domain = Domain( d.attributes, d.class_var, metas=d.metas + tuple( ContinuousVariable(get_unique_names(d, l)) for l in labels), ) out_corpus = Corpus.from_numpy( domain, self.corpus.X, self.corpus.Y, np.hstack([self.corpus.metas, scores]), ) Corpus.retain_preprocessing(self.corpus, out_corpus) else: out_corpus = self.corpus self.Outputs.corpus.send( create_annotated_table(out_corpus, self.selected_rows)) self.Outputs.selected_documents.send( out_corpus[self.selected_rows] if self.selected_rows else None)
def _preprocess_words(corpus: Corpus, words: List[str], callback: Callable) -> List[str]: """ Corpus's tokens can be preprocessed. Since they will not match correctly with words preprocessors that change words (e.g. normalization) must be applied to words too. """ # workaround to preprocess words # TODO: currently preprocessors work only on corpus, when there will be more # cases like this think about implementation of preprocessors for a list # of strings words_feature = StringVariable("words") words_c = Corpus.from_numpy( Domain([], metas=[words_feature]), np.empty((len(words), 0)), metas=np.array([[w] for w in words]), text_features=[words_feature], ) # only transformers and normalizers preprocess on the word level pps = [ pp for pp in corpus.used_preprocessor.preprocessors if isinstance(pp, (BaseTransformer, BaseNormalizer)) ] for i, pp in enumerate(pps): words_c = pp(words_c) callback((i + 1) / len(pps)) return [w[0] for w in words_c.tokens if len(w)]
def create_corpus(texts: List[str]) -> Corpus: """Create sample corpus with texts passed""" text_var = StringVariable("Text") domain = Domain([], metas=[text_var]) c = Corpus.from_numpy( domain, X=np.empty((len(texts), 0)), metas=np.array(texts).reshape(-1, 1), text_features=[text_var], ) return preprocess.LowercaseTransformer()(c)
def commit(self): """Create a new corpus and output it""" doc_var = StringVariable("Document") title_var = StringVariable("Title") domain = Domain([], metas=[title_var, doc_var]) corpus = Corpus.from_numpy( domain, np.empty((len(self.texts), 0)), metas=np.array(self.texts), text_features=[doc_var], ) corpus.set_title_variable(title_var) self.Outputs.corpus.send(corpus)
def _create_simple_data(self) -> None: """ Creat a simple dataset with 4 documents. Save it to `self.corpus`. """ metas = np.array([ ["Lorem ipsum dolor sit amet, consectetur adipiscing elit."], ["Duis viverra elit eu mi blandit, {et} sollicitudin nisi "], [" a porta\tleo. Duis vitae ultrices massa. Mauris ut pulvinar a"], [ "tortor. Class (aptent) taciti\nsociosqu ad lit1ora torquent per" ], ]) text_var = StringVariable("text") domain = Domain([], metas=[text_var]) self.corpus = Corpus.from_numpy( domain, X=np.empty((len(metas), 0)), metas=metas, text_features=[text_var], )
def _create_corpus(self) -> Optional[Corpus]: if len(self.tweets) == 0: return None def to_val(attr, val): if isinstance(attr, DiscreteVariable): attr.val_from_str_add(val) return attr.to_val(val) m = [attr for attr, _ in METAS] domain = Domain(attributes=[], class_vars=[], metas=m) metas = np.array( [ [to_val(attr, t) for (attr, _), t in zip(METAS, ts)] for ts in self.tweets.values() ], dtype=object, ) x = np.empty((len(metas), 0)) return Corpus.from_numpy(domain, x, metas=metas, text_features=self.text_features)
def run( corpus: Optional[Corpus], words: Optional[List], cached_keywords: Dict, scoring_methods: Set, scoring_methods_kwargs: Dict, agg_method: int, state: TaskState ) -> Results: results = Results(scores=[], labels=[], all_keywords={}) if not corpus: return results # passed by reference (and not copied) - to save partial results results.all_keywords = cached_keywords if not scoring_methods: return results def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception callback(0, "Calculating...") scores = {} documents = corpus.documents step = 1 / len(scoring_methods) for method_name, func in ScoringMethods.ITEMS: if method_name in scoring_methods: if method_name not in results.all_keywords: i = len(results.labels) cb = wrap_callback(callback, start=i * step, end=(i + 1) * step) needs_tokens = method_name in ScoringMethods.TOKEN_METHODS kw = {"progress_callback": cb} kw.update(scoring_methods_kwargs.get(method_name, {})) keywords = func(corpus if needs_tokens else documents, **kw) results.all_keywords[method_name] = keywords keywords = results.all_keywords[method_name] scores[method_name] = \ dict(AggregationMethods.aggregate(keywords, agg_method)) results.labels.append(method_name) scores = pd.DataFrame(scores) if words: # Normalize words for preprocessor in corpus.used_preprocessor.preprocessors: if isinstance(preprocessor, BaseNormalizer): dummy = Corpus.from_numpy( Domain((), metas=[StringVariable("Words")]), X=np.empty((len(words), 0)), metas=np.array(words)[:, None] ) words = list(preprocessor(dummy).tokens.flatten()) # Filter scores using words existing_words = [w for w in set(words) if w in scores.index] scores = scores.loc[existing_words] if existing_words \ else scores.iloc[:0] results.scores = scores.reset_index().sort_values( by=[results.labels[0], "index"], ascending=[False, True] ).values.tolist() return results