def _send_output(self) -> None: """ Create corpus with scores and output it """ if self.corpus is None: self.Outputs.corpus.send(None) self.Outputs.selected_documents.send(None) return scores, labels = self._gather_scores() if labels: d = self.corpus.domain domain = Domain( d.attributes, d.class_var, metas=d.metas + tuple( ContinuousVariable(get_unique_names(d, l)) for l in labels), ) out_corpus = Corpus( domain, self.corpus.X, self.corpus.Y, np.hstack([self.corpus.metas, scores]), ) Corpus.retain_preprocessing(self.corpus, out_corpus) else: out_corpus = self.corpus self.Outputs.corpus.send( create_annotated_table(out_corpus, self.selected_rows)) self.Outputs.selected_documents.send( out_corpus[self.selected_rows] if self.selected_rows else None)
def check_data(self): self.Error.clear() if isinstance(self.data, Table) and \ isinstance(self.selected_data, Table): if len(self.selected_data) == 0: self.Error.empty_selection() self.clear() return # keep only BoW features bow_domain = self.get_bow_domain() if len(bow_domain.attributes) == 0: self.Error.no_bow_features() self.clear() return self.data = Corpus.from_table(bow_domain, self.data) self.selected_data_transformed = Corpus.from_table( bow_domain, self.selected_data) if np_sp_sum(self.selected_data_transformed.X) == 0: self.Error.no_words_overlap() self.clear() elif len(self.data) == len(self.selected_data): self.Error.all_selected() self.clear() else: self.apply() else: self.clear()
def check_data(self): self.Error.clear() if isinstance(self.data, Table) and \ isinstance(self.selected_data, Table): if len(self.selected_data) == 0: self.Error.empty_selection() self.clear() return # keep only BoW features bow_domain = self.get_bow_domain() if len(bow_domain.attributes) == 0: self.Error.no_bow_features() self.clear() return self.data = Corpus.from_table(bow_domain, self.data) self.selected_data_transformed = Corpus.from_table(bow_domain, self.selected_data) if np_sp_sum(self.selected_data_transformed.X) == 0: self.Error.no_words_overlap() self.clear() elif len(self.data) == len(self.selected_data): self.Error.all_selected() self.clear() else: self.apply() else: self.clear()
def test_output_status(self): """ Test input, output info """ # when input signal data = Corpus.from_file("election-tweets-2016") out_sum = self.widget.info.set_output_summary = Mock() self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished() out_sum.assert_called_with( str(len(data)), "6444 document(s)\n4 text features(s)\n7 other feature(s)\n" "Classification; discrete class with 2 values.") out_sum.reset_mock() # corpus without class data1 = Corpus(Domain(data.domain.attributes, metas=data.domain.metas), data.X, metas=data.metas, text_features=data.text_features) self.send_signal(self.widget.Inputs.data, data1) self.wait_until_finished() out_sum.assert_called_with( str(len(data)), "6444 document(s)\n4 text features(s)\n7 other feature(s)") out_sum.reset_mock() # corpus with continuous class data1 = Corpus(Domain(data.domain.attributes, ContinuousVariable("a"), metas=data.domain.metas), data.X, np.random.rand(len(data), 1), metas=data.metas, text_features=data.text_features) self.send_signal(self.widget.Inputs.data, data1) self.wait_until_finished() out_sum.assert_called_with( str(len(data)), "6444 document(s)\n4 text features(s)\n7 other feature(s)\n" "Regression; numerical class.") out_sum.reset_mock() # default dataset is on the output self.send_signal(self.widget.Inputs.data, None) self.wait_until_finished() out_sum.assert_called_with( "140", "140 document(s)\n1 text features(s)\n0 other feature(s)\n" "Classification; discrete class with 2 values.") out_sum.reset_mock()
def __call__(self, corpus: Corpus, callback: Callable = None, **kw) -> Corpus: """ Marks tokens of a corpus with POS tags. """ if callback is None: callback = dummy_callback corpus = super().__call__(corpus, wrap_callback(callback, end=0.2)) assert corpus.has_tokens() callback(0.2, "POS Tagging...") tags = np.array(self._preprocess(corpus.tokens, **kw), dtype=object) corpus.pos_tags = tags return corpus
def __call__(self, corpus: Corpus) -> Corpus: """ Preprocess corpus. Should be extended when inherited and invoke _preprocess method on a document or token(s). :param corpus: Corpus :return: Corpus Preprocessed corpus. """ ids = corpus.ids corpus = corpus.copy() corpus.ids = ids corpus.used_preprocessor = self return corpus
def search(self, lang, queries, articles_per_query=10, should_break=None, on_progress=None): """ Searches for articles. Args: lang(str): A language code in ISO 639-1 format. queries(list of str): A list of queries. should_break (callback): Callback for breaking the computation before the end. If it evaluates to True, downloading is stopped and document downloaded till now are returned in a Corpus. on_progress (callable): Callback for progress bar. """ wikipedia.set_lang(lang) results = [] for i, query in enumerate(queries): try: articles = wikipedia.search(query, results=articles_per_query) for j, article in enumerate(articles): if callable(should_break) and should_break(): break results.extend(self._get(article, query, should_break)) if callable(on_progress): on_progress((i*articles_per_query + j+1) / (len(queries) * articles_per_query), len(results)) except (wikipedia.exceptions.HTTPTimeoutError, IOError) as e: self.on_error(str(e)) break if callable(should_break) and should_break(): break return Corpus.from_documents(results, 'Wikipedia', self.attributes, self.class_vars, self.metas, title_indices=[-1])
def load(self): self.progressBarInit() if not os.path.isdir(self.directory): print(f"error: {self.directory} is not a valid directory!", file=sys.stderr) files = list(Path(self.directory).glob(self._glob)) if len(files) == 0: print("error: no files found!", file=sys.stderr) mails = [] seen = {} for i, filename in enumerate(files): try: mails.append(list(parse_enron_mail(filename))) key = "#".join([mails[-1][0], mails[-1][7]]) if key in seen: mails[-1][3] = self.YESSTRING seen[key] = True except Exception as e: print(filename) print(e) self.progressBarSet(100 * (i + 1) / len(files)) domain = self.corpusDomain(mails) table = Table.from_list(domain, mails) self.Outputs.data.send(Corpus.from_table(table.domain, table)) self.progressBarFinished()
def create_corpus(self): return Corpus.from_documents(self.tweets, 'Twitter', self.attributes, self.class_vars, self.metas, title_indices=[-1])
def corpus_mock(tokens): corpus = Corpus.from_numpy( Domain([], metas=[StringVariable("texts")]), np.empty((len(tokens), 0)), metas=np.array([[" ".join(t)] for t in tokens]), ) return corpus
def test_init(self): corpus = Corpus.from_file('deerwester') c2n = CorpusToNetwork(corpus) self.assertIsInstance(c2n.corpus, Corpus) self.assertEqual(len(c2n.corpus), 9) with self.assertRaises(Exception): c2n = CorpusToNetwork(corpus.domain)
def create_corpus(self): """ Creates a corpus with collected tweets. """ self.statuses_lock.acquire() corpus = Corpus.from_documents(self.tweets, 'Twitter', self.attributes, self.class_vars, self.metas, title_indices=[-2]) self.statuses_lock.release() return corpus
def test_compute_values(self): """ Test compute values on new data """ data = self._compute_features("Word count") computed = Corpus.from_table(data.domain, self.book_data) self.assertEqual(data.domain, computed.domain) self.assertTupleEqual((len(self.book_data), 1), computed.X.shape)
def _preprocess_words(corpus: Corpus, words: List[str], callback: Callable) -> List[str]: """ Corpus's tokens can be preprocessed. Since they will not match correctly with words preprocessors that change words (e.g. normalization) must be applied to words too. """ # workaround to preprocess words # TODO: currently preprocessors work only on corpus, when there will be more # cases like this think about implementation of preprocessors for a list # of strings words_feature = StringVariable("words") words_c = Corpus( Domain([], metas=[words_feature]), metas=np.array([[w] for w in words]), text_features=[words_feature], ) # only transformers and normalizers preprocess on the word level pps = [ pp for pp in corpus.used_preprocessor.preprocessors if isinstance(pp, (BaseTransformer, BaseNormalizer)) ] for i, pp in enumerate(pps): words_c = pp(words_c) callback((i + 1) / len(pps)) return [w[0] for w in words_c.tokens if len(w)]
def dummy_fetch(self, cursors, max_tweets, search_author, callback): return ( Corpus( Domain([], metas=[StringVariable("Content")]), metas=np.array([["Abc"], ["Cde"], ["Gf"]]), ), 3, )
def test_titles_no_newline(self): corpus = Corpus.from_file("andersen") with corpus.unlocked(): corpus.metas[0, 0] = corpus.metas[0, 0] + "\ntest" corpus.set_title_variable("Title") self.send_signal(self.widget.Inputs.corpus, corpus) self.assertEqual("The Little Match-Seller test", self.widget.view.model().index(0, 0).data())
def main(): app = QApplication([]) widget = OWSentimentAnalysis() corpus = Corpus.from_file('book-excerpts') corpus = corpus[:3] widget.set_corpus(corpus) widget.show() app.exec()
def _store_tokens_from_documents(self, corpus: Corpus, callback: Callable) -> Corpus: """ Create tokens from documents and set corpus.tokens. :param corpus: Corpus :param callback: progress callback function :return: Corpus Preprocessed corpus. """ assert callback is not None tokens, n = [], len(corpus.pp_documents) for i, doc in enumerate(corpus.pp_documents): callback(i / n) tokens.append(self._preprocess(doc)) corpus.store_tokens(tokens) return corpus
def _store_tokens(self, corpus: Corpus, callback: Callable) -> Corpus: """ Preprocess and set corpus.tokens. :param corpus: Corpus :param callback: progress callback function :return: Corpus Preprocessed corpus. """ assert callback is not None assert corpus.has_tokens() tokens, n = [], len(corpus.tokens) for i, tokens_ in enumerate(corpus.tokens): callback(i / n) tokens.append([self._preprocess(s) for s in tokens_]) corpus.store_tokens(tokens) return corpus
def main(): app = QApplication([]) widget = OWSentimentAnalysis() corpus = Corpus.from_file('bookexcerpts') corpus = corpus[:3] widget.set_corpus(corpus) widget.show() app.exec()
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: corpus = super().__call__(corpus) if callback is None: callback = dummy_callback callback(0, "Transforming...") corpus = self._store_documents(corpus, wrap_callback(callback, end=0.5)) return self._store_tokens(corpus, wrap_callback(callback, start=0.5)) \ if corpus.has_tokens() else corpus
def setUp(self): self.patcher = patch( "orangecontrib.text.widgets.owsemanticviewer." "SemanticSearch", new=DummySearch) self.patcher.start() self.widget = self.create_widget(OWSemanticViewer) self.corpus = Corpus.from_file("deerwester") self.words = create_words_table(["foo", "graph", "minors", "trees"])
def test_corpus_not_normalized(self): # send non-normalized corpus non_normalized_corpus = Corpus.from_file("book-excerpts") self.send_signal(self.widget.Inputs.corpus, non_normalized_corpus) self.assertTrue(self.widget.Warning.corpus_not_normalized.is_shown()) # when sending normalized corpus error should disappear self.send_signal(self.widget.Inputs.corpus, self.corpus) self.assertFalse(self.widget.Warning.corpus_not_normalized.is_shown())
def test_append_to_existing_X(self): """ Test if new features are correctly attached to X matrix """ data = Corpus.from_file("election-tweets-2016") self.send_signal(self.widget.Inputs.corpus, data) self.wait_until_finished() statistics = self.get_output(self.widget.Outputs.corpus) self.assertTupleEqual((data.X.shape[0], data.X.shape[1] + 2), statistics.X.shape)
def test_empty(self): corpus = Corpus.from_file('deerwester')[:0] c2n = CorpusToNetwork(corpus) result = c2n(document_nodes=True, window_size=1, threshold=1, freq_threshold=1) self.assertEqual(result.number_of_nodes(), 0) self.assertEqual(result.number_of_edges(), 0)
def create_corpus(self, search_author): if search_author: class_vars = self.authors metas = self.metas else: class_vars = [] metas = self.metas + self.authors return Corpus.from_documents(self.tweets, 'Twitter', self.attributes, class_vars, metas, title_indices=[-1])
def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus: if corpus.pos_tags is None: return corpus callback(0, "Filtering...") filtered_tags = [] filtered_tokens = [] for tags, tokens in zip(corpus.pos_tags, corpus.tokens): tmp_tags = [] tmp_tokens = [] for tag, token in zip(tags, tokens): # should we consider partial matches, i.e. "NN" for "NNS"? if tag in self._tags: tmp_tags.append(tag) tmp_tokens.append(token) filtered_tags.append(tmp_tags) filtered_tokens.append(tmp_tokens) corpus.store_tokens(filtered_tokens) corpus.pos_tags = filtered_tags return corpus
def create_corpus(texts: List[str]) -> Corpus: """ Create sample corpus with texts passed """ text_var = StringVariable("Text") domain = Domain([], metas=[text_var]) c = Corpus( domain, metas=np.array(texts).reshape(-1, 1), text_features=[text_var], ) return preprocess.LowercaseTransformer()(c)
def test_title_already_in_dataset(self): """ This dataset already have the title attribute so the title option is set to this attribute by default """ # default corpus dataset data = Corpus.from_file("election-tweets-2016") self.send_signal(self.widget.Inputs.data, data) self.assertEqual(data.domain["Content"], self.widget.title_variable) self.check_output("Content")
def test_output_unique(self): corpus = Corpus.from_file("book-excerpts") var = ContinuousVariable("Word count") corpus = corpus.add_column(var, np.array([1 for _ in range(len(corpus))])) words = create_words_table(["doctor", "rum", "house"]) self.send_signal(self.widget.Inputs.corpus, corpus) self.send_signal(self.widget.Inputs.words, words) self.wait_until_finished() output = self.get_output(self.widget.Outputs.selected_documents) self.assertTrue("Word count (1)" in output.domain)
def embedding_mock(_, corpus, __): if isinstance(corpus, list): return np.ones((len(corpus), 10)) else: # corpus is Corpus return ( Corpus( domain=Domain([ContinuousVariable(str(i)) for i in range(10)]), X=np.ones((len(corpus), 10)), ), None, )
def _create_simple_data(self) -> None: """ Creat a simple dataset with 4 documents. Save it to `self.corpus`. """ metas = np.array([ ["Lorem ipsum dolor sit amet, consectetur adipiscing elit."], ["Duis viverra elit eu mi blandit, {et} sollicitudin nisi "], [" a porta\tleo. Duis vitae ultrices massa. Mauris ut pulvinar a"], [ "tortor. Class (aptent) taciti\nsociosqu ad lit1ora torquent per" ], ]) text_var = StringVariable("text") domain = Domain([], metas=[text_var]) self.corpus = Corpus( domain, X=np.empty((len(metas), 0)), metas=metas, text_features=[text_var], )
def _filter_tokens(self, corpus: Corpus, callback: Callable, dictionary=None) -> Corpus: callback(0, "Filtering...") filtered_tokens = [] filtered_tags = [] for i, tokens in enumerate(corpus.tokens): filter_map = self._preprocess(tokens) filtered_tokens.append(list(compress(tokens, filter_map))) if corpus.pos_tags is not None: filtered_tags.append( list(compress(corpus.pos_tags[i], filter_map))) if dictionary is None: corpus.store_tokens(filtered_tokens) else: corpus.store_tokens(filtered_tokens, dictionary) if filtered_tags: corpus.pos_tags = np.array(filtered_tags, dtype=object) return corpus
def add_embedding(corpus: Corpus) -> Corpus: transformed_corpus = BowVectorizer().transform(corpus) pca = PCA(n_components=2) pca_model = pca(transformed_corpus) projection = pca_model(transformed_corpus) domain = Domain( transformed_corpus.domain.attributes, transformed_corpus.domain.class_vars, chain(transformed_corpus.domain.metas, projection.domain.attributes)) return corpus.transform(domain)
def main(): corpus = Corpus.from_file('book-excerpts') vect = BowVectorizer() corpus_vect = vect.transform(corpus) app = QApplication([]) widget = OWWordEnrichment() widget.set_data(corpus_vect) subset_corpus = corpus_vect[:10] widget.set_data_selected(subset_corpus) widget.handleNewSignals() widget.show() app.exec()
class Vader_Sentiment: sentiments = ('pos', 'neg', 'neu', 'compound') name = 'Vader' @wait_nltk_data def __init__(self): self.vader = SentimentIntensityAnalyzer() def transform(self, corpus, copy=True): scores = [] for text in corpus.documents: pol_sc = self.vader.polarity_scores(text) scores.append([pol_sc[x] for x in self.sentiments]) X = np.array(scores).reshape((-1, len(self.sentiments))) # set compute values shared_cv = SharedTransform(self) cv = [VectorizationComputeValue(shared_cv, col) for col in self.sentiments] if copy: corpus = corpus.copy() corpus.extend_attributes(X, self.sentiments, compute_values=cv) return corpus if __name__ == "__main__": corpus = Corpus.from_file('deerwester') liu = Liu_Hu_Sentiment('Slovenian') corpus2 = liu.transform(corpus[:5])