def test_args(self): corpus = Corpus.from_file('deerwester') BowVectorizer.wglobals['const'] = lambda df, N: 1 vect = BowVectorizer(norm=BowVectorizer.NONE, wlocal=BowVectorizer.COUNT, wglobal='const') self.assertEqualCorpus( vect.transform(corpus), BowVectorizer(wlocal=BowVectorizer.COUNT).transform(corpus)) vect = BowVectorizer(norm=BowVectorizer.NONE, wlocal=BowVectorizer.BINARY, wglobal='const') self.assertEqualCorpus( vect.transform(corpus), BowVectorizer(wlocal=BowVectorizer.BINARY).transform(corpus)) vect = BowVectorizer(norm=BowVectorizer.L1, wlocal=BowVectorizer.COUNT, wglobal='const') x = vect.transform(corpus).X self.assertAlmostEqual(abs(x.sum(axis=1) - 1).sum(), 0)
def test_transform(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) self.assertIsInstance(result, Corpus) self.assertEqual(len(result.domain), 43)
def test_transform(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) self.assertIsInstance(result, Corpus) self.assertEqual(len(result.domain.variables), 43)
def test_empty_corpus(self): """ Empty data. GH-247 """ corpus = Corpus.from_file("deerwester")[:0] vect = BowVectorizer(norm=BowVectorizer.L1) out = vect.transform(corpus) self.assertEqual(out, corpus)
def test_compute_values(self): corpus = Corpus.from_file('deerwester') vect = BowVectorizer() bow = vect.transform(corpus) computed = Corpus.from_table(bow.domain, corpus) self.assertEqual(bow.domain, computed.domain) self.assertEqual((bow.X != computed.X).nnz, 0)
def test_callback(self): vect = BowVectorizer() corpus = Corpus.from_file("deerwester") callback = MagicMock() result = vect.transform(corpus, callback=callback) self.assertIsInstance(result, Corpus) self.assertEqual(len(result.domain.variables), 43) callback.assert_has_calls([call(0.3), call(0.6), call(0.9), call(1)])
def test_ngrams(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') corpus = preprocess.RegexpTokenizer('\w+')(corpus) corpus = preprocess.NGrams(ngrams_range=(1, 3))(corpus) result = vect.transform(corpus) attrs = [attr.name for attr in result.domain.attributes] self.assertIn(corpus.tokens[0][1], attrs) self.assertIn(' '.join(corpus.tokens[0][:2]), attrs) self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
def test_ngrams(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'), ngrams_range=(1, 3)) pr(corpus, inplace=True) result = vect.transform(corpus) attrs = [attr.name for attr in result.domain.attributes] self.assertIn(corpus.tokens[0][1], attrs) self.assertIn(' '.join(corpus.tokens[0][:2]), attrs) self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
def main(): corpus = Corpus.from_file('book-excerpts') vect = BowVectorizer() corpus_vect = vect.transform(corpus) app = QApplication([]) widget = OWWordEnrichment() widget.set_data(corpus_vect) subset_corpus = corpus_vect[:10] widget.set_data_selected(subset_corpus) widget.handleNewSignals() widget.show() app.exec()
def test_domain(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) attrs = [attr.name for attr in result.domain.attributes] self.assertEqual(attrs, sorted(attrs)) X = result.X.toarray() for i in range(len(corpus)): for contains, attr in zip(X[i], attrs): if contains > .001: self.assertIn(attr, corpus.tokens[i])
def test_args(self): corpus = Corpus.from_file('deerwester') BowVectorizer.wglobals['const'] = lambda df, N: 1 vect = BowVectorizer(norm=BowVectorizer.NONE, wlocal=BowVectorizer.COUNT, wglobal='const') self.assertEqualCorpus(vect.transform(corpus), BowVectorizer(wlocal=BowVectorizer.COUNT).transform(corpus)) vect = BowVectorizer(norm=BowVectorizer.NONE, wlocal=BowVectorizer.BINARY, wglobal='const') self.assertEqualCorpus(vect.transform(corpus), BowVectorizer(wlocal=BowVectorizer.BINARY).transform(corpus)) vect = BowVectorizer(norm=BowVectorizer.L1, wlocal=BowVectorizer.COUNT, wglobal='const') x = vect.transform(corpus).X self.assertAlmostEqual(abs(x.sum(axis=1) - 1).sum(), 0)
def tests_duplicated_names(self): """ BOW adds words to the domain and if same attribute name already appear in the domain it renames it and add number to the existing attribute name """ corpus = Corpus.from_file("deerwester") corpus = corpus.extend_attributes(np.ones((len(corpus), 1)), ["human"]) corpus = corpus.extend_attributes(np.ones((len(corpus), 1)), ["testtest"]) vect = BowVectorizer() out = vect.transform(corpus) # first attribute is in the dataset before bow and should be renamed self.assertEqual("human (1)", out.domain[0].name) self.assertEqual("testtest", out.domain[1].name) # all attributes from [1:] are are bow attributes and should include # human self.assertIn("human", [v.name for v in out.domain.attributes[1:]])
def test_compute_values_same_tfidf_regardless_num_documents(self): """ When computing TF-IDF from compute values TF-IDF should give same results regardless of length of new corpus - IDF weighting should consider only counts from original corpus. """ corpus = Corpus.from_file('deerwester') train_corpus = corpus[:5] test_corpus = corpus[5:] vect = BowVectorizer(wglobal=BowVectorizer.IDF) bow = vect.transform(train_corpus) computed1 = Corpus.from_table(bow.domain, test_corpus[1:]) computed2 = Corpus.from_table(bow.domain, test_corpus) self.assertEqual(computed1.domain, computed2.domain) self.assertEqual(bow.domain, computed2.domain) self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0)
def tfidf_keywords( corpus: Corpus, progress_callback: Callable = None) -> List[List[Tuple[str, float]]]: """ Extract keywords using TF-IDF. Parameters ---------- tokens : list Lists of tokens. progress_callback : callable Function for reporting progress. Returns ------- keywords : list """ if progress_callback is None: progress_callback = dummy_callback # empty X part - to know that every feature of X is bag of wrds domain = Domain([], class_vars=corpus.domain.class_vars, metas=corpus.domain.metas) corpus = corpus.from_table(domain, corpus) vectorizer = BowVectorizer( wlocal=BowVectorizer.COUNT, wglobal=BowVectorizer.IDF if len(corpus) > 1 else BowVectorizer.NONE, norm=BowVectorizer.L2, ) res = vectorizer.transform(corpus) X, words = res.X, [a.name for a in res.domain.attributes] keywords = [] n_docs = X.shape[0] for i, row in enumerate(X): progress_callback(i / n_docs) nonzero = row.nonzero() if len(nonzero) > 1: keywords.append([(words[i], row[0, i]) for i in nonzero[1]]) else: keywords.append([]) return keywords
def test_result(self): pp = PreprocessorList([BASE_TRANSFORMER, RegexpTokenizer()]) corpus = pp(Corpus.from_file("book-excerpts")[::3]) vect = BowVectorizer() corpus_vect = vect.transform(corpus) words = ["beheld", "events", "dragged", "basin", "visit", "have"] d = Domain([corpus_vect.domain[w] for w in words]) corpus_vect = corpus_vect.transform(d) self.send_signal(self.widget.Inputs.data, corpus_vect) self.send_signal(self.widget.Inputs.selected_data, corpus_vect[:1]) self.wait_until_finished(timeout=100000) np.testing.assert_array_almost_equal( self.widget.results.p_values, [0.02128, 1, 0.04255, 0.06383, 0.08511, 0.97872], decimal=5, ) np.testing.assert_array_almost_equal( self.widget.results.fdr_values, [0.12766, 1, 0.12766, 0.12766, 0.12766, 1], decimal=5, )
fp = lambda score: "%0.5f" % score if score > 10e-3 else "%0.1e" % score fpt = lambda score: "%0.9f" % score if score > 10e-3 else "%0.5e" % score class EATreeWidgetItem(QTreeWidgetItem): def __init__(self, word, p_value, f_value, parent): super().__init__(parent) self.data = [word, p_value, f_value] self.setText(0, word) self.setText(1, fp(p_value)) self.setToolTip(1, fpt(p_value)) self.setText(2, fp(f_value)) self.setToolTip(2, fpt(f_value)) def __lt__(self, other): col = self.treeWidget().sortColumn() return self.data[col] < other.data[col] if __name__ == '__main__': from orangewidget.utils.widgetpreview import WidgetPreview from orangecontrib.text.vectorization import BowVectorizer corpus = Corpus.from_file('book-excerpts') vect = BowVectorizer() corpus_vect = vect.transform(corpus) WidgetPreview(OWWordEnrichment).run(set_data_selected=corpus_vect[:10], set_data=corpus_vect)
def setUp(self): self.widget = self.create_widget(OWWordEnrichment) corpus = Corpus.from_file('book-excerpts')[::3] vect = BowVectorizer() self.corpus_vect = vect.transform(corpus) self.subset_corpus = self.corpus_vect[:5]
def setUp(self): # type: OWWordEnrichment self.widget = self.create_widget(OWWordEnrichment) self.corpus = Corpus.from_file('book-excerpts') vect = BowVectorizer() self.corpus_vect = vect.transform(self.corpus)
def test_binary(self): vect = BowVectorizer(wlocal=BowVectorizer.BINARY) corpus = Corpus.from_file('deerwester') result = vect.transform(corpus) self.assertEqual(result.X.max(), 1.)