def set_corpus(self, data=None): self.corpus = data if data is not None and not isinstance(data, Corpus): self.corpus = Corpus.from_table(data.domain, data) self.model.set_corpus(self.corpus) self.update_widget() self.commit()
def set_data(self, data=None): self.closeContext() self.reset_widget() self.corpus = data self.search_features = [] if data is not None: if not isinstance(data, Corpus): self.corpus = Corpus.from_table(data.domain, data) domain = self.corpus.domain # Enable/disable tokens checkbox if not self.corpus.has_tokens(): self.show_tokens_checkbox.setCheckState(False) self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens()) self.search_features = list(filter_visible(chain(domain.variables, domain.metas))) self.display_features = list(filter_visible(chain(domain.variables, domain.metas))) self.search_indices = list(range(len(self.search_features))) self.display_indices = list(range(len(self.display_features))) self.selection = [0] self.openContext(self.corpus) self.display_list_indices = self.display_indices self.regenerate_docs() self.list_docs() self.update_info() self.set_selection() self.show_docs() self.commit()
def open_file(self, path=None, data=None): self.closeContext() self.Error.clear() self.unused_attrs_model[:] = [] self.used_attrs_model[:] = [] if data: self.corpus = Corpus.from_table(data.domain, data) elif path: try: self.corpus = Corpus.from_file(path) self.corpus.name = os.path.splitext(os.path.basename(path))[0] except BaseException as err: self.Error.read_file(path, str(err)) else: return self.update_info() self.used_attrs = list(self.corpus.text_features) if not self.corpus.text_features: self.Error.corpus_without_text_features() self.Outputs.corpus.send(None) return self.openContext(self.corpus) self.used_attrs_model.extend(self.used_attrs) self.unused_attrs_model.extend( [f for f in self.corpus.domain.metas if f.is_string and f not in self.used_attrs_model])
def open_file(self, path=None, data=None): self.closeContext() self.Error.clear() self.unused_attrs_model[:] = [] self.used_attrs_model[:] = [] if data: self.corpus = Corpus.from_table(data.domain, data) elif path: try: self.corpus = Corpus.from_file(path) self.corpus.name = os.path.splitext(os.path.basename(path))[0] except BaseException as err: self.Error.read_file(path, str(err)) else: return self.update_info() self.used_attrs = list(self.corpus.text_features) if not self.corpus.text_features: self.Error.corpus_without_text_features() self.Outputs.corpus.send(None) return self.openContext(self.corpus) self.used_attrs_model.extend(self.used_attrs) self.unused_attrs_model.extend([ f for f in self.corpus.domain.metas if f.is_string and f not in self.used_attrs_model ])
def test_count_correctness(self): """Test if computed counts are correct for train and test dataset""" bow = BowVectorizer().transform(self.small_corpus_train) self.assert_bow_same(bow, self.train_counts, self.terms) # computed from compute_values - result contains only terms from train dataset bow_test = Corpus.from_table(bow.domain, self.small_corpus_test) self.assert_bow_same(bow_test, self.test_counts, self.terms)
def set_data(self, data=None): self.reset_widget() self.corpus = data if data is not None: if not isinstance(data, Corpus): self.corpus = Corpus.from_table(data.domain, data) self.load_features() self.regenerate_docs() self.commit()
def test_from_table(self): t = Table.from_file('brown-selected') self.assertIsInstance(t, Table) c = Corpus.from_table(t.domain, t) self.assertIsInstance(c, Corpus) self.assertEqual(len(t), len(c)) np.testing.assert_equal(t.metas, c.metas) self.assertEqual(c.text_features, [t.domain.metas[0]])
def _load_corpus(path: str, data: Table, state: TaskState) -> Corpus: state.set_status("Loading") corpus = None if data: corpus = Corpus.from_table(data.domain, data) elif path: corpus = Corpus.from_file(path) corpus.name = os.path.splitext(os.path.basename(path))[0] return corpus
def test_compute_values(self): corpus = Corpus.from_file('deerwester') vect = BowVectorizer() bow = vect.transform(corpus) computed = Corpus.from_table(bow.domain, corpus) self.assertEqual(bow.domain, computed.domain) self.assertEqual((bow.X != computed.X).nnz, 0)
def test_compute_values_same_tfidf_regardless_num_documents(self): """ When computing TF-IDF from compute values TF-IDF should give same results regardless of length of new corpus - IDF weighting should consider only counts from original corpus. """ corpus = Corpus.from_file('deerwester') train_corpus = corpus[:5] test_corpus = corpus[5:] vect = BowVectorizer(wglobal=BowVectorizer.IDF) bow = vect.transform(train_corpus) computed1 = Corpus.from_table(bow.domain, test_corpus[1:]) computed2 = Corpus.from_table(bow.domain, test_corpus) self.assertEqual(computed1.domain, computed2.domain) self.assertEqual(bow.domain, computed2.domain) self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0)
def set_data(self, data=None): self.reset_widget() # Clear any old data. if data is not None: self.corpus = data if isinstance(data, Table): self.corpus = Corpus.from_table(data.domain, data) self.load_features() self.regenerate_documents() # Send the corpus to output. self.send(Output.CORPUS, self.corpus)
def set_corpus(self, data=None): self.closeContext() self.corpus = data if data is not None and not isinstance(data, Corpus): self.corpus = Corpus.from_table(data.domain, data) self.model.set_corpus(self.corpus) if not self.is_word_on_input: self.word = "" self.openContext(self.corpus) self.set_word()
def on_data(self, data): if data and not isinstance(data, Corpus): data = Corpus.from_table(data.domain, data) self.data = data self._repopulate_attr_combo(data) if not data: self.region_selected('') QTimer.singleShot(0, lambda: self.webview.evalJS('DATA = {}; renderMap();')) else: QTimer.singleShot(0, self.on_attr_change)
def test_from_table_renamed(self): c1 = Corpus.from_file('book-excerpts') new_domain = Domain(c1.domain.attributes, metas=[c1.domain.metas[0].renamed("text1")]) # when text feature renamed c2 = Corpus.from_table(new_domain, c1) self.assertIsInstance(c2, Corpus) self.assertEqual(len(c1), len(c2)) np.testing.assert_equal(c1.metas, c2.metas) self.assertEqual(1, len(c2.text_features)) self.assertEqual("text1", c2.text_features[0].name)
def test_tfidf_correctness(self): """ Test if computed tf-ids are correct for train and test dataset When computing tf-idf on the training dataset (from compute values) weights (idf) must be computed based on numbers on training dataset """ bow = BowVectorizer(wglobal=BowVectorizer.IDF).transform( self.small_corpus_train) document_appearance = (self.train_counts != 0).sum(0) n = len(self.train_counts) idfs_train = self.train_counts * np.log(n / document_appearance) self.assert_bow_same(bow, idfs_train, self.terms) bow_test = Corpus.from_table(bow.domain, self.small_corpus_test) # weights computed based on numbers from training dataset idfs_test = self.test_counts * np.log(n / document_appearance) self.assert_bow_same(bow_test, idfs_test, self.terms)
def test_no_overlapping(self): w = self.widget # with one column bow it is easier corpus_vect = Corpus.from_table( Domain(self.corpus_vect.domain.attributes[:1], self.corpus_vect.domain.class_var, self.corpus_vect.domain.metas), self.corpus_vect) self.send_signal(w.Inputs.data, corpus_vect[10:15]) self.send_signal(w.Inputs.selected_data, corpus_vect[4:5]) self.assertTrue(self.widget.Error.no_words_overlap.is_shown()) # when commands changed on non-valid data w.controls.filter_by_p.click() self.send_signal(w.Inputs.selected_data, self.subset_corpus) self.send_signal(w.Inputs.data, self.corpus_vect) self.assertFalse(self.widget.Error.no_words_overlap.is_shown())
def test_documents_from_sparse_features(self): t = Table.from_file('brown-selected') c = Corpus.from_table(t.domain, t) c.X = csr_matrix(c.X) # docs from X, Y and metas docs = c.documents_from_features([t.domain.attributes[0], t.domain.class_var, t.domain.metas[0]]) self.assertEqual(len(docs), len(t)) for first_attr, class_val, meta_attr, d in zip(t.X[:, 0], c.Y, c.metas[:, 0], docs): first_attr = c.domain.attributes[0].str_val(first_attr) class_val = c.domain.class_var.str_val(class_val) meta_attr = c.domain.metas[0].str_val(meta_attr) self.assertIn(class_val, d) self.assertIn(first_attr, d) self.assertIn(meta_attr, d) # docs only from sparse X docs = c.documents_from_features([t.domain.attributes[0]]) self.assertEqual(len(docs), len(t)) for first_attr, d in zip(t.X[:, 0], docs): first_attr = c.domain.attributes[0].str_val(first_attr) self.assertIn(first_attr, d)
def addMetaDataColumn(corpus, columnData, columnDomain): newDomain = addMetaDomain(corpus.domain, columnDomain) newArray = addMetaData(corpus, columnData) newTable = Table.from_list(newDomain, newArray) newCorpus = Corpus.from_table(newDomain, newTable) return (newCorpus)
def commit(self): if self.output_mask is not None: output_corpus = Corpus.from_table(self.corpus.domain, self.corpus, row_indices=self.output_mask) self.send(Output.CORPUS, output_corpus)
def test_compute_values(self): sentiment = self.method.transform(self.corpus) computed = Corpus.from_table(sentiment.domain, self.corpus) self.assertEqual(sentiment.domain, computed.domain) self.assertTrue((sentiment.X == computed.X).all())