示例#1
0
 def set_corpus(self, data=None):
     self.corpus = data
     if data is not None and not isinstance(data, Corpus):
         self.corpus = Corpus.from_table(data.domain, data)
     self.model.set_corpus(self.corpus)
     self.update_widget()
     self.commit()
示例#2
0
    def set_data(self, data=None):
        self.closeContext()
        self.reset_widget()
        self.corpus = data
        self.search_features = []
        if data is not None:
            if not isinstance(data, Corpus):
                self.corpus = Corpus.from_table(data.domain, data)
            domain = self.corpus.domain
            # Enable/disable tokens checkbox
            if not self.corpus.has_tokens():
                self.show_tokens_checkbox.setCheckState(False)
            self.show_tokens_checkbox.setEnabled(self.corpus.has_tokens())

            self.search_features = list(filter_visible(chain(domain.variables, domain.metas)))
            self.display_features = list(filter_visible(chain(domain.variables, domain.metas)))
            self.search_indices = list(range(len(self.search_features)))
            self.display_indices = list(range(len(self.display_features)))
            self.selection = [0]
            self.openContext(self.corpus)
            self.display_list_indices = self.display_indices
            self.regenerate_docs()
            self.list_docs()
            self.update_info()
            self.set_selection()
            self.show_docs()
        self.commit()
示例#3
0
    def open_file(self, path=None, data=None):
        self.closeContext()
        self.Error.clear()
        self.unused_attrs_model[:] = []
        self.used_attrs_model[:] = []
        if data:
            self.corpus = Corpus.from_table(data.domain, data)
        elif path:
            try:
                self.corpus = Corpus.from_file(path)
                self.corpus.name = os.path.splitext(os.path.basename(path))[0]
            except BaseException as err:
                self.Error.read_file(path, str(err))
        else:
            return

        self.update_info()
        self.used_attrs = list(self.corpus.text_features)
        if not self.corpus.text_features:
            self.Error.corpus_without_text_features()
            self.Outputs.corpus.send(None)
            return
        self.openContext(self.corpus)
        self.used_attrs_model.extend(self.used_attrs)
        self.unused_attrs_model.extend(
            [f for f in self.corpus.domain.metas
             if f.is_string and f not in self.used_attrs_model])
示例#4
0
    def open_file(self, path=None, data=None):
        self.closeContext()
        self.Error.clear()
        self.unused_attrs_model[:] = []
        self.used_attrs_model[:] = []
        if data:
            self.corpus = Corpus.from_table(data.domain, data)
        elif path:
            try:
                self.corpus = Corpus.from_file(path)
                self.corpus.name = os.path.splitext(os.path.basename(path))[0]
            except BaseException as err:
                self.Error.read_file(path, str(err))
        else:
            return

        self.update_info()
        self.used_attrs = list(self.corpus.text_features)
        if not self.corpus.text_features:
            self.Error.corpus_without_text_features()
            self.Outputs.corpus.send(None)
            return
        self.openContext(self.corpus)
        self.used_attrs_model.extend(self.used_attrs)
        self.unused_attrs_model.extend([
            f for f in self.corpus.domain.metas
            if f.is_string and f not in self.used_attrs_model
        ])
    def test_count_correctness(self):
        """Test if computed counts are correct for train and test dataset"""
        bow = BowVectorizer().transform(self.small_corpus_train)
        self.assert_bow_same(bow, self.train_counts, self.terms)

        # computed from compute_values - result contains only terms from train dataset
        bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
        self.assert_bow_same(bow_test, self.test_counts, self.terms)
示例#6
0
 def set_data(self, data=None):
     self.reset_widget()
     self.corpus = data
     if data is not None:
         if not isinstance(data, Corpus):
             self.corpus = Corpus.from_table(data.domain, data)
         self.load_features()
         self.regenerate_docs()
     self.commit()
示例#7
0
    def test_from_table(self):
        t = Table.from_file('brown-selected')
        self.assertIsInstance(t, Table)

        c = Corpus.from_table(t.domain, t)
        self.assertIsInstance(c, Corpus)
        self.assertEqual(len(t), len(c))
        np.testing.assert_equal(t.metas, c.metas)
        self.assertEqual(c.text_features, [t.domain.metas[0]])
示例#8
0
 def _load_corpus(path: str, data: Table, state: TaskState) -> Corpus:
     state.set_status("Loading")
     corpus = None
     if data:
         corpus = Corpus.from_table(data.domain, data)
     elif path:
         corpus = Corpus.from_file(path)
         corpus.name = os.path.splitext(os.path.basename(path))[0]
     return corpus
示例#9
0
 def set_data(self, data=None):
     self.reset_widget()
     self.corpus = data
     if data is not None:
         if not isinstance(data, Corpus):
             self.corpus = Corpus.from_table(data.domain, data)
         self.load_features()
         self.regenerate_docs()
     self.commit()
示例#10
0
    def test_compute_values(self):
        corpus = Corpus.from_file('deerwester')
        vect = BowVectorizer()

        bow = vect.transform(corpus)
        computed = Corpus.from_table(bow.domain, corpus)

        self.assertEqual(bow.domain, computed.domain)
        self.assertEqual((bow.X != computed.X).nnz, 0)
    def test_compute_values(self):
        corpus = Corpus.from_file('deerwester')
        vect = BowVectorizer()

        bow = vect.transform(corpus)
        computed = Corpus.from_table(bow.domain, corpus)

        self.assertEqual(bow.domain, computed.domain)
        self.assertEqual((bow.X != computed.X).nnz, 0)
示例#12
0
    def test_from_table(self):
        t = Table.from_file('brown-selected')
        self.assertIsInstance(t, Table)

        c = Corpus.from_table(t.domain, t)
        self.assertIsInstance(c, Corpus)
        self.assertEqual(len(t), len(c))
        np.testing.assert_equal(t.metas, c.metas)
        self.assertEqual(c.text_features, [t.domain.metas[0]])
    def test_compute_values_same_tfidf_regardless_num_documents(self):
        """
        When computing TF-IDF from compute values TF-IDF should give same
        results regardless of length of new corpus - IDF weighting should consider
        only counts from original corpus.
        """
        corpus = Corpus.from_file('deerwester')
        train_corpus = corpus[:5]
        test_corpus = corpus[5:]
        vect = BowVectorizer(wglobal=BowVectorizer.IDF)

        bow = vect.transform(train_corpus)
        computed1 = Corpus.from_table(bow.domain, test_corpus[1:])
        computed2 = Corpus.from_table(bow.domain, test_corpus)

        self.assertEqual(computed1.domain, computed2.domain)
        self.assertEqual(bow.domain, computed2.domain)
        self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0)
示例#14
0
 def set_data(self, data=None):
     self.reset_widget()  # Clear any old data.
     if data is not None:
         self.corpus = data
         if isinstance(data, Table):
             self.corpus = Corpus.from_table(data.domain, data)
         self.load_features()
         self.regenerate_documents()
         # Send the corpus to output.
         self.send(Output.CORPUS, self.corpus)
示例#15
0
 def set_corpus(self, data=None):
     self.closeContext()
     self.corpus = data
     if data is not None and not isinstance(data, Corpus):
         self.corpus = Corpus.from_table(data.domain, data)
     self.model.set_corpus(self.corpus)
     if not self.is_word_on_input:
         self.word = ""
         self.openContext(self.corpus)
     self.set_word()
示例#16
0
 def on_data(self, data):
     if data and not isinstance(data, Corpus):
         data = Corpus.from_table(data.domain, data)
     self.data = data
     self._repopulate_attr_combo(data)
     if not data:
         self.region_selected('')
         QTimer.singleShot(0, lambda: self.webview.evalJS('DATA = {}; renderMap();'))
     else:
         QTimer.singleShot(0, self.on_attr_change)
示例#17
0
 def on_data(self, data):
     if data and not isinstance(data, Corpus):
         data = Corpus.from_table(data.domain, data)
     self.data = data
     self._repopulate_attr_combo(data)
     if not data:
         self.region_selected('')
         QTimer.singleShot(0, lambda: self.webview.evalJS('DATA = {}; renderMap();'))
     else:
         QTimer.singleShot(0, self.on_attr_change)
示例#18
0
    def test_from_table_renamed(self):
        c1 = Corpus.from_file('book-excerpts')
        new_domain = Domain(c1.domain.attributes, metas=[c1.domain.metas[0].renamed("text1")])

        # when text feature renamed
        c2 = Corpus.from_table(new_domain, c1)
        self.assertIsInstance(c2, Corpus)
        self.assertEqual(len(c1), len(c2))
        np.testing.assert_equal(c1.metas, c2.metas)
        self.assertEqual(1, len(c2.text_features))
        self.assertEqual("text1", c2.text_features[0].name)
    def test_tfidf_correctness(self):
        """
        Test if computed tf-ids are correct for train and test dataset
        When computing tf-idf on the training dataset (from compute values)
        weights (idf) must be computed based on numbers on training dataset
        """
        bow = BowVectorizer(wglobal=BowVectorizer.IDF).transform(
            self.small_corpus_train)

        document_appearance = (self.train_counts != 0).sum(0)
        n = len(self.train_counts)
        idfs_train = self.train_counts * np.log(n / document_appearance)
        self.assert_bow_same(bow, idfs_train, self.terms)

        bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
        # weights computed based on numbers from training dataset
        idfs_test = self.test_counts * np.log(n / document_appearance)
        self.assert_bow_same(bow_test, idfs_test, self.terms)
    def test_no_overlapping(self):
        w = self.widget

        # with one column bow it is easier
        corpus_vect = Corpus.from_table(
            Domain(self.corpus_vect.domain.attributes[:1],
                   self.corpus_vect.domain.class_var,
                   self.corpus_vect.domain.metas), self.corpus_vect)

        self.send_signal(w.Inputs.data, corpus_vect[10:15])
        self.send_signal(w.Inputs.selected_data, corpus_vect[4:5])
        self.assertTrue(self.widget.Error.no_words_overlap.is_shown())

        # when commands changed on non-valid data
        w.controls.filter_by_p.click()

        self.send_signal(w.Inputs.selected_data, self.subset_corpus)
        self.send_signal(w.Inputs.data, self.corpus_vect)
        self.assertFalse(self.widget.Error.no_words_overlap.is_shown())
示例#21
0
    def test_documents_from_sparse_features(self):
        t = Table.from_file('brown-selected')
        c = Corpus.from_table(t.domain, t)
        c.X = csr_matrix(c.X)

        # docs from X, Y and metas
        docs = c.documents_from_features([t.domain.attributes[0], t.domain.class_var, t.domain.metas[0]])
        self.assertEqual(len(docs), len(t))
        for first_attr, class_val, meta_attr, d in zip(t.X[:, 0], c.Y, c.metas[:, 0], docs):
            first_attr = c.domain.attributes[0].str_val(first_attr)
            class_val = c.domain.class_var.str_val(class_val)
            meta_attr = c.domain.metas[0].str_val(meta_attr)
            self.assertIn(class_val, d)
            self.assertIn(first_attr, d)
            self.assertIn(meta_attr, d)

        # docs only from sparse X
        docs = c.documents_from_features([t.domain.attributes[0]])
        self.assertEqual(len(docs), len(t))
        for first_attr, d in zip(t.X[:, 0], docs):
            first_attr = c.domain.attributes[0].str_val(first_attr)
            self.assertIn(first_attr, d)
示例#22
0
    def test_documents_from_sparse_features(self):
        t = Table.from_file('brown-selected')
        c = Corpus.from_table(t.domain, t)
        c.X = csr_matrix(c.X)

        # docs from X, Y and metas
        docs = c.documents_from_features([t.domain.attributes[0], t.domain.class_var, t.domain.metas[0]])
        self.assertEqual(len(docs), len(t))
        for first_attr, class_val, meta_attr, d in zip(t.X[:, 0], c.Y, c.metas[:, 0], docs):
            first_attr = c.domain.attributes[0].str_val(first_attr)
            class_val = c.domain.class_var.str_val(class_val)
            meta_attr = c.domain.metas[0].str_val(meta_attr)
            self.assertIn(class_val, d)
            self.assertIn(first_attr, d)
            self.assertIn(meta_attr, d)

        # docs only from sparse X
        docs = c.documents_from_features([t.domain.attributes[0]])
        self.assertEqual(len(docs), len(t))
        for first_attr, d in zip(t.X[:, 0], docs):
            first_attr = c.domain.attributes[0].str_val(first_attr)
            self.assertIn(first_attr, d)
def addMetaDataColumn(corpus, columnData, columnDomain):
    newDomain = addMetaDomain(corpus.domain, columnDomain)
    newArray = addMetaData(corpus, columnData)
    newTable = Table.from_list(newDomain, newArray)
    newCorpus = Corpus.from_table(newDomain, newTable)
    return (newCorpus)
示例#24
0
 def commit(self):
     if self.output_mask is not None:
         output_corpus = Corpus.from_table(self.corpus.domain, self.corpus,
                                           row_indices=self.output_mask)
         self.send(Output.CORPUS, output_corpus)
示例#25
0
    def test_compute_values(self):
        sentiment = self.method.transform(self.corpus)
        computed = Corpus.from_table(sentiment.domain, self.corpus)

        self.assertEqual(sentiment.domain, computed.domain)
        self.assertTrue((sentiment.X == computed.X).all())
示例#26
0
    def test_compute_values(self):
        sentiment = self.method.transform(self.corpus)
        computed = Corpus.from_table(sentiment.domain, self.corpus)

        self.assertEqual(sentiment.domain, computed.domain)
        self.assertTrue((sentiment.X == computed.X).all())