def test_args(self):
        corpus = Corpus.from_file('deerwester')

        BowVectorizer.wglobals['const'] = lambda df, N: 1

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')

        self.assertEqualCorpus(
            vect.transform(corpus),
            BowVectorizer(wlocal=BowVectorizer.COUNT).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.BINARY,
                             wglobal='const')
        self.assertEqualCorpus(
            vect.transform(corpus),
            BowVectorizer(wlocal=BowVectorizer.BINARY).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.L1,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')
        x = vect.transform(corpus).X
        self.assertAlmostEqual(abs(x.sum(axis=1) - 1).sum(), 0)
    def test_transform(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        self.assertIsInstance(result, Corpus)
        self.assertEqual(len(result.domain), 43)
    def test_transform(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        self.assertIsInstance(result, Corpus)
        self.assertEqual(len(result.domain.variables), 43)
 def test_empty_corpus(self):
     """
     Empty data.
     GH-247
     """
     corpus = Corpus.from_file("deerwester")[:0]
     vect = BowVectorizer(norm=BowVectorizer.L1)
     out = vect.transform(corpus)
     self.assertEqual(out, corpus)
    def test_compute_values(self):
        corpus = Corpus.from_file('deerwester')
        vect = BowVectorizer()

        bow = vect.transform(corpus)
        computed = Corpus.from_table(bow.domain, corpus)

        self.assertEqual(bow.domain, computed.domain)
        self.assertEqual((bow.X != computed.X).nnz, 0)
    def test_compute_values(self):
        corpus = Corpus.from_file('deerwester')
        vect = BowVectorizer()

        bow = vect.transform(corpus)
        computed = Corpus.from_table(bow.domain, corpus)

        self.assertEqual(bow.domain, computed.domain)
        self.assertEqual((bow.X != computed.X).nnz, 0)
 def test_empty_corpus(self):
     """
     Empty data.
     GH-247
     """
     corpus = Corpus.from_file("deerwester")[:0]
     vect = BowVectorizer(norm=BowVectorizer.L1)
     out = vect.transform(corpus)
     self.assertEqual(out, corpus)
示例#8
0
    def test_callback(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file("deerwester")
        callback = MagicMock()

        result = vect.transform(corpus, callback=callback)
        self.assertIsInstance(result, Corpus)
        self.assertEqual(len(result.domain.variables), 43)
        callback.assert_has_calls([call(0.3), call(0.6), call(0.9), call(1)])
 def test_ngrams(self):
     vect = BowVectorizer()
     corpus = Corpus.from_file('deerwester')
     corpus = preprocess.RegexpTokenizer('\w+')(corpus)
     corpus = preprocess.NGrams(ngrams_range=(1, 3))(corpus)
     result = vect.transform(corpus)
     attrs = [attr.name for attr in result.domain.attributes]
     self.assertIn(corpus.tokens[0][1], attrs)
     self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
     self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
示例#10
0
 def test_ngrams(self):
     vect = BowVectorizer()
     corpus = Corpus.from_file('deerwester')
     pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'),
                                  ngrams_range=(1, 3))
     pr(corpus, inplace=True)
     result = vect.transform(corpus)
     attrs = [attr.name for attr in result.domain.attributes]
     self.assertIn(corpus.tokens[0][1], attrs)
     self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
     self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
示例#11
0
def main():

    corpus = Corpus.from_file('book-excerpts')
    vect = BowVectorizer()
    corpus_vect = vect.transform(corpus)
    app = QApplication([])
    widget = OWWordEnrichment()
    widget.set_data(corpus_vect)
    subset_corpus = corpus_vect[:10]
    widget.set_data_selected(subset_corpus)
    widget.handleNewSignals()
    widget.show()
    app.exec()
    def test_domain(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        attrs = [attr.name for attr in result.domain.attributes]
        self.assertEqual(attrs, sorted(attrs))

        X = result.X.toarray()
        for i in range(len(corpus)):
            for contains, attr in zip(X[i], attrs):
                if contains > .001:
                    self.assertIn(attr, corpus.tokens[i])
示例#13
0
    def test_domain(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        attrs = [attr.name for attr in result.domain.attributes]
        self.assertEqual(attrs, sorted(attrs))

        X = result.X.toarray()
        for i in range(len(corpus)):
            for contains, attr in zip(X[i], attrs):
                if contains > .001:
                    self.assertIn(attr, corpus.tokens[i])
示例#14
0
    def test_args(self):
        corpus = Corpus.from_file('deerwester')

        BowVectorizer.wglobals['const'] = lambda df, N: 1

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')

        self.assertEqualCorpus(vect.transform(corpus),
                               BowVectorizer(wlocal=BowVectorizer.COUNT).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.BINARY,
                             wglobal='const')
        self.assertEqualCorpus(vect.transform(corpus),
                               BowVectorizer(wlocal=BowVectorizer.BINARY).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.L1,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')
        x = vect.transform(corpus).X
        self.assertAlmostEqual(abs(x.sum(axis=1) - 1).sum(), 0)
示例#15
0
 def tests_duplicated_names(self):
     """
     BOW adds words to the domain and if same attribute name already appear
     in the domain it renames it and add number to the existing attribute
     name
     """
     corpus = Corpus.from_file("deerwester")
     corpus = corpus.extend_attributes(np.ones((len(corpus), 1)), ["human"])
     corpus = corpus.extend_attributes(np.ones((len(corpus), 1)), ["testtest"])
     vect = BowVectorizer()
     out = vect.transform(corpus)
     # first attribute is in the dataset before bow and should be renamed
     self.assertEqual("human (1)", out.domain[0].name)
     self.assertEqual("testtest", out.domain[1].name)
     # all attributes from [1:] are are bow attributes and should include
     # human
     self.assertIn("human", [v.name for v in out.domain.attributes[1:]])
    def test_compute_values_same_tfidf_regardless_num_documents(self):
        """
        When computing TF-IDF from compute values TF-IDF should give same
        results regardless of length of new corpus - IDF weighting should consider
        only counts from original corpus.
        """
        corpus = Corpus.from_file('deerwester')
        train_corpus = corpus[:5]
        test_corpus = corpus[5:]
        vect = BowVectorizer(wglobal=BowVectorizer.IDF)

        bow = vect.transform(train_corpus)
        computed1 = Corpus.from_table(bow.domain, test_corpus[1:])
        computed2 = Corpus.from_table(bow.domain, test_corpus)

        self.assertEqual(computed1.domain, computed2.domain)
        self.assertEqual(bow.domain, computed2.domain)
        self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0)
示例#17
0
def tfidf_keywords(
        corpus: Corpus,
        progress_callback: Callable = None) -> List[List[Tuple[str, float]]]:
    """
    Extract keywords using TF-IDF.

    Parameters
    ----------
    tokens : list
        Lists of tokens.
    progress_callback : callable
        Function for reporting progress.

    Returns
    -------
    keywords : list
    """
    if progress_callback is None:
        progress_callback = dummy_callback

    # empty X part - to know that every feature of X is bag of wrds
    domain = Domain([],
                    class_vars=corpus.domain.class_vars,
                    metas=corpus.domain.metas)
    corpus = corpus.from_table(domain, corpus)

    vectorizer = BowVectorizer(
        wlocal=BowVectorizer.COUNT,
        wglobal=BowVectorizer.IDF if len(corpus) > 1 else BowVectorizer.NONE,
        norm=BowVectorizer.L2,
    )
    res = vectorizer.transform(corpus)
    X, words = res.X, [a.name for a in res.domain.attributes]

    keywords = []
    n_docs = X.shape[0]
    for i, row in enumerate(X):
        progress_callback(i / n_docs)
        nonzero = row.nonzero()
        if len(nonzero) > 1:
            keywords.append([(words[i], row[0, i]) for i in nonzero[1]])
        else:
            keywords.append([])
    return keywords
    def test_result(self):
        pp = PreprocessorList([BASE_TRANSFORMER, RegexpTokenizer()])
        corpus = pp(Corpus.from_file("book-excerpts")[::3])
        vect = BowVectorizer()
        corpus_vect = vect.transform(corpus)

        words = ["beheld", "events", "dragged", "basin", "visit", "have"]
        d = Domain([corpus_vect.domain[w] for w in words])
        corpus_vect = corpus_vect.transform(d)

        self.send_signal(self.widget.Inputs.data, corpus_vect)
        self.send_signal(self.widget.Inputs.selected_data, corpus_vect[:1])
        self.wait_until_finished(timeout=100000)

        np.testing.assert_array_almost_equal(
            self.widget.results.p_values,
            [0.02128, 1, 0.04255, 0.06383, 0.08511, 0.97872],
            decimal=5,
        )
        np.testing.assert_array_almost_equal(
            self.widget.results.fdr_values,
            [0.12766, 1, 0.12766, 0.12766, 0.12766, 1],
            decimal=5,
        )
示例#19
0

fp = lambda score: "%0.5f" % score if score > 10e-3 else "%0.1e" % score
fpt = lambda score: "%0.9f" % score if score > 10e-3 else "%0.5e" % score


class EATreeWidgetItem(QTreeWidgetItem):
    def __init__(self, word, p_value, f_value, parent):
        super().__init__(parent)
        self.data = [word, p_value, f_value]
        self.setText(0, word)
        self.setText(1, fp(p_value))
        self.setToolTip(1, fpt(p_value))
        self.setText(2, fp(f_value))
        self.setToolTip(2, fpt(f_value))

    def __lt__(self, other):
        col = self.treeWidget().sortColumn()
        return self.data[col] < other.data[col]


if __name__ == '__main__':
    from orangewidget.utils.widgetpreview import WidgetPreview
    from orangecontrib.text.vectorization import BowVectorizer

    corpus = Corpus.from_file('book-excerpts')
    vect = BowVectorizer()
    corpus_vect = vect.transform(corpus)
    WidgetPreview(OWWordEnrichment).run(set_data_selected=corpus_vect[:10],
                                        set_data=corpus_vect)
 def setUp(self):
     self.widget = self.create_widget(OWWordEnrichment)
     corpus = Corpus.from_file('book-excerpts')[::3]
     vect = BowVectorizer()
     self.corpus_vect = vect.transform(corpus)
     self.subset_corpus = self.corpus_vect[:5]
示例#21
0
 def setUp(self):
     # type: OWWordEnrichment
     self.widget = self.create_widget(OWWordEnrichment)
     self.corpus = Corpus.from_file('book-excerpts')
     vect = BowVectorizer()
     self.corpus_vect = vect.transform(self.corpus)
示例#22
0
 def test_binary(self):
     vect = BowVectorizer(wlocal=BowVectorizer.BINARY)
     corpus = Corpus.from_file('deerwester')
     result = vect.transform(corpus)
     self.assertEqual(result.X.max(), 1.)
 def test_binary(self):
     vect = BowVectorizer(wlocal=BowVectorizer.BINARY)
     corpus = Corpus.from_file('deerwester')
     result = vect.transform(corpus)
     self.assertEqual(result.X.max(), 1.)