Пример #1
0
    def test_ngrams_iter(self):
        c = Corpus.from_file('deerwester')
        c.ngram_range = (1, 1)
        self.assertEqual(list(c.ngrams), [doc.lower().split() for doc in c.documents])
        expected = [[(token.lower(), ) for token in doc.split()] for doc in c.documents]
        self.assertEqual(list(c.ngrams_iterator(join_with=None)), expected)
        c.ngram_range = (2, 3)

        expected_ngrams = [('machine', 'interface'), ('for', 'lab'),
                           ('machine', 'interface', 'for'), ('abc', 'computer', 'applications')]

        for ngram in expected_ngrams:
            self.assertIn(ngram, list(c.ngrams_iterator(join_with=None))[0])
            self.assertIn('-'.join(ngram), list(c.ngrams_iterator(join_with='-'))[0])

        pos_tagger.tag_corpus(c)
        c.ngram_range = (1, 1)
        for doc in c.ngrams_iterator(join_with='_', include_postags=True):
            for token in doc:
                self.assertRegexpMatches(token, '\w+_[A-Z]+')
Пример #2
0
    def test_ngrams_iter(self):
        c = Corpus.from_file('deerwester')
        c.ngram_range = (1, 1)
        self.assertEqual(list(c.ngrams),
                         [doc.lower().split() for doc in c.documents])
        expected = [[(token.lower(), ) for token in doc.split()]
                    for doc in c.documents]
        self.assertEqual(list(c.ngrams_iterator(join_with=None)), expected)
        c.ngram_range = (2, 3)

        expected_ngrams = [('machine', 'interface'), ('for', 'lab'),
                           ('machine', 'interface', 'for'),
                           ('abc', 'computer', 'applications')]

        for ngram in expected_ngrams:
            self.assertIn(ngram, list(c.ngrams_iterator(join_with=None))[0])
            self.assertIn('-'.join(ngram),
                          list(c.ngrams_iterator(join_with='-'))[0])

        pos_tagger.tag_corpus(c)
        c.ngram_range = (1, 1)
        for doc in c.ngrams_iterator(join_with='_', include_postags=True):
            for token in doc:
                self.assertRegexpMatches(token, '\w+_[A-Z]+')
Пример #3
0
    def test_extend(self):
        c = Corpus.from_file('deerwester')
        c2 = c[:5]
        self.assertEqual(len(c2), 5)
        n = len(c)
        pos_tagger.tag_corpus(c)
        self.assertIsNot(c._tokens, None)
        self.assertIsNot(c.pos_tags, None)
        self.assertIs(c2._tokens, None)
        self.assertIs(c2.pos_tags, None)

        c.extend(c2)
        self.assertEqual(len(c), n + 5)
        self.assertIs(c._tokens, None)
        self.assertIs(c.pos_tags, None)

        pos_tagger.tag_corpus(c)
        pos_tagger.tag_corpus(c2)
        c.extend(c2)
        self.assertEqual(len(c), n + 10)
        self.assertEqual(len(c._tokens), n + 10)
        self.assertEqual(len(c.pos_tags), n + 10)
Пример #4
0
            self.is_preprocessed = ''
            self.is_pos_tagged = ''
            self.ngram_range = ''

    def commit(self):
        if self.corpus is not None:
            matched = self.corpus[self.output_mask]
            output_mask = set(self.output_mask)
            unmatched_mask = [
                i for i in range(len(self.corpus)) if i not in output_mask
            ]
            unmatched = self.corpus[unmatched_mask]
            self.Outputs.matching_docs.send(matched)
            self.Outputs.other_docs.send(unmatched)
        else:
            self.Outputs.matching_docs.send(None)
            self.Outputs.other_docs.send(None)


if __name__ == '__main__':
    from orangecontrib.text.tag import pos_tagger
    app = QApplication([])
    widget = OWCorpusViewer()
    widget.show()
    corpus = Corpus.from_file('book-excerpts')
    corpus = corpus[:3]
    corpus = pos_tagger.tag_corpus(corpus)
    corpus.ngram_range = (1, 2)
    widget.set_data(corpus)
    app.exec()
Пример #5
0
            self.n_matching = ''
            self.n_tokens = ''
            self.n_types = ''
            self.is_preprocessed = ''
            self.is_pos_tagged = ''
            self.ngram_range = ''

    def commit(self):
        if self.corpus is not None:
            matched = self.corpus[self.output_mask]
            output_mask = set(self.output_mask)
            unmatched_mask = [i for i in range(len(self.corpus)) if i not in output_mask]
            unmatched = self.corpus[unmatched_mask]
            self.send(IO.MATCHED, matched)
            self.send(IO.UNMATCHED, unmatched)
        else:
            self.send(IO.MATCHED, None)
            self.send(IO.UNMATCHED, None)

if __name__ == '__main__':
    from orangecontrib.text.tag import pos_tagger
    app = QApplication([])
    widget = OWCorpusViewer()
    widget.show()
    corpus = Corpus.from_file('bookexcerpts')
    corpus = corpus[:3]
    corpus = pos_tagger.tag_corpus(corpus)
    corpus.ngram_range = (1, 2)
    widget.set_data(corpus)
    app.exec()