def test_ngrams_iter(self): c = Corpus.from_file('deerwester') c.ngram_range = (1, 1) self.assertEqual(list(c.ngrams), [doc.lower().split() for doc in c.documents]) expected = [[(token.lower(), ) for token in doc.split()] for doc in c.documents] self.assertEqual(list(c.ngrams_iterator(join_with=None)), expected) c.ngram_range = (2, 3) expected_ngrams = [('machine', 'interface'), ('for', 'lab'), ('machine', 'interface', 'for'), ('abc', 'computer', 'applications')] for ngram in expected_ngrams: self.assertIn(ngram, list(c.ngrams_iterator(join_with=None))[0]) self.assertIn('-'.join(ngram), list(c.ngrams_iterator(join_with='-'))[0]) pos_tagger.tag_corpus(c) c.ngram_range = (1, 1) for doc in c.ngrams_iterator(join_with='_', include_postags=True): for token in doc: self.assertRegexpMatches(token, '\w+_[A-Z]+')
def test_extend(self): c = Corpus.from_file('deerwester') c2 = c[:5] self.assertEqual(len(c2), 5) n = len(c) pos_tagger.tag_corpus(c) self.assertIsNot(c._tokens, None) self.assertIsNot(c.pos_tags, None) self.assertIs(c2._tokens, None) self.assertIs(c2.pos_tags, None) c.extend(c2) self.assertEqual(len(c), n + 5) self.assertIs(c._tokens, None) self.assertIs(c.pos_tags, None) pos_tagger.tag_corpus(c) pos_tagger.tag_corpus(c2) c.extend(c2) self.assertEqual(len(c), n + 10) self.assertEqual(len(c._tokens), n + 10) self.assertEqual(len(c.pos_tags), n + 10)
self.is_preprocessed = '' self.is_pos_tagged = '' self.ngram_range = '' def commit(self): if self.corpus is not None: matched = self.corpus[self.output_mask] output_mask = set(self.output_mask) unmatched_mask = [ i for i in range(len(self.corpus)) if i not in output_mask ] unmatched = self.corpus[unmatched_mask] self.Outputs.matching_docs.send(matched) self.Outputs.other_docs.send(unmatched) else: self.Outputs.matching_docs.send(None) self.Outputs.other_docs.send(None) if __name__ == '__main__': from orangecontrib.text.tag import pos_tagger app = QApplication([]) widget = OWCorpusViewer() widget.show() corpus = Corpus.from_file('book-excerpts') corpus = corpus[:3] corpus = pos_tagger.tag_corpus(corpus) corpus.ngram_range = (1, 2) widget.set_data(corpus) app.exec()
self.n_matching = '' self.n_tokens = '' self.n_types = '' self.is_preprocessed = '' self.is_pos_tagged = '' self.ngram_range = '' def commit(self): if self.corpus is not None: matched = self.corpus[self.output_mask] output_mask = set(self.output_mask) unmatched_mask = [i for i in range(len(self.corpus)) if i not in output_mask] unmatched = self.corpus[unmatched_mask] self.send(IO.MATCHED, matched) self.send(IO.UNMATCHED, unmatched) else: self.send(IO.MATCHED, None) self.send(IO.UNMATCHED, None) if __name__ == '__main__': from orangecontrib.text.tag import pos_tagger app = QApplication([]) widget = OWCorpusViewer() widget.show() corpus = Corpus.from_file('bookexcerpts') corpus = corpus[:3] corpus = pos_tagger.tag_corpus(corpus) corpus.ngram_range = (1, 2) widget.set_data(corpus) app.exec()