def setUp(self): self.corpus = Corpus.from_file("deerwester") self.pp_list = [preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer(), preprocess.SnowballStemmer(), preprocess.NGrams(), tag.AveragedPerceptronTagger()]
def test_preprocess(self): pr = preprocess.Preprocessor( tokenizer=preprocess.RegexpTokenizer('\w+'), pos_tagger=tag.AveragedPerceptronTagger()) corpus = Corpus.from_file('deerwester') pr(corpus, inplace=True) self.assertIsNotNone(corpus.pos_tags)
def test_reset_pos_tags(self): corpus = Corpus.from_file('deerwester') tagger = tag.AveragedPerceptronTagger() tagged_corpus = tagger(corpus) self.assertTrue(len(tagged_corpus.pos_tags)) tokenizer = preprocess.RegexpTokenizer(pattern=r'\w') tokenized_corpus = tokenizer(corpus) self.assertFalse(tokenized_corpus.pos_tags)
def test_POSTagger(self): corpus = Corpus.from_file('deerwester') tagger = tag.AveragedPerceptronTagger() result = tagger.tag_corpus(corpus) self.assertTrue(hasattr(result, 'pos_tags')) # for token in itertools.chain(*result.tokens): # self.assertRegexpMatches(token, '[a-z]+_[A-Z]+') for tokens, tags in zip(result.tokens, result.pos_tags): self.assertEqual(len(tokens), len(tags))
def test_pos_filter(self): pos_filter = preprocess.PosTagFilter("NN") pp_list = [ preprocess.WordPunctTokenizer(), tag.AveragedPerceptronTagger() ] corpus = self.corpus for pp in pp_list: corpus = pp(corpus) filtered = pos_filter(corpus) self.assertTrue(len(filtered.pos_tags)) self.assertEqual(len(filtered.pos_tags[0]), 5) self.assertEqual(len(filtered.tokens[0]), 5)
def test_filter_pos_tags(self): pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer(), tag.AveragedPerceptronTagger(), preprocess.StopwordsFilter() ] corpus = self.corpus with corpus.unlocked(): corpus.metas[0, 0] = "This is the most beautiful day in the world" for pp in pp_list: corpus = pp(corpus) self.assertEqual(len(corpus.tokens), len(corpus.pos_tags)) self.assertEqual(len(corpus.tokens[0]), len(corpus.pos_tags[0])) self.assertEqual(corpus.tokens[0], ["beautiful", "day", "world"]) self.assertEqual(corpus.pos_tags[0], ["JJ", "NN", "NN"])
def setUp(self): self.tagger = tag.AveragedPerceptronTagger() self.corpus = Corpus.from_file('deerwester')