def test_get_term_idx_and_x(self): docs = [whitespace_nlp('aa aa bb.'), whitespace_nlp('bb aa a.')] df = pd.DataFrame({'category': ['a', 'b'], 'parsed': docs}) #corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed') corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed') corpus = corpus_fact.build() kvs = list(corpus_fact._term_idx_store.items()) keys = [k for k, v in kvs] values = [v for k, v in kvs] self.assertEqual(sorted(keys), list(range(7))) self.assertEqual(sorted(values), ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa']) def assert_word_in_doc_cnt(doc, word, count): self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count) assert_word_in_doc_cnt(0, 'aa', 2) assert_word_in_doc_cnt(0, 'bb', 1) assert_word_in_doc_cnt(0, 'aa aa', 1) assert_word_in_doc_cnt(0, 'aa bb', 1) assert_word_in_doc_cnt(0, 'bb aa', 0) assert_word_in_doc_cnt(1, 'bb', 1) assert_word_in_doc_cnt(1, 'aa', 1) assert_word_in_doc_cnt(1, 'a', 1) assert_word_in_doc_cnt(1, 'bb aa', 1) assert_word_in_doc_cnt(1, 'aa aa', 0) assert_word_in_doc_cnt(1, 'aa a', 1) self.assertTrue(isinstance(corpus, ParsedCorpus))
def test_hamlet(self): raw_docs = get_hamlet_docs() categories = [ get_hamlet_snippet_binary_category(doc) for doc in raw_docs ] docs = [whitespace_nlp(doc) for doc in raw_docs] df = pd.DataFrame({'category': categories, 'parsed': docs}) corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed') corpus = corpus_fact.build() tdf = corpus.get_term_freq_df() self.assertEqual(list(tdf.ix['play']), [37, 5]) self.assertFalse( any( corpus.search('play').apply( lambda x: 'plfay' in str(x['parsed']), axis=1))) self.assertTrue( all( corpus.search('play').apply( lambda x: 'play' in str(x['parsed']), axis=1))) # !!! to do verify term doc matrix play_term_idx = corpus_fact._term_idx_store.getidx('play') play_X = corpus_fact._X.todok()[:, play_term_idx] self.assertEqual(play_X.sum(), 37 + 5)
def test_get_term_idx_and_x(self): docs = [whitespace_nlp('aa aa bb.'), whitespace_nlp('bb aa a.')] df = pd.DataFrame({'category': ['a', 'b'], 'parsed': docs}) # corpus_fact = CorpusFromParsedDocuments(convention_df, 'category', 'parsed') corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed') corpus = corpus_fact.build() kvs = list(corpus_fact._term_idx_store.items()) keys = [k for k, v in kvs] values = [v for k, v in kvs] self.assertEqual(sorted(keys), list(range(7))) self.assertEqual(sorted(values), ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa']) def assert_word_in_doc_cnt(doc, word, count): self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count) assert_word_in_doc_cnt(0, 'aa', 2) assert_word_in_doc_cnt(0, 'bb', 1) assert_word_in_doc_cnt(0, 'aa aa', 1) assert_word_in_doc_cnt(0, 'aa bb', 1) assert_word_in_doc_cnt(0, 'bb aa', 0) assert_word_in_doc_cnt(1, 'bb', 1) assert_word_in_doc_cnt(1, 'aa', 1) assert_word_in_doc_cnt(1, 'a', 1) assert_word_in_doc_cnt(1, 'bb aa', 1) assert_word_in_doc_cnt(1, 'aa aa', 0) assert_word_in_doc_cnt(1, 'aa a', 1) self.assertTrue(isinstance(corpus, ParsedCorpus))
def test_hamlet(self): raw_docs = get_hamlet_docs() categories = [get_hamlet_snippet_binary_category(doc) for doc in raw_docs] docs = [whitespace_nlp(doc) for doc in raw_docs] df = pd.DataFrame({'category': categories, 'parsed': docs}) corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed') corpus = corpus_fact.build() tdf = corpus.get_term_freq_df() self.assertEqual(list(tdf.ix['play']), [37, 5]) self.assertFalse(any(corpus.search('play').apply(lambda x: 'plfay' in str(x['parsed']), axis=1))) self.assertTrue(all(corpus.search('play').apply(lambda x: 'play' in str(x['parsed']), axis=1))) # !!! to do verify term doc matrix play_term_idx = corpus_fact._term_idx_store.getidx('play') play_X = corpus_fact._X.todok()[:, play_term_idx] self.assertEqual(play_X.sum(), 37 + 5)