Пример #1
0
	def test_get_term_idx_and_x(self):
		docs = [whitespace_nlp('aa aa bb.'),
		        whitespace_nlp('bb aa a.')]
		df = pd.DataFrame({'category': ['a', 'b'],
		                   'parsed': docs})
		#corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed')
		corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed')
		corpus = corpus_fact.build()

		kvs = list(corpus_fact._term_idx_store.items())
		keys = [k for k, v in kvs]
		values = [v for k, v in kvs]
		self.assertEqual(sorted(keys), list(range(7)))
		self.assertEqual(sorted(values),
		                 ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa'])

		def assert_word_in_doc_cnt(doc, word, count):
			self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count)

		assert_word_in_doc_cnt(0, 'aa', 2)
		assert_word_in_doc_cnt(0, 'bb', 1)
		assert_word_in_doc_cnt(0, 'aa aa', 1)
		assert_word_in_doc_cnt(0, 'aa bb', 1)
		assert_word_in_doc_cnt(0, 'bb aa', 0)
		assert_word_in_doc_cnt(1, 'bb', 1)
		assert_word_in_doc_cnt(1, 'aa', 1)
		assert_word_in_doc_cnt(1, 'a', 1)
		assert_word_in_doc_cnt(1, 'bb aa', 1)
		assert_word_in_doc_cnt(1, 'aa aa', 0)
		assert_word_in_doc_cnt(1, 'aa a', 1)
		self.assertTrue(isinstance(corpus, ParsedCorpus))
    def test_hamlet(self):
        raw_docs = get_hamlet_docs()
        categories = [
            get_hamlet_snippet_binary_category(doc) for doc in raw_docs
        ]
        docs = [whitespace_nlp(doc) for doc in raw_docs]
        df = pd.DataFrame({'category': categories, 'parsed': docs})
        corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed')
        corpus = corpus_fact.build()
        tdf = corpus.get_term_freq_df()
        self.assertEqual(list(tdf.ix['play']), [37, 5])
        self.assertFalse(
            any(
                corpus.search('play').apply(
                    lambda x: 'plfay' in str(x['parsed']), axis=1)))
        self.assertTrue(
            all(
                corpus.search('play').apply(
                    lambda x: 'play' in str(x['parsed']), axis=1)))

        # !!! to do verify term doc matrix
        play_term_idx = corpus_fact._term_idx_store.getidx('play')
        play_X = corpus_fact._X.todok()[:, play_term_idx]

        self.assertEqual(play_X.sum(), 37 + 5)
	def test_get_term_idx_and_x(self):
		docs = [whitespace_nlp('aa aa bb.'),
		        whitespace_nlp('bb aa a.')]
		df = pd.DataFrame({'category': ['a', 'b'],
		                   'parsed': docs})
		# corpus_fact = CorpusFromParsedDocuments(convention_df, 'category', 'parsed')
		corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed')
		corpus = corpus_fact.build()

		kvs = list(corpus_fact._term_idx_store.items())
		keys = [k for k, v in kvs]
		values = [v for k, v in kvs]
		self.assertEqual(sorted(keys), list(range(7)))
		self.assertEqual(sorted(values),
		                 ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa'])

		def assert_word_in_doc_cnt(doc, word, count):
			self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count)

		assert_word_in_doc_cnt(0, 'aa', 2)
		assert_word_in_doc_cnt(0, 'bb', 1)
		assert_word_in_doc_cnt(0, 'aa aa', 1)
		assert_word_in_doc_cnt(0, 'aa bb', 1)
		assert_word_in_doc_cnt(0, 'bb aa', 0)
		assert_word_in_doc_cnt(1, 'bb', 1)
		assert_word_in_doc_cnt(1, 'aa', 1)
		assert_word_in_doc_cnt(1, 'a', 1)
		assert_word_in_doc_cnt(1, 'bb aa', 1)
		assert_word_in_doc_cnt(1, 'aa aa', 0)
		assert_word_in_doc_cnt(1, 'aa a', 1)
		self.assertTrue(isinstance(corpus, ParsedCorpus))
	def test_hamlet(self):
		raw_docs = get_hamlet_docs()
		categories = [get_hamlet_snippet_binary_category(doc) for doc in raw_docs]
		docs = [whitespace_nlp(doc) for doc in raw_docs]
		df = pd.DataFrame({'category': categories,
		                   'parsed': docs})
		corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed')
		corpus = corpus_fact.build()
		tdf = corpus.get_term_freq_df()
		self.assertEqual(list(tdf.ix['play']), [37, 5])
		self.assertFalse(any(corpus.search('play').apply(lambda x: 'plfay' in str(x['parsed']), axis=1)))
		self.assertTrue(all(corpus.search('play').apply(lambda x: 'play' in str(x['parsed']), axis=1)))

		# !!! to do verify term doc matrix
		play_term_idx = corpus_fact._term_idx_store.getidx('play')
		play_X = corpus_fact._X.todok()[:, play_term_idx]

		self.assertEqual(play_X.sum(), 37 + 5)