def test_word_analyzer_unigrams_and_bigrams(): wa = WordNGramAnalyzer(min_n=1, max_n=2) text = u"J'ai mang\xe9 du kangourou ce midi, c'\xe9tait pas tr\xeas bon." expected = [u'ai', u'mange', u'du', u'kangourou', u'ce', u'midi', u'etait', u'pas', u'tres', u'bon', u'ai mange', u'mange du', u'du kangourou', u'kangourou ce', u'ce midi', u'midi etait', u'etait pas', u'pas tres', u'tres bon'] assert_equal(wa.analyze(text), expected)
def test_word_analyzer_unigrams(): wa = WordNGramAnalyzer(min_n=1, max_n=1) text = u"J'ai mang\xe9 du kangourou ce midi, c'\xe9tait pas tr\xeas bon." expected = [u'ai', u'mange', u'du', u'kangourou', u'ce', u'midi', u'etait', u'pas', u'tres', u'bon'] assert_equal(wa.analyze(text), expected) text = "This is a test, really.\n\n I met Harry yesterday." expected = [u'this', u'is', u'test', u'really', u'met', u'harry', u'yesterday'] assert_equal(wa.analyze(text), expected)
def test_dense_vectorizer(): wa = WordNGramAnalyzer() train_data = [wa.analyze(d) for d in JUNK_FOOD_DOCS[:-1]] test_data = [wa.analyze(JUNK_FOOD_DOCS[-1])] # test without vocabulary v1 = TermCountVectorizer() counts_train = v1.transform(train_data) assert_equal(counts_train[0, v1.vocabulary["pizza"]], 2) v2 = TermCountVectorizer(vocabulary=v1.vocabulary) # test with a pre-existing vocabulary for v in (v1, v2): counts_test = v.transform(test_data) assert_equal(counts_test[0, v.vocabulary["coke"]], 1) # test tf-idf t1 = TfidfTransformer() tfidf = t1.fit(counts_train).transform(counts_train) assert_equal(len(t1.idf), len(v1.vocabulary)) assert_equal(tfidf.shape, (len(train_data), len(v1.vocabulary))) # test tf-idf with new data tfidf_test = t1.transform(counts_test) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(use_idf=False) tf = t2.fit(counts_train).transform(counts_train) assert_equal(t2.idf, None) assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * len(train_data)) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) tv = TfidfVectorizer() tfidf2 = tv.fit(train_data).transform(train_data) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = tv.transform(test_data) assert_array_almost_equal(tfidf_test, tfidf_test2)