def tf_idf(tag_matrix): #calculate TF-IDF tfidf = TfidfTransformer(None, use_idf=True) tfidf.fit(tag_matrix) tag_matrix = tfidf.transform(tag_matrix) dense_tag_matrix = tag_matrix.todense() return dense_tag_matrix
def test_dense_vectorizer(): wa = WordNGramAnalyzer() train_data = [wa.analyze(d) for d in JUNK_FOOD_DOCS[:-1]] test_data = [wa.analyze(JUNK_FOOD_DOCS[-1])] # test without vocabulary v1 = TermCountVectorizer() counts_train = v1.transform(train_data) assert_equal(counts_train[0, v1.vocabulary["pizza"]], 2) v2 = TermCountVectorizer(vocabulary=v1.vocabulary) # test with a pre-existing vocabulary for v in (v1, v2): counts_test = v.transform(test_data) assert_equal(counts_test[0, v.vocabulary["coke"]], 1) # test tf-idf t1 = TfidfTransformer() tfidf = t1.fit(counts_train).transform(counts_train) assert_equal(len(t1.idf), len(v1.vocabulary)) assert_equal(tfidf.shape, (len(train_data), len(v1.vocabulary))) # test tf-idf with new data tfidf_test = t1.transform(counts_test) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(use_idf=False) tf = t2.fit(counts_train).transform(counts_train) assert_equal(t2.idf, None) assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * len(train_data)) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) tv = TfidfVectorizer() tfidf2 = tv.fit(train_data).transform(train_data) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = tv.transform(test_data) assert_array_almost_equal(tfidf_test, tfidf_test2)
def test_vectorizer(): # results to be compared res = [] # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1) assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1) assert_equal(counts_test[0, v.vocabulary[u"water"]], 1) # stop word from the fixed list assert_false(u"the" in v.vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false(u"copyright" in v.vocabulary) # not present in the sample assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0) assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0) assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0) assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = toarray(t1.fit(counts_train).transform(counts_train)) assert_equal(len(t1.idf_), len(v1.vocabulary)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary))) res.append(tfidf) res.append(t1.idf_) # test tf-idf with new data tfidf_test = toarray(t1.transform(counts_test)) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = toarray(t2.fit(counts_train).transform(counts_train)) assert_equal(t2.idf_, None) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = Vectorizer(norm='l1') tv.tc.max_df = v1.max_df tfidf2 = toarray(tv.fit_transform(train_data)) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = toarray(tv.transform(test_data)) assert_array_almost_equal(tfidf_test, tfidf_test2) return res
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1) assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1) assert_equal(counts_test[0, v.vocabulary[u"water"]], 1) # stop word from the fixed list assert_false(u"the" in v.vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false(u"copyright" in v.vocabulary) # not present in the sample assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0) assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0) assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0) assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = toarray(t1.fit(counts_train).transform(counts_train)) assert_equal(len(t1.idf_), len(v1.vocabulary)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary))) # test tf-idf with new data tfidf_test = toarray(t1.transform(counts_test)) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = toarray(t2.fit(counts_train).transform(counts_train)) assert_equal(t2.idf_, None) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = Vectorizer(norm='l1') tv.tc.max_df = v1.max_df tfidf2 = toarray(tv.fit_transform(train_data)) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = toarray(tv.transform(test_data)) assert_array_almost_equal(tfidf_test, tfidf_test2) # test empty vocabulary v3 = CountVectorizer(vocabulary=None) assert_raises(ValueError, v3.transform, train_data)