if 'MLCOMP_DATASETS_HOME' not in os.environ: print "MLCOMP_DATASETS_HOME not set; please follow the above instructions" sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train') print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Loading 20 newsgroups test set... " news_test = load_mlcomp('20news-18828', 'test') t0 = time() print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer"
print 'data loaded' categories = data_train.target_names # for case categories == None print "%d documents (training set)" % len(data_train.data) print "%d documents (testing set)" % len(data_test.data) print "%d categories" % len(categories) print # split a training set and a test set y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform(data_train.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform(data_test.data) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print if opts.select_chi2: print ("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time()
def test_vectorizer(): # results to be compared res = [] # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1) assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1) assert_equal(counts_test[0, v.vocabulary[u"water"]], 1) # stop word from the fixed list assert_false(u"the" in v.vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false(u"copyright" in v.vocabulary) # not present in the sample assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0) assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0) assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0) assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = toarray(t1.fit(counts_train).transform(counts_train)) assert_equal(len(t1.idf_), len(v1.vocabulary)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary))) res.append(tfidf) res.append(t1.idf_) # test tf-idf with new data tfidf_test = toarray(t1.transform(counts_test)) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = toarray(t2.fit(counts_train).transform(counts_train)) assert_equal(t2.idf_, None) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = Vectorizer(norm='l1') tv.tc.max_df = v1.max_df tfidf2 = toarray(tv.fit_transform(train_data)) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = toarray(tv.transform(test_data)) assert_array_almost_equal(tfidf_test, tfidf_test2) return res
data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) print "%d documents (training set)" % len(data_train.filenames) print "%d documents (testing set)" % len(data_test.filenames) print "%d categories" % len(data_train.target_names) print # split a training set and a test set filenames_train, filenames_test = data_train.filenames, data_test.filenames y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in filenames_train)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in filenames_test)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print ################################################################################ # Benchmark classifiers def benchmark(clf):
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1) assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1) assert_equal(counts_test[0, v.vocabulary[u"water"]], 1) # stop word from the fixed list assert_false(u"the" in v.vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false(u"copyright" in v.vocabulary) # not present in the sample assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0) assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0) assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0) assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = toarray(t1.fit(counts_train).transform(counts_train)) assert_equal(len(t1.idf_), len(v1.vocabulary)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary))) # test tf-idf with new data tfidf_test = toarray(t1.transform(counts_test)) assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = toarray(t2.fit(counts_train).transform(counts_train)) assert_equal(t2.idf_, None) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = Vectorizer(norm='l1') tv.tc.max_df = v1.max_df tfidf2 = toarray(tv.fit_transform(train_data)) assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = toarray(tv.transform(test_data)) assert_array_almost_equal(tfidf_test, tfidf_test2) # test empty vocabulary v3 = CountVectorizer(vocabulary=None) assert_raises(ValueError, v3.transform, train_data)
filenames = np.concatenate((data_train.filenames, data_test.filenames)) target_names = set(data_train.target_names + data_test.target_names) print "%d documents" % len(filenames) print "%d categories" % len(target_names) print # split a training set and a test set labels = np.concatenate((data_train.target, data_test.target)) true_k = np.unique(labels).shape[0] print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform((open(f).read() for f in filenames)) X = Normalizer(norm="l2", copy=False).transform(X) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print ############################################################################### # Now sparse MiniBatchKmeans print "_" * 80 mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13, chunk_size=1000, tol=0.0, n_init=1)