clf = LinearSVC(**parameters).fit(X_train, y_train) print "done in %fs" % (time() - t0) print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) print "Loading 20 newsgroups test set... " news_test = load_mlcomp('20news-18828', 'test') t0 = time() print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in news_test.filenames)) y_test = news_test.target print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print "Predicting the outcomes of the testing set" t0 = time() pred = clf.predict(X_test) print "done in %fs" % (time() - t0) <<<<<<< HEAD print "precision: %0.3f" % precision(y_test, pred) print "recall: %0.3f" % recall(y_test, pred) print "f1_score: %0.3f" % f1_score(y_test, pred) ======= print "Classification report on test set:"
# split a training set and a test set filenames_train, filenames_test = data_train.filenames, data_test.filenames y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in filenames_train)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in filenames_test)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print ################################################################################ # Benchmark classifiers def benchmark(clf): print 80 * '_' print "Training: " print clf t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print "train time: %0.3fs" % train_time
clf = SGDClassifier(**parameters).fit(X_train, y_train) print "done in %fs" % (time() - t0) print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) print "Loading 20 newsgroups test set... " news_test = load_mlcomp('20news-18828', 'test') t0 = time() print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in news_test.filenames)) y_test = news_test.target print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print "Predicting the outcomes of the testing set" t0 = time() pred = clf.predict(X_test) print "done in %fs" % (time() - t0) print "Classification report on test set for classifier:" print clf print print classification_report(y_test, pred, class_names=news_test.target_names) cm = confusion_matrix(y_test, pred)