from scikits.learn.metrics import classification_report if 'MLCOMP_DATASETS_HOME' not in os.environ: print "Please follow those instructions to get started:" sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train') print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Training a linear classifier..." parameters = { 'loss': 'hinge', 'penalty': 'l2', 'n_iter': 50, 'alpha': 0.00001, 'fit_intercept': True, }
if 'MLCOMP_DATASETS_HOME' not in os.environ: print "Please follow those instructions to get started:" print __doc__ sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train') print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Training a linear SVM (hinge loss and L2 regularizer)..." parameters = { 'loss': 'l2', 'penalty': 'l2', 'C': 10, 'dual': False, 'eps': 1e-4, } print "parameters:", parameters
data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, rng=42) print "%d documents (training set)" % len(data_train.filenames) print "%d documents (testing set)" % len(data_test.filenames) print "%d categories" % len(data_train.target_names) print # split a training set and a test set filenames_train, filenames_test = data_train.filenames, data_test.filenames y_train, y_test = data_train.target, data_test.target print "Extracting features from the training dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in filenames_train)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape print print "Extracting features from the test dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in filenames_test)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print ################################################################################ # Benchmark classifiers