def test_dense_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # simulate iterables train_data = iter(data[1:-1]) test_data = iter([data[0], data[-1]]) # label junk food as -1, the others as +1 y = np.ones(len(data)) y[:6] = -1 y_train = y[1:-1] y_test = np.array([y[0], y[-1]]) pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())]) parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')} # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # cross-validation doesn't work if the length of the data is not known, # hence use lists instead of iterators pred = grid_search.fit(list(train_data), y_train).predict(list(test_data)) assert_array_equal(pred, y_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accurracy models assert_equal(grid_search.best_score, 1.0) best_vectorizer = grid_search.best_estimator.named_steps['vect'] assert_equal(best_vectorizer.analyzer.max_n, 1)
def test_grid_search(): """Test that the best estimator contains the right value for foo_param""" clf = MockClassifier() cross_validation = GridSearchCV(clf, {'foo_param': [1, 2, 3]}) # make sure it selects the smallest parameter in case of ties assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): assert cross_validation.grid_scores_[i][0] == {'foo_param': foo_i}
def test_grid_search(): """Test that the best estimator contains the right value for foo_param""" clf = MockClassifier() cross_validation = GridSearchCV(clf, {'foo_param': [1, 2, 3]}) # make sure it selects the smallest parameter in case of ties assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2) for i, foo_i in enumerate([1, 2, 3]): assert cross_validation.grid_scores_[i][0] == {'foo_param' : foo_i}
def do_grid_search(X, Y, gs_params): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ lrpipe = Pipeline([('logreg', LogisticRegression())]) gs = GridSearchCV(lrpipe, gs_params, n_jobs=-1) #print gs gs = gs.fit(X, Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " + str(best_parameters)) logger.info("expected score: " + str(score)) return best_parameters
def do_grid_search(X,Y, gs_params): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ lrpipe = Pipeline([ ('logreg', LogisticRegression() ) ]) gs = GridSearchCV( lrpipe, gs_params , n_jobs=-1) #print gs gs = gs.fit(X,Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " +str( best_parameters ) ) logger.info("expected score: "+str( score ) ) return best_parameters
def test_grid_search_error(): """Test that grid search will capture errors on data with different length""" X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) assert_raises(ValueError, cv.fit, X_[:180], y_)
def train_svm_crossvalidated(X, y, tuned_parameters={'kernel': ['rbf'], 'gamma': 2.0**np.arange(-15,3), 'C': 2.0**np.arange(-5, 15)}): """ Performs grid search with stratified K-fold cross validation on observations X with true labels y and returns an optimal SVM, clf """ k_fold = _size_dependent_k_split(np.size(X,0)) clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=recall_score) clf.fit(X, y, cv=StratifiedKFold(y, k_fold)) y_true, y_pred = y, clf.predict(X) #print "Classification report for the best estimator: " #print clf.best_estimator print "Tuned with optimal value: %0.3f" % recall_score(y_true, y_pred) return clf
def ParameterGridSearch(self, callback=None, nValidation=5): ''' Grid search for the best C and gamma parameters for the RBF Kernel. The efficiency of the parameters is evaluated using nValidation-fold cross-validation of the training data. As this process is time consuming and parallelizable, a number of threads equal to the number of cores in the computer is used for the calculations ''' from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics import precision_score from scikits.learn.cross_val import StratifiedKFold # # XXX: program crashes with >1 worker when running cpa.py # No crash when running from classifier.py. Why? # n_workers = 1 #try: #from multiprocessing import cpu_count #n_workers = cpu_count() #except: #n_workers = 1 # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting parameters = { 'C': 2**np.arange(-5, 11, 2, dtype=float), 'gamma': 2**np.arange(3, -11, -2, dtype=float) } clf = GridSearchCV(SVC(kernel='rbf'), parameters, n_jobs=n_workers, score_func=precision_score) clf.fit(self.svm_train_values, self.svm_train_labels, cv=StratifiedKFold(self.svm_train_labels, nValidation)) # Pick the best parameters as the ones with the maximum cross-validation rate bestParameters = max(clf.grid_scores_, key=lambda a: a[1]) bestC = bestParameters[0]['C'] bestGamma = bestParameters[0]['gamma'] logging.info('Optimal values: C=%s g=%s rate=%s' % (bestC, bestGamma, bestParameters[1])) return bestC, bestGamma
def test_grid_search_sparse(): """Test that grid search works with both dense and sparse matrices""" X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert np.mean(y_pred == y_pred2) >= .9 assert_equal(C, C2)
def do_grid_search(X, Y, gs_params=None): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ svpipe = Pipeline([('rbfsvm', SVC())]) if not gs_params: gs_params = { 'rbfsvm__C': (1.5, 2, 5, 10, 20), 'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5), } gs = GridSearchCV(svpipe, gs_params, n_jobs=-1) #print gs gs = gs.fit(X, Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " + str(best_parameters)) logger.info("expected score: " + str(score)) return best_parameters
def test_grid_search_sparse_score_func(): X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) # XXX: set refit to False due to a random bug when True (default) cv.fit(X_[:180], y_[:180], refit=False) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) # XXX: set refit to False due to a random bug when True (default) cv.fit(X_[:180], y_[:180], refit=False) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2)
def do_grid_search(X,Y, gs_params=None): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ svpipe = Pipeline([ ('rbfsvm', SVC() ) ]) if not gs_params: gs_params = { 'rbfsvm__C': (1.5, 2, 5, 10, 20), 'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5 ) , } gs = GridSearchCV( svpipe, gs_params , n_jobs=-1) #print gs gs = gs.fit(X,Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " +str( best_parameters ) ) logger.info("expected score: "+str( score ) ) return best_parameters
def ParameterGridSearch(self, callback = None, nValidation = 5): ''' Grid search for the best C and gamma parameters for the RBF Kernel. The efficiency of the parameters is evaluated using nValidation-fold cross-validation of the training data. As this process is time consuming and parallelizable, a number of threads equal to the number of cores in the computer is used for the calculations ''' from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics import precision_score from scikits.learn.cross_val import StratifiedKFold # # XXX: program crashes with >1 worker when running cpa.py # No crash when running from classifier.py. Why? # n_workers = 1 #try: #from multiprocessing import cpu_count #n_workers = cpu_count() #except: #n_workers = 1 # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting parameters = {'C': 2**np.arange(-5,11,2, dtype=float), 'gamma': 2**np.arange(3,-11,-2, dtype=float)} clf = GridSearchCV(SVC(kernel='rbf'), parameters, n_jobs=n_workers, score_func=precision_score) clf.fit(self.svm_train_values, self.svm_train_labels, cv=StratifiedKFold(self.svm_train_labels, nValidation)) # Pick the best parameters as the ones with the maximum cross-validation rate bestParameters = max(clf.grid_scores_, key=lambda a: a[1]) bestC = bestParameters[0]['C'] bestGamma = bestParameters[0]['gamma'] logging.info('Optimal values: C=%s g=%s rate=%s'% (bestC, bestGamma, bestParameters[1])) return bestC, bestGamma
def test_dense_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # simulate iterables train_data = iter(data[1:-1]) test_data = iter([data[0], data[-1]]) # label junk food as -1, the others as +1 y = np.ones(len(data)) y[:6] = -1 y_train = y[1:-1] y_test = np.array([y[0],y[-1]]) pipeline = Pipeline([('vect', CountVectorizer()), ('svc', DenseLinearSVC())]) parameters = { 'vect__analyzer': (WordNGramAnalyzer(min_n=1, max_n=1), WordNGramAnalyzer(min_n=1, max_n=2)), 'svc__loss' : ('l1', 'l2') } # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # cross-validation doesn't work if the length of the data is not known, # hence use lists instead of iterators pred = grid_search.fit(list(train_data), y_train).predict(list(test_data)) assert_array_equal(pred, y_test) # on this toy dataset bigram representation which is used in the last of the # grid_search is considered the best estimator since they all converge to # 100% accurracy models assert_equal(grid_search.best_score, 1.0) best_vectorizer = grid_search.best_estimator.named_steps['vect'] assert_equal(best_vectorizer.analyzer.max_n, 2)
def test_grid_search_sparse_score_func(): X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2)
def test_grid_search_sparse(): """Test that grid search works with both dense and sparse matrices""" X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C':[0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C':[0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert np.mean(y_pred == y_pred2) >= .9 assert_equal(C, C2)
def test_grid_search_sparse_score_func(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) # XXX: set refit to False due to a random bug when True (default) cv.set_params(refit=False).fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) # XXX: set refit to False due to a random bug when True (default) cv.set_params(refit=False).fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2)
moto_vq_train, plane_vq_train = [np.load(file) for file in ['moto_vq_train.npy','plane_vq_train.npy']] labels = [0]* moto_vq_train.shape[0] + [1]* plane_vq_train.shape[0] ############################################################################### # Train SVM param_grid = { 'C': [1, 5, 10, 50, 100], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) #clf = SVC(kernel='rbf') #clf = SVC(kernel='linear') clf.fit(np.vstack([moto_vq_train,plane_vq_train]), np.array(labels)) print "Best estimator found by grid search:" #print clf.best_estimator ############################################################################### # Evaluation moto_vq_eval, plane_vq_eval = [np.load(file) for file
# Data attributes targets = [0, 1, 2] target_names = ["covered", "no alternance", "uncovered"] target_colors = "rgb" # Classification settings pipeline = Pipeline([ ('extr', InfinitivesExtractor()), ('svc', LinearSVC(multi_class=True)) ]) parameters = { 'extr__count': (True,False), 'extr__n': (3, 4, 5, 6), 'svc__C': (1e-1, 1e-2, 1e9) } grid_search = GridSearchCV(pipeline, parameters) print "Loading data..." X, y = load_data() print "Searching for the best model..." t0 = time() grid_search.fit(X, y) print "Done in %0.3f" % (time() - t0) print "Best score: %0.3f" % grid_search.best_score clf = grid_search.best_estimator print clf yp = clf.predict(X) print classification_report(y, yp, targets, target_names) #pl.figure() #pl.title("Classification rate for 3-fold stratified CV")
parameters = { # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way 'vect__max_df': (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__analyzer__max_n': (1, 2), # words or bigrams # 'tfidf__use_idf': (True, False), 'clf__alpha': (0.00001, 0.000001), 'clf__penalty': ('l2', 'elasticnet'), # 'clf__n_iter': (10, 50, 80), } # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) # cross-validation doesn't work if the length of the data is not known, # hence use lists instead of iterators text_docs = [file(f).read() for f in data.filenames] print "Performing grid search..." print "pipeline:", [name for name, _ in pipeline.steps] print "parameters:" pprint(parameters) t0 = time() grid_search.fit(text_docs, data.target) print "done in %0.3fs" % (time() - t0) print print "Best score: %0.3f" % grid_search.best_score
# split the dataset in two equal part respecting label proportions train, test = iter(StratifiedKFold(y, 2)).next() ################################################################################ # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] scores = [ ('precision', precision_score), ('recall', recall_score), ] for score_name, score_func in scores: clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func) clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5)) y_true, y_pred = y[test], clf.predict(X[test]) print "Classification report for the best estimator: " print clf.best_estimator print "Tuned for '%s' with optimal value: %0.3f" % ( score_name, score_func(y_true, y_pred)) print classification_report(y_true, y_pred) print "Grid scores:" pprint(clf.grid_scores_) print # Note the problem is too easy: the hyperparameter plateau is too flat and the # output model is the same for precision and recall with ties in quality
def test_GridSearch(): clf = MockClassifier() cross_validation = GridSearchCV(clf, {'foo_param':[1, 2, 3]}) assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2)
print "Extracting the top %d eigenfaces" % n_components pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) eigenfaces = pca.components_.T.reshape((n_components, 64, 64)) # project the input data on the eigenfaces orthonormal basis X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Train a SVM classification model print "Fitting the classifier to the training set" param_grid = {"C": [1, 5, 10, 100], "gamma": [0.0001, 0.001, 0.01, 0.1]} clf = GridSearchCV(SVC(kernel="rbf"), param_grid, fit_params={"class_weight": "auto"}, n_jobs=-1) clf = clf.fit(X_train_pca, y_train) print "Best estimator found by grid search:" print clf.best_estimator # Quantitative evaluation of the model quality on the test set y_pred = clf.predict(X_test_pca) print classification_report(y_test, y_pred, labels=selected_target, target_names=target_names[selected_target]) print confusion_matrix(y_test, y_pred, labels=selected_target) # Qualitative evaluation of the predictions using matplotlib
from preprocess import InfinitivesExtractor, load_data # Data attributes targets = [0, 1, 2] target_names = ["covered", "no alternance", "uncovered"] target_colors = "rgb" # Classification settings pipeline = Pipeline([('extr', InfinitivesExtractor()), ('svc', LinearSVC(multi_class=True))]) parameters = { 'extr__count': (True, False), 'extr__n': (3, 4, 5, 6), 'svc__C': (1e-1, 1e-2, 1e9) } grid_search = GridSearchCV(pipeline, parameters) print "Loading data..." X, y = load_data() print "Searching for the best model..." t0 = time() grid_search.fit(X, y) print "Done in %0.3f" % (time() - t0) print "Best score: %0.3f" % grid_search.best_score clf = grid_search.best_estimator print clf yp = clf.predict(X) print classification_report(y, yp, targets, target_names) #pl.figure() #pl.title("Classification rate for 3-fold stratified CV")
############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(len(y), 2) # cross-validation generator for model selection ridge = BayesianRidge() mem = Memory(cachedir='.', verbose=1) # Ward agglomeration followed by BayesianRidge A = grid_to_graph(n_x=size, n_y=size) ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem, n_components=1) clf = Pipeline([('ward', ward), ('ridge', ridge)]) parameters = {'ward__n_clusters': [10, 20, 30]} # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, parameters, n_jobs=1) clf.fit(X, y, cv=cv) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)]) parameters = {'anova__percentile': [5, 10, 20]} # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, parameters) clf.fit(X, y, cv=cv) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target ################################################################################ # Plot the PCA spectrum pca.fit(X_digits) pl.figure(1, figsize=(4, 3)) pl.clf() pl.axes([.2, .2, .7, .7]) pl.plot(pca.explained_variance_, linewidth=2) pl.axis('tight') pl.xlabel('n_components') pl.ylabel('explained_variance_') ################################################################################ # Prediction scores = cross_val.cross_val_score(pipe, X_digits, y_digits, n_jobs=-1) from scikits.learn.grid_search import GridSearchCV n_components = [10, 15, 20, 30, 40, 50, 64] Cs = np.logspace(-4, 4, 16) estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, logistic__C=Cs), n_jobs=-1) estimator.fit(X_digits, y_digits)
from scikits.learn import datasets from scikits.learn.metrics import zero_one ################################################################################ # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target ################################################################################ # Set the parameters by cross-validation tuned_parameters = [{"kernel": ("rbf",), "gamma": [1e-3, 1e-4]}, {"kernel": ("linear",)}] clf = GridSearchCV(SVC(C=1), tuned_parameters, n_jobs=2) y_pred = [] y_true = [] for train, test in StratifiedKFold(y, 2): cv = StratifiedKFold(y[train], 5) y_pred.append(clf.fit(X[train], y[train], cv=cv).predict(X[test])) y_true.append(y[test]) y_pred = np.concatenate(y_pred) y_true = np.concatenate(y_true) classif_rate = np.mean(y_pred == y_true) * 100 print "Classification rate : %f" % classif_rate
def test_grid_search(): """Test that the best estimator contains the right value for foo_param""" clf = MockClassifier() cross_validation = GridSearchCV(clf, {'foo_param':[1, 2, 3]}) assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2)
from scikits.learn.grid_search import GridSearchCV from scikits.learn import datasets from scikits.learn.metrics import zero_one ################################################################################ # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target ################################################################################ # Set the parameters by cross-validation tuned_parameters = [{'kernel':('rbf', ), 'gamma':[1e-3, 1e-4]}, {'kernel':('linear', )}] clf = GridSearchCV(SVC(C=1), tuned_parameters, n_jobs=2) y_pred = [] y_true = [] for train, test in StratifiedKFold(y, 2): clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5)) y_pred = np.append(y_pred, clf.predict(X[test])) y_true = np.append(y_true, y[test]) classif_rate = np.mean(y_pred == y_true) * 100 print "Classification rate : %f" % classif_rate
y_test = dataset.target[split:] # Build a vectorizer / classifier pipeline using the previous analyzer pipeline = Pipeline([ ('vect', CountVectorizer(max_features=100000)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(C=1000)), ]) parameters = { 'vect__analyzer__max_n': (1, 2), 'vect__max_df': (.95, ), } # Fit the pipeline on the training set using grid search for the parameters grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train[:200], y_train[:200]) # Refit the best parameter set on the complete training set clf = grid_search.best_estimator.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted)
y_test = dataset.target[split:] # Build a vectorizer / classifier pipeline using the previous analyzer pipeline = Pipeline([ ('vect', CountVectorizer(max_features=100000)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(C=1000)), ]) parameters = { 'vect__analyzer__max_n': (1, 2), 'vect__max_df': (.95,), } # Fit the pipeline on the training set using grid search for the parameters grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train[:200], y_train[:200]) # Refit the best parameter set on the complete training set clf = grid_search.best_estimator.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted) print cm
t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ################################################################################ # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1, 5, 10, 50, 100], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator ################################################################################ # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names)
n_components = 150 print "Extracting the top %d eigenfaces" % n_components pca_sl = RandomizedPCA(n_components=n_components, whiten=True) pca_sl.fit(X_train) #components, mean = pca.pca(X_train, n_components) #print "PCA components shape", pca.components_.T.shape #eigenfaces = pca.components_.T.reshape((-1, 64, 64)) # project the input data on the eigenfaces orthonormal basis X_train_pca = pca_sl.transform(X_train) #X_train_pca = pca.transform(X_train, mean, components) ################################################################################ # Train a SVM classification model print "Fitting the classifier to the training set" param_grid = { 'C': [1, 5, 10, 50, 100], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) clf = clf.fit(X_train_pca, y_train) print "Best estimator found by grid search:" print clf.best_estimator
tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] scores = [ ('precision', precision_score), ('recall', recall_score), ] for score_name, score_func in scores: clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func) clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5)) y_true, y_pred = y[test], clf.predict(X[test]) print "Classification report for the best estimator: " print clf.best_estimator print "Tuned for '%s' with optimal value: %0.3f" % ( score_name, score_func(y_true, y_pred)) print classification_report(y_true, y_pred) print "Grid scores:" pprint(clf.grid_scores_) print # Note the problem is too easy: the hyperparameter plateau is too flat and the # output model is the same for precision and recall with ties in quality
moto_vq_train, plane_vq_train = [ np.load(file) for file in ['moto_vq_train.npy', 'plane_vq_train.npy'] ] labels = [0] * moto_vq_train.shape[0] + [1] * plane_vq_train.shape[0] ############################################################################### # Train SVM param_grid = { 'C': [1, 5, 10, 50, 100], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) #clf = SVC(kernel='rbf') #clf = SVC(kernel='linear') clf.fit(np.vstack([moto_vq_train, plane_vq_train]), np.array(labels)) print "Best estimator found by grid search:" #print clf.best_estimator ############################################################################### # Evaluation moto_vq_eval, plane_vq_eval = [ np.load(file) for file in ['moto_vq_eval.npy', 'plane_vq_eval.npy']
def test_grid_search(): """Test that the best estimator contains the right value for foo_param""" clf = MockClassifier() cross_validation = GridSearchCV(clf, {'foo_param': [1, 2, 3]}) assert_equal(cross_validation.fit(X, y).best_estimator.foo_param, 2)