def test_grid_search_sparse_score_func(): X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2)
def test_grid_search_sparse(): """Test that grid search works with both dense and sparse matrices""" X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C':[0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C':[0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert np.mean(y_pred == y_pred2) >= .9 assert_equal(C, C2)
def test_grid_search_sparse(): """Test that grid search works with both dense and sparse matrices""" X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert np.mean(y_pred == y_pred2) >= .9 assert_equal(C, C2)
def test_grid_search_sparse_score_func(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) # XXX: set refit to False due to a random bug when True (default) cv.set_params(refit=False).fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) # XXX: set refit to False due to a random bug when True (default) cv.set_params(refit=False).fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2)
def test_grid_search_sparse_score_func(): X_, y_ = test_dataset_classif(n_samples=200, n_features=100, seed=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) # XXX: set refit to False due to a random bug when True (default) cv.fit(X_[:180], y_[:180], refit=False) y_pred = cv.predict(X_[180:]) C = cv.best_estimator.C X_ = sp.csr_matrix(X_) clf = SparseLinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) # XXX: set refit to False due to a random bug when True (default) cv.fit(X_[:180], y_[:180], refit=False) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2)
def train_svm_crossvalidated(X, y, tuned_parameters={'kernel': ['rbf'], 'gamma': 2.0**np.arange(-15,3), 'C': 2.0**np.arange(-5, 15)}): """ Performs grid search with stratified K-fold cross validation on observations X with true labels y and returns an optimal SVM, clf """ k_fold = _size_dependent_k_split(np.size(X,0)) clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=recall_score) clf.fit(X, y, cv=StratifiedKFold(y, k_fold)) y_true, y_pred = y, clf.predict(X) #print "Classification report for the best estimator: " #print clf.best_estimator print "Tuned with optimal value: %0.3f" % recall_score(y_true, y_pred) return clf
X_test_pca = pca.transform(X_test) # Train a SVM classification model print "Fitting the classifier to the training set" param_grid = {"C": [1, 5, 10, 100], "gamma": [0.0001, 0.001, 0.01, 0.1]} clf = GridSearchCV(SVC(kernel="rbf"), param_grid, fit_params={"class_weight": "auto"}, n_jobs=-1) clf = clf.fit(X_train_pca, y_train) print "Best estimator found by grid search:" print clf.best_estimator # Quantitative evaluation of the model quality on the test set y_pred = clf.predict(X_test_pca) print classification_report(y_test, y_pred, labels=selected_target, target_names=target_names[selected_target]) print confusion_matrix(y_test, y_pred, labels=selected_target) # Qualitative evaluation of the predictions using matplotlib n_row = 3 n_col = 4 def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit("_", 1)[-1] true_name = target_names[y_test[i]].rsplit("_", 1)[-1] return "predicted: %s\ntrue: %s" % (pred_name, true_name)
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator ################################################################################ # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes)) ################################################################################ # Qualitative evaluation of the predictions using matplotlib n_row = 3 n_col = 4 def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1] true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
# split the dataset in two equal part respecting label proportions train, test = iter(StratifiedKFold(y, 2)).next() ################################################################################ # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] scores = [ ('precision', precision_score), ('recall', recall_score), ] for score_name, score_func in scores: clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func) clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5)) y_true, y_pred = y[test], clf.predict(X[test]) print "Classification report for the best estimator: " print clf.best_estimator print "Tuned for '%s' with optimal value: %0.3f" % ( score_name, score_func(y_true, y_pred)) print classification_report(y_true, y_pred) print "Grid scores:" pprint(clf.grid_scores_) print # Note the problem is too easy: the hyperparameter plateau is too flat and the # output model is the same for precision and recall with ties in quality
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) #clf = SVC(kernel='rbf') #clf = SVC(kernel='linear') clf.fit(np.vstack([moto_vq_train, plane_vq_train]), np.array(labels)) print "Best estimator found by grid search:" #print clf.best_estimator ############################################################################### # Evaluation moto_vq_eval, plane_vq_eval = [ np.load(file) for file in ['moto_vq_eval.npy', 'plane_vq_eval.npy'] ] y_name = ['moto'] * moto_vq_eval.shape[0] + ['plane'] * plane_vq_eval.shape[0] y_test = [0] * moto_vq_eval.shape[0] + [1] * plane_vq_eval.shape[0] y_test = np.array(y_test) y_pred = clf.predict(np.vstack([moto_vq_eval, plane_vq_eval])) print classification_report(y_test, y_pred, labels=labels, class_names=y_name) print confusion_matrix(y_test, y_pred)
clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) #clf = SVC(kernel='rbf') #clf = SVC(kernel='linear') clf.fit(np.vstack([moto_vq_train,plane_vq_train]), np.array(labels)) print "Best estimator found by grid search:" #print clf.best_estimator ############################################################################### # Evaluation moto_vq_eval, plane_vq_eval = [np.load(file) for file in ['moto_vq_eval.npy','plane_vq_eval.npy']] y_name = ['moto']*moto_vq_eval.shape[0] + ['plane']* plane_vq_eval.shape[0] y_test = [0]* moto_vq_eval.shape[0] + [1]* plane_vq_eval.shape[0] y_test = np.array(y_test) y_pred = clf.predict(np.vstack([moto_vq_eval, plane_vq_eval])) print classification_report(y_test, y_pred, labels=labels, class_names=y_name) print confusion_matrix(y_test, y_pred)
tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] scores = [ ('precision', precision_score), ('recall', recall_score), ] for score_name, score_func in scores: clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func) clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5)) y_true, y_pred = y[test], clf.predict(X[test]) print "Classification report for the best estimator: " print clf.best_estimator print "Tuned for '%s' with optimal value: %0.3f" % ( score_name, score_func(y_true, y_pred)) print classification_report(y_true, y_pred) print "Grid scores:" pprint(clf.grid_scores_) print # Note the problem is too easy: the hyperparameter plateau is too flat and the # output model is the same for precision and recall with ties in quality
from scikits.learn.grid_search import GridSearchCV from scikits.learn import datasets from scikits.learn.metrics import zero_one ################################################################################ # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target ################################################################################ # Set the parameters by cross-validation tuned_parameters = [{'kernel':('rbf', ), 'gamma':[1e-3, 1e-4]}, {'kernel':('linear', )}] clf = GridSearchCV(SVC(C=1), tuned_parameters, n_jobs=2) y_pred = [] y_true = [] for train, test in StratifiedKFold(y, 2): clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5)) y_pred = np.append(y_pred, clf.predict(X[test])) y_true = np.append(y_true, y[test]) classif_rate = np.mean(y_pred == y_true) * 100 print "Classification rate : %f" % classif_rate