def plot_validation_curve(estimator, param_name, param_range, x, y, cv, x_train, y_train, cv_train, title, svm=0): plt.figure() plt.grid() plt.axvline(x=estimator.get_params()[param_name]) train_scores, test_scores = validation_curve(estimator, x, y, param_name=param_name, param_range=param_range, cv=cv, n_jobs=1) train_scores_mean = 1 - np.mean(train_scores, axis=1) test_scores_mean = 1 - np.mean(test_scores, axis=1) not_using, cv_scores = validation_curve(estimator, x_train, y_train, param_name=param_name, param_range=param_range, cv=cv_train, n_jobs=1) cv_scores_mean = 1 - np.mean(cv_scores, axis=1) plt.title(title) plt.xlabel(param_name) plt.ylabel("Error") if svm == 0: plt.plot(param_range, train_scores_mean, label="Training error", color="r") plt.plot(param_range, test_scores_mean, label="Testing error", color="g") plt.plot(param_range, cv_scores_mean, label="Cross-validation error", color="b") else: plt.semilogx(param_range, train_scores_mean, label="Training error", color="r") plt.semilogx(param_range, test_scores_mean, label="Testing error", color="g") plt.semilogx(param_range, cv_scores_mean, label="Cross-validation error", color="b") plt.legend(loc="best") plt.savefig(title) plt.close()
def test_validation_curve(): ''' test validation_curve with LinerSVC and different C :return: None ''' digits = load_digits() X,y=digits.data,digits.target param_name="C" param_range = np.logspace(-2, 2) train_scores, test_scores = validation_curve(LinearSVC(), X, y, param_name=param_name, param_range=param_range,cv=10, scoring="accuracy") train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fig=plt.figure() ax=fig.add_subplot(1,1,1) ax.semilogx(param_range, train_scores_mean, label="Training Accuracy", color="r") ax.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") ax.semilogx(param_range, test_scores_mean, label="Testing Accuracy", color="g") ax.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") ax.set_title("Validation Curve with LinearSVC") ax.set_xlabel("C") ax.set_ylabel("Score") ax.set_ylim(0,1.1) ax.legend(loc='best') plt.show()
def plot_validation_curve(clf, X_train, Y_train): param_range = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20] train_scores, test_scores = validation_curve(KNeighborsClassifier(algorithm = 'ball_tree', weights = 'uniform'), X_train, Y_train, param_name = "n_neighbors", param_range = param_range, cv = 7, scoring = "accuracy", n_jobs = -1, verbose = True) train_scores_mean = np.mean(train_scores, axis = 1) train_scores_std = np.std(train_scores, axis = 1) test_scores_mean = np.mean(test_scores, axis = 1) test_scores_std = np.mean(test_scores, axis = 1) plt.title("Validation Curve with KNN", size = 15) plt.xlabel("n_neighbors", size = 15) plt.ylabel("Score", size = 15) plt.xticks(size = 12) plt.yticks(size = 12) plt.ylim(0.5,1.0) plt.plot(param_range, train_scores_mean, label = "Training Score", color = "r") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha = 0.2, color = "r") plt.plot(param_range, test_scores_mean, label = "Cross-Validation Score", color = "g") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha = 0.2, color = "g") plt.legend(loc = "best") plt.savefig('plot_validation_curve_rf_asis') plt.show()
def plot_validation_curve(self, estimator, params, param_name, X, y, title, xtricks =None, x_label = None, ylim = None, cv=None, n_jobs = 1): plt.figure(figsize = (6,8)) plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.title(title)#"Validation Curve with SVM" plt.xlabel(x_label) plt.ylabel("Accuracy") #param_range = np.logspace(-6, -1, 5) train_scores, test_scores = validation_curve( estimator, X, y, param_name=param_name, param_range=params, cv=cv, scoring="accuracy", n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.plot(xtricks, train_scores_mean, "o-", label="Training accuracy", color="r") plt.fill_between(xtricks, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") plt.plot(xtricks, test_scores_mean, "o-", label="Cross-validation accuracy", color="g") plt.fill_between(xtricks, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") plt.grid() plt.legend(loc="best") plt.savefig("../Figures/" + title + ".png", bbox_inches="tight") return plt
def plot_validation_curve(model, X, y, scorer, param_name, param_range=np.linspace(0.1, 1, 5), cv=None, n_jobs=5, ylim=None, title="Xval. validation curve"): ''' Plot learning curve for model on data ''' df = pd.DataFrame() df['param_range'] = param_range train_scores, test_scores = validation_curve(model, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring=scorer, n_jobs=n_jobs) df['train_mean'] = 1 - np.mean(train_scores, axis=1) df['train_std'] = np.std(train_scores, axis=1) df['test_mean'] = 1 - np.mean(test_scores, axis=1) df['test_std'] = np.std(test_scores, axis=1) plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Parameter value") plt.ylabel("Error (1-score)") plt.grid() plt.semilogx(param_range, df.train_mean, color="r", label="Training") plt.fill_between(param_range, df.train_mean - df.train_std, df.train_mean + df.train_std, alpha=0.1, color="r") plt.semilogx(param_range, df.test_mean, color="g", label="Test") plt.fill_between(param_range, df.test_mean - df.test_std, df.test_mean + df.test_std, alpha=0.1, color="g") plt.legend(loc="best") plt.show() return df, plt
def plot_validation_curve(estimator, X, y, param_name, param_range): train_scores, test_scores = validation_curve(estimator, X, y, param_name, param_range) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.figure() plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.legend(loc='lower right') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.ylim([0., 1.0]) plt.tight_layout() plt.show()
def RR_validationcurve(sspacing, tspacing, RR_lambda_opt, lambdas_range): """ Reconstruct all fields using RR and save to netcdf file Parameters ---------- sspacing : 2D subsampling ratio in space (in one direction) tspacing : 1D subsampling ratio in time RR_alpha_opt : optimal regularization parameter given from RR_cv_estimate_alpha(sspacing, tspacing, alphas) """ # lambdas_range= np.logspace(-2, 4, 28) #Load all training data (Xl_tr, mea_l, sig_l, Xh_tr,mea_h,sig_h) = data_preprocess(sspacing, tspacing) # validation curve from sklearn.linear_model import Ridge from sklearn.learning_curve import validation_curve train_MSE, test_MSE = validation_curve(Ridge(),Xl_tr, Xh_tr, param_name="alpha", param_range=lambdas_range, scoring = "mean_squared_error", cv=10) # API always tries to maximize a loss function, so MSE is actually in the flipped sign train_MSE = -train_MSE test_MSE = -test_MSE # save to .mat file import scipy.io as sio sio.savemat('/data/PhDworks/isotropic/regerssion/RR_crossvalidation.mat', dict(lambdas_range=lambdas_range, train_MSE = train_MSE, test_MSE = test_MSE)) return (train_MSE, test_MSE)
def plot_validation_curve(classifier, X, y, param_name="gamma", param_range=None): fig = plt.figure() ax = fig.add_subplot(111) plt.title("Validation Curve") plt.ylim((0,1)) plt.xlabel("degree") plt.ylabel("Score") param_range = np.logspace(-6, 0, 5) train_scores, validation_scores = validation_curve(classifier, X, y, param_name, param_range=param_range) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) validation_scores_mean = np.mean(validation_scores, axis=1) validation_scores_std = np.std(validation_scores, axis=1) plt.semilogx(param_range, train_scores_mean, label="Training Score", color="g") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="g") plt.semilogx(param_range, validation_scores_mean, label="Cross-validation Score", color="r") plt.fill_between(param_range, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.2, color="r") box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.65, box.height]) plt.legend(loc="center left", bbox_to_anchor=(1, 0.5)) plt.grid() plt.show()
def plot_validation_curve(clf, X, y, param, name=None): try: name = clf.__class__.__name__ if name is None else name if param is None: return scorer = metrics.make_scorer(metrics.average_precision_score) train_scores, test_scores = validation_curve(clf, X, y, cv=5, scoring=scorer, n_jobs=-1, param_name=param['name'], param_range=param['range']) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title('Validation Curve of {} varying {}'.format(name, param['name'])) plt.xlabel(param['name']) plt.ylabel("Score") plt.ylim(-0.05, 1.05) plt.xlim(min(param['range']), max(param['range'])) plt.plot(param['range'], train_scores_mean, label='Training score', color='r') plt.fill_between(param['range'], train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color='r') plt.plot(param['range'], test_scores_mean, label='Cross-validation score', color="g") plt.fill_between(param['range'], test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color='g') plt.legend(loc='lower right') plt.savefig(name+'_'+param['name']+'_validationcurve.png') plt.clf() except Exception as e: print('ERROR: {}, {}'.format(name, str((e)))) pass
def plot_validation_curve(estimator, title, X, y, param_name, param_range, cv=10, scoring='accuracy', n_jobs=2): from sklearn.learning_curve import validation_curve train_scores, test_scores = validation_curve( estimator, X, y, param_name, param_range, cv=cv, scoring=scoring, n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.figure() plt.title(title) plt.xlabel(param_name) plt.ylabel("Score") plt.ylim(0.0, 1.1) plt.semilogx(param_range, train_scores_mean, label="Training score", color="r") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="g") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") plt.legend(loc="best") plt.show()
def plot_validation_curve(clf, cv, X, y, param_name, param_range): train_scores, valid_scores = validation_curve( clf, X, y, param_name = param_name, param_range = param_range, cv = cv, scoring = my_pipeline_deviance_function, n_jobs=14, verbose = 2) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) # Plot parameter VS estimated error fig, ax = plt.subplots() ax.set_title('Validation curve') ax.set_xlabel(param_name, fontsize = 14) ax.set_ylabel("Loss (deviance)", fontsize = 14) ax.set_xlim(min(param_range), max(param_range)) ax.plot(param_range, train_scores_mean, color="red", label="Training") ax.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="red") ax.plot(param_range, valid_scores_mean, color="green", label="CV") ax.fill_between(param_range, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="green") ax.legend(loc="best") plt.savefig('validation_curve' + param_name + '.png')
def plot_validation_curve(estimator, X, y, param_name, param_range, title=None, ylim=None, cv=10, n_jobs=1, scoring="accuracy"): train_scores, test_scores = validation_curve(estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring=scoring, n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve") plt.xlabel(param_name) plt.ylabel("Score") if ylim is not None: plt.ylim(*ylim) else: plt.ylim(0.0, 1.1) plt.semilogx(param_range, train_scores_mean, label="Training score", color="r") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="g") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") if title: plt.title(title) plt.legend(loc="best") plt.show() return plt
def ModelComplexity(X, y): """ Calculates the performance of the model as model complexity increases. The learning and testing errors rates are then plotted. """ # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.2, random_state = 0) # Calculate the training and testing scores #alpha_range = np.logspace(0.1, 1,num = 10, base = 0.1) alpha_range = np.arange(0.1, 1, 0.1) train_scores, test_scores = curves.validation_curve(Ridge(), X, y, \ param_name = "alpha", param_range = alpha_range, cv = cv, scoring = 'r2') # Find the mean and standard deviation for smoothing train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Plot the validation curve pl.figure(3) pl.title('LinearRegression Complexity Performance') pl.plot(alpha_range, train_mean, 'o-', color = 'r', label = 'Training Score') pl.plot(alpha_range,test_mean, 'o-', color = 'g', label = 'Validation Score') pl.fill_between(alpha_range, train_mean - train_std, \ train_mean + train_std, alpha = 0.15, color = 'r') pl.fill_between(alpha_range, test_mean - test_std, \ test_mean + test_std, alpha = 0.15, color = 'g') # Visual aesthetics pl.legend(loc = 'lower right') pl.xlabel('alpha_range') pl.ylabel('Score') pl.ylim([0.5000,1.0000]) pl.show()
def ModelComplexity(X, y): """ Calculates the performance of the model as model complexity increases. The learning and testing errors rates are then plotted. """ # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=0) # Vary the max_depth parameter from 1 to 10 max_depth = np.arange(1, 11) # Calculate the training and testing scores train_scores, test_scores = curves.validation_curve(DecisionTreeRegressor(), X, y, \ param_name = "max_depth", param_range = max_depth, cv = cv, scoring = 'r2') # Find the mean and standard deviation for smoothing train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Plot the validation curve pl.figure(figsize=(7, 5)) pl.title('Decision Tree Regressor Complexity Performance') pl.plot(max_depth, train_mean, 'o-', color='r', label='Training Score') pl.plot(max_depth, test_mean, 'o-', color='g', label='Validation Score') pl.fill_between(max_depth, train_mean - train_std, \ train_mean + train_std, alpha = 0.15, color = 'r') pl.fill_between(max_depth, test_mean - test_std, \ test_mean + test_std, alpha = 0.15, color = 'g') # Visual aesthetics pl.legend(loc='lower right') pl.xlabel('Maximum Depth') pl.ylabel('Score') pl.ylim([-0.05, 1.05]) pl.show()
def plot_validation_curve(logreg, X_train, Y_train): param_range = [0.05,0.1,0.5,1,5,10,20,50,100,250,500,1000,2500,5000,7500,10000] train_scores, test_scores = validation_curve(LogisticRegression(solver = 'newton-cg', fit_intercept = True, class_weight = None), X_train, Y_train, param_name = "C", param_range = param_range, cv = 7, scoring = "accuracy", n_jobs = -1, verbose = True) train_scores_mean = np.mean(train_scores, axis = 1) train_scores_std = np.std(train_scores, axis = 1) test_scores_mean = np.mean(test_scores, axis = 1) test_scores_std = np.mean(test_scores, axis = 1) plt.title("Validation Curve with Logistic Rgression", size = 15) plt.xlabel("C", size = 15) plt.ylabel("Score", size = 15) plt.xticks(size = 12) plt.yticks(size = 12) plt.ylim(0.94,0.946) plt.plot(param_range, train_scores_mean, label = "Training Score", color = "r") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha = 0.2, color = "r") plt.plot(param_range, test_scores_mean, label = "Cross-Validation Score", color = "g") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha = 0.2, color = "g") plt.legend(loc = "best") plt.savefig('plot_validation_curve_rf_asis') plt.show()
def plot_val_curve(features, labels, model): p_range = np.logspace(-5, 5, 5) train_scores, test_scores = validation_curve(model, features, labels, param_name="gamma", param_range=p_range, cv=6, scoring="accuracy", n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve") plt.xlabel("$\gamma$") plt.ylabel("Score") plt.semilogx(p_range, train_scores_mean, label="Training score", color="#E29539") plt.semilogx(p_range, test_scores_mean, label="Cross-validation score", color="#94BA65") plt.legend(loc="best") plt.show()
def plot_training_curve(model, X, y): params = ["min_samples_leaf", "min_samples_split"] p_range = [2, 4, 8, 10, 12, 14, 16, 18, 20] # [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] for param in params: print("plotting validation curve...") train_scores, valid_scores = validation_curve( model, X, y, param_name=param, param_range=p_range, cv=3, scoring='mean_absolute_error') train_scores_mean = np.absolute(np.mean(train_scores, axis=1)) valid_scores_mean = np.absolute(np.mean(valid_scores, axis=1)) plt.title("Validation Curve with GBM") plt.xlabel(param) plt.ylabel("MAE") plt.plot(p_range, train_scores_mean, label="Training Error", color="r", marker='o') plt.plot(p_range, valid_scores_mean, label="Cross-validation Error", color="g", marker='s') plt.legend(loc="best") plt.show()
def plot_validation_curve(estimator, X, y, title, param_name, param_range, cv = 10): train_scores, test_scores = validation_curve( estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring="accuracy", n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.figure() plt.title(title) plt.xlabel(param_name) plt.ylabel("Score") plt.ylim(0.0, 1.1) #plt.semilogx(param_range, train_scores_mean, label="Training score", color="r") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") #plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="g") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") plt.plot(param_range, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(param_range, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
def plot_validation_curve(clf, X_train, Y_train): param_range = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20] train_scores, test_scores = validation_curve(KNeighborsClassifier(algorithm = 'ball_tree', weights = 'uniform'), X_train, Y_train, param_name = "n_neighbors", param_range = param_range, cv = 7, scoring = "accuracy", n_jobs = -1, verbose = True) train_scores_mean = np.mean(train_scores, axis = 1) train_scores_std = np.std(train_scores, axis = 1) test_scores_mean = np.mean(test_scores, axis = 1) test_scores_std = np.mean(test_scores, axis = 1) plt.title("Validation Curve with KNN", size = 15) plt.xlabel("n_neighbors", size = 15) plt.ylabel("Score", size = 15) plt.xticks(size = 12) plt.yticks(size = 12) plt.ylim(0.8,1.0) plt.plot(param_range, train_scores_mean, label = "Training Score", color = "r") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha = 0.2, color = "r") plt.plot(param_range, test_scores_mean, label = "Cross-Validation Score", color = "g") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha = 0.2, color = "g") plt.legend(loc = "best") plt.savefig('plot_validation_curve_rf_asis') plt.show()
def plot_validation_curve(estimator, X, y, param_name, param_range, ylim=(0, 1.1), cv=None, n_jobs=-1, scoring=None, filename=None): plt.clf() estimator_name = type(estimator).__name__ plt.title("Validation curves for %s on %s" % (param_name, estimator_name)) plt.grid() plt.xlabel(param_name) plt.ylabel("Score") plt.xlim(min(param_range), max(param_range)) plt.ylim(*ylim) train_scores, test_scores = validation_curve( estimator, X, y, param_name, param_range, cv=cv, n_jobs=n_jobs, scoring=scoring) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.semilogx(param_range, train_scores_mean, 'o-', color="r", label="Training score") plt.semilogx(param_range, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") print("Best test score: {:.4f}".format(test_scores_mean[-1])) save_plot("validation_curve_"+str(filename)+".png")
def main(): S, col_names_S = load_data(config.paths.training_data, config.paths.cache_folder) Xs, Ys, col_names_S = extract_xy(S, col_names_S) a = RandomForestClassifier(n_estimators=1) a.fit(Xs.toarray(), Ys.toarray().ravel()) best_features = a.feature_importances_ max_ind, max_val = max(enumerate(best_features), key=operator.itemgetter(1)) print best_features print max_ind, max_val print Xs.shape print Ys.shape param_range = [1, 3, 5, 7, 10, 15, 20, 30, 60, 80] train_scores, test_scores = validation_curve(RandomForestClassifier(criterion='entropy'), Xs, Ys.toarray().ravel(), 'n_estimators', param_range) print train_scores print test_scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.title("Validation Curve for Random Forest") plt.xlabel("Number of Trees") plt.ylabel("Score") plt.plot(param_range, train_mean, label="Training Score", color='r') plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.2, color='r') plt.plot(param_range, test_mean, label="Test Score", color='b') plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.2, color='b') plt.legend(loc="best") plt.show()
def validation_curve_analysis(estimator=None, param_name=None, param_range=None, issues_train=None, priority_train=None): """ Generates the validation curve for a specific estimator. :param estimator: Estimator. :param param_name: Name of the parameter. :param param_range: Range of the parameters to consider. :param issues_train: Train issues. :param priority_train: Train priorities. :return: None. """ train_scores, test_scores = validation_curve(estimator=estimator, X=issues_train, y=priority_train, param_name=param_name, param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) _, _ = plt.subplots(figsize=(2.5, 2.5)) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.xlabel('Parameter ' + param_name) plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.show()
def deviance_curve(classifier, features, labels, metaparameter_name, param_range, metric='Accuracy', n_folds=4, njobs=-1, fig_size=(16, 9)): training_scores, validation_scores = validation_curve(classifier, features, labels, metaparameter_name, param_range, n_jobs=njobs, cv=n_folds, scoring=metric) training_scores_mean = np.mean(training_scores, axis=1) training_scores_std = np.std(training_scores, axis=1) validation_scores_mean = np.mean(validation_scores, axis=1) validation_scores_std = np.std(validation_scores, axis=1) sns.set_style("darkgrid") sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5}) plt.figure(num=None, figsize=fig_size, dpi=600, facecolor='w', edgecolor='k') plt.title("Validation Curve") plt.xlabel(metaparameter_name) plt.ylabel(metric) plt.xlim(np.min(param_range), np.max(param_range)) plt.plot(param_range, training_scores_mean, label="Training " + metric, color="mediumblue") plt.fill_between(param_range, training_scores_mean - training_scores_std, training_scores_mean + training_scores_std, alpha=0.2, color="lightskyblue") plt.plot(param_range, validation_scores_mean, label="validation " + metric, color="coral") plt.fill_between(param_range, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.2, color="lightcoral") plt.legend(loc="best") plt.show()
def plot_validation_curve(self, estimator, X_train, y_train, param_range, param_name, cv=None, ylim=None, figure_title="Validation Curve"): """ :param estimator: :param X_train: inputs :param y_train: class labels :param param_range: list , e.g. [1, 2, 3, 4] :param param_name: String :param cv: integer, number of folds :param y_axis_limit: list for the ylim , e.g. [0.8, 1.0] :return: plot figure """ train_scores, test_scores = validation_curve(estimator, X_train, y_train, param_name, param_range, cv) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean-train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.title(figure_title) plt.grid() #plt.xscale('log') plt.legend(loc='lower right') plt.xlabel(param_name) ## or replace with a meaningful name ? plt.ylabel('Accuracy') if ylim is not None: plt.ylim(ylim) plt.show()
def plot_validation_curve(classifier,xTrain,yTrain,paramName,paramRange): train_scores, test_scores = validation_curve( classifier, xTrain, yTrain, param_name=paramName, param_range=paramRange, cv=3, scoring="log_loss", n_jobs=-1,verbose=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve") plt.xlabel(paramName) plt.ylabel("Score") plt.plot(paramRange, train_scores_mean, 'o-', label="Training score", color="r") plt.fill_between(paramRange, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") plt.plot(paramRange, test_scores_mean, 'o-', label="Cross-validation score", color="g") plt.fill_between(paramRange, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") plt.legend(loc="best") plt.show()
def tune_parameter(): pipe_lr = Pipeline([('scl',StandardScaler()), ('clf', LogisticRegression(penalty='l2')) ]) # print pipe_lr.get_params().keys() param_range =np.arange(0.0001,0.1,0.001) train_scores, test_scores = validation_curve( estimator= pipe_lr, X = df_train, y = y_train, param_name= 'clf__C', param_range= param_range, cv = 10 ) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color = 'blue',marker ='o', markersize = 5, label = 'training accuray') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha = 0.15, color ='blue') plt.plot(param_range, test_mean, color = 'red',marker ='s', markersize = 5, label = 'validation accuray') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha = 0.15, color ='red') plt.grid() # plt.xscale('log') plt.legend(loc = 'lower right') plt.xlabel('Parameter C') plt.ylabel('Accurary') plt.ylim([0,1]) plt.show()
def main(): # loading training data data = pd.read_csv('../input/train.csv') X_tr = data.values[:, 1:].astype(float) X_tr = normalizeX(X_tr) y_tr = data.values[:, 0] param_range = np.array( [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0]) train_scores, test_scores = validation_curve( digit_recognizer(maxiter=100), X_tr, y_tr, param_name='lambda_', param_range=param_range, cv=3, scoring='accuracy', n_jobs=2) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title("Validation Curve with Neural Network") plt.xlabel("$\lambda$") plt.ylabel("Score") plt.ylim(0.85, 1.05) plt.semilogx(param_range, train_scores_mean, label="Training score", color="r") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="g") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") plt.legend(loc="best") plt.show()
def plot_validation_curve(estimator, X, y, param_name, param_range, addition_graph_points, graph_title, graph_xlabel, graph_ylabel, ylim, cv=5, scoring="accuracy"): cv_train_scores, cv_test_scores = validation_curve(estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring=scoring) cv_train_scores_mean = np.mean(cv_train_scores, axis=1) cv_train_scores_std = np.std(cv_train_scores, axis=1) cv_test_scores_mean = np.mean(cv_test_scores, axis=1) cv_test_scores_std = np.std(cv_test_scores, axis=1) plt.title(graph_title) plt.xlabel(graph_xlabel) plt.ylabel(graph_ylabel) plt.ylim(*ylim) plt.fill_between(param_range, cv_train_scores_mean - cv_train_scores_std, cv_train_scores_mean + cv_train_scores_std, alpha=0.1, color="r") plt.fill_between(param_range, cv_test_scores_mean - cv_test_scores_std,cv_test_scores_mean + cv_test_scores_std, alpha=0.1, color="b") plt.plot(param_range, cv_train_scores_mean, 'o-', color="r", label="Cross Validation Training score") plt.plot(param_range, cv_test_scores_mean, 'o-', color="b",label="Cross Validation Test Score") for gp in addition_graph_points: plt.plot(param_range, gp['data'], 'o-', color=gp['color'],label=gp['label']) plt.legend(loc="best") plt.savefig('plots/'+graph_title+'.png') plt.close()
def plot_validation_curve(estimator, X, y, param_name, param_range=np.logspace(-6, -1, 5), title=None, ylim=None, cv=None, n_jobs=1, scoring=None): plt.figure(figsize=(20,10)) #param_range = np.logspace(-6, -1, 5) train_scores, test_scores = validation_curve( estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring=scoring, n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title(title) plt.xlabel(param_name) plt.ylabel("Score") if ylim is not None: plt.ylim(*ylim) plt.semilogx(param_range, train_scores_mean, label="Training score", color="r") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="g") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") plt.legend(loc="best") plt.show() print("Stratified 3-Fold cross-validation \ntrain_scores_mean:") print(train_scores_mean) print("test_scores_mean:") print(test_scores_mean) return plt
def plot_val_curve(features, labels, model): p_range = np.logspace(-5, 5, 5) train_scores, test_scores = validation_curve(model, features, labels, param_name='gamma', param_range=p_range, cv=6, scoring='accuracy', n_jobs=1) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title('Validation Curve') plt.xlabel('$\gamma$') plt.ylabel('Score') plt.semilogx(p_range, train_scores_mean, label='Training score', color='#E29539') plt.semilogx(p_range, test_scores_mean, label='Cross-validation score', color='#94BA65') plt.legend(loc='best') plt.savefig('figures/val_curve.png', transparent=True)
def addressing_over_under_fitting(): df = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None) X = df.loc[:, 2:].values y = df.loc[:, 1].values le = LabelEncoder() y = le.fit_transform(y) le.transform(['M', 'B']) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.20, random_state=1) pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0))]) param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] train_scores, test_scores = validation_curve(estimator=pipe_lr, X=X_train, y=y_train, param_name='clf__C', param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.legend(loc='lower right') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.ylim([0.8, 1.0]) plt.tight_layout() plt.savefig(PL6 + 'validation_curve.png', dpi=300)
def validation_curves(model, X, y, n_iter, test_size): n_Cs = 10 Cs = np.logspace(-5, 5, n_Cs) cv = ShuffleSplit(X.shape[0], n_iter=n_iter, test_size=test_size, random_state=0) train_scores, test_scores = validation_curve(model, X, y, "C", Cs, cv=cv) return (Cs, train_scores, test_scores)
def model_complexity(features, targets, cv_data): '''Produces a figure displaying the model complexity curve--model score performance as a function of parameter tuning ''' from predictor import performance_metric params = ['base_estimator', 'n_estimators', 'learning_rate'] params_range = [ [DecisionTreeRegressor(max_depth = 2), DecisionTreeRegressor(max_depth = 4), DecisionTreeRegressor(max_depth = 8), DecisionTreeRegressor(max_depth = 16), DecisionTreeRegressor(max_depth = 32)], \ np.arange(25, 250, 25), np.arange(0.5, 3, 0.5)] pcurve = zip(params, params_range) scoring_fnc = make_scorer(performance_metric, greater_is_better=False) k = 0 com_fig, com_ax = plt.subplots(1, 3, figsize=(12, 4), dpi=50) for pname, prange in pcurve: train_scores, test_scores = curves.validation_curve( AdaBoostRegressor(), features, targets, param_name=pname, param_range=prange, cv=cv_data, scoring=scoring_fnc) train_mean = np.mean(abs(train_scores), axis=1) test_mean = np.mean(abs(test_scores), axis=1) train_std = np.std(abs(train_scores), axis=1) test_std = np.std(abs(test_scores), axis=1) if k == 0: prange = [2, 4, 8, 16, 32] com_ax[k].plot(prange, train_mean, 'o-', color='r', label='Training Score') com_ax[k].plot(prange, test_mean, 'o-', color='g', label='Validation Score') com_ax[k].set_ylim((0, 14)) com_ax[k].fill_between(prange, train_mean - train_std, train_mean + train_std, alpha=0.15, color='r') com_ax[k].fill_between(prange, test_mean - test_std, test_mean + test_std, alpha=0.15, color='g') if k == 0: com_ax[k].set_xlabel('{} max_depth'.format(pname)) com_ax[k].legend(['Training Score', 'Testing Score'], bbox_to_anchor=(-.16, 1.05)) else: com_ax[k].set_xlabel(pname) com_ax[k].set_ylabel('Adjusted Close Price Relative Difference (%)') k += 1 com_fig.savefig('img/' + 'col_plots.png') return com_fig, com_ax
def my_visual_parameter_tuning(x_train, y_train, pipe, param_name, n_fold, param_range): """Shows a plot to see which value for a parameter is the best. Args: x_train (DataSeries): X variable y_train (DataSeries): y variable pipe (Pipeline): Pipeline with the input set n_fold (int): How often the cross validation is done param_name (String): The name of the parameter that should be tested param_range (List): List of a range of parameter values Returns: bool: True if the function worked """ # Visualization Check train_scores, test_scores = validation_curve(estimator=pipe, X=x_train, y=y_train, param_name=param_name, param_range=param_range, cv=n_fold, n_jobs=1, verbose=1) # Get mean and SD train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Plot Training and test group accuracy plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.legend(loc='best') plt.xlabel('Parameter') plt.ylabel('Score') plt.show() return True
def valid_curve(self, col2fit, score='accuracy', verbose=0): """ Plots the validation curve """ self.prepare_data(self.df_full, True, col2fit) train_values = self.df_full[col2fit].values target_values = self.df_full['rain'].values ## Number of cpu to use ## Making sure there is one free unless there is only one njobs = max(1, int(0.75*multiprocessing.cpu_count())) print '\n\nValidating with njobs = {}\n...\n'.format(njobs) ## Parameter info is hard-coded for now, should be improved... paramater4validation = "n_estimators" maxdepth = 15 param_range = [10, 50, 100, 150, 200, 250, 300, 400, 600, 800, 1000, 1500] #paramater4validation = "max_depth" #nestimators = 150 #param_range = [8, 10, 12, 14, 15, 16, 17, 18, 20, 24] print '\nValidating on {} with ranges:'.format(paramater4validation) print param_range print 'validating...' train_scores, test_scores = validation_curve( RandomForestClassifier(max_depth = maxdepth), train_values, target_values, param_name=paramater4validation, param_range=param_range,cv=10, scoring=score, verbose = verbose, n_jobs=njobs) #train_scores, test_scores = validation_curve( # RandomForestClassifier(n_estimators = nestimators), train_values, target_values, # param_name=paramater4validation, param_range=param_range,cv=10, # scoring=score, verbose = verbose, n_jobs=njobs) ## plotting train_scores_mean = N.mean(train_scores, axis=1) train_scores_std = N.std(train_scores, axis=1) test_scores_mean = N.mean(test_scores, axis=1) test_scores_std = N.std(test_scores, axis=1) fig = plt.figure() plt.title("Validation Curve") plt.xlabel(paramater4validation) plt.ylabel(score) plt.plot(param_range, train_scores_mean, label="Training score", color="r") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") plt.plot(param_range, test_scores_mean, label="Cross-validation score", color="g") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") plt.grid() plt.legend(loc='best') fig.show() raw_input('press enter when finished...')
def validationCurves(model, X_test, y_test, web): from sklearn.learning_curve import validation_curve import numpy as np param_range = np.arange(0, 5) # Generating Validation Curve sucess = False v_train_scores, v_test_scores = None, None for params in model.get_params().keys(): # TODO: Fix Weird Bug try: v_train_scores, v_test_scores = validation_curve( model, X_test, y_test, param_range=param_range) sucess = True except: pass if sucess: break fig, ax = plt.subplots() ax.set_xlabel("Validation examples") ax.set_ylabel("Score") print(v_train_scores) v_train_scores_mean = np.mean(v_train_scores, axis=1) v_train_scores_std = np.std(v_train_scores, axis=1) v_test_scores_mean = np.mean(v_test_scores, axis=1) v_test_scores_std = np.std(v_test_scores, axis=1) ax.fill_between(param_range, v_train_scores_mean - v_train_scores_std, v_train_scores_mean + v_train_scores_std, alpha=0.1, color="orange") ax.fill_between(param_range, v_test_scores_mean - v_test_scores_std, v_test_scores_mean + v_test_scores_std, alpha=0.1, color="purple") ax.plot(param_range, v_train_scores_mean, 'o-', color="orange", label="Training score") ax.plot(param_range, v_test_scores_mean, 'o-', color="purple", label="Cross-validation score") ax.legend(loc="best") if web: return fig plt.show() plt.close()
def validate(est, X, y, pname, prange): est_cp = deepcopy(est) return validation_curve(est_cp, X, y, param_name=pname, param_range=prange, n_jobs=1)
def drawValidationCurve(self): """ To draw the validation curve :return:NA """ X, y = self.X_train, self.y_train.ravel() indices = np.arange(y.shape[0]) #np.random.shuffle(indices) X, y = X[indices], y[indices] train_sizes = range(2, 100, 2) train_scores, valid_scores = validation_curve( self.regr, X, y, "n_neighbors", train_sizes, cv=5, scoring='mean_squared_error') train_scores = -1.0 / 5 * train_scores valid_scores = -1.0 / 5 * valid_scores train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training MSE") plt.plot(train_sizes, valid_scores_mean, '*-', color="g", label="Cross-validation MSE") plt.legend(loc="best") plt.xlabel('K Neighbors') plt.ylabel('MSE') plt.title( 'Validation Curve with KNN Regression on the parameter of K Neighbors' ) plt.grid(True) plt.show()
def drawValidationCurve_maxdepth(self): """ To draw the validation curve :return:NA """ X, y = self.X_train, self.y_train.ravel() indices = np.arange(y.shape[0]) #np.random.shuffle(indices) X, y = X[indices], y[indices] train_sizes = range(1, 60) train_scores, valid_scores = validation_curve( self.model, X, y, "max_depth", train_sizes, cv=5, scoring='mean_squared_error') train_scores = -1.0 / 5 * train_scores valid_scores = -1.0 / 5 * valid_scores train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training MSE") plt.plot(train_sizes, valid_scores_mean, '*-', color="g", label="Cross-validation MSE") plt.legend(loc="best") plt.xlabel('Max Depth') plt.ylabel('MSE') plt.title( 'Validation Curve with Random Forest Regression \non the parameter of Max Depth when n_estimators=32' ) plt.grid(True) plt.show()
def test_validation_curve_clone_estimator(): X, y = make_classification(n_samples=2, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) param_range = np.linspace(1, 0, 10) _, _ = validation_curve( MockEstimatorWithSingleFitCallAllowed(), X, y, param_name="param", param_range=param_range, cv=2 )
def test_validation_curve(): X, y = make_classification(n_samples=2, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) param_range = np.linspace(0, 1, 10) train_scores, test_scores = validation_curve(MockEstimatorWithParameter(), X, y, param_name="param", param_range=param_range, cv=2) assert_array_almost_equal(train_scores.mean(axis=1), param_range) assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range)
def validation_curve_plot(clf_class, x, y, parameter_range, parameter_name): """ returns validation curve plot clf = any algorithms can be used. Example: RandomForestClassifier() standardization = True or False. False as default. Better to standardize features before trainning for many algorithms """ clf = clf_class param_range = parameter_range train_scores, test_scores = validation_curve(estimator=clf, X=x, y=y, param_name=parameter_name, param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.legend(loc='lower right') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.ylim([0.8, 1.0]) plt.tight_layout() plt.show()
def plot_validation_curve(estimator, title, X, y, param_name, param_range, ylim=None): """ :param estimator: sklearn regressor object :param title: Title of the curve :param X: predictors :param y: response :param param_name: parameter of the regression obj to do cross validation on, ex:Number of trees for RF :param param_range: range of values of the parameter :param ylim: :return: plots the validation curve for the parameter """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel(param_name) plt.ylabel("RMSLE") rsmle_score = make_scorer(rsmle_, greater_is_better=True) train_scores, test_scores = validation_curve(estimator, X, y, param_name=param_name, param_range=param_range, cv=5, scoring=rsmle_score, n_jobs=1) print "cross validation done...plotting the graph" train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.plot(param_range, train_scores_mean, label="Training score", color="r") plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") plt.plot(param_range, test_scores_mean, label="Cross-validation score", color="g") plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") plt.legend(loc="best") plt.show()
def validation_curves(model, X, y, n_iter, test_size): n_Cs = 10 Cs = np.logspace(-5, 5, n_Cs) cv = ShuffleSplit(X.shape[0], n_iter=n_iter, test_size=test_size, random_state=0) train_scores, test_scores = validation_curve(model, X, y, 'C', Cs, cv=cv) return (Cs, train_scores, test_scores)
def test_validation_curve(): ''' 测试 validation_curve 的用法 。验证对于 LinearSVC 分类器 , C 参数对于预测准确率的影响 :return: None ''' ### 加载数据 digits = load_digits() X, y = digits.data, digits.target #### 获取验证曲线 ###### param_name = "C" param_range = np.logspace(-2, 2) train_scores, test_scores = validation_curve(LinearSVC(), X, y, param_name=param_name, param_range=param_range, cv=10, scoring="accuracy") ###### 对每个 C ,获取 10 折交叉上的预测得分上的均值和方差 ##### train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) ####### 绘图 ###### fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.semilogx(param_range, train_scores_mean, label="Training Accuracy", color="r") ax.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") ax.semilogx(param_range, test_scores_mean, label="Testing Accuracy", color="g") ax.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") ax.set_title("Validation Curve with LinearSVC") ax.set_xlabel("C") ax.set_ylabel("Score") ax.set_ylim(0, 1.1) ax.legend(loc='best') plt.show()
def drawValidationCurve(self): """ To draw the validation curve :return:NA """ X, y = self.X_train, self.y_train.ravel() indices = np.arange(y.shape[0]) np.random.shuffle(indices) X, y = X[indices], y[indices] train_sizes = range(2, 60) train_scores, valid_scores = validation_curve(self.clf, X, y, "max_depth", train_sizes, cv=5) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Precision") plt.plot(train_sizes, valid_scores_mean, '*-', color="g", label="Cross-validation Precision") plt.legend(loc="best") plt.xlabel('Max Depth(log2(all features) to be considered) ') plt.ylabel('Precision') plt.title( 'Validation Curve with Decision Tree on the parameter of Max Depth' ) plt.grid(True) plt.show()
def drawValidationCurve(self): """ To draw the validation curve :return:NA """ X, y = self.X_train, self.y_train.ravel() indices = np.arange(y.shape[0]) np.random.shuffle(indices) X, y = X[indices], y[indices] train_sizes = range(2, 75) train_scores, valid_scores = validation_curve(self.ada, X, y, "n_estimators", train_sizes, cv=5) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) valid_scores_mean = np.mean(valid_scores, axis=1) valid_scores_std = np.std(valid_scores, axis=1) plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Precision") plt.plot(train_sizes, valid_scores_mean, '*-', color="g", label="Cross-validation Precision") plt.legend(loc="best") plt.xlabel('Estimators') plt.ylabel('Precision') plt.title( 'Validation Curve with AdaBoost-DecisionTree on the parameter of Estimators' ) plt.grid(True) plt.show()
def test_validation_curve(): X, y = make_classification( n_samples=2, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0 ) param_range = np.linspace(0, 1, 10) with warnings.catch_warnings(record=True) as w: train_scores, test_scores = validation_curve( MockEstimatorWithParameter(), X, y, param_name="param", param_range=param_range, cv=2 ) if len(w) > 0: raise RuntimeError("Unexpected warning: %r" % w[0].message) assert_array_almost_equal(train_scores.mean(axis=1), param_range) assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range)
def test_validation_curve(self): digits = datasets.load_digits() df = pdml.ModelFrame(digits) param_range = np.logspace(-2, -1, 2) svc = df.svm.SVC(random_state=self.random_state) result = df.learning_curve.validation_curve(svc, 'gamma', param_range) expected = lc.validation_curve(svm.SVC(random_state=self.random_state), digits.data, digits.target, 'gamma', param_range) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assert_numpy_array_almost_equal(result[1], expected[1])
def validation_curve_plot(clf_class, x, y, parameter_range, parameter_name): """ returns validation curve plot clf = any algorithms can be used. Example: RandomForestClassifier() standardization = True or False. False as default. Better to standardize features before trainning for many algorithms """ clf = clf_class param_range = parameter_range train_scores, test_scores = validation_curve( estimator=clf, X=x, y=y, param_name=parameter_name, param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.legend(loc='lower right') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.ylim([0.8, 1.0]) plt.tight_layout() plt.show()
def plot_validation_curve( estimator, X, y, param_name, param_range, title="Validation Curve", ylim=None, semilog=False, cv=None, n_jobs=1, scoring=None, ax=None): # param_range = np.logspace(-6, -1, 5) train_scores, test_scores = validation_curve( estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring=scoring, n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) if ax is None: fig, ax1 = plt.subplots() else: ax1 = ax ax1.set_title(title) ax1.set_xlabel(param_name) ax1.set_ylabel("Score") ax1.grid() if ylim is not None: ax1.set_ylim(ylim) if semilog: ax1.semilogx(param_range, train_scores_mean, 'o-', label="Training score", color="r") else: ax1.plot(param_range, train_scores_mean, 'o-', label="Training score", color="r") ax1.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="r") if semilog: ax1.semilogx(param_range, test_scores_mean, 'o-', label="Cross-validation score", color="g") else: ax1.plot(param_range, test_scores_mean, 'o-', label="Cross-validation score", color="g") ax1.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="g") ax1.legend(loc="best") return ax1
def plotValidationCurve(estimator, title, X, y, param_name, param_range, cv=5): trainScores, testScores = validation_curve(estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring="accuracy", ) trainScoresMean = np.mean(trainScores, axis=1) trainScoresStd = np.std(trainScores, axis=1) testScoresMean = np.mean(testScores, axis=1) testScoresStd = np.std(testScores, axis=1) sns.plt.title(title) sns.plt.xlabel(param_name) sns.plt.ylabel("Accuracy Score") sns.plt.ylim(0.0, 1.1) sns.plt.semilogx(param_range, trainScoresMean, label="Training score", color="r") sns.plt.fill_between(param_range, trainScoresMean - trainScoresStd, trainScoresMean + trainScoresStd, alpha=0.2, color="r") sns.plt.semilogx(param_range, testScoresMean, label="Cross-validation score",color="b") sns.plt.fill_between(param_range, testScoresMean - testScoresStd, testScoresMean + testScoresStd, alpha=0.2, color="b") sns.plt.legend(loc="best") return sns.plt
def clf_validation_curve(self): #Plot of Accuracy vs Regularisation param_range=[0.001,0.01,0.1,1.0,10.0,100.0] pipe_lr = Pipeline([ ('scl',StandardScaler()), # ('pca',PCA(n_components=2)), ('clf',LogisticRegression(penalty='l2',random_state=0))]) train_scores, test_scores= validation_curve( estimator=pipe_lr, X=self.X_train, y=self.y_train, param_name='clf__C', param_range=param_range, cv=10) train_mean = np.mean(train_scores,axis=1) train_std= np.std(train_scores,axis=1) test_mean=np.mean(test_scores,axis=1) test_std= np.std(test_scores,axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5,label='training accuracy') plt.fill_between(param_range, train_mean+train_std, train_mean-train_std, alpha=0.15,color='blue') plt.plot(param_range, test_mean, color='green',linestyle='--', marker='s', markersize=5,label='validation accuracy') plt.fill_between(param_range, test_mean+test_std, test_mean-test_std, alpha=0.15,color='green') plt.grid() plt.xscale('log') plt.legend(loc='lower right') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.ylim([0.8,1.0]) plt.show()