def run(dbdata, dbdata2, knn, param_range, title): global counter X_train = dbdata.iloc[:, np.arange(32)] y_train = dbdata.iloc[:, 32] X_test = dbdata2.iloc[:, np.arange(32)] y_test = dbdata2.iloc[:, 32] #####Graphs the best results obtained from the gridsearch knn.fit(X_train, y_train) X_pred = knn.predict(X_train) y_pred = knn.predict(X_test) ############################### ########Validation Curve####### ############################### # train_scores, test_scores = validation_curve( # knn, X_train, y_train, param_name="n_neighbors", param_range=param_range, # cv=5, scoring="neg_brier_score", n_jobs=-1) # train_scores_mean = np.mean(train_scores, axis=1) # train_scores_std = np.std(train_scores, axis=1) # test_scores_mean = np.mean(test_scores, axis=1) # test_scores_std = np.std(test_scores, axis=1) # plt.title("Brier Score for KNN (Monks%d)" % counter) # plt.xlabel('N-neighbours') # plt.ylabel('Loss') # lw = 2 # #plt.ylim(0,1) # plt.plot(param_range, train_scores_mean, label="Training score", # color="darkorange", lw=lw) # plt.plot(param_range, test_scores_mean, label="Testing score", # color="navy", lw=lw) # plt.legend(loc="best") ######################################## #################Learning curve######### ######################################## plot_learning_curve(knn, title, X_train, y_train, ylim=(0, 1.01), cv=5, n_jobs=4) ############################################ print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print('Training Score is ', accuracy_score(y_train, X_pred)) print('Testing Score is ', accuracy_score(y_test, y_pred)) print('Training Error is ', mean_squared_error(y_train, X_pred)) print('Testing Error is ', mean_squared_error(y_test, y_pred)) plt.savefig('img/KNN/Student%d_learning' % counter) counter += 1 plt.clf()
def run(dbdata, dbdata2, DTclassifier, param_range, title): global counter X_train = dbdata.iloc[:, [1, 2, 3, 4, 5, 6]] #np.arange(32 y_train = dbdata.iloc[:, 0] #32 X_test = dbdata2.iloc[:, [1, 2, 3, 4, 5, 6]] #np.arange(32 y_test = dbdata2.iloc[:, 0] #32 ###### DTclassifier.fit(X_train, y_train) X_pred = DTclassifier.predict(X_train) y_pred = DTclassifier.predict(X_test) ######################## ########VALIDATION CURVE####### ######################## # train_scores, test_scores = validation_curve( # DTclassifier, X_train, y_train, param_name="random_state", param_range=param_range, # cv=3, scoring="neg_brier_score", n_jobs=-1) # train_scores_mean = np.mean(train_scores, axis=1) # train_scores_std = np.std(train_scores, axis=1) # test_scores_mean = np.mean(test_scores, axis=1) # test_scores_std = np.std(test_scores, axis=1) # plt.title("Brier Score of DTC(MONKS%d)" % counter) # plt.xlabel('Random State') # plt.ylabel("Loss") # lw = 2 # #plt.ylim(0,1) # plt.plot(param_range, train_scores_mean, label="Training score", # color="darkorange", lw=lw) # plt.plot(param_range, test_scores_mean, label="Testing score", # color="navy", lw=lw) # plt.legend(loc="best") # ################################## ######################################## ######################################## #################LEARNING CURVE######### ######################################## plot_learning_curve(DTclassifier, title, X_train, y_train, ylim=(0, 1.01), cv=5, n_jobs=4) print('Training Score is ', accuracy_score(y_train, X_pred)) print('Testing Score is ', accuracy_score(y_test, y_pred)) print('Training Error is ', mean_squared_error(y_train, X_pred)) print('Testing Error is ', mean_squared_error(y_test, y_pred)) plt.savefig('img/DTC/Students%d' % counter) counter += 1 plt.clf()
def run(dbdata, dbdata2, svclassifier, param_range, title): global counter X_train = dbdata.iloc[:, np.arange(32)] y_train = dbdata.iloc[:, 32] X_test = dbdata2.iloc[:, np.arange(32)] y_test = dbdata2.iloc[:, 32] #####Graphs the best results obtained from the gridsearch svclassifier.fit(X_train, y_train) X_pred = svclassifier.predict(X_train) y_pred = svclassifier.predict(X_test) ######################## ########VALIDATION CURVE####### ######################## # train_scores, test_scores = validation_curve( # svclassifier, X_train, y_train, param_name="C", param_range=param_range, # cv=3, scoring="accuracy", n_jobs=-1) # train_scores_mean = np.mean(train_scores, axis=1) # train_scores_std = np.std(train_scores, axis=1) # test_scores_mean = np.mean(test_scores, axis=1) # test_scores_std = np.std(test_scores, axis=1) # plt.title("Accuracy Score of SVM(MONKS%d)" % counter) # plt.xlabel('C') # plt.ylabel("Accuracy") # lw = 2 # plt.ylim(0,1) # plt.plot(param_range, train_scores_mean, label="Training score", # color="darkorange", lw=lw) # plt.plot(param_range, test_scores_mean, label="Testing score", # color="navy", lw=lw) # plt.legend(loc="best") ######################################## ######################################## #################LEARNING CURVE######### ######################################## plot_learning_curve(svclassifier, title, X_train, y_train, ylim=(0, 1.01), cv=5, n_jobs=4) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print('Training Score is ', accuracy_score(y_train, X_pred)) print('Testing Score is ', accuracy_score(y_test, y_pred)) print('Training Error is ', mean_squared_error(y_train, X_pred)) print('Testing Error is ', mean_squared_error(y_test, y_pred)) plt.savefig('img/SVM/Students%d' % counter) counter += 1 plt.clf()
def mlp_regression(X, y, cv): parameters = {'alpha': 10.0**-np.arange(1, 7)} score_func = make_scorer(pearson_cor, greater_is_better=True) mlp = MLPRegressor(max_iter=800, hidden_layer_sizes=(200, 200), activation='tanh') best_mlp, best_params_mlp = cross_val(mlp, params=parameters, X_train=X, y_train=y, score=score_func, cv=cv, n_jobs=-1) title = r"Learning curves (MLP regression)" plt, test_scores = plot_learning_curve(best_mlp, title, X, y, ylim=(0.0, 1.0), cv=cv, n_jobs=4, scoring=score_func) plt.show() print("best mlp:", best_mlp) print("best para:", best_params_mlp) return best_mlp, test_scores
def support_machine_classification(X, y, cv): parameters = [ { 'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000] }, { 'kernel': ['rbf'], 'C': [0.1, 0.2, 0.25, 0.35, 0.5, 1, 10, 100, 400, 1000, 2500], 'gamma': [0.01, 0.5, 1, 5, 10, 100] }, # {'kernel': ['poly'], 'C': [1, 10, 100, 1000], 'degree': [3, 4], 'gamma': [0.01, 1, 5, 10, 100]} ] # define a scoring function score_func = make_scorer(pearson_cor, greater_is_better=True) svc = SVC() # Get the best model through CV best_svc, best_params_clf = cross_val(svc, params=parameters, X_train=X, y_train=y, score=score_func, cv=cv, n_jobs=-1) title_clf = r"Learning Curves (SVC, rbf kernel)" plot_learning_curve(best_svc, title_clf, X, y, ylim=(0.0, 1.0), cv=cv, n_jobs=4, scoring=score_func) plt.show() print("best svc:", best_svc) print("best para:", best_params_clf) return best_svc
def random_forest(X, y, cv): score_func = make_scorer(pearson_cor, greater_is_better=True) rfg = ExtraTreesRegressor(max_features=8) title = r"Learning curves (Random Forest)" plt, test_scores = plot_learning_curve(rfg, title, X, y, ylim=(0.0, 1.0), cv=cv, n_jobs=4, scoring=score_func) plt.show() return rfg, test_scores
# Use parameters from either the hyperparameter optimization, or manually selected parameters... params = best_params print "Generating SGDClassifier model with parameters: ", params sgd = SGDClassifier(**params) print 'Plot learning curve...' cv = ShuffleSplit(X.shape[0], n_iter=25, test_size=0.2, random_state=np.random.randint(0, 123456789)) title = "SGDClassifier: ", params learningcurve.plot_learning_curve(sgd, title, X, y, ylim=(0.5, 1.0), cv=cv, n_jobs=-1) test_data = X_test.values Xt = test_data[:, 1::] yt = test_data[:, 0] print "Training model with", train_data.shape[0], "examples" print "Testing model with", test_data.shape[0], "examples" print "Submitting predicted labels for", submit_df.shape[0], "records" test_scores = [] # Using the optimal parameters, predict the survival of the labeled test set for i in range(5):
#============================================================================================================== # Use parameters from either the hyperparameter optimization, or manually selected parameters... params = params_score #params = best_params ############################################################################################################# # Model generation/validation # print "Generating RandomForestClassifier model with parameters: ", params forest = RandomForestClassifier(n_jobs=-1, oob_score=True, **params) print "\nCalculating Learning Curve..." title = "RandomForestClassifier with hyperparams: ", params midpoint, diff = \ learningcurve.plot_learning_curve(forest, title, X, y, (0.6, 1.01), cv=8, n_jobs=-1, plot=True) #print "Midpoint:", midpoint #print "Diff:", diff print "\nGenerating ROC curve 5 times to get mean AUC with class weights..." aucs = [] for i in range(5): aucs.append(roc_auc.generate_roc_curve(forest, X, y, survived_weight)) auc_mean = ("%.3f" % (np.mean(aucs))).lstrip('0') auc_std = ("%.3f" % (np.std(aucs))).lstrip('0') auc_lower = ("%.3f" % (np.mean(aucs) - np.std(aucs))).lstrip('0') print "ROC - Area under curve:", auc_mean, "and stddev:", auc_std print "\nFitting model 5 times to get mean OOB score using full training data with class weights..." test_scores = [] # Using the optimal parameters, predict the survival of the labeled test set 10 times
params = params_score #params = best_params ############################################################################################################# # Model generation/validation # print "Generating RandomForestClassifier model with parameters: ", params forest = RandomForestClassifier(n_jobs=-1, oob_score=True, **params) print "\nCalculating Learning Curve..." title = "RandomForestClassifier with hyperparams: ", params midpoint, diff = \ learningcurve.plot_learning_curve(forest, title, X, y, (0.6, 1.01), cv=8, n_jobs=-1, plot=True, verbose=10) #print "Midpoint:", midpoint #print "Diff:", diff print "\nGenerating ROC curve 5 times to get mean AUC with class weights..." aucs = [] for i in range(5): aucs.append(roc_auc.generate_roc_curve(forest, X, y, survived_weight)) auc_mean = ("%.3f"%(np.mean(aucs))).lstrip('0') auc_std = ("%.3f"%(np.std(aucs))).lstrip('0') auc_lower = ("%.3f"%(np.mean(aucs)-np.std(aucs))).lstrip('0') print "ROC - Area under curve:", auc_mean, "and stddev:", auc_std print "\nFitting model 5 times to get mean OOB score using full training data with class weights..."
# random_search.fit(train_data[0::,1::], train_data[0::,0]) # best_params = report(random_search.grid_scores_) #================================================================================================================== # Plot the learning curve for the model cv = sklearn.cross_validation.ShuffleSplit(X.shape[0], n_iter=10, train_size=0.7, test_size=0.3, random_state=np.random.randint( 0, 123456789)) title = "Learning Curves (BernoulliNB)" bnb = naive_bayes.BernoulliNB() learningcurve.plot_learning_curve(bnb, title, X, y, (0.6, 0.9), cv=cv, n_jobs=1) # Using the optimal parameters, predict the survival of the test set print 'Predicting...' bnb = naive_bayes.BernoulliNB() bnb.fit(train_data[0::, 1::], train_data[0::, 0]) output = bnb.predict(test_data).astype(int) # write results predictions_file = open( "data/results/naivebayes_bernoulli" + str(int(time.time())) + ".csv", "wb") open_file_object = csv.writer(predictions_file) open_file_object.writerow(["PassengerId", "Survived"]) open_file_object.writerows(zip(ids, output))
sys.exit() #============================================================================================================== # print 'Hyperparameter optimization via GridSearchCV...' # grid_search = GridSearchCV(svc, rbf_params, cv=20, n_jobs=-1, verbose=2) # grid_search.fit(X, y) # best_params = report(grid_search.grid_scores_) #============================================================================================================== # Plot the learning curve for the model with the best parameters print 'Plotting learning curve...' cv = ShuffleSplit(X.shape[0], n_iter=20, test_size=0.33, random_state=np.random.randint(0,123456789)) title = "SVC(RBF): ", best_params svc = SVC(**best_params) learningcurve.plot_learning_curve(svc, title, X, y, ylim=(0.5, 1.0), cv=cv, n_jobs=-1) sys.exit() # Using the optimal parameters, predict the survival of the test set print 'Predicting test set...' #================================================================================================================== # for train_ix, val_ix in cv: # sgd.fit(X[train_ix], y[train_ix]) # val_pred = sgd.predict(X[val_ix]) # print "cross val accuracy score: ", metrics.accuracy_score(y[val_ix], val_pred) #================================================================================================================== svc.fit(X, y) output = svc.predict(test_data).astype(int)
grid_search = GridSearchCV(forest, params, n_jobs=-1) grid_search.fit(X, y) best_params = scorereport.report(grid_search.grid_scores_) # Use parameters from either the hyperparameter optimization, or manually selected parameters... params = best_params print "Generating RandomForestClassifier model with parameters: ", params forest = RandomForestClassifier(n_jobs=-1, **params) print "Plot Learning Curve..." cv = cross_validation.ShuffleSplit(train_data.shape[0], n_iter=5, test_size=0.25, \ random_state=np.random.randint(0,123456789)) title = "RandomForestClassifier with hyperparams: ", params learningcurve.plot_learning_curve(forest, title, X, y, (0.6, 1.01), cv=cv, n_jobs=-1) test_data = X_test[X_test.Gender==gender].drop('Gender', axis=1).values Xt = test_data[:, 1::] yt = test_data[:, 0] print "Training", gender, "model with", train_data.shape[0], "examples" print "Testing", gender, "model with", test_data.shape[0], "examples" print "Submitting predicted labels for", submit_df.shape[0], "records" test_scores = [] # Using the optimal parameters, predict the survival of the labeled test set for i in range(5): print "Predicting test set for submission..." forest.fit(X, y)
from lassocv_util import lasso_cv_x, load_data from learningcurve import plot_learning_curve from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LassoCV, Lasso, RidgeCV import numpy as np import pandas as pd from sklearn.utils import shuffle from sklearn.model_selection import train_test_split if __name__ == '__main__': elf_lr = LogisticRegression(max_iter=5000, solver="liblinear", C=1) x, y, df = load_data('merged_file_f.csv') x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1215) lasso_x_train, lasso_x_test, lasso_f_columns = lasso_cv_x( x_train, x_test, y_train, y_test) plot_learning_curve(elf_lr, lasso_x_train, y_train, model_name='Logistic Regression')
results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) model.fit(X_train, y_train) t = model.predict(X_test) print(msg + " Test score: " + str(r2_score(y_test, t))) # boxplot algorithm comparison fig = plt.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) ax = sns.boxplot(x=['KNN', 'SVR', 'GDB', 'RF', 'ETC', 'BC', 'STREGR'], y=results) ax.set_xticklabels(names) plt.show() # fig.savefig('plot/algorithm_compare.png', format='png', dpi=1200) # Draw learning curve of each models import learningcurve for name, model in models: title = "Learning Curves (" + name + ")" estimator = model p1 = learningcurve.plot_learning_curve(estimator, title, X_train, y_train, ylim=(0.1, 1.01), cv=kfold, n_jobs=4)
grid_search.fit(X, y) best_params = scorereport.report(grid_search.grid_scores_) # Use parameters from either the hyperparameter optimization, or manually selected parameters... params = best_params print "Generating RandomForestClassifier model with parameters: ", params forest = RandomForestClassifier(n_jobs=-1, **params) print "Plot Learning Curve..." cv = cross_validation.ShuffleSplit(train_data.shape[0], n_iter=5, test_size=0.25, \ random_state=np.random.randint(0,123456789)) title = "RandomForestClassifier with hyperparams: ", params learningcurve.plot_learning_curve(forest, title, X, y, (0.6, 1.01), cv=cv, n_jobs=-1) test_data = X_test[X_test.Gender == gender].drop('Gender', axis=1).values Xt = test_data[:, 1::] yt = test_data[:, 0] print "Training", gender, "model with", train_data.shape[0], "examples" print "Testing", gender, "model with", test_data.shape[0], "examples" print "Submitting predicted labels for", submit_df.shape[0], "records" test_scores = [] # Using the optimal parameters, predict the survival of the labeled test set for i in range(5):
###### import learningcurve from sklearn.model_selection import ShuffleSplit title = "Learning Curves (RandomForestRegressor)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) estimator = rf = RandomForestRegressor(n_estimators=best_n, max_features=best_f) p1 = learningcurve.plot_learning_curve(estimator, title, X_train, y_train, ylim=(0.1, 1.01), cv=10, n_jobs=4) p1.grid() p1.savefig('LearningCurve_rf.png', format='png', dpi=1200) title = "Learning Curves (SVM, RBF, C = 100)" # SVC is more expensive so we do a lower number of CV iterations: estimator = svm.SVR(C=best_c, epsilon=best_e, kernel='rbf') p2 = learningcurve.plot_learning_curve(estimator, title, X_train, y_train, (0.5, 1.01), cv=10,
# "binarize": np.random.rand()} # # run randomized search to find the optimal parameters # n_iter_search = 50 # bnb = naive_bayes.BernoulliNB() # random_search = RandomizedSearchCV(bnb, param_distributions=params, n_iter=n_iter_search) # random_search.fit(train_data[0::,1::], train_data[0::,0]) # best_params = report(random_search.grid_scores_) #================================================================================================================== # Plot the learning curve for the model cv = sklearn.cross_validation.ShuffleSplit(X.shape[0], n_iter=100, train_size=0.7, test_size=0.3, random_state=np.random.randint(0,123456789)) title = "Learning Curves (GaussianNB)" gnb = naive_bayes.GaussianNB() learningcurve.plot_learning_curve(gnb, title, X, y, (0.6, 0.9), cv=cv, n_jobs=1) # Using the optimal parameters, predict the survival of the test set print 'Predicting...' bnb = naive_bayes.BernoulliNB() bnb.fit(train_data[0::,1::], train_data[0::,0]) output = bnb.predict(test_data).astype(int) # write results predictions_file = open("data/results/naivebayes_gaussian" + str(int(time.time())) + ".csv", "wb") open_file_object = csv.writer(predictions_file) open_file_object.writerow(["PassengerId","Survived"]) open_file_object.writerows(zip(ids, output)) predictions_file.close() print 'Done.'
# Get the best model through CV best_svr, best_params_rg = cross_val(svr, params=parameters, X_train=X_rg_scaled, y_train=y_rg, score=score_func, cv=cv, n_jobs=-1) print("best svr:", best_svr) print("best para:", best_params_rg) title_rg = r"Learning Curves (SVR, rbf kernel)" plot_learning_curve(best_svr, title_rg, X_rg_scaled, y_rg, ylim=(0.0, 0.8), cv=cv, n_jobs=4, scoring=score_func) plt.show() # X_train, X_test, y_train, y_test = train_test_split(X_rg_scaled, y_rg, test_size=0.33, random_state=42) X_train = X_rg_scaled[100:] y_train = y_rg[100:] X_test = X_rg_scaled[:100] y_test = y_rg[:100] best_svr.fit(X_train, y_train) y_predict = best_svr.predict(X_test) print(pearson_cor(y_test, y_predict))