def gbm(): global features_train, labels_train, features_test from sklearn.ensemble import GradientBoostingClassifier as gbc from sklearn.grid_search import GridSearchCV as gscv param = { 'learning_rate': [0.1, 0.01, 0.3, 0.4, 0.5, 0.2], "n_estimators": [10, 50, 100], 'min_samples_split': [2, 5, 10, 15, 20, 25, 30] } svr = gbc() clf = gscv(svr, param) clf.fit(features_train, labels_train) pred = clf.predict(features_test) return pred
def support_vector_classifier(f_train,l_train,f_test): from sklearn.grid_search import GridSearchCV as gscv from sklearn.svm import SVC import time param={'kernel':('linear','rbf'),'C':[1,2,5,10,15,20]} svr=SVC() clf=gscv(svr,param_grid=param) #clf=SVC(kernel='linear') start_time=time.time() clf.fit(f_train,l_train) print("Training Time: %s seconds"%(time.time()-start_time)) start_time=time.time() pred=clf.predict(f_test) print("Predicting Time: %s seconds"%(time.time()-start_time)) print(clf.best_params_) return pred
def regressAba(data): datasetVariants = getAbaData(data) target = datasetVariants[1] datasetVariants = datasetVariants[0] exhaustiveCVPipeline = { 'OLS' : { 'model' : lm.LinearRegression(), 'parameters' : {} }, 'Ridge' : { 'model' : lm.Ridge(), 'parameters' : {'alpha' : np.arange(0.05, 1, 0.05)} # }, # 'Lasso' : { # 'model' : lm.Lasso(), # 'parameters' : {'alpha' : np.arange(0.05, 1, 0.05)} # }, # 'k-NN' : { # 'model' : neighbors.KNeighborsRegressor(), # 'parameters' : {'weights' : ['uniform','distance'], # 'leaf_size' : np.arange(5, 100, 1), # 'n_neighbors' : np.arange(3, 100, 1)} # }, # 'D-Trees' : { # 'model' : tree.DecisionTreeRegressor(), # 'parameters' : {'max_depth' : np.arange(5, 100, 1)} }} print 'Starting Exhausting Regression Search on Abalone Data:\n' for description,data in datasetVariants.items(): print '\nUsing dataset: ' + description + '\n' for modelName,attributes in exhaustiveCVPipeline.items(): gscv_instance = gscv(attributes['model'], attributes['parameters'], cv = 10) copy = data.copy() gscv_instance.fit(copy,target) print modelName, gscv_instance.best_score_, gscv_instance.best_params_
#RF PARAMETER TUNING: RED------------------------------------------------------ #instantiate model rf = rfc() #specify parameter options for number of trees, max number of features and criterion tree_range = range(20, 1520, 20) feature_list= ['sqrt',5,7,9,11] criterion_list=['gini','entropy'] param_grid = dict(n_estimators=tree_range, max_features=feature_list, criterion=criterion_list) iterations=len(tree_range)*len(feature_list)*len(criterion_list) #run grid search with F1 scoring rfgrid_red_unscaled_f1 = gscv(rf, param_grid, cv=5, verbose=5, scoring='f1') rfgrid_red_unscaled_f1.fit(dfr_exp, dfr_res) #store results in data frame rfgrid_red_unscaled_f1_results=pd.DataFrame(index=range(0,iterations),columns=['n_estimators','max_features','criterion','mean_score','all_scores']) rfgrid_red_unscaled_f1_results.mean_score=[result[1] for result in rfgrid_red_unscaled_f1.grid_scores_] rfgrid_red_unscaled_f1_results.all_scores=[result[2] for result in rfgrid_red_unscaled_f1.grid_scores_] rfgrid_red_unscaled_f1_params=[result[0] for result in rfgrid_red_unscaled_f1.grid_scores_] for i in range(0,iterations): rfgrid_red_unscaled_f1_results.n_estimators[i]=rfgrid_red_unscaled_f1_params[i]['n_estimators'] rfgrid_red_unscaled_f1_results.max_features[i]=rfgrid_red_unscaled_f1_params[i]['max_features'] rfgrid_red_unscaled_f1_results.criterion[i]=rfgrid_red_unscaled_f1_params[i]['criterion'] #save results to file; retrieve when needed rfgrid_red_unscaled_f1_results.to_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/rfgrid_red_unscaled_f1_results.txt', sep='\t', header=True) #rfgrid_red_unscaled_f1_results=pd.read_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/rfgrid_red_unscaled_f1_results.txt', sep='\t', header=False, names=['n_estimators','max_features','criterion','mean_score','all_scores'])
#identify best estimator dt_estimator = dt_rfe_cv.estimator_ #predict classes dt_predictions = pd.Series(dt_estimator.predict(post2000_exp_scaled[dt_features])) #cross predicted vs actual post2000_res.index = dt_predictions.index dt_crosstab = pd.crosstab(post2000_res, dt_predictions, rownames=['Actual'], colnames=['Predicted'], margins=True) print dt_crosstab #combine RFE with grid search to find optimal tuning parameter and features depth_range = range(2, 10) param_grid = dict(estimator__max_depth=depth_range) dt_rfe_gs = gscv(dt_rfe_cv, param_grid, cv=10, scoring='roc_auc') dt_rfe_gs.fit(pre2000_exp_scaled, pre2000_res) #show and plot results (optimal max depth is 5) print dt_rfe_gs.best_params_ print dt_rfe_gs.grid_scores_ dt_grid_mean_scores = [score[1] for score in dt_rfe_gs.grid_scores_] plt.figure() plt.plot(depth_range, dt_grid_mean_scores) plt.hold(True) plt.plot(dt_rfe_gs.best_params_['estimator__max_depth'], dt_rfe_gs.best_score_, 'ro', markersize=12, markeredgewidth=1.5,markerfacecolor='None', markeredgecolor='r') plt.grid(True) #identify best estimator
#set missings to 0 df.fillna(value=0, inplace=True) df.isnull().sum() #set explanatory and response variables explanatory = [ col for col in df.columns if col not in ['playerid', 'inducted', 'year'] ] df_exp = df[explanatory] df_res = df.inducted #KNN knn = knc(p=2) #specify Euclidean distance param_grid = dict(n_neighbors=range(1, 30, 2)) #set up grid for results kn_accuracy = gscv(knn, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1, 30, 2)) #set up grid for results kn_f1 = gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1, 30, 2)) #set up grid for results kn_auc = gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res) #Naive Bayes nb = mnb() nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy') nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1') nb_auc = cvs(nbclass, df_exp, df_res, cv=10, scoring='roc_auc') #Decision Tree dtree = tr.DecisionTreeClassifier(criterion='gini',
#RF PARAMETER TUNING: RED------------------------------------------------------ #instantiate model rf = rfc() #specify parameter options for number of trees, max number of features and criterion tree_range = range(20, 1520, 20) feature_list = ['sqrt', 5, 7, 9, 11] criterion_list = ['gini', 'entropy'] param_grid = dict(n_estimators=tree_range, max_features=feature_list, criterion=criterion_list) iterations = len(tree_range) * len(feature_list) * len(criterion_list) #run grid search with F1 scoring rfgrid_red_unscaled_f1 = gscv(rf, param_grid, cv=5, verbose=5, scoring='f1') rfgrid_red_unscaled_f1.fit(dfr_exp, dfr_res) #store results in data frame rfgrid_red_unscaled_f1_results = pd.DataFrame(index=range(0, iterations), columns=[ 'n_estimators', 'max_features', 'criterion', 'mean_score', 'all_scores' ]) rfgrid_red_unscaled_f1_results.mean_score = [ result[1] for result in rfgrid_red_unscaled_f1.grid_scores_ ] rfgrid_red_unscaled_f1_results.all_scores = [ result[2] for result in rfgrid_red_unscaled_f1.grid_scores_ ]
# identify best estimator dt_estimator = dt_rfe_cv.estimator_ # predict classes dt_predictions = pd.Series(dt_estimator.predict(post2000_exp_scaled[dt_features])) # cross predicted vs actual post2000_res.index = dt_predictions.index dt_crosstab = pd.crosstab(post2000_res, dt_predictions, rownames=["Actual"], colnames=["Predicted"], margins=True) print dt_crosstab # combine RFE with grid search to find optimal tuning parameter and features depth_range = range(2, 10) param_grid = dict(estimator__max_depth=depth_range) dt_rfe_gs = gscv(dt_rfe_cv, param_grid, cv=10, scoring="roc_auc") dt_rfe_gs.fit(pre2000_exp_scaled, pre2000_res) # show and plot results (optimal max depth is 5) print dt_rfe_gs.best_params_ print dt_rfe_gs.grid_scores_ dt_grid_mean_scores = [score[1] for score in dt_rfe_gs.grid_scores_] plt.figure() plt.plot(depth_range, dt_grid_mean_scores) plt.hold(True) plt.plot( dt_rfe_gs.best_params_["estimator__max_depth"], dt_rfe_gs.best_score_, "ro", markersize=12,
df.shape #players=1130 #set missings to 0 df.fillna(value=0, inplace=True) df.isnull().sum() #set explanatory and response variables explanatory = [col for col in df.columns if col not in ['playerid', 'inducted','year']] df_exp = df[explanatory] df_res = df.inducted #KNN knn=knc(p = 2) #specify Euclidean distance param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results kn_accuracy=gscv(knn, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results kn_f1=gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results kn_auc=gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res) #Naive Bayes nb = mnb() nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy') nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1') nb_auc = cvs(nbclass, df_exp, df_res, cv=10, scoring='roc_auc') #Decision Tree dtree = tr.DecisionTreeClassifier(criterion = 'gini', splitter = 'best', max_features = None, max_depth = None,min_samples_split = 2, min_samples_leaf = 2, max_leaf_nodes = None)
#FIRST MODEL FIT: CATEGORICAL POSITION VARIABLES #select explanatory variables explanatory_train1 = sql_train[['pitcher','catcher','dhitter']] explanatory_test1 = sql_test[['pitcher','catcher','dhitter']] #run KNN from sklearn.neighbors import KNeighborsClassifier as knc knn=knc(p = 2) #specify Euclidean distance k_range = range(1,30, 2) #specify range of k to test, every second number from 1 to 30, this is number of neighbors param_grid = dict(n_neighbors=k_range) #set up grid for results from sklearn.grid_search import GridSearchCV as gscv grid=gscv(knn, param_grid, cv=10, scoring='accuracy') #instantiate model grid.fit(explanatory_train1, response_train) #fit model grid_mean_scores_1 = [result[1] for result in grid.grid_scores_] best_score_1 = grid.best_score_ best_param_1 = grid.best_params_ knn_optimal_1 = grid.best_estimator_ best_score_1 best_param_1 knn_optimal_1 knn_optimal_pred_1 = knn_optimal_1.predict(explanatory_test1) accuracy_1 = len(response_test[response_test == knn_optimal_pred_1]) / len(response_test) accuracy_1