예제 #1
0
파일: script.py 프로젝트: apb7/Kaggle
def gbm():
    global features_train, labels_train, features_test
    from sklearn.ensemble import GradientBoostingClassifier as gbc
    from sklearn.grid_search import GridSearchCV as gscv
    param = {
        'learning_rate': [0.1, 0.01, 0.3, 0.4, 0.5, 0.2],
        "n_estimators": [10, 50, 100],
        'min_samples_split': [2, 5, 10, 15, 20, 25, 30]
    }
    svr = gbc()
    clf = gscv(svr, param)
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)
    return pred
예제 #2
0
def support_vector_classifier(f_train,l_train,f_test):
    from sklearn.grid_search import GridSearchCV as gscv
    from sklearn.svm import  SVC
    import time
    param={'kernel':('linear','rbf'),'C':[1,2,5,10,15,20]}
    svr=SVC()
    clf=gscv(svr,param_grid=param)
    #clf=SVC(kernel='linear')
    start_time=time.time()
    clf.fit(f_train,l_train)
    print("Training Time: %s seconds"%(time.time()-start_time))
    start_time=time.time()
    pred=clf.predict(f_test)
    print("Predicting Time: %s seconds"%(time.time()-start_time))
    print(clf.best_params_)

    return pred
예제 #3
0
파일: assigment.py 프로젝트: radubl/CS342
def regressAba(data):

	datasetVariants = getAbaData(data)
	target = datasetVariants[1]
	datasetVariants = datasetVariants[0]

	exhaustiveCVPipeline = {
		'OLS' : {
			'model' : lm.LinearRegression(),
			'parameters' : {}
		},
		'Ridge' : {
			'model' : lm.Ridge(),
			'parameters' :  {'alpha' : np.arange(0.05, 1, 0.05)}
		# },
		# 'Lasso' : {
		# 	'model' : lm.Lasso(),
		# 	'parameters' :  {'alpha' : np.arange(0.05, 1, 0.05)}
		# },
		# 'k-NN' : {
		# 	'model' : neighbors.KNeighborsRegressor(),
		# 	'parameters' :  {'weights' : ['uniform','distance'],
		# 					 'leaf_size' : np.arange(5, 100, 1),
		# 					 'n_neighbors' : np.arange(3, 100, 1)}
		# },
		# 'D-Trees' : {
		# 	'model' : tree.DecisionTreeRegressor(),
		# 	'parameters' :  {'max_depth' : np.arange(5, 100, 1)}
		}}

	print 'Starting Exhausting Regression Search on Abalone Data:\n'

	for description,data in datasetVariants.items():
		print '\nUsing dataset: ' + description + '\n'

		for modelName,attributes in exhaustiveCVPipeline.items():

			gscv_instance = gscv(attributes['model'], attributes['parameters'], cv = 10)

			copy = data.copy()

			gscv_instance.fit(copy,target)

			print modelName, gscv_instance.best_score_, gscv_instance.best_params_

#RF PARAMETER TUNING: RED------------------------------------------------------

#instantiate model
rf = rfc()

#specify parameter options for number of trees, max number of features and criterion
tree_range = range(20, 1520, 20)
feature_list= ['sqrt',5,7,9,11]
criterion_list=['gini','entropy']
param_grid = dict(n_estimators=tree_range, max_features=feature_list, criterion=criterion_list)
iterations=len(tree_range)*len(feature_list)*len(criterion_list)

#run grid search with F1 scoring
rfgrid_red_unscaled_f1 = gscv(rf, param_grid, cv=5, verbose=5, scoring='f1')
rfgrid_red_unscaled_f1.fit(dfr_exp, dfr_res)

#store results in data frame
rfgrid_red_unscaled_f1_results=pd.DataFrame(index=range(0,iterations),columns=['n_estimators','max_features','criterion','mean_score','all_scores'])
rfgrid_red_unscaled_f1_results.mean_score=[result[1] for result in rfgrid_red_unscaled_f1.grid_scores_]
rfgrid_red_unscaled_f1_results.all_scores=[result[2] for result in rfgrid_red_unscaled_f1.grid_scores_]
rfgrid_red_unscaled_f1_params=[result[0] for result in rfgrid_red_unscaled_f1.grid_scores_]
for i in range(0,iterations):
    rfgrid_red_unscaled_f1_results.n_estimators[i]=rfgrid_red_unscaled_f1_params[i]['n_estimators']
    rfgrid_red_unscaled_f1_results.max_features[i]=rfgrid_red_unscaled_f1_params[i]['max_features']
    rfgrid_red_unscaled_f1_results.criterion[i]=rfgrid_red_unscaled_f1_params[i]['criterion']

#save results to file; retrieve when needed
rfgrid_red_unscaled_f1_results.to_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/rfgrid_red_unscaled_f1_results.txt', sep='\t', header=True)
#rfgrid_red_unscaled_f1_results=pd.read_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/rfgrid_red_unscaled_f1_results.txt', sep='\t', header=False, names=['n_estimators','max_features','criterion','mean_score','all_scores'])
#identify best estimator
dt_estimator = dt_rfe_cv.estimator_

#predict classes
dt_predictions = pd.Series(dt_estimator.predict(post2000_exp_scaled[dt_features]))

#cross predicted vs actual
post2000_res.index = dt_predictions.index
dt_crosstab = pd.crosstab(post2000_res, dt_predictions, rownames=['Actual'], 
                          colnames=['Predicted'], margins=True)
print dt_crosstab

#combine RFE with grid search to find optimal tuning parameter and features
depth_range = range(2, 10)
param_grid = dict(estimator__max_depth=depth_range)
dt_rfe_gs = gscv(dt_rfe_cv, param_grid, cv=10, scoring='roc_auc')
dt_rfe_gs.fit(pre2000_exp_scaled, pre2000_res)

#show and plot results (optimal max depth is 5)
print dt_rfe_gs.best_params_
print dt_rfe_gs.grid_scores_ 

dt_grid_mean_scores = [score[1] for score in dt_rfe_gs.grid_scores_]
plt.figure()
plt.plot(depth_range, dt_grid_mean_scores)
plt.hold(True)
plt.plot(dt_rfe_gs.best_params_['estimator__max_depth'], dt_rfe_gs.best_score_, 'ro', 
         markersize=12, markeredgewidth=1.5,markerfacecolor='None', markeredgecolor='r')
plt.grid(True)

#identify best estimator
예제 #6
0
#set missings to 0
df.fillna(value=0, inplace=True)
df.isnull().sum()

#set explanatory and response variables
explanatory = [
    col for col in df.columns if col not in ['playerid', 'inducted', 'year']
]
df_exp = df[explanatory]
df_res = df.inducted

#KNN
knn = knc(p=2)  #specify Euclidean distance

param_grid = dict(n_neighbors=range(1, 30, 2))  #set up grid for results
kn_accuracy = gscv(knn, param_grid, cv=10,
                   scoring='accuracy').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1, 30, 2))  #set up grid for results
kn_f1 = gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1, 30, 2))  #set up grid for results
kn_auc = gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res)

#Naive Bayes
nb = mnb()
nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy')
nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1')
nb_auc = cvs(nbclass, df_exp, df_res, cv=10, scoring='roc_auc')

#Decision Tree
dtree = tr.DecisionTreeClassifier(criterion='gini',
예제 #7
0
#RF PARAMETER TUNING: RED------------------------------------------------------

#instantiate model
rf = rfc()

#specify parameter options for number of trees, max number of features and criterion
tree_range = range(20, 1520, 20)
feature_list = ['sqrt', 5, 7, 9, 11]
criterion_list = ['gini', 'entropy']
param_grid = dict(n_estimators=tree_range,
                  max_features=feature_list,
                  criterion=criterion_list)
iterations = len(tree_range) * len(feature_list) * len(criterion_list)

#run grid search with F1 scoring
rfgrid_red_unscaled_f1 = gscv(rf, param_grid, cv=5, verbose=5, scoring='f1')
rfgrid_red_unscaled_f1.fit(dfr_exp, dfr_res)

#store results in data frame
rfgrid_red_unscaled_f1_results = pd.DataFrame(index=range(0, iterations),
                                              columns=[
                                                  'n_estimators',
                                                  'max_features', 'criterion',
                                                  'mean_score', 'all_scores'
                                              ])
rfgrid_red_unscaled_f1_results.mean_score = [
    result[1] for result in rfgrid_red_unscaled_f1.grid_scores_
]
rfgrid_red_unscaled_f1_results.all_scores = [
    result[2] for result in rfgrid_red_unscaled_f1.grid_scores_
]
# identify best estimator
dt_estimator = dt_rfe_cv.estimator_

# predict classes
dt_predictions = pd.Series(dt_estimator.predict(post2000_exp_scaled[dt_features]))

# cross predicted vs actual
post2000_res.index = dt_predictions.index
dt_crosstab = pd.crosstab(post2000_res, dt_predictions, rownames=["Actual"], colnames=["Predicted"], margins=True)
print dt_crosstab

# combine RFE with grid search to find optimal tuning parameter and features
depth_range = range(2, 10)
param_grid = dict(estimator__max_depth=depth_range)
dt_rfe_gs = gscv(dt_rfe_cv, param_grid, cv=10, scoring="roc_auc")
dt_rfe_gs.fit(pre2000_exp_scaled, pre2000_res)

# show and plot results (optimal max depth is 5)
print dt_rfe_gs.best_params_
print dt_rfe_gs.grid_scores_

dt_grid_mean_scores = [score[1] for score in dt_rfe_gs.grid_scores_]
plt.figure()
plt.plot(depth_range, dt_grid_mean_scores)
plt.hold(True)
plt.plot(
    dt_rfe_gs.best_params_["estimator__max_depth"],
    dt_rfe_gs.best_score_,
    "ro",
    markersize=12,
df.shape #players=1130

#set missings to 0
df.fillna(value=0, inplace=True)
df.isnull().sum()

#set explanatory and response variables
explanatory = [col for col in df.columns if col not in ['playerid', 'inducted','year']]
df_exp = df[explanatory]
df_res = df.inducted

#KNN
knn=knc(p = 2) #specify Euclidean distance

param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results
kn_accuracy=gscv(knn, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results
kn_f1=gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res)

param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results
kn_auc=gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res)

#Naive Bayes
nb = mnb()
nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy')
nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1')
nb_auc = cvs(nbclass, df_exp, df_res, cv=10, scoring='roc_auc')

#Decision Tree
dtree = tr.DecisionTreeClassifier(criterion = 'gini', splitter = 'best', max_features = None, max_depth = None,min_samples_split = 2, min_samples_leaf = 2, max_leaf_nodes = None)
예제 #10
0
#FIRST MODEL FIT: CATEGORICAL POSITION VARIABLES

#select explanatory variables
explanatory_train1 = sql_train[['pitcher','catcher','dhitter']]
explanatory_test1 = sql_test[['pitcher','catcher','dhitter']]

#run KNN

from sklearn.neighbors import KNeighborsClassifier as knc
knn=knc(p = 2) #specify Euclidean distance
k_range = range(1,30, 2) #specify range of k to test, every second number from 1 to 30, this is number of neighbors
param_grid = dict(n_neighbors=k_range) #set up grid for results

from sklearn.grid_search import GridSearchCV as gscv
grid=gscv(knn, param_grid, cv=10, scoring='accuracy') #instantiate model
grid.fit(explanatory_train1, response_train) #fit model

grid_mean_scores_1 = [result[1] for result in grid.grid_scores_]
best_score_1 = grid.best_score_
best_param_1 = grid.best_params_
knn_optimal_1 = grid.best_estimator_

best_score_1
best_param_1
knn_optimal_1

knn_optimal_pred_1 = knn_optimal_1.predict(explanatory_test1)
accuracy_1 = len(response_test[response_test == knn_optimal_pred_1]) / len(response_test)

accuracy_1