test = pd.read_csv('./../data/testDatafinalData.csv') x_test = test.drop(['id'], axis=1) min_samples = range(2, 19, 4) max_depth = range(1, 20, 4) max_feature = ['auto', 'sqrt', 'log2'] #roc_auc parameters = { 'min_samples_split': min_samples, 'max_depth': max_depth, 'max_features': max_feature } clf_tree = tree.DecisionTreeClassifier() model = GridSearchCV(clf_tree, parameters, scoring='roc_auc') #model = tree.DecisionTreeClassifier() model.fit(x, y) print model.best_params_ print model.grid_scores_ score_sqrt = [] score_auto = [] score_log2 = [] for a in model.grid_scores_: if (a[0]['max_features'] == 'log2'): score_log2.append(a[1]) for a in model.grid_scores_: if (a[0]['max_features'] == 'auto'): score_auto.append(a[1])
# Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=0) # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1,10,100]}, {'kernel': ['linear'], 'C': [1,10,100]}] scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring='%s_weighted' % score) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) print() print("Detailed classification report:") print()
### parameter tuning with grid search from sklearn.grid_search import GridSearchCV from sklearn.feature_selection import SelectKBest from sklearn.pipeline import Pipeline select = SelectKBest() steps = [('feature_selection', select), ('random_forest', clf)] parameters = dict(feature_selection__k=[10, 15, 'all'], random_forest__n_estimators=[5, 10, 15, 20], random_forest__criterion=['gini', 'entropy'], random_forest__max_features=[1, 2, 3, 4], random_forest__min_samples_split=[2, 3, 4, 5]) pipeline = Pipeline(steps) cv = StratifiedKFold(labels, n_folds=10) grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv) grid_search.fit(features, labels) print 'Best score: {}'.format(grid_search.best_score_) print 'best parameters: {}'.format(grid_search.best_params_) ''' Best score: 0.861 best parameters: {'random_forest__min_samples_split': 4, 'random_forest__n_estimators': 20, 'feature_selection__k': 'all', 'random_forest__max_features': 4, 'random_forest__criterion': 'entropy'} ''' ''' Finalize the model ''' ### plot the cross-validation scores clf = RandomForestClassifier(max_features=4, min_samples_split=4, criterion='entropy', n_estimators=20)
def reg_run(): parent_data_folder = '' import getpass user = getpass.getuser() if user == 'igor': parent_data_folder = '/home/igor/ML/data_1/' elif user == 'pesici': parent_data_folder = os.environ['HOME'] + '/' else: os.path.realpath(__file__) simple = False train_data_path = '' test_data_path = '' X_file_search = '' X_nd_file_search = '' Y_file_search = '' vSize = 8 if not simple: train_data_path = parent_data_folder + 'set_train_gray_matter_maps/' test_data_path = parent_data_folder + 'set_test_gray_matter_maps/' X_file_search = 'X_compact' + str(vSize) + '.mtx' X_nd_file_search = 'X_compact' + str(vSize) + '.npy' Y_file_search = 'y' + str(vSize) + '.mtx.npy' else: train_data_path = parent_data_folder + 'set_train_simple/' test_data_path = parent_data_folder + 'set_test_simple/' X_file_search = 'X_simple.mtx' X_nd_file_search = 'X_simple.npy' Y_file_search = 'y_simple.mtx.npy' idxSlice = 85 targets_file = parent_data_folder + 'targets.csv' X_file = parent_data_folder + X_file_search X_nd_file = parent_data_folder + X_nd_file_search Y_file = parent_data_folder + Y_file_search targets = [0] with open(targets_file, 'rb') as csvfile: for line in csvfile: targets.append(int(line)) iters = 300 ys = [] X = [] method = 'SVR pipeline' if (method == 'SVR pipeline' or True) and (X_nd_file_search in os.listdir(parent_data_folder) and Y_file_search in os.listdir(parent_data_folder)): print 'Existing X and Y ARRAY files were found!' ys = np.load(Y_file) X = np.load(X_nd_file) #elif method <> 'SVR pipeline' and (X_file_search in os.listdir(parent_data_folder) and Y_file_search in os.listdir(parent_data_folder)): # print 'Existing X and Y files were found!' # ys = np.load(Y_file) # X = io.mmread(X_file) else: for f in os.listdir(train_data_path): if ('train_' in f and f.endswith('.mtx') and not simple) or ('train_' in f and f.endswith('.npy') and simple): iters = iters - 1 #print dirpath+f pic_id = int(f.split('.')[0].split('_')[1]) #segmented_img = np.load(train_data_path+'/'+f) #segmented_img = None if not simple: img = io.mmread(train_data_path + '/' + f) img = img.toarray() img = np.reshape(img, (176, 208, 176)) # sum up voxels of size 8x8x8 or 4x4x4 of img x = smooth_img(img, vSize) if X == []: X = x else: X = np.vstack((X, x)) y = targets[pic_id] ys.append(y) else: segmented_img = np.load(train_data_path + '/' + f) x = segmented_img if X == []: X = x else: X = np.vstack((X, x)) y = targets[pic_id] ys.append(y) #new_size = segmented_img.shape[0] *segmented_img.shape[1]*segmented_img.shape[2] #x = np.reshape(segmented_img, (new_size,1)) #x = x.astype(int) #x = sparse.coo_matrix(x) #nda_show(segmented_img[:,:,idxSlice], title=str(y)) gc.collect() print 'iters = ', iters if iters < 0: break print 'Saving X...' if not simple: np.save(X_nd_file, X) else: np.save(X_nd_file, X) #io.mmwrite(X_file, X) np.save(Y_file, ys) print 'X has size: ', X.shape try: print 'ys has size: ', ys.shape except: print 'ys has size: ', len(ys) # Dimension reduction from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression from sklearn import preprocessing, decomposition from sklearn.decomposition import PCA from sklearn.svm import SVR from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV from sklearn.ensemble import RandomForestRegressor # Regression model reg = '' # Remove features with too low between-subject variance e.g. nulls variance_threshold = VarianceThreshold(threshold=.01) # Normalize the data so it has 0 mean normal variance scaler = preprocessing.StandardScaler() min_max_scaler = preprocessing.MinMaxScaler() #if method == 'Lasso': # print 'Running ', method # reg = Lasso() # reg.fit(X ,ys) # print("Data fitted with Lasso Regression") # w = reg.coef_ # np.save('W_Lasso', w) if method == 'LassoCV': print 'Running ', method #variance_threshold = VarianceThreshold(threshold=0.01) lasso = LassoCV() reg = Pipeline([('variance_threshold', variance_threshold), ('lasso', lasso)]) reg.fit(X, ys) print("Data fitted with Lasso CV Regression") #w = reg.coef_ #np.save('W_LassoCV', w) if method == 'SVR pipeline': print 'Running ', method #X = X.toarray() #print 'Converted sparse to dense nd array' #variance_threshold = VarianceThreshold(threshold=.01) # Here we use a classical univariate feature selection: removes all but the k highest scoring features #feature_selection = SelectKBest(f_regression, k=2000) # ('feature_selection', feature_selection), # PCA #pca = PCA(n_components=1000) # SVM regression svrLinear = SVR(kernel='linear', C=1e-4) #svrPloy2 = SVR(kernel='poly', degree=2) #svrSigmoid = SVR(kernel='sigmoid') svrRBF = SVR() svr = svrLinear #rForest = RandomForestRegressor() regs = [svrLinear] Cs = [1e-3, 1e-2] #gammas = [1e-8, 1e-7, 1e-6] pipe = Pipeline([ ('variance_threshold', variance_threshold), ('scaler', scaler), ('svr', svrLinear), ]) params = dict( variance_threshold=[variance_threshold], scaler=[scaler], svr=regs, svr__C=Cs, ) # does cross-validation with 3-fold for each combination of kernels and Cs reg = GridSearchCV(pipe, param_grid=params, n_jobs=4, cv=5) #reg = pipe reg.fit(X, ys) print 'Data fitted with ', method print "Best parameters set found on development set:" print print(reg.best_params_) print #w = reg.coef_ #np.save('W_LassoCV', w) prediction = [] iters = 0 test_files = sorted(os.listdir(test_data_path)) smoothed_y_folder = parent_data_folder + 'set_test_smooth' + str( vSize) + '/' for f in test_files: if ('test_' in f and f.endswith('.mtx') and not simple) or ('test_' in f and f.endswith('.npy') and simple): iters = iters + 1 pic_id = int(f.split('.')[0].split('_')[1]) segmented_img = None if not simple: output_file = smoothed_y_folder + f + '.npy' if os.path.isfile(output_file): print 'Compressed file %s found!!' % output_file segmented_img = np.load(output_file) else: segmented_img = io.mmread(test_data_path + f) if method == 'SVR pipeline' or method == 'LassoCV': segmented_img = segmented_img.toarray() segmented_img = np.reshape(segmented_img, (176, 208, 176)) segmented_img = smooth_img(segmented_img, vSize) #np.save(output_file, segmented_img) else: segmented_img = np.load(test_data_path + f) res = reg.predict(segmented_img) prediction.append([pic_id, res[0]]) gc.collect() print "Age prediction for image %d completed " % (pic_id) #print '' #print 'iters = ', iters with open(parent_data_folder + "predictions.csv", "wb") as f: f.write(b'ID,Prediction\n') for pred in prediction: f.write(str(pred[0]) + ',' + str(pred[1]) + '\n') print 'Done.'
'vect__stop_words': ('english', None), 'vect__max_features': (2500, 5000, 10000, None), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'vect__norm': ('l1', 'l2'), 'clf__penalty': ('l1', 'l2'), 'clf__C': (0.01, 0.1, 1, 10), } ''' To use parallel-computing in a script, you must protect your main loop using "if __name__ == '__main__'" ''' if __name__ == '__main__': grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3) df = pd.read_csv('sms.csv') X, y, = df['message'], df['label'] X_train, X_test, y_train, y_test = train_test_split(X, y) grid_search.fit(X_train, y_train) print('最佳效果:%0.3f' % grid_search.best_score_) print('最优参数组合:') best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print('\t%s: %r' % (param_name, best_parameters[param_name])) predictions = grid_search.predict(X_test) print('准确率:', accuracy_score(y_test, predictions)) print('精确率:', precision_score(y_test, predictions)) print('召回率:', recall_score(y_test, predictions))
y_cv_train = y_cv_train.values.flatten() y_sep_test = y_val.values.flatten() y_cv_train[y_cv_train == 2] = 0 y_sep_test[y_sep_test == 2] = 0 X_train = X_cv_train X_test = X_sep_test y_train = y_cv_train y_test = y_sep_test # Using GridSearchCV to find the best values for C and gamma C_range = 10.0**np.arange(-4, 4) gamma_range = 10.0**np.arange(-10, 1) param_grid = dict(gamma=gamma_range, C=C_range) skf = cv.StratifiedKFold(y=y_train, n_folds=3) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=skf) grid.fit(X_train, y_train) # Print out parameters crossclf = svm.SVC(probability=True, **grid.best_params_) y_pred = crossclf.fit(X_train, y_train).predict(X_test) print crossclf print 'y_pred: ', y_train print 'y_pred: ', y_pred print "Best parameter", grid.best_params_ # {'C': 10.0, 'gamma': 0.001} print "Cross-Validation score", cv.cross_val_score(crossclf, X_train, y_train).mean() print "Independent accuracy score", accuracy_score(y_test, y_pred) print "Independent precision score", precision_score(y_test, y_pred) print "Independent recall score", recall_score(y_test, y_pred) print "Independent f1 score", f1_score(y_test, y_pred)
#creating parameters to fit into algortihm parameters = { 'n_estimators': [10, 20, 30, 50, 100], 'max_features': [0.6, 0.2, 0.3], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 3, 4, 6] } #parameters = {'penalty':['l1', 'l2'],'C': np.logspace(0, 4, 10)} # calculating accuracy score acc_scorer = make_scorer(accuracy_score) # Running grid search grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer, cv=5) # Fit the grid search object to the training data and find the optimal parameters grid_obj = grid_obj.fit(X_train, y_train) # Set the clf to the best combination of parameters best_clf = grid_obj.best_estimator_ # Fit the best parameter to the data. best_clf.fit(X_train, y_train) #making predictions best_predictions = best_clf.predict(X_test) #printing fbeta score and accuracy score of the optimized model . print("\nOptimized Model\n------")
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV param_dist = {"base_estimator__max_depth": [1,2,3], "base_estimator__min_samples_split": [1,2], "base_estimator__min_samples_leaf": [1,2], "n_estimators": [2,3,5], "learning_rate":[0.4,0.6,0.8], "algorithm":["SAMME","SAMME.R"] } cv = cross_validation.StratifiedShuffleSplit(y_train,n_iter = 4,random_state = 9) f1score=make_scorer(f1_score, pos_label="yes") # build a classifier dt_clf=DecisionTreeClassifier() clf = AdaBoostClassifier(dt_clf) # run grid search grid_search = GridSearchCV(clf, param_grid=param_dist,cv=cv,scoring=f1score) gs_estimator=grid_search.fit(X_train,y_train) print "Best model parameter: " + str(gs_estimator.best_params_) y_pred=grid_search.predict(X_test) #print y_pred gs_f1score=f1_score(y_test, y_pred,pos_label="yes") print "f1 score: {:.5f}".format(gs_f1score) # ######
#best_score_ : 최고 성능의 지표 값 #best_params_ : 최고 성능을 보이는 파라미터 #best_estimator_ : 최고 성능을 보이는 파라미터를 가진 모형 from sklearn.grid_search import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC #pipe line 안에는 리스트 형태로 한번에 엮어서 진행할 절차를 넣어준다. pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))]) param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10., 100., 1000.] param_grid = [{'clf__C':param_range, 'clf__kernel':['linear']}, {'clf__C':param_range, 'clf__gamma':param_range, 'clf__kernel':['rbf']}] #원래는 param_grid에다가 dict, list를 parameter 이름에 맞춰서(C, gamma, kernel) 이렇게만 해줘도 되지만, 지금은 pipeline이어서 clf__를 앞에 써준듯 하다. gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=1) # %time gs = gs.fit(X, y) print(gs.best_score_) print(gs.best_params_) gs.grid_scores_ #ParameterGrid : 파라미터를 조합하여 탐색 그리드를 생성해 주는 명령어로, iterator 역할을 한다. #이거로 조합들을 만든다음 for문을 돌려서 진행하는 방식으로 탐색할 수 있음 from sklearn.grid_search import ParameterGrid param_grid = {'a':[1, 2], 'b':[True, False]} list(ParameterGrid(param_grid)) param_grid= [{'kernel':['linear']}, {'kernel':['rbf'], 'gamma':[1, 10]}]
C['Class']=DF[i].iloc[:,c1] Classifier[i]=C.values[:,0] m=len(Classifier[i]) df=DF[i].drop(DF[i].columns[c1],axis=1) l=len(df.columns) #Create Features Array Features[i]=df.values[:,0:l] fld=5 state=12 kf=KFold(m,n_folds=fld,shuffle=True,random_state=state) C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=kf) grid.fit(Features[i], Classifier[i]) print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_)) Params[i]=grid.best_params_ print Params SVMA=0 SVMC=Params[1]['C'] SVMG=Params[1]['gamma'] start=time.time() kf=KFold(m,n_folds=fld,shuffle=True,random_state=state) for train_index, test_index in kf: Clf_Train=[] Clf_Test=[] m_train=len(train_index)
print("Precision:", precision) print("Recall:", recall) #对最大特征数max_features、最小样本数min_samples_split、叶子节点最少样本数min_samples_leaf、决策树最大深度max_depth、内部节点再划分所需最小样本数min_samples_split做调参: #param_test5= {'max_features':range(3,11,2),'min_samples_split':range(80,150,20), 'min_samples_leaf':range(10,60,10),'n_estimators':range(10,100,20),'max_depth':range(3,14,2)} param_test5 = { 'min_samples_leaf': range(10, 30, 10), 'n_estimators': range(50, 200, 50) } gsearch5 = GridSearchCV(estimator=RandomForestClassifier(max_depth=None, min_samples_split=2, max_features="auto", max_leaf_nodes=None, bootstrap=True), param_grid=param_test5, scoring='roc_auc', iid=False, cv=5) gsearch5.fit(X_train, y_train) print gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_ #0.990622704291 10棵树 #[[21721 84] # [ 133 1203]] #('Precision:', 0.96432304616161124) #('Recall:', 0.94829838717428705) #('Precision:', 0.99049738866427095) #('Recall:', 0.99062270429108512)
#print clf.predict(features_test) print clf.score(features_test, labels_test) print "training time:", round(time()-t0, 3), "s" ''' ######################################################### import numpy from sklearn.svm import SVC from sklearn.grid_search import GridSearchCV features_train = features_train[:len(features_train) / 100] labels_train = labels_train[:len(labels_train) / 100] t0 = time() clf = SVC(kernel='rbf', C=10000) clf.fit(features_train, labels_train) predictions = clf.predict(features_test) score = clf.score(features_test, labels_test) print "training time:", round(time() - t0, 3), "s" grid = GridSearchCV(clf, { 'kernel': ['linear', 'rbf'], 'C': [1, 10, 100, 1000, 10000] }, 'accuracy') grid.fit(features_train, labels_train) best_params = grid.best_params_ model = grid.best_estimator_ score = grid.best_score_ print best_params, model, score print numpy.count_nonzero(predictions)
import pandas as pd df = pd.read_csv("book2.csv") X = df.iloc[:, 0:14] y = df.iloc[:, 14] xgb_model = xgb.XGBClassifier() optimization_dict = { 'max_depth': [1, 2, 3, 4, 5, 6], 'n_estimators': [50, 100, 200], } optimization_dict1 = { 'subsample': [0.8, 0.9, 1], 'max_delta_step': [0, 1, 2, 4], 'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3] } model = GridSearchCV(xgb_model, optimization_dict, scoring='accuracy', verbose=1) model = GridSearchCV(xgb_model, optimization_dict1, scoring='accuracy', verbose=1) model.fit(X, y) print(model.best_score_) print(model.best_params_)
mtry = np.sqrt(X.shape[1]).round() # mtry=np.sqrt(n_components).round() rf = RandomForestClassifier(n_estimators=5000) gbm = GradientBoostingClassifier(n_estimators=10000, learning_rate=0.001) # Parameter Grids param_grid_rf = dict(max_features=np.arange( int(mtry - round(mtry / 2)), int(mtry + round(mtry / 2)), 2)) param_grid_gbm = dict(max_depth=range(1, 10)) # param_grid=dict(max_features=range(5,100,5)) param_dist = {"max_features": sp_randint(5, 100)} random_search_rf = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=40) grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=10) grid_search_gbm = GridSearchCV(estimator=gbm, param_grid=param_grid_gbm, cv=10) pipe1 = Pipeline([('feature_selection', feature_linearSVC), ('classification', grid_search_rf)]) pipe2 = Pipeline([('feature_selection', feature_RFECV), ('classification', random_search_rf)]) # pipe3 = Pipeline([('feature_selection', feature_PCA), # ('classification', grid_search_rf)]) #%% #Nested cross-validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_estimator_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) print() print("Detailed classification report:") print()
y_train = final_train.pop('wage_class') y_test = final_test.pop('wage_class') cv_params = {'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]} ind_params = { 'learning_rate': 0.1, 'n_estimators': 1000, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic' } optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring='accuracy', cv=5, n_jobs=-1) optimized_GBM.fit(final_train, y_train) GridSearchCV(cv=5, error_score='raise', estimator=xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
print("MSE for the test part2 is : ", np.mean((z2 - testY)**2)) print("MSE for the test part3 is : ", np.mean((z3 - testY)**2)) print("MSE for the test part4 is : ", np.mean((z4 - testY)**2)) print("MSE for the test part5 is : ", np.mean((z5 - testY)**2)) #%% Using the Linear Regression #################################### Linear Regression print("================== Linear Regression...") clf0 = LinearRegression() param = { "fit_intercept": [True, False], "normalize": [False], "copy_X": [True, False] } grid = GridSearchCV(clf0, param, n_jobs=1) grid.fit(trainX1, trainY) clf01 = LinearRegression(fit_intercept=grid.best_params_["fit_intercept"], normalize=grid.best_params_["normalize"], copy_X=grid.best_params_["copy_X"], n_jobs=-1) print("================== LR1 Ends...") grid.fit(trainX2, trainY) clf02 = LinearRegression(fit_intercept=grid.best_params_["fit_intercept"], normalize=grid.best_params_["normalize"], copy_X=grid.best_params_["copy_X"], n_jobs=-1) print("================== LR2 Ends...")
from sklearn.pipeline import make_pipeline pca = RandomizedPCA(n_components=150, whiten=True, random_state=42) svc = SVC(kernel='rbf', class_weight='balanced') model = make_pipeline(pca, svc) from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target, random_state=42) from sklearn.grid_search import GridSearchCV param_grid = {'svc__C': [1, 5, 10, 50], 'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]} grid = GridSearchCV(model, param_grid) get_ipython().run_line_magic("time", " grid.fit(Xtrain, ytrain)") print(grid.best_params_) model = grid.best_estimator_ yfit = model.predict(Xtest) fig, ax = plt.subplots(4, 6) for i, axi in enumerate(ax.flat): axi.imshow(Xtest[i].reshape(62, 47), cmap='bone') axi.set(xticks=[], yticks=[]) axi.set_ylabel(faces.target_names[yfit[i]].split()[-1], color='black' if yfit[i] == ytest[i] else 'red')
Decision Tree Regression -------------------------------------------------- """ from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor from sklearn.grid_search import GridSearchCV # Tune Hyperparameters of DecisionTreeClassifier parameters = { 'max_depth': [2, 3, 4, 5, 10], 'criterion': ["mse"], 'min_samples_split': [2, 3, 5], 'min_samples_leaf': [1, 5], 'max_leaf_nodes': [5, 7, 10, 12, 15] } grid_search_tree = GridSearchCV(DecisionTreeRegressor(), parameters, n_jobs=4) grid_search_tree.fit(regressors_train_pca, target_train) print(grid_search_tree.best_score_, grid_search_tree.best_params_) # Train Best Model regr_tree = DecisionTreeRegressor(max_depth=2, min_samples_leaf=1, criterion='mse', min_samples_split=2, max_leaf_nodes=10) regr_tree.fit(regressors_train_pca, target_train) predicted_tree = regr_tree.predict(regressors_test_pca) # RMSE math.sqrt(mean_squared_error(target_test, predicted_tree))
import xgboost as xgb from sklearn.grid_search import GridSearchCV import pandas as pd import numpy as np df_train = pd.read_csv( "C:/Users/Shanu/PycharmProjects/Crime-data/communities.csv" ) #skiprows=20,index_col=21) df_train.replace('na', 0, inplace=True) df_train.replace('?', 0, inplace=True) X_train = df_train.values[:, 1:171] Y_train = df_train.values[:, :1] optimized_GBM = GridSearchCV(cv=5, estimator=xgb.XGBRegressor(), param_grid={ 'reg_alpha': np.linspace(np.float_power(10, -4), np.float_power(10, 1), 20) }, refit=True, scoring='neg_mean_squared_error', verbose=1) # Optimize for accuracy since that is the metric used in the Adult Data Set notation optimized_GBM.fit(X_train, Y_train) print(optimized_GBM.grid_scores_)
X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator_ ############################################################################### # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names)
def gridsearchcv_train(self,alg,param_grid,train_predictor_set,train_target_set,cv,n_jobs): param_grid = {'max_depth':range(3,10,2),'min_child_weight':range(1,6,2)} gsearch = GridSearchCV(estimator=alg,param_grid = param_grid,scoring='roc_auc',n_jobs=24,iid=False, cv=10,verbose=1) gsearch.fit(train_predictor_set,train_target_set) print gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_
scoring='roc_auc') print roc_scores_svm.mean() # Worse than PCA # try with PCA roc_scores_svm_pca = cross_val_score(svm, pca_df_small, response_series, cv=10, scoring='roc_auc') print roc_scores_svm_pca.mean() # let's do a grid search param_grid = dict(kernel=['linear', 'poly', 'rbf', 'sigmoid']) svm_grid = GridSearchCV(svm, param_grid, cv=10, scoring='roc_auc') svm_grid.fit(explanatory_df, response_series) best_estimator = svm_grid.best_estimator_ print best_estimator.kernel # Linear is the best estimator score won print svm_grid.best_score_ # best estimator was 77% - just below RFs # Note: SVMs are more accurate than RFs with trending data! #################################################### ############# Out of Sample Testing ################ #################################################### conn = sqlite3.connect('C:\Users\garauste\Documents\SQLite\lahman2013.sqlite') # new query to pull data post 2000
def train(args): print("Loading embeddings.") fname = "{}/labels.csv".format(args.workDir) labels = pd.read_csv(fname, header=None).as_matrix()[:, 1] labels = map(itemgetter(1), map(os.path.split, map(os.path.dirname, labels))) # Get the directory. fname = "{}/reps.csv".format(args.workDir) embeddings = pd.read_csv(fname, header=None).as_matrix() le = LabelEncoder().fit(labels) labelsNum = le.transform(labels) nClasses = len(le.classes_) print("Training for {} classes.".format(nClasses)) if args.classifier == 'LinearSvm': clf = SVC(C=1, kernel='linear', probability=True) elif args.classifier == 'GridSearchSvm': print(""" Warning: In our experiences, using a grid search over SVM hyper-parameters only gives marginally better performance than a linear SVM with C=1 and is not worth the extra computations of performing a grid search. """) param_grid = [{ 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }] clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5) elif args.classifier == 'GMM': # Doesn't work best clf = GMM(n_components=nClasses) # ref: # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py elif args.classifier == 'RadialSvm': # Radial Basis Function kernel # works better with C = 1 and gamma = 2 clf = SVC(C=1, kernel='rbf', probability=True, gamma=2) elif args.classifier == 'DecisionTree': # Doesn't work best clf = DecisionTreeClassifier(max_depth=20) elif args.classifier == 'GaussianNB': clf = GaussianNB() # ref: https://jessesw.com/Deep-Learning/ elif args.classifier == 'DBN': from nolearn.dbn import DBN clf = DBN( [embeddings.shape[1], 500, labelsNum[-1:][0] + 1 ], # i/p nodes, hidden nodes, o/p nodes learn_rates=0.3, # Smaller steps mean a possibly more accurate result, but the # training will take longer learn_rate_decays=0.9, # a factor the initial learning rate will be multiplied by # after each iteration of the training epochs=300, # no of iternation # dropouts = 0.25, # Express the percentage of nodes that # will be randomly dropped as a decimal. verbose=1) if args.ldaDim > 0: clf_final = clf clf = Pipeline([('lda', LDA(n_components=args.ldaDim)), ('clf', clf_final)]) clf.fit(embeddings, labelsNum) fName = "{}/classifier.pkl".format(args.workDir) print("Saving classifier to '{}'".format(fName)) with open(fName, 'w') as f: pickle.dump((le, clf), f)
X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' # If you are running sklearn version 0.17 or later, the expected argument is "balanced". clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator_ ############################################################################### # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes))
'NVC0905_22_002_Ecog_c015_f1', 'NVC0905_22_002_Ecog_c015_f2', 'NVC0905_22_002_Ecog_c015_f8', 'NVC0905_22_002_Ecog_c015_f9', 'NVC0905_22_002_Ecog_c016_f1', 'NVC0905_22_002_Ecog_c016_f4', 'NVC0905_22_002_Ecog_c016_f13', 'NVC0905_22_002_Ecog_c016_f16', 'NVC0905_22_002_Ecog_c016_f17', 'NVC0905_22_002_Ecog_c016_f21', 'NVC0905_22_002_Ecog_c016_f23' ] X_train, X_test, y_train, y_test = train_test_split(df[X_cols], df['ictal_ind'], test_size=0.3, random_state=1) rf = RandomForestClassifier(random_state=1) rf.fit(X_train, y_train) probs = rf.predict_proba(X_test)[:, 1] print metrics.roc_auc_score(y_test, probs) print probs list_estimators = list(xrange(1, 30, 2)) + list(xrange(30, 101, 10)) param_grid = dict(n_estimators=list_estimators) grid = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc') grid.fit(df[X_cols], df['ictal_ind']) # Plot the results of the grid search grid_mean_scores = [result[1] for result in grid.grid_scores_] plt.xlim([0, 100]) plt.scatter(list_estimators, grid_mean_scores, s=40) plt.grid(True) plt.title('Tuning Random Forests for Dog 2') plt.ylabel('AUC for 5-fold CV') plt.xlabel('Number of Trees')
### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html ### Create cross-validation sss = StratifiedShuffleSplit(labels, 50, random_state=42) ### Naive Bayes optimization t0 = time() # PCA pca = PCA() #Pipeline pipeline = Pipeline([('scale', mm_scaler), ('SKB', SelectKBest()), ('PCA', PCA()), ('NB', GaussianNB())]) # clf's parameters parameters = {'SKB__k': [5, 6], 'PCA__n_components': [2, 3, 4]} #GridSearchCV gs = GridSearchCV(pipeline, parameters, cv=sss, scoring='f1') gs.fit(features, labels) clf_NB = gs.best_estimator_ tester.test_classifier(clf_NB, my_dataset, new_features_list) print "done in %0.3fs" % (time() - t0) ### Decision Tree optimization t0 = time() # PCA pca = PCA() #Pipeline pipeline = Pipeline([('scale', mm_scaler), ('SKB', SelectKBest()), ('PCA', PCA()), ('DT', DecisionTreeClassifier())]) # clf's parameters parameters = {
def _get_SVM(): tune_params = [{"C": [1, 5, 10, 100, 1000]}] return GridSearchCV(LinearSVC(), tune_params, scoring="f1")
#'union__summary__tfidf__max_df': (0.8, 1.0), #'union__summary__tfidf__max_features': (5000,50000), 'union__summary__best__n_components': (100, 200, 300), #'union__authors__countvec__max_features': (10, 50), #'clf__alpha': ( 0.000001, 0.0000001), #'clf__penalty': ('l2', 'l1'), #'clf__n_iter': (3, 5), 'clf__C': (1, 2), 'clf__solver': ('newton-cg', 'lbfgs'), 'clf__multi_class': ('ovr', 'multinomial'), } if __name__ == "__main__": grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) print("Performing grid search...") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() grid_search.fit(data, y) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
data["corr"], corr_lbl = pd.factorize(data.correct) data_dv = pd.get_dummies( data[["ROI", "condition_side", "condition_type", "phase"]]) data_dv["pow"] = data.power data_itc = pd.read_csv(data_path + "alpha_mean_itc_data_extracted_phase_target.csv") data_itc = data_itc.drop("mean", 1) data_dv["itc"] = data_itc["itc"] y = data["corr"].get_values() X = data_dv.get_values() cv = StratifiedShuffleSplit(y, n_iter=10) ada_params = { "adaboostclassifier__n_estimators": np.arange(1, 50, 1), "adaboostclassifier__learning_rate": np.arange(0.01, 1, 0.1) } ada = AdaBoostClassifier scaler_pipe = make_pipeline(StandardScaler(), AdaBoostClassifier()) grid = GridSearchCV(scaler_pipe, param_grid=ada_params, cv=cv) ada_grid.fit(X, y) ada = ada_grid.best_estimator_ scores = cross_val_score(ada, X, y, cv=cv, scoring="roc_auc")