def run_mod(train_X, test_X, train_Y): reg = GB(max_features="auto", n_estimators=300, random_state=1) reg.fit(train_X, train_Y) pred = reg.predict_proba(test_X) #pred=reg.predict(test_X) # predict class imp = reg.feature_importances_ return pred, imp
def getBoostingTree(data, target): Y = data[target] X = data.drop(target, axis=1) #aggressive pruning!! model = GB(max_depth=1) model.fit(X, Y) return model
def model_GB(X, y): parameters = { 'n_estimators': [250], 'max_features': ['sqrt'], 'max_depth': [25], 'min_samples_split': [40], 'min_samples_leaf': [225] } f1_scorer = make_scorer(fbeta_score, beta=0.5, pos_label=1) ks_scorer = make_scorer(ks_score, needs_proba=True) gb = GB() cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=2) grid_obj = GridSearchCV(gb, param_grid=parameters, scoring=ks_scorer, n_jobs=3, cv=cv) grid_obj.fit(X, y) gb = grid_obj.best_estimator_ title = "Learning Curves (GB)" # Cross validation with 10 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. plot_learning_curve(gb, title, X, y, cv=3, n_jobs=3, train_sizes=np.linspace(.1, 1.0, 10)) plt.show() plt.savefig('learning_curve.png') return gb
def try_params(n_iterations, params): n_estimators = int(round(n_iterations * trees_per_iteration)) print "n_estimators:", n_estimators pprint(params) clf = GB(n_estimators=n_estimators, verbose=0, **params) clf.fit(x_train, y_train) p = clf.predict_proba(x_train)[:, 1] ll = log_loss(y_train, p) auc = AUC(y_train, p) acc = accuracy(y_train, np.round(p)) print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc) # p = clf.predict_proba(x_test)[:, 1] ll = log_loss(y_test, p) auc = AUC(y_test, p) acc = accuracy(y_test, np.round(p)) print "# testing | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc) return {'loss': ll, 'log_loss': ll, 'auc': auc}
def try_params(n_iterations, params, data): n_estimators = int(round(n_iterations * trees_per_iteration)) print("n_estimators:", n_estimators) pprint(params) clf = GB(n_estimators=n_estimators, verbose=0, **params) return train_and_eval_sklearn_regressor(clf, data)
def GBClassifier(cls, ): n_estimators_range = np.arange(50, 500, 50) param_grid = {'n_estimators': n_estimators_range} # learning_rate_range=np.array([0.1]) # param_grid['learning_rate']=learning_rate_range return cls(GB(learning_rate=0.1), par_grid_dict=param_grid)
def opt_model_GB(X, y): parameters = opt_GB(X, y) parameters = map(lambda i: int(i) if i > 2 else 2, parameters) gb = GB(max_depth=parameters[0], min_samples_split=parameters[1], min_samples_leaf=parameters[2]) gb.fit(X, y) return gb
def gdbt_select(train,train_y,a,b,step,c): from sklearn.feature_selection import SelectFromModel score=0 index=0 model1=GB(random_state=0).fit(train.values, train_y.values.reshape(-1,1)) for i in range(a,b,step): model = SelectFromModel(model1,threshold=i/c) model.fit(train,train_y) train1=model.transform(train) model =GB(random_state=0) cv_score=cross_val_score(model, train1, train_y, cv=cv, scoring='recall').mean() if score<cv_score: score=cv_score index=i/c print i/c,cv_score print print index,score
def func_GB(parameters, *args): parameters = map(lambda i: int(i) if i > 2 else 2, parameters) gb = GB(max_depth=parameters[0], min_samples_split=parameters[1], min_samples_leaf=parameters[2]) X = args[0] y = args[1] gb.fit(X, y) y_pred = pd.DataFrame(gb.predict_proba(X), index=X.index)[1] return ks_score(y, y_pred)
def get_training_models(): return [ ("MLP_RELU", MLP(hidden_layer_sizes=(100, ), alpha=0.0001, activation="relu", learning_rate_init=0.001, tol=0.0001, max_iter=200)), ("GB_50", GB(n_estimators=250, learning_rate=0.1, subsample=1.0, max_depth=3, min_samples_split=20)), ("RF_FINAL", RF(n_estimators=250, max_depth=None, min_samples_split=2, bootstrap=True, n_jobs=-1)), ]
def test(self): """ Test the model with best parameters found in randomSearch() or gridSearch() :return: """ # self.clf = GB(random_state=40, n_estimators=40, max_features='sqrt', learning_rate=0.8, criterion='friedman_mse') self.clf = GB() self.clf.set_params(**self.best_parameter) print("*** Test Result for Gradient Boosting ***") ModelEvaluation.evaluateModelWithCV(self.clf, self.dataset_x, self.dataset_y, cv=10)
def fit_models(data): features = data.drop('rings', axis=1) target = data.rings models = DT(), RF(), GB(max_depth=1) for model in models: cv_results = cross_val_score(model, features, target, cv=N_FOLDS) print('\n==========\n', model) print('\ncv results\n', cv_results) print('\nmean cv accuracy =', cv_results.mean()) print('std cv accuracy = ', cv_results.std())
def GB_classif(): # GradientBoostingClassifier # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier # sklearn.ensemble.GradientBoostingClassifier(loss=’deviance’, learning_rate=0.1, # n_estimators=100, subsample=1.0, criterion=’friedman_mse’, min_samples_split=2, # min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, # min_impurity_split=None, init=None, random_state=None, max_features=None, # verbose=0, max_leaf_nodes=None, warm_start=False, presort=’auto’) hypers = { 'n_estimators': 400, 'learning_rate': 0.05, # 'subsample': 0.4 'max_depth': 4 } return GB(**hypers)
def f_classif_select(train,train_y): from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif score=0 index=1 for i in range(1,train.shape[1]+1): model=SelectKBest(f_classif,k=i) train1=model.fit_transform(train,train_y) model = GB(random_state=0) cv_score=cross_val_score(model, train1, train_y, cv=cv, scoring='recall').mean() if score<cv_score: score=cv_score index=i print i,round(cv_score,4) print "______________________" print index,score model=SelectKBest(f_classif,k=index).fit(train,train_y) train.columns[~model.get_support()]
data.Embarked=data.Embarked.fillna(data.Embarked.mode()[0])#使用众数填充 data.Age=data.Age.fillna(data.Age.mean()) #均值填充缺失年龄 data.Fare=data.Fare.fillna(data.Fare.mean()) #均值填充缺失Fare return data data = pd.read_csv(r'D:\[DataSet]\1_Titanic\train.csv') data = dataProcess(data) feature = ['Pclass','Sex','Age','Fare','Embarked'] X = data[feature] #Feature y = data.Survived #Label modelDict = {'DT':DT(),'SVC':SVC(),'GNB':GNB(),'KNN':KNN(n_neighbors=3), 'MLP':MLP(hidden_layer_sizes=(500,)), 'LogR':LogR(C=1.0,penalty='l1',tol=1e-6), 'RF':RF(),'GB':GB(n_estimators=500)} for model in modelDict.keys(): clf = modelDict.get(model) scores = cross_val_score(clf, X, y, cv=5) print (model +' accuracy: '+'%.3f'%(scores.mean()*100)+'%') clf_GB = GB(n_estimators=500) clf_GB.fit(X,y) #模型训练 data_sub = pd.read_csv(r'D:\[DataSet]\1_Titanic\test.csv') #加载测试数据 data_sub = dataProcess(data_sub) #处理测试数据 X_sub = data_sub[feature] #提取测试数据特征 y_sub = clf_GB.predict(X_sub) #使用模型预测 result = pd.DataFrame({'PassengerId':data_sub['PassengerId'].as_matrix(), 'Survived':y_sub}) #形成要求格式 result.to_csv(r'D:\[DataSet]\1_Titanic\submission.csv', index=False) #输出至文件
def __init__(self, dataset_x, dataset_y): self.dataset_x = dataset_x self.dataset_y = dataset_y self.clf = GB() self.best_parameter = {}
gsRF = GridSearchCV(clf_RF,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1) gsRF.fit(X,y) rf_best = gsRF.best_estimator_ clf_SVC = SVC(probability=True) svc_param_grid = {'kernel': ['rbf'], 'gamma': [ 0.001, 0.01, 0.1, 1], 'C': [1, 10, 50, 100,200,300, 1000]} gsSVC = GridSearchCV(clf_SVC,param_grid = svc_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1) gsSVC.fit(X,y) svm_best = gsSVC.best_estimator_ clf_GB = GB() gb_param_grid = {'loss' : ['deviance'], 'n_estimators' : [100,300,500], 'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [4, 8], 'min_samples_leaf': [100,150], 'max_features': [0.3, 0.1]} gsGB = GridSearchCV(clf_GB,param_grid = gb_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1) gsGB.fit(X,y) gb_best = gsGB.best_estimator_ clf_MLP = MLP() mlp_param_grid = {'hidden_layer_sizes' : [100,200,300,400,500], 'activation' : ['relu'], 'solver' : ['adam'],
test_data = test_data.dropna() train_data = train_data.dropna() #test_data["horsepower"]=test_data["horsepower"].apply(lambda x: 1 if x=="?" else 0) #train_data["horsepower"]=train_data["horsepower"].apply(lambda x: 1 if x=="?" else 0) train_X = train_data.drop(["id", "mpg", "horsepower", "car name"], axis=1) train_y = train_data["mpg"] val_X = test_data.drop(["id", "horsepower", "car name"], axis=1) train_y = train_y.astype('int64') print(train_y.dtype) #dtypeはdetaの確認 from sklearn.linear_model import LinearRegression as LR #線形回帰モデル LR_model = LR() LR_model.fit(train_X, train_y) val_predictions = LR_model.predict(val_X) print(LR_model.score(train_X, train_y)) from sklearn.ensemble import RandomForestClassifier as RF RF_model = RF(n_estimators=1000, random_state=0) RF_model.fit(train_X, train_y) A_val_predictions = RF_model.predict(val_X) print(RF_model.score(train_X, train_y)) from sklearn.ensemble import GradientBoostingClassifier as GB GB_model = GB(random_state=0, learning_rate=0.01) GB_model.fit(train_X, train_y) val_predictions = GB_model.predict(val_X) print(GB_model.score(train_X, train_y)) test_data["mpg"] = A_val_predictions A_test = test_data[["id", "mpg"]] A_test.to_csv("sample_submit.csv", index=False, header=False, encoding='cp932')
# coding=utf-8 from sklearn.ensemble import GradientBoostingClassifier as GB # from sklearn.ensemble import GradientBoostingRegressor as GB model=GB(random_state=0) # 交叉验证 from sklearn.model_selection import StratifiedKFold cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=False) cross_val_score(model, train, train_y, cv=cv, scoring='precision').mean()# 'neg_mean_squared_error' # 分类和回归--方差筛选(特征需离散化) # var_cols 列名按照方差从小到大排序 var_cols=train.var().sort_values().index train1=train.copy() val1=val.copy() i=-1 for col in var_cols: model=GB(random_state=0) model.fit(train1,train_y) pred=model.predict(val1) print i,np.sqrt(metrics.mean_squared_error(val_y,pred)) print "_____________________________________" train1=train1.drop(col,axis=1) val1=val1.drop(col,axis=1) i=i+1 # 分类--卡方检验
'Sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S' ] X = data_all.loc[data.index][feature] y = data.Survived modelDict = { 'DT': DT(), 'SVC': SVC(), 'GNB': GNB(), 'KNN': KNN(n_neighbors=3), 'MLP': MLP(hidden_layer_sizes=(500, )), 'LogR': LogR(C=1.0, penalty='l1', tol=1e-6), 'RF': RF(n_estimators=300), 'GB': GB(n_estimators=500) } for model in modelDict.keys(): clf = modelDict.get(model) scores = cross_val_score(clf, X, y, cv=5) print(model + ' accuracy: ' + '%.3f' % (scores.mean() * 100) + '%') votingC = VotingClassifier(estimators=[('clf_GB', GB(n_estimators=500)), ('clf_RF', RF(n_estimators=300)), ('clf_SVC', SVC(probability=True)), ('clf_MLP', MLP(hidden_layer_sizes=(500, )))], voting='soft', n_jobs=4)
# 调参后的Random Forest model=RF(n_estimators=gsearch1.best_params_['n_estimators'],max_depth=gsearch2.best_params_['max_depth'], min_samples_leaf =gsearch3.best_params_['min_samples_leaf'], min_samples_split =gsearch3.best_params_['min_samples_split'],max_features=gsearch4.best_params_['max_features'],random_state=0) model.fit(train,train_y) pred=model.predict(test) metrics.recall_score(test_y,pred) # GBDT 调参 # scoring参数 http://scikit-learn.org/0.18/modules/model_evaluation.html#scoring-parameter from sklearn.model_selection import GridSearchCV # 调节参数n_estimators param_test1 = {'n_estimators':range(75,90,1)} gsearch1 = GridSearchCV(estimator = GB(learning_rate=0.1,random_state=0), param_grid = param_test1, scoring='recall',iid=False,cv=cv) gsearch1.fit(train,train_y) gsearch1.grid_scores_,gsearch1.best_score_,gsearch1.best_params_ # 调节参数max_depth和min_samples_split param_test2 = {'max_depth':range(3,9,2), 'min_samples_split':range(2,503,100)} gsearch2 = GridSearchCV(estimator =GB(learning_rate=0.1, n_estimators=gsearch1.best_params_['n_estimators'],random_state=0), param_grid = param_test2, scoring='recall',iid=False, cv=cv) gsearch2.fit(train,train_y) gsearch2.grid_scores_,gsearch2.best_score_,gsearch2.best_params_ # 调节参数min_samples_split和min_samples_leaf param_test3 = {'min_samples_split':range(2,200,50), 'min_samples_leaf':range(1,100,10)} gsearch3 = GridSearchCV(estimator = GB(learning_rate=0.1, n_estimators=gsearch1.best_params_['n_estimators'],max_depth=gsearch2.best_params_['max_depth'], random_state=0), param_grid = param_test3, scoring='recall',iid=False, cv=cv) gsearch3.fit(train,train_y) gsearch3.grid_scores_,gsearch3.best_score_,gsearch3.best_params_
def generateColumns(start, end): for i in range(start, end + 1): l.extend([str(i) + 'X', str(i) + 'Y']) return l req = generateColumns(1, 68) import pandas as pd df = pd.read_csv('merge-mix.csv') # selecting features and label as X & y respectively X = df[req] y = df['emotion'] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42) from sklearn.ensemble import GradientBoostingRegressor as GB gb = GB() gb.fit(X_train, y_train.values.ravel()) import matplotlib.pyplot as plt plt.bar(range(X_train.shape[1]), gb.feature_importances_) plt.xticks(range(X_train.shape[1]), req) plt.show()
def model_GB(X, y): gb = GB(n_estimators=300) gb.fit(X, y) return gb
def get_submission_models(): return [("GB_Final", GB(n_estimators=250, learning_rate=0.1, subsample=1.0, max_depth=3, min_samples_split=20)), ]