def gradBosting_parameterTuning(train_dog,label_dog,train_cat,label_cat): #tuning the n_estimators and learning_rate first seed=123 estimators_chosen=[800,1000] learning_chosen=[0.01,0.05] min_samples_split=[50,80,100] subsample=[0.4,0.6,0.8] max_depth=[8,10,12] min_score=np.inf best_params_dog={'n_estimators':0,'learning_rate':0,'min_samples_split':0,'subsample':0,'max_depth':0} gbc=GradientBoostingClassifier(max_features='sqrt',max_depth=6,min_samples_split=50,subsample=0.8) #tuning the estimators number and learning rate for dog for i in estimators_chosen: for j in learning_chosen: for z in min_samples_split: for m in subsample: for n in max_depth: gbc.set_params(n_estimators=i,learning_rate=j,min_samples_split=z,subsample=m,max_depth=n) kfold=KFold(n_splits=10,random_state=seed) score=cross_val_score(gbc,X=train_dog,y=label_dog,scoring='neg_log_loss',cv=kfold) score=(-score.mean()) print('For dog dataset:') print('The n_estimators=%d, the learning_rate=%.3f, the min_samples_split=%d, the subsample=%.2f,the max_depth=%d\ give the score=%f'%(i,j,z,m,n,score)) if score<min_score: min_score=score best_params_dog['n_estimators']=i best_params_dog['learning_rate']=j best_params_dog['min_samples_split']=z best_params_dog['subsample']=m best_params_dog['max_depth']=n print('Best params: {} {} {} {} {}, score: {}'.format(best_params_dog['n_estimators'], best_params_dog['learning_rate'], best_params_dog['min_samples_split'],best_params_dog['subsample'], best_params_dog['max_depth'],min_score)) #tunning the estimators number and learning rate for cat min_score = np.inf best_params_cat = {'n_estimators': 0, 'learning_rate': 0, 'min_samples_split': 0, 'subsample': 0, 'max_depth': 0} for i in estimators_chosen: for j in learning_chosen: for z in min_samples_split: for m in subsample: for n in max_depth: gbc.set_params(n_estimators=i,learning_rate=j,min_samples_split=z,subsample=m,max_depth=n) kfold=KFold(n_splits=10,random_state=seed) score=cross_val_score(gbc,X=train_cat,y=label_cat,scoring='neg_log_loss',cv=kfold) score=(-score.mean()) print('For cat dataset:') print('The n_estimators=%d, the learning_rate=%.3f, the min_samples_split=%d, the subsample=%.2f,the max_depth=%d\ give the score=%f'%(i,j,z,m,n,score)) if score<min_score: min_score=score best_params_cat['n_estimators']=i best_params_cat['learning_rate']=j best_params_cat['min_samples_split']=z best_params_cat['subsample']=m best_params_cat['max_depth']=n print('Best params: {} {} {} {} {}, score: {}'.format(best_params_cat['n_estimators'], best_params_cat['learning_rate'], best_params_cat['min_samples_split'],best_params_cat['subsample'], best_params_cat['max_depth'],min_score))
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": clf = GradientBoostingClassifier() gscv = GridSearchCV(clf, parameters, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = n_jobs, cv = cv_k_fold) gscv.fit(X_train, y_train) self.best_params = gscv.best_params_ clf.set_params(**gscv.best_params_) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() estimator.set_params(**gscv.best_params_) self.estimator = estimator self.one_hot_encoding = None
def train_gb(x_train, y_train, x_test, y_test, x_val, y_val, gb_gridsearch): print('Training model gradient boosting with sklearn...') cls = GradientBoostingClassifier() if gb_gridsearch: print('Tuning parameters...') grid_params_gb = [{ 'learning_rate': [0.05], 'n_estimators': [1000], 'max_depth': [6], 'subsample': [1], 'min_samples_split': [2], 'min_samples_leaf': [1], 'max_features': ['sqrt'], 'verbose': [1] }] gs_gb = GridSearchCV(estimator=cls, param_grid=grid_params_gb, scoring='f1_weighted', cv=10, verbose=10, n_jobs=-1) gs_gb.fit(x_train, y_train) # Best params print('Best params: %s' % gs_gb.best_params_) # Best training data r2 print('Best training accuracy: %.3f' % gs_gb.best_score_) model = gs_gb.best_estimator_ #cls.set_params(**gs_gb.best_params_) #model = cls.fit(x_train, y_train) else: params_gb = { 'learning_rate': 0.05, 'n_estimators': 500, 'max_depth': 3, 'subsample': 1, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'verbose': 2 } cls.set_params(**params_gb) model = cls.fit(x_train, y_train) print(print(cls.get_params())) print('Test predictions with trained mode...') y_pred = model.predict(x_test) print('Train predictions with trained mode...') y_pred_t = model.predict(x_train) print('Validation predictions with trained mode...') y_pred_val = model.predict(x_val) print('Confussion matrix test:') print(confusion_matrix(y_test, y_pred)) print('Confussion matrix validation:') print(confusion_matrix(y_val, y_pred_val)) print('Prediction accuracy for test: %.3f ' % accuracy_score(y_test, y_pred)) print('Prediction accuracy for train: %.3f ' % accuracy_score(y_train, y_pred_t)) print('Prediction accuracy for validation: %.3f ' % accuracy_score(y_val, y_pred_val)) return model
def trainModel(strModelName, dcModelParams, arrX_train, arrY_train): """ Use the given model setting and data to train a model """ # setup model model = None if (strModelName == 'GBRT'): model = GradientBoostingClassifier() elif (strModelName == 'decision_tree'): model = DecisionTreeClassifier() elif (strModelName == 'extra_trees'): model = ExtraTreesClassifier() elif (strModelName == 'random_forest'): model = RandomForestClassifier() elif (strModelName == 'SVM'): model = SVC() else: raise KeyError("Unsupported model: %s" % strModelName) if(dcModelParams is not None): model.set_params(**dcModelParams) # train model.fit(arrX_train, arrY_train) return model
def gradient_boosting_classifier(X_train_res, X_test, y_train_res): clf = GradientBoostingClassifier() # instance of adaboost classifier clf.set_params(learning_rate=1, max_depth=3, n_estimators=30, min_samples_split=3) # tuned gradient boosting #clf.set_params(n_estimators = 30,learning_rate = 1) gb_clf = clf.fit(X_train_res, y_train_res) # fitting model on sampled train data gb_predict = gb_clf.predict(X_test) # predict on test data gb_acc = accuracy_score(y_test, gb_predict) # accuracy score gb_kappa = cohen_kappa_score( y_test, gb_predict) # cohen kappa score of cohen_kappa accuracy = cross_val_score(clf, X_train_res, y_train_res, cv=10, scoring='accuracy') # 10-fold accuracy score f_score = cross_val_score(clf, X_train_res, y_train_res, cv=10, scoring='f1_micro') # 10-fold f1-score gb_accuracy, gb_f_score = accuracy.mean(), f_score.mean( ) # f1 and accuracy mean score #print "accuracy and f_score are: " return gb_accuracy, gb_f_score, gb_clf, gb_predict, gb_kappa # return gb_accuracy, gb_f_score,gb_clf,gb_predict,gb_kappa
def getGradientBDTClassifier(options={}): """the standard BDT classifier based on Gradient Boosting""" bdt = GradientBoostingClassifier(n_estimators=120, learning_rate=0.13, max_depth=5, min_weight_fraction_leaf=0.01, random_state=0) bdt.set_params(**options) return bdt
def getGradBDT(options={}): """ Standard BDT classifier based on GradienBoosting""" bdt = GradientBoostingClassifier(n_estimators=20, learning_rate=0.08, max_depth=6, min_weight_fraction_leaf=0.08, random_state=0, verbose=4) bdt.set_params(**options) return bdt
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": gscv = GridSearchCV( GradientBoostingClassifier(), parameters, verbose=10, scoring="f1", #scoring = "precision" or "recall" n_jobs=n_jobs, cv=cv_k_fold) gscv.fit(X_train, y_train) best_params = gscv.best_params_ print "[GBDT's Best Parameter]", gscv.best_params_ clf = GradientBoostingClassifier() clf.set_params(**gscv.best_params_) del gscv clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() else: best_params = { 'loss': ['deviance'], 'learning_rate': [0.1], 'max_depth': [2], 'min_samples_leaf': [8], 'max_features': [5], #max_features must be in (0, n_features] 'max_leaf_nodes': [20], 'subsample': [0.1], 'n_estimators': [100], 'random_state': [0] } estimator.set_params(**best_params) self.estimator = estimator self.one_hot_encoding = None
def GBC(self): print("*********** Gradient Boosting Classifier ***********") Model = GradientBoostingClassifier() param_grid = [{ 'loss': ['deviance', 'exponential'], 'n_estimators': np.arange(10, 200, 5), #[10, 40, 70, 80, 90, 100, 120, 140, 150], 'learning_rate': np.arange(0, 1, 0.01), #[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 'subsample': np.arange(0.1, 1, 0.05), #[0.1,0.3,0.5,0.7,0.9,1], 'min_samples_split': [2, 4, 5, 7, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5], 'max_depth': [int(x) for x in np.linspace(10, 110, num=11)], 'max_features': ['auto', 'sqrt', 'log2'] }] clf = RandomizedSearchCV(Model, param_distributions=param_grid, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=10, verbose=3) best_clf = clf.fit(self.X, self.y) print( f'Accuracy during search - : {best_clf.score(self.X,self.y):.3f}') params = best_clf.best_estimator_.get_params() estimator = Model.set_params(**params) return estimator
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": gscv = GridSearchCV(GradientBoostingClassifier(), parameters, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = n_jobs, cv = cv_k_fold) gscv.fit(X_train, y_train) best_params = gscv.best_params_ print "[GBDT's Best Parameter]", gscv.best_params_ clf = GradientBoostingClassifier() clf.set_params(**gscv.best_params_) del gscv clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() else: best_params = {'loss' : ['deviance'], 'learning_rate' : [0.1], 'max_depth': [2], 'min_samples_leaf': [8], 'max_features': [5],#max_features must be in (0, n_features] 'max_leaf_nodes' : [20], 'subsample' : [0.1], 'n_estimators' : [100], 'random_state' : [0]} estimator.set_params(**best_params) self.estimator = estimator self.one_hot_encoding = None
def grid_search(gbdt_model, param_search, dtrain): gbdt_gs = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, max_features='sqrt') param_set = gbdt_model.get_params() gbdt_gs.set_params(**param_set) gsearch = GridSearchCV(estimator=gbdt_gs, param_grid=param_search, cv=5) gsearch.fit(dtrain.values[:, 1:], dtrain.values[:, 0]) print(gsearch.cv_results_) print(gsearch.best_score_) print(gsearch.best_params_) return gsearch.best_params_
def _build_ml_model(self, param_model=None): if self.ml_model == "gbm": model = GradientBoostingClassifier() elif self.ml_model == "adaboost": model = AdaBoostClassifier() elif self.ml_model == "rf": model = RandomForestClassifier() elif self.ml_model == "svc": model = SVC() else: raise ValueError( "Please use either one of the following values 'gbm', 'adaboost', 'rf', 'svc'" ) if param_model is not None: model.set_params(param_model) return model
def model_fit(dtrain, param_set=None): gbdt_model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, max_features='sqrt') if param_set: gbdt_model.set_params(**param_set) gbdt_model.fit(dtrain.values[:, 1:], dtrain.values[:, 0]) dtrain_pred = gbdt_model.predict(dtrain.values[:, 1:]) print(gbdt_model.feature_importances_) print('准确率 : %.4g' % metrics.accuracy_score(dtrain.values[:, 0], dtrain_pred)) return gbdt_model
def gb_paramsearch(DEPTH, COLUMNS, COLNAME, START=0, ENDIX=11): gbt = pd.DataFrame({ "trees": range(START, 1600), "columns": COLNAME, "depth": DEPTH, "unoC_train": 0, "acc_train": 0, "aucPR_train": 0, "aucROC_train": 0, "unoC_val": 0, "acc_val": 0, "aucPR_val": 0, "aucROC_val": 0 }) gb = GradientBoostingClassifier(random_state=0, verbose=True, min_samples_leaf=5, max_depth=DEPTH, n_estimators=START, subsample=1, learning_rate=0.1) if START > 0: print("pretraining!") gb.fit(train.loc[tra_ix, COLUMNS], train.loc[tra_ix, "AnyOutcome"], sample_weight=train.loc[tra_ix, "IPCW"]) t0 = time.time() for i in range(1, ENDIX): if i % 10 == 0: print(i) print(time.time() - t0) t0 = time.time() _ = gb.set_params(n_estimators=START + 20 * i, warm_start=True) gb.fit(train.loc[tra_ix, COLUMNS], train.loc[tra_ix, "AnyOutcome"], sample_weight=train.loc[tra_ix, "IPCW"]) print(gb.n_estimators_) for d in ["train", "val"]: gc.collect() ms = get_metrics(d, gb, COLUMNS) #get_metrics(d, gb, COLUMNS) print(ms) for k, v in ms.items(): gbt.loc[gbt.trees == gb.n_estimators_, k] = v gbt = gbt[~(gbt.unoC_val == 0)].reset_index(drop=True) previous = pd.read_csv("./Performance_Metrics/metric_df.csv") gbt = pd.concat([previous, gbt]) print("Finished! We have this many rows in our data frame:", len(gbt)) gbt.to_csv("./Performance_Metrics/metric_df.csv", index=False)
def runGradientBoostingClassifier(x_train, y_train, x_test, y_test, p): # Here we instantiate the gradient boosting classifier clf = GradientBoostingClassifier() clf.set_params(**p) clf.fit(x_train, y_train) # now we have to computer the classification accuracy # think about what two variables we have to compare gbc_predictions = clf.predict(x_test) dt_score = accuracy_score(y_test, gbc_predictions) print("accuracy score on test data: " + str(dt_score), file=sys.stderr) etc_predictions = clf.predict(x_test) dt_score = accuracy_score(y_test, etc_predictions) print("accuracy score on test data: " + str(dt_score), file=sys.stderr) train_score = accuracy_score(y_train, clf.predict(x_train)) print("accuracy score on training data: " + str(train_score), file=sys.stderr) return (train_score, dt_score)
class Hyperopt_gbc: def __init__(self, X, y, seed): self.name = 'Gradient Boosting' self.name_short = 'GBC' self.X = X self.y = y self.seed = seed self.clf = None self.best_acc = 0 self.space = { 'max_depth': hp.choice('max_depth', range(1, 30)), 'max_features': hp.choice('max_features', range(1, 5)), 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 50)), 'min_samples_split': hp.choice('min_samples_split', range(10, 50, 10)), 'max_leaf_nodes': hp.choice('max_leaf_nodes', range(2, 50)), 'loss': hp.choice('loss', ['deviance', 'exponential']), 'n_estimators': hp.choice('n_estimators', range(1, 500, 5)), 'subsample' : hp.quniform('subsample', 0.1, 1, 0.01), 'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01) } self.max_evals = 50 def train_test(self, params): warnings.filterwarnings(action='ignore', category=DeprecationWarning) self.clf = GradientBoostingClassifier(**params) self.clf.fit(self.X, self.y) return cross_val_score(self.clf, self.X, self.y, scoring='roc_auc', cv=10).mean() def f(self, params): acc = self.train_test(params) if acc > self.best_acc: self.best_acc = acc return {'loss': -acc, 'status': STATUS_OK} def best(self): trials = Trials() best = fmin(self.f, self.space, algo=tpe.suggest, max_evals = self.max_evals, rstate= np.random.RandomState(self.seed), trials=trials) self.clf.set_params(**best) return self.clf, self.name, self.name_short, space_eval(self.space, best), self.best_acc
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": clf = GradientBoostingClassifier() gscv = GridSearchCV( clf, parameters, verbose=10, scoring="f1", #scoring = "precision" or "recall" n_jobs=n_jobs, cv=cv_k_fold) gscv.fit(X_train, y_train) self.best_params = gscv.best_params_ clf.set_params(**gscv.best_params_) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() estimator.set_params(**gscv.best_params_) self.estimator = estimator self.one_hot_encoding = None
xytext=(150, 1.0), textcoords='data', arrowprops=dict(arrowstyle="-", connectionstyle="arc")) ax.annotate('', xy=(800, test_deviance[799]), xycoords='data', xytext=(800, est.train_score_[799]), textcoords='data', arrowprops=dict(arrowstyle="-")) ax.text(810, 0.25, 'train-test gap') def fmt_params(params): return ','.join("{0}={1}".format(key, val) for key, val in params.iteritems()) fig = plt.figure(figsize=(10, 10)) ax = plt.gca() for params, (test_color, train_color) in [({}, ('#d7191c', '#2c7bb6')), # ({'min_samples_leaf': 3}, ('#fdae61', '#abd9e9')), ({'learning_rate': 0.1}, ('#bcbcbc', '#ccebc4')), ({'learning_rate': 0.1, 'subsample': 0.5}, ('#7A68A6', '#FFB5B8'))]: est = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=1, learning_rate=1.0) est.set_params(**params) est.fit(x_train, y_train) ax, test_dev = deviance_plot(est, x_test, y_test, ax=ax, label=fmt_params(params), train_color=train_color, test_color=test_color) ax.annotate('Higher bias', xy=(900, est.train_score_[899]), xycoords='data', xytext=(600, 0.3), textcoords='data', arrowprops=dict(arrowstyle="-", connectionstyle="arc")) ax.annotate('Lower variance', xy=(900, test_deviance[899]), xycoords='data', xytext=(600, 1.0), textcoords='data', arrowprops=dict(arrowstyle="-")) plt.legend(loc='upper right') from sklearn.grid_search import GridSearchCV param_grid = { 'learning_rate': [0.1, 0.05, 0.02, 0.01],
def main(): ########################## # Dataset initialization # ########################## print('Dataset initialization') try : vectors = pickle.load(open(VECTORS_PATH, 'rb')) xs, ys = vectors['xs'], vectors['ys'] except FileNotFoundError: xs, ys = vectorize(DATA_PATH, LABEL_PATH) pickle.dump({ 'xs': xs, 'ys': ys }, open(VECTORS_PATH, 'wb')) print('Class Distribution Bar Graph') class_dist_bar(LABEL_PATH) ########################## # Parameter Optimization # ########################## print('Parameter Optimization') max_depth = int(len(xs[1]) * .40) - 1 single = int(max_depth/5) Random Forest Parameter Grid rfc_param_grid = [{ 'n_estimators': [i for i in range(100, 1100, 100)], 'max_depth': [i for i in range(2, 22, 2)], # 'n_jobs': [NUM_CORES], 'random_state': [RANDOM_STATE] }] rfc_px_len = len(rfc_param_grid[0]['n_estimators']) rfc_py_len = len(rfc_param_grid[0]['max_depth']) # Gradient Boost Parameter Grid gbc_param_grid = [{ 'n_estimators': [i for i in range(100, 1100, 100)], 'max_depth': [i for i in range(2, 22, 2)], 'random_state': [RANDOM_STATE] }] gbc_px_len = len(gbc_param_grid[0]['n_estimators']) gbc_py_len = len(gbc_param_grid[0]['max_depth']) # XGBoost Parameter Grid xgb_param_grid = [{ 'nthread': [NUM_CORES], 'objective': ['binary:logistic'], 'learning_rate': [0.05], 'n_estimators': [i for i in range(100, 1200, 100)], 'max_depth': [i for i in range(2, 22, 2)], 'seed': [RANDOM_STATE_XGB] }] xgb_px_len = len(xgb_param_grid[0]['n_estimators']) xgb_py_len = len(xgb_param_grid[0]['max_depth']) Random Forest print('\tRandom Forest') try : rfc_results = pickle.load(open(RFC_GRID_SEARCH_PATH, 'rb')) param_selection_heat_map(rfc_results, rfc_px_len, rfc_py_len, GRID_SEARCH_CV_FOLDS, RFC_GRID_SEARCH_GRAPH_PATH, 'Random Forest Classifier Parameter Selection') except FileNotFoundError: rfc_results = optimize_hyper_params('rfc', rfc_param_grid, xs, ys) pickle.dump(rfc_results, open(RFC_GRID_SEARCH_PATH, 'wb')) param_selection_heat_map(rfc_results, rfc_px_len, rfc_py_len, GRID_SEARCH_CV_FOLDS, RFC_GRID_SEARCH_GRAPH_PATH, 'Random Forest Classifier Parameter Selection') # Gradient Boosted Trees print('\tGradient Boosted Trees') try: gbc_results = pickle.load(open(GBC_GRID_SEARCH_PATH, 'rb')) param_selection_heat_map(gbc_results, gbc_px_len, gbc_py_len, GRID_SEARCH_CV_FOLDS, GBC_GRID_SEARCH_GRAPH_PATH, 'Gradient Boosted Trees Parameter Selection') except FileNotFoundError: gbc_results = optimize_hyper_params('gbc', gbc_param_grid, xs, ys) pickle.dump(gbc_results, open(GBC_GRID_SEARCH_PATH, 'wb')) param_selection_heat_map(gbc_results, gbc_px_len, gbc_py_len, GRID_SEARCH_CV_FOLDS, GBC_GRID_SEARCH_GRAPH_PATH, 'Gradient Boosted Trees Parameter Selection') # XGBoost print('\tXGBoost') try: xgb_results = pickle.load(open(XGB_GRID_SEARCH_PATH, 'rb')) param_selection_heat_map(xgb_results, xgb_px_len, xgb_py_len, GRID_SEARCH_CV_FOLDS, XGB_GRID_SEARCH_GRAPH_PATH, 'XGBoost Parameter Selection') except FileNotFoundError: xgb_results = optimize_hyper_params('xgb', xgb_param_grid, xs, ys) pickle.dump(xgb_results, open(XGB_GRID_SEARCH_PATH, 'wb')) param_selection_heat_map(xgb_results, xgb_px_len, xgb_py_len, GRID_SEARCH_CV_FOLDS, XGB_GRID_SEARCH_GRAPH_PATH, 'XGBoost Trees Parameter Selection') #################### # Final Train/Test # #################### print('Final Train/Test') try: final_scores = pickle.load(open(FINAL_RESULTS_PATH, 'rb')) except FileNotFoundError: opt_params = { 'rfc': { 1: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 2: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 3: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 4: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 5: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 6: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 7: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 8: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 9: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 10: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 11: { 'n_estimators': 100, 'max_depth': 20, 'random_state': RANDOM_STATE }, 12: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 13: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 14: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 15: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 16: { 'n_estimators': 100, 'max_depth': 16, 'random_state': RANDOM_STATE }, 17: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, }, 'gbc': { 1: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 2: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 3: { 'n_estimators': 600, 'max_depth': 4, 'random_state': RANDOM_STATE }, 4: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 5: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE }, 6: { 'n_estimators': 100, 'max_depth': 8, 'random_state': RANDOM_STATE }, 7: { 'n_estimators': 100, 'max_depth': 4, 'random_state': RANDOM_STATE }, 8: { 'n_estimators': 200, 'max_depth': 12, 'random_state': RANDOM_STATE }, 9: { 'n_estimators': 100, 'max_depth': 8, 'random_state': RANDOM_STATE }, 10: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE }, 11: { 'n_estimators': 900, 'max_depth': 4, 'random_state': RANDOM_STATE }, 12: { 'n_estimators': 200, 'max_depth': 2, 'random_state': RANDOM_STATE }, 13: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 14: { 'n_estimators': 300, 'max_depth': 4, 'random_state': RANDOM_STATE }, 15: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 16: { 'n_estimators': 300, 'max_depth': 8, 'random_state': RANDOM_STATE }, 17: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE }, } } final_scores = {} for i in range(1, 18): rfc = RandomForestClassifier() rfc.set_params(**opt_params['rfc'][i]) gbc = GradientBoostingClassifier() gbc.set_params(**opt_params['gbc'][i]) # xgb = XGBClassifier() final_scores[i] = {} final_scores[i]['rfc'] = cross_val_score(rfc, xs[i], ys[i], cv=CV_FOLDS, n_jobs=NUM_CORES, scoring='f1') final_scores[i]['gbc'] = cross_val_score(gbc, xs[i], ys[i], cv=CV_FOLDS, n_jobs=NUM_CORES, scoring='f1') # final_scores[i]['xgb'] = cross_val_score(xgb, np.array(xs[i]), np.array(ys[i]), cv=CV_FOLDS, scoring='f1') pickle.dump(final_scores, open(FINAL_RESULTS_PATH, 'wb')) fig, axarr = plt.subplots(5, 4, figsize=(25, 25)) for i in range(1, 18): a, b = final_scores[i]['rfc'], final_scores[i]['gbc'] # a, b, c = final_scores[i]['rfc'], final_scores[i]['gbc'], final_scores[i]['xgb'] row, col = int((i-1)/4), (i-1)%4 axarr[row][col].boxplot([a, b]) # axarr[row][col].boxplot([a, b, c]) axarr[row][col].set_title('Body Zone %s' % (i), fontsize=28) axarr[row][col].set_xticklabels(['RFC', 'GBC'], fontsize=24) # axarr[row][col].set_xticklabels(['RFC', 'GBC', 'XGB']) axarr[row][col].set_ylabel('Accuracy', fontsize=24) for i in range(1, 4): axarr[4][i].axis('off') plt.suptitle("Model Comparison", fontsize=30, fontweight='bold') plt.tight_layout() plt.subplots_adjust(top=.95) plt.savefig(FINAL_RESULTS_GRAPH_PATH)
voting_clf_soft = VotingClassifier(estimators=model_list, voting='soft') voting_clf_soft.fit(X_train, y_train) esm_score_val = voting_clf_soft.score(X_val, y_val) esm_score_test = voting_clf_soft.score(X_test, y_test) #print(esm_score_val) #print(esm_score_test) #tune_parameters = {'n_estimators' : [50, 100]} #gbm_clf = GridSearchCV(estimator = GradientBoostingClassifier(max_depth=6, random_state=0), param_grid=tune_parameters) #gbm_clf.fit(X_train, y_train) trees = (10, 50, 100) gbm_clf_final = GradientBoostingClassifier(max_depth=6) training_errors = list() validation_errors = list() test_errors = list() for tree in trees: gbm_clf_final.set_params(n_estimators=tree) gbm_clf_final.fit(X_train, y_train) training_errors.append(gbm_clf_final.score(X_train, y_train)) validation_errors.append(gbm_clf_final.score(X_val, y_val)) test_errors.append(gbm_clf_final.score(X_test, y_test)) plt.plot(trees, training_errors, label='Train') plt.plot(trees, test_errors, label='Test') plt.plot(trees, validation_errors, label='Validation') plt.xlabel('No. of Trees') plt.ylabel('Performance Score') plt.legend(loc='upper left') #gbm_clf1 = GradientBoostingClassifier(n_estimators=50, max_depth=6) #gbm_clf1.fit(X_train,y_train) #gbm_clf2 = GradientBoostingClassifier(n_estimators=100, max_depth=6) #gbm_clf2.fit(X_train,y_train) #gbm_clf3 = GradientBoostingClassifier(n_estimators=10, max_depth=6)
class AnalyzeBoost: """Analyzing the performance of three different boosting methods: - AdaBoost, - Gradient boost, - XGBoost. Parameters ---------- X_train : array Features of the training set. X_test : array Features of the test set. y_train : array Targets of the training set. y_test : array Targets of the test set. method : str Boosting method to analyze. seed : float Random seed. n_estimators : int learning_rate : float max_depth : int verbose : boolean If True, printouts from the process are provided. Attributes ---------- attribute : float Description. """ def __init__( self, X_train, X_test, y_train, y_test, method="xgboost", seed=0, n_estimators=100, learning_rate=0.5, max_depth=3, verbose=True, time_id=time.strftime("%Y%m%d-%H%M&S"), ): self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.method = method self.seed = seed self.n_estimators = n_estimators self.learning_rate = learning_rate self.max_depth = max_depth self.verbose = verbose self.time_id = time_id if self.verbose: fprint("-----------------------") fprint(f"Time: {self.time_id}") fprint(f"Number of training samples: {np.shape(self.X_train)[0]}") fprint(f"Number of test samples: {np.shape(self.X_test)[0]}") fprint(f"Method: {method}") if self.method == "adaboost": self.base_estimator = DecisionTreeClassifier() self.clf = AdaBoostClassifier(base_estimator=self.base_estimator) self.max_depth_str = "base_estimator__max_depth" elif self.method == "gradientboost": self.clf = GradientBoostingClassifier() self.max_depth_str = "max_depth" elif self.method == "xgboost": self.clf = xgb.XGBClassifier() self.max_depth_str = "max_depth" else: print("Provide boost method.") sys.exit(1) def fit(self): parameters = { "n_estimators": self.n_estimators, "learning_rate": self.learning_rate, self.max_depth_str: self.max_depth, } self.clf.set_params(**parameters) if self.verbose: fprint(f"Estimators: {self.n_estimators}") fprint(f"Learning rate: {self.learning_rate}") fprint(f"Max depth: {self.max_depth}") print("Making fit...") self.clf.fit(self.X_train, self.y_train) self.imp = self.clf.feature_importances_ self.idcs = np.argsort(self.imp) np.save("featimp-" + self.method + ".npy", self.imp) if self.verbose: fprint("Feature importances:") for f in range(self.X_train.shape[1]): fprint(f"{f}. feat. {self.idcs[f]} ({self.imp[self.idcs[f]]})") # Save model pickle.dump(self.clf, open(self.time_id + "-" + self.method + "-fit.pkl", "wb")) def predict(self): if self.verbose: print("Making predictions...") self.y_pred = self.clf.predict(self.X_test) accuracy = accuracy_score(self.y_pred, self.y_test) fprint(f"Test accuracy score: {np.around(accuracy, decimals=3)}") plot_confusion_matrix(self.y_test, self.y_pred, analysis_id=self.time_id) def gridsearch(self, parameters=None, cv=5, load_search=None): """Performing a grid search for optimal parameters. Parameters ---------- parameters : dict Dictionary with the parameters to be tested in the grid search. cv : int Number of folds in the cross-validation. load_search : pickle dump The search model from a potential previous grid search, to avoid doing a new grid search. If None, a new grid search is performed. """ if load_search is None: if parameters is None: parameters = [{ "learning_rate": [1, 0.5, 0.1], "n_estimators": [100, 150, 200], self.max_depth_str: [5, 7, 9, 11], }] self.search = GridSearchCV( self.clf, param_grid=parameters, cv=cv, n_jobs=-1, verbose=6, return_train_score=True, ) self.search.fit(self.X_train, self.y_train) # Save model pickle.dump( self.search, open(self.time_id + "-" + self.method + "-search.pkl", "wb"), ) else: self.search = pickle.load(open(load_search, "rb")) # Save results from grid search and print to terminal cv_results = pd.DataFrame(self.search.cv_results_) cv_results.to_csv(f"{self.time_id}-gridsearch.csv") report(self.search.cv_results_) # Overwriting parameters to the best parameters found by search self.learning_rate = self.search.best_params_["learning_rate"] self.n_estimators = self.search.best_params_["n_estimators"] self.max_depth = self.search.best_params_[self.max_depth_str]
def main( trainXFile="/home/kiran/kdd/trainXall.csv", trainYFile="/home/kiran/kdd/trainY.csv", validXFile="/home/kiran/kdd/validXall.csv", validYFile="/home/kiran/kdd/validY.csv", testXFile="/home/kiran/kdd/testXall.csv", n_estimators=150, n_estimators_step=150, learning_rate=0.005, max_features=30, max_depth=11, verbose=0, dump_file="/home/kiran/kdd/pymodels/gbm_all_0.005_30_11.pkl", outputFile="prediction.txt", max_trees=300, random_state=11, ): actual = np.loadtxt(validYFile, delimiter=",") trainY = np.loadtxt(trainYFile, delimiter=",") train = pd.read_csv(trainXFile) valid = pd.read_csv(validXFile) # trainY = pd.read_csv (trainYFile) # trainY = trainY.ix [:,'x'] # validY = pd.read_csv (validYFile) # actual = validY.ix [:,'x'] # validWeights = pd.read_csv (validFileWeights) # validWeights = validWeights.ix [:,'x'] # actual = actual.get_values () gbm = GradientBoostingClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_features=max_features, max_depth=max_depth, random_state=random_state, verbose=verbose, ) gbm.fit(train, trainY) prediction_valid = gbm.predict_proba(valid)[:, 1] # gbm = joblib.load ( '/home/kiran/kdd/pymodels/gbmmore.pkl') bestAUC = 0 # myAUC = kdd_metrics (actual, prediction_valid, validWeights) myAUC = metrics.roc_auc_score(actual, prediction_valid) bestAUC = 0 while myAUC >= bestAUC: n_estimators = n_estimators + n_estimators_step gbm.set_params(n_estimators=n_estimators, warm_start=True) gbm.fit(train, trainY) prediction_valid = gbm.predict_proba(valid)[:, 1] myAUC = metrics.roc_auc_score(actual, prediction_valid) print "bestAUC: %f myAUC: %f" % (bestAUC, myAUC) improvement = myAUC - bestAUC if improvement < 0.0000000001: break if n_estimators > max_trees: break bestAUC = myAUC bestPrediction = prediction_valid joblib.dump(gbm, dump_file) print "bestAUC: %f improvement: %f" % (bestAUC, improvement) myAUC = metrics.roc_auc_score(actual, bestPrediction) print "AUC: %f bestPrediction: %f" % (myAUC, improvement) test = pd.read_csv(testXFile) prediction_test1 = gbm.predict_proba(test)[:, 1] prediction_total = np.concatenate((bestPrediction, prediction_test1), axis=0) np.savetxt(outputFile, prediction_total, delimiter=",")
class BADS(object): def __init__(self): # Data self.X_train = None self.y_train = None self.X_train_cv = None self.X_valid_cv = None self.y_train_cv = None self.y_valid_cv = None self.column_names = None self.X_test = None self.yhat = None self.thresholds = None # Classifiers self.clf = None self.clf_cv = None # Cost matrix self.cm = np.array([[3., 0.], [-10., 0.]]) # variables to be set self.rs = 90049 self.save_model = False ######### Feature Selection ######### self.manual_features_to_remove = [ "x_order_date_num", "x_account_creation_date_num", "x_deliverydate_estimated_num", "x_deliverydate_actual_num" ] self.feature_correlation_removal = False self.feature_correlation_threshold = 0.7 self.automatic_feature_selection = False self.automatic_feature_threshold = 0.005 ######### Oversampling ######### # non-standard package: http://contrib.scikit-learn.org/imbalanced-learn/index.html self.oversample_method = "none" ######### Cross-Valdiation ######### self.do_cv = False # this takes a long time self.cv_num_folds = 4 self.cv_validation_frac = 0.15 self.cv_rs_iters = 20 self.cost_func = self.bads_costs # bads_costs, roc_auc_score self.score_func = self.bads_scorer # bads_scorer, roc_auc_score self.set_model("rf") # "rf" or "gbc" or "linear" def set_model(self, model_to_use=None): """Set the model to use from a pre-set list. One could set these variables manually but for ease of use, we have created a list of predefined models to ease of use. Parameters ---------- model_to_use: a string of the model to be used. If None, use the objects models. """ if model_to_use == None: model_to_use = self.model_to_use ######### Model Selection ######### if model_to_use == "rf": # Random Forest Classifier from sklearn.ensemble import RandomForestClassifier self.clf = RandomForestClassifier(random_state=self.rs) self.automatic_feature_selection_params = { 'n_estimators': 250, 'verbose': 0, 'n_jobs': 3 } self.clf_default_params = { 'min_samples_split': 2, 'n_estimators': 250, 'min_samples_leaf': 9, 'criterion': 'gini', 'verbose': 0, 'oob_score': True, 'n_jobs': 3 } self.cv_param_grid = { 'n_estimators': [100, 250, 500], 'min_samples_split': [2, 4, 8], 'min_samples_leaf': [1, 3, 9], 'n_jobs': [3] } elif model_to_use == "gbc": # Gradient Boosting Classifier from sklearn.ensemble import GradientBoostingClassifier self.clf = GradientBoostingClassifier(random_state=self.rs) self.automatic_feature_selection_params = { 'n_estimators': 50, 'verbose': 1 } self.clf_default_params = { 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'verbose': 1 } self.cv_param_grid = { 'n_estimators': [50, 100, 250, 500], 'learning_rate': [0.05, 0.1, .25], 'max_depth': [3, 5, 9] } elif model_to_use == "linear": # Logistic Regression Classifier from sklearn import linear_model self.clf = linear_model.LogisticRegression() self.clf_default_params = {'penalty': 'l1'} self.cv_param_grid = { 'penalty': ['l1', 'l2'], 'C': 2**np.linspace(-3, 5, 17), 'n_jobs': [3] } else: print("Please Set The Model") def simple_oversample_idx(self, y): """Simple oversample to equalize the two groups. Parameters ---------- y: an array of the true target variable values. """ y_idx_0 = np.where(y == 0)[0] y_idx_1 = np.random.choice(np.where(y == 1)[0], size=y_idx_0.shape[0], replace=True) ret_cust_idx = [] ret_cust_idx.extend(y[y_idx_0]) ret_cust_idx.extend(y[y_idx_1]) return (ret_cust_idx) def bads_costs(self, y_t, yhat): """Return the profit per customer. This function calculates the profit per customer based on the matrix given to us in the assignment. Parameters ---------- y_t: an array of true target variable values. yhat: an array of binary predictions from our model. """ N = yhat.shape[0] C = confusion_matrix(y_t, yhat) return (np.multiply(C, self.cm).sum() / N) def bads_scorer(self, y_t, yhat_prob): """Return the maximum profit per customer This function does a simple line search using the assignment cost/profit function. For each threshold, we create a vector of binary predictions and then we calculate the profit per customer using these binary predictions. The maximum value is returned. Note: All threshold levels at which this maximum occurred are saved to the object. Parameters ---------- y_t: an array of true target variable values. yhat_prob: an array of probability predictions from our model. """ thresholds = np.linspace(0.01, 0.99) costs = [ self.bads_costs(y_t, yhat_prob[:, 1] > threshold) for threshold in thresholds ] self.thresholds.append(thresholds[np.argmax(costs)]) return (np.max(costs)) def find_corr_features(self, df, threshold=0.7): """Return list of column names. This function calculates the simple correlation matrix between features and based on a given threshold (default: abs(0.7)) removes the feature that comes later on in the feature list. This will prioritize original features over features we have created. Parameters ---------- df: a pandas dataframe (either the train or test set) threshold: a scalar value above whose absolute value features will be considered "highly correllated". """ cols = df.columns.values.tolist() corr_mat = df.corr() corr_items = np.where(np.abs(np.triu(corr_mat, k=1)) > threshold) cols_removed = [] for corr_item in list( set([cols[max(item)] for item in zip(*corr_items)])): cols_removed.append(corr_item) cols.remove(corr_item) print("Removing Columns:", ", ".join(cols_removed)) return (cols) def loadDataset(self, df, date_to_int=True, use_woe=True): """Return pandas dataframe. The purpose of this function is to put our dataframe into a form that is as close as possible to our R dataframe that we've used in our data processing steps. Additionally, we have calculated dates, added an "is_weekday" dummy variable and chosen to use or not use the Weight of Evidence variables. Parameters ---------- df: a pandas dataframe (either the train or test set) date_to_int: a logical value to decide whether to convert dates to integers based on the epoch date of January 1st, 2013. use_woe: a logical value whether to use Weight of Evidence converted variables or use the original variables as k-1 dummies. """ # remove NA df.fillna(-99, inplace=True) # Convert Dates df.order_date = pd.to_datetime(df.order_date, format='%Y-%m-%d') df.account_creation_date = pd.to_datetime(df.account_creation_date, format='%Y-%m-%d') df.deliverydate_estimated = pd.to_datetime(df.deliverydate_estimated, format='%Y-%m-%d') df.deliverydate_actual = pd.to_datetime(df.deliverydate_actual, format='%Y-%m-%d') # Create weekday dummy for order_date df['x_order_date_is_weekday'] = df.order_date.dt.dayofweek < 5 if date_to_int: epoch_date = pd.Timestamp("2013-01-01") df.order_date = (df.order_date - epoch_date).astype('timedelta64[D]').astype(int) df.account_creation_date = ( df.account_creation_date - epoch_date).astype('timedelta64[D]').astype(int) df.deliverydate_estimated = ( df.deliverydate_estimated - epoch_date).astype('timedelta64[D]').astype(int) df.deliverydate_actual = ( df.deliverydate_actual - epoch_date).astype('timedelta64[D]').astype(int) # Convert Categories (factors in R lingo) cols_to_categorize = [ "model", "form_of_address", "email_domain", "postcode_invoice", "postcode_delivery", "payment", "advertising_code", "x_order_date_yearweek" ] # Categorize _bin columns cols = df.columns cols_to_categorize.extend( cols[cols.str.contains("_bin")].values.tolist()) for col_to_cat in cols_to_categorize: #print(col_to_cat) if (col_to_cat in df.columns.values): df[col_to_cat] = df[col_to_cat].astype('category') return (df) def create_datasets(self, use_woe=False, fp_train="output/train_cleaned_woe.csv", fp_test="output/test_cleaned_woe.csv"): """Load datasets. This is a convenience function that loads both the training and testing datasets and implements any feature selection that we've decided to use. Additionally, we impose the column structure of the train set on the test set. Implicitly, this adds and removes appropriate "factor levels" and gives any added factor level a default of 0. Parameters ---------- use_woe: a logical value whether to use Weight of Evidence converted variables or use the original variables as k-1 dummies. fp_train: a string of the train set CSV file fp_test: a string of the test set CSV file """ train = pd.read_csv(fp_train, sep=";", decimal=',', index_col="ID") train = self.loadDataset(train) # Create Feature List features_to_use = train.columns.values.tolist() features_to_use.remove("return_customer") cols_woe_removal = [col for col in features_to_use if "x_woe_" in col] if use_woe: cols_woe_removal = [ col.replace("x_woe_", "") for col in cols_woe_removal ] self.manual_features_to_remove.extend(cols_woe_removal) for ftr in self.manual_features_to_remove: if ftr in features_to_use: features_to_use.remove(ftr) elif "x_" + ftr in features_to_use: features_to_use.remove("x_" + ftr) # remove dates if not converted to ints for date_feature, v in train.dtypes.items(): if v == "datetime64[ns]": features_to_use.remove(date_feature) train = train[features_to_use + ["return_customer"]] # Visualize Correlation before splitting out dummy variables if self.feature_correlation_removal: sns.heatmap(train.drop("return_customer", 1).corr()) plt.show() # Split out dummy variables train = pd.get_dummies(train) # feature Correlation Removal if self.feature_correlation_removal: print("Removing correlated features...") noncorr_cols = self.find_corr_features( train.drop("return_customer", 1), self.feature_correlation_threshold) train = train[noncorr_cols + ["return_customer"]] # set train datasets self.X_train, self.y_train = train.drop( "return_customer", 1).values, train["return_customer"].values self.column_names = train.columns test = pd.read_csv(fp_test, sep=";", decimal=',', index_col="ID") test = self.loadDataset(test) test = pd.get_dummies(test) # The following line gives the test set the same columns as the training set. # This simultaneously adds columns to the test set and sets the values in those columns to 0 and # drops any columns in the test set that did not exist in the training set. print("Imposing train column structure on test...") test = test.reindex(columns=self.column_names, fill_value=0) test.drop("return_customer", 1, inplace=True) # set test dataset self.X_test = test.values self.X_train_cv, self.X_valid_cv, self.y_train_cv, self.y_valid_cv = train_test_split( self.X_train, self.y_train, test_size=self.cv_validation_frac, stratify=self.y_train, random_state=self.rs) def oversample(self): """Oversample datasets. Simple: This just normalizes the number of data points to make the two classes equal sizes. Samples are duplicated at random with replacement. SMOTE: SMOTE oversampling on the minority class to an equal weight as the majority class. SMOTE+Tomek: This option oversamples the minority class and then removes data points which are determined to be Tomek links. """ if self.oversample_method == "simple": # oversampling with replacement of the minority group to equalize the size of the minority and # majority group print("Simple oversampling...") # Create the Hyper-Parameter Cross-Validation train and test sets ret_cust_idx_cv = self.simple_oversample_idx(self.y_train_cv) self.X_train_cv, self.y_train_cv = self.X_train_cv[ ret_cust_idx_cv, :], self.y_train_cv[ret_cust_idx_cv] # Create the full train and test sets ret_cust_idx = simple_oversample_idx(self.y_train) self.X_train, self.y_train = self.X_train[ ret_cust_idx, :], self.y_train[ret_cust_idx] elif self.oversample_method == "SMOTE": # https://www.jair.org/media/953/live-953-2037-jair.pdf from imblearn.over_sampling import SMOTE print("SMOTE oversampling...") sm = SMOTE(kind='regular', random_state=self.rs) # Create the Hyper-Parameter Cross-Validation train and test sets self.X_train_cv, self.y_train_cv = sm.fit_sample( self.X_train_cv, self.y_train_cv) # Create the full train and test sets self.X_train, self.y_train = sm.fit_sample(self.X_train, self.y_train) elif self.oversample_method == "SMOTETomek": from imblearn.combine import SMOTETomek print("SMOTE + Tomek Links oversampling...") sm = SMOTETomek(random_state=rs) # Create the Hyper-Parameter Cross-Validation train and test sets self.X_train_cv, self.y_train_cv = sm.fit_sample( self.X_train_cv, self.y_train_cv) # Create the full train and test sets self.X_train, self.y_train = sm.fit_sample(self.X_train, self.y_train) else: print("No oversampling...") def automagic_feature_selection(self): """Prune data sets based on algorithmic feature selection. We use a particular threshold to keep certain columns based on the "feature importances" of tree-based classifiers (i.e. random forest or gradient boosted trees) """ if self.automatic_feature_selection: print("Starting automatic feature selection...") # this takes about 10 minutes to run self.clf.set_params(**self.automatic_feature_selection_params) self.clf.fit(self.X_train, self.y_train) important_features = np.where( self.clf.feature_importances_ > self.automatic_feature_threshold)[0].tolist() important_features_labels = self.column_names[important_features] print("High Importance Features:", ", ".join(important_features_labels.tolist())) np.savetxt("output/optimal_features.csv", important_features_labels.values, fmt="%s", delimiter=";") self.X_train, self.X_test = self.X_train[:, important_features], self.X_test[:, important_features] self.X_train_cv, self.X_valid_cv = self.X_train_cv[:, important_features], self.X_valid_cv[:, important_features] else: print("No automatic feature selection...") def run_model(self, fp_output="output/test_return_customer.csv"): """Do hyperparameter search, if desired, and then make prediction on test set. We do our hyper parameter search and make our prediction on the test set. At this time, we print out diagnostics and results throughout the process. """ self.thresholds = [] if self.do_cv: # this can take a LONG time print("Searching for best parameters with CV search...") self.clf_cv = RandomizedSearchCV(self.clf, self.cv_param_grid, scoring=make_scorer( self.score_func, needs_proba=True), cv=self.cv_num_folds, n_iter=self.cv_rs_iters, random_state=self.rs, verbose=1) self.clf_cv.fit(self.X_train_cv, self.y_train_cv) #clf_rf_cv.cv_results_ joblib.dump(self.clf_cv.cv_results_, 'output/clf_rf_cv.results.pkl') print("Cross Valdiation Report:") print("Best Params:", self.clf_cv.best_params_) print("Best Score:", self.clf_cv.best_score_) # Plot Expected ROI per Customer plt.errorbar(range(self.cv_rs_iters), self.clf_cv.cv_results_["mean_test_score"], yerr=self.clf_cv.cv_results_["std_test_score"], fmt="o") plt.title("Errorbar Plot of Hyper Parameter Search") plt.ylabel("Average ROI") plt.xlabel("Iteration (See Table Below)") plt.margins(0.03) plt.show() print(pd.DataFrame(list(self.clf_cv.cv_results_['params']))) # Train and Validate a random forest classifier with the best parameters yhat_valid_prob = self.clf_cv.predict_proba(self.X_valid_cv) params_star = self.clf_cv.best_params_ self.clf.set_params(**params_star) else: self.clf.set_params(**self.clf_default_params) self.clf.fit(self.X_train_cv, self.y_train_cv) yhat_valid_prob = self.clf.predict_proba(self.X_valid_cv) print("Validation Summary:") print("Calculate Optimal Threshold") thresholds = np.linspace(0.01, 0.99, 197) costs = [ self.bads_costs(self.y_valid_cv, yhat_valid_prob[:, 1] > threshold) for threshold in thresholds ] threshold_star = thresholds[np.argmax(costs)] # Plot plt.plot(thresholds, costs) plt.title("Threshold Search") plt.ylabel("Average ROI") plt.xlabel("Threshold of return_customer = 1") plt.show() print("Threshold:", threshold_star) yhat_valid = yhat_valid_prob[:, 1] > threshold_star print("Average ROI:", self.cost_func(self.y_valid_cv, yhat_valid)) print("ROC Score:", roc_auc_score(self.y_valid_cv, yhat_valid_prob[:, 1])) print("Validation Return Customers: {} of {} ({}%)".format( np.sum(yhat_valid), len(yhat_valid), np.round(100 * np.sum(yhat_valid) / len(yhat_valid), 2))) print(confusion_matrix(self.y_valid_cv, yhat_valid)) # Train model with all data and use on the Test set self.clf.fit(self.X_train, self.y_train) yhat_test_proba = self.clf.predict_proba(self.X_test) yhat_test = yhat_test_proba[:, 1] > threshold_star preds = pd.DataFrame(np.c_[np.arange(51885, 51885 + yhat_test.shape[0]), yhat_test.astype(int)], columns=["ID", "return_customer"]) preds.to_csv(fp_output, index=False) preds_probs = pd.DataFrame(np.c_[np.arange(51885, 51885 + yhat_test.shape[0]), yhat_test_proba[:, 1]], columns=["ID", "return_customer"]) preds_probs.to_csv(fp_output.split(".")[0] + "_probs.csv", index=False) print("Testing Return Customers: {} of {} ({}%)".format( np.sum(yhat_test), len(yhat_test), np.round(100 * np.sum(yhat_test) / len(yhat_test), 2))) self.yhat = yhat_test if self.save_model: joblib.dump(self.clf, 'output/model_final.pkl') #clf_rf = joblib.load('output/model_final.pkl') def pca_analysis(self, X, y, num_PC=5, recalc_PC=True): """Visualize data with predictions with Principal Component Analysis. Due to the high dimentionality of our data, we found it hard to conceptualize without reducing the dimensionality. We do a principal component analysis and create a scatter matrix of the results. Importantly, we allow the PCA to be fitted with the training data and then applied on the testing data, so we will see the two data sets rotated in the same manner. Parameters ---------- X: numpy array of train or test data y: numpy vector of true target values or model predictions num_PC: number of principal components to use recalc_PC: a logical value used to decide whether to recalculate the principal components including the rotation vectors. If false, one will rotate the data-based on the rotations of the previously calculated principal components. """ # PCA Analysis of Results from sklearn.decomposition import PCA from sklearn.preprocessing import scale train_scaled = scale(X) if recalc_PC: self.pca = PCA(n_components=num_PC) self.pca.fit(train_scaled) else: print("Using previous eigenvectors to rotate data...") train_rotated = self.pca.transform(train_scaled) df_train = pd.DataFrame(train_rotated) df_train["colors"] = [ "returning" if y_i else "non-returning" for y_i in y ] sns.pairplot(df_train, hue="colors", diag_kind="kde", vars=range(num_PC)) plt.show()
gc.collect() param = { # init the hyperparams of GBDT 'learning_rate': 0.2, 'n_estimators': 100, # number of trees here 'max_depth': 8, # set max_depth of a tree 'min_samples_split': 20, 'min_samples_leaf': 10, 'subsample': 0.01, 'max_leaf_nodes': None, # set max leaf nodes of a tree 'random_state': 1, 'verbose': 0 } gbdt_model = GradientBoostingClassifier() gbdt_model.set_params(**param) ## fitting gbdt_model.fit(X_train_gbdt, y_train_gbdt) ## log-loss of training y_pred_gbdt = gbdt_model.predict_proba(X_train_gbdt)[:, 1] log_loss_gbdt = log_loss(y_train_gbdt, y_pred_gbdt) print('log loss of GBDT on train set: %.5f' % log_loss_gbdt) y_pred_gbdt = gbdt_model.predict_proba(X_valid)[:, 1] log_loss_gbdt = log_loss(y_valid, y_pred_gbdt) print('log loss of GBDT on valid set: %.5f' % log_loss_gbdt) ## store the pre-trained gbdt_model pickle.dump(gbdt_model, open(fp_gbdt_model, 'wb'))
train_data = data.loc[rindex, :] test_data = data.drop(rindex) train_label = train_data.target train_data = train_data.drop(['target', 'id'], axis=1) test_label = test_data.target test_data = test_data.drop(['target', 'id'], axis=1) gb_otto = GradientBoostingClassifier(n_estimators=100, verbose=1, warm_start=1) gb_otto.fit(train_data, train_label) test_prob = gb_otto.predict_proba(test_data) train_prob = gb_otto.predict_proba(train_data) print 'The logloss score of test data:', logloss(test_label, test_prob) print 'The logloss score of train data:', logloss(train_label, train_prob) gb_otto.set_params(n_estimators=150) gb_otto.fit(train_data, train_label) test_prob = gb_otto.predict_proba(test_data) train_prob = gb_otto.predict_proba(train_data) print 'The logloss score of test data:', logloss(test_label, test_prob) print 'The logloss score of train data:', logloss(train_label, train_prob) gb_otto.set_params(n_estimators=200) gb_otto.fit(train_data, train_label) test_prob = gb_otto.predict_proba(test_data) train_prob = gb_otto.predict_proba(train_data) print 'The logloss score of test data:', logloss(test_label, test_prob) print 'The logloss score of train data:', logloss(train_label, train_prob) gb_otto.set_params(n_estimators=250) gb_otto.fit(train_data, train_label)
class RuleFit(BaseEstimator, TransformerMixin): """Rulefit class Parameters ---------- tree_size: Number of terminal nodes in generated trees. If exp_rand_tree_size=True, this will be the mean number of terminal nodes. sample_fract: fraction of randomly chosen training observations used to produce each tree. FP 2004 (Sec. 2) max_rules: approximate total number of rules generated for fitting. Note that actual number of rules will usually be lower than this due to duplicates. memory_par: scale multiplier (shrinkage factor) applied to each new tree when sequentially induced. FP 2004 (Sec. 2) rfmode: 'regress' for regression or 'classify' for binary classification. lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2 by multiplying the winsorised variable by 0.4/stdev. lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear terms before standardisation. exp_rand_tree_size: If True, each boosted tree will have a different maximum number of terminal nodes based on an exponential distribution about tree_size. (Friedman Sec 3.3) model_type: 'r': rules only; 'l': linear terms only; 'rl': both rules and linear terms random_state: Integer to initialise random objects and provide repeatability. tree_generator: Optional: this object will be used as provided to generate the rules. This will override almost all the other properties above. Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None) tol: The tolerance for the optimization for LassoCV or LogisticRegressionCV: if the updates are smaller than `tol`, the optimization code checks the dual gap for optimality and continues until it is smaller than `tol`. max_iter: The maximum number of iterations for LassoCV or LogisticRegressionCV. n_jobs: Number of CPUs to use during the cross validation in LassoCV or LogisticRegressionCV. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. Attributes ---------- rule_ensemble: RuleEnsemble The rule ensemble feature_names: list of strings, optional (default=None) The names of the features (columns) """ def __init__(self, tree_size=4, sample_fract='default', max_rules=2000, memory_par=0.01, tree_generator=None, rfmode='regress', lin_trim_quantile=0.025, lin_standardise=True, exp_rand_tree_size=True, model_type='rl', Cs=None, cv=3, tol=0.0001, max_iter=None, n_jobs=None, random_state=None): self.tree_generator = tree_generator self.rfmode = rfmode self.lin_trim_quantile = lin_trim_quantile self.lin_standardise = lin_standardise self.winsorizer = Winsorizer(trim_quantile=lin_trim_quantile) self.friedscale = FriedScale(self.winsorizer) self.stddev = None self.mean = None self.exp_rand_tree_size = exp_rand_tree_size self.max_rules = max_rules self.sample_fract = sample_fract self.max_rules = max_rules self.memory_par = memory_par self.tree_size = tree_size self.random_state = random_state self.model_type = model_type self.cv = cv self.tol = tol # LassoCV default max_iter is 1000 while LogisticRegressionCV 100. self.max_iter = 1000 if 'regress' else 100 self.n_jobs = n_jobs self.Cs = Cs def fit(self, X, y=None, feature_names=None): """Fit and estimate linear combination of rule ensemble """ ## Enumerate features if feature names not provided N = X.shape[0] if feature_names is None: self.feature_names = [ 'feature_' + str(x) for x in range(0, X.shape[1]) ] else: self.feature_names = feature_names if 'r' in self.model_type: ## initialise tree generator if self.tree_generator is None: n_estimators_default = int( np.ceil(self.max_rules / self.tree_size)) self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N) if self.rfmode == 'regress': self.tree_generator = GradientBoostingRegressor( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) else: self.tree_generator = GradientBoostingClassifier( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) if self.rfmode == 'regress': if type(self.tree_generator) not in [ GradientBoostingRegressor, RandomForestRegressor ]: raise ValueError( "RuleFit only works with RandomForest and BoostingRegressor" ) else: if type(self.tree_generator) not in [ GradientBoostingClassifier, RandomForestClassifier ]: raise ValueError( "RuleFit only works with RandomForest and BoostingClassifier" ) ## fit tree generator if not self.exp_rand_tree_size: # simply fit with constant tree size self.tree_generator.fit(X, y) else: # randomise tree size as per Friedman 2005 Sec 3.3 np.random.seed(self.random_state) tree_sizes = np.random.exponential( scale=self.tree_size - 2, size=int(np.ceil(self.max_rules * 2 / self.tree_size))) tree_sizes = np.asarray([ 2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes)) ], dtype=int) i = int(len(tree_sizes) / 4) while np.sum(tree_sizes[0:i]) < self.max_rules: i = i + 1 tree_sizes = tree_sizes[0:i] self.tree_generator.set_params(warm_start=True) curr_est_ = 0 for i_size in np.arange(len(tree_sizes)): size = tree_sizes[i_size] self.tree_generator.set_params(n_estimators=curr_est_ + 1) self.tree_generator.set_params(max_leaf_nodes=size) random_state_add = self.random_state if self.random_state else 0 self.tree_generator.set_params( random_state=i_size + random_state_add ) # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here. self.tree_generator.get_params()['n_estimators'] self.tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C')) curr_est_ = curr_est_ + 1 self.tree_generator.set_params(warm_start=False) tree_list = self.tree_generator.estimators_ if isinstance(self.tree_generator, RandomForestRegressor) or isinstance( self.tree_generator, RandomForestClassifier): tree_list = [[x] for x in self.tree_generator.estimators_] ## extract rules self.rule_ensemble = RuleEnsemble(tree_list=tree_list, feature_names=self.feature_names) ## concatenate original features and rules X_rules = self.rule_ensemble.transform(X) ## standardise linear variables if requested (for regression model only) if 'l' in self.model_type: ## standard deviation and mean of winsorized features self.winsorizer.train(X) winsorized_X = self.winsorizer.trim(X) self.stddev = np.std(winsorized_X, axis=0) self.mean = np.mean(winsorized_X, axis=0) if self.lin_standardise: self.friedscale.train(X) X_regn = self.friedscale.scale(X) else: X_regn = X.copy() ## Compile Training data X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: X_concat = np.concatenate((X_concat, X_regn), axis=1) if 'r' in self.model_type: if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) ## fit Lasso if self.rfmode == 'regress': if self.Cs is None: # use defaultshasattr(self.Cs, "__len__"): n_alphas = 100 alphas = None elif hasattr(self.Cs, "__len__"): n_alphas = None alphas = 1. / self.Cs else: n_alphas = self.Cs alphas = None self.lscv = LassoCV(n_alphas=n_alphas, alphas=alphas, cv=self.cv, max_iter=self.max_iter, tol=self.tol, n_jobs=self.n_jobs, random_state=self.random_state) self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_ self.intercept_ = self.lscv.intercept_ else: Cs = 10 if self.Cs is None else self.Cs self.lscv = LogisticRegressionCV(Cs=Cs, cv=self.cv, penalty='l1', max_iter=self.max_iter, tol=self.tol, n_jobs=self.n_jobs, random_state=self.random_state, solver='liblinear') self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_[0] self.intercept_ = self.lscv.intercept_[0] return self def predict(self, X): """Predict outcome for X """ X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: if self.lin_standardise: X_concat = np.concatenate((X_concat, self.friedscale.scale(X)), axis=1) else: X_concat = np.concatenate((X_concat, X), axis=1) if 'r' in self.model_type: rule_coefs = self.coef_[-len(self.rule_ensemble.rules):] if len(rule_coefs) > 0: X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs) if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) return self.lscv.predict(X_concat) def predict_proba(self, X): """Predict outcome probability for X, if model type supports probability prediction method """ if 'predict_proba' not in dir(self.lscv): error_message = ''' Probability prediction using predict_proba not available for model type {lscv} '''.format(lscv=self.lscv) raise ValueError(error_message) X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: if self.lin_standardise: X_concat = np.concatenate((X_concat, self.friedscale.scale(X)), axis=1) else: X_concat = np.concatenate((X_concat, X), axis=1) if 'r' in self.model_type: rule_coefs = self.coef_[-len(self.rule_ensemble.rules):] if len(rule_coefs) > 0: X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs) if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) return self.lscv.predict_proba(X_concat) def transform(self, X=None, y=None): """Transform dataset. Parameters ---------- X : array-like matrix, shape=(n_samples, n_features) Input data to be transformed. Use ``dtype=np.float32`` for maximum efficiency. Returns ------- X_transformed: matrix, shape=(n_samples, n_out) Transformed data set """ return self.rule_ensemble.transform(X) def get_rules(self, exclude_zero_coef=False, subregion=None): """Return the estimated rules Parameters ---------- exclude_zero_coef: If True (default), returns only the rules with an estimated coefficient not equalt to zero. subregion: If None (default) returns global importances (FP 2004 eq. 28/29), else returns importance over subregion of inputs (FP 2004 eq. 30/31/32). Returns ------- rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds the coefficients and 'support' the support of the rule in the training data set (X) """ n_features = len(self.coef_) - len(self.rule_ensemble.rules) rule_ensemble = list(self.rule_ensemble.rules) output_rules = [] ## Add coefficients for linear effects for i in range(0, n_features): if self.lin_standardise: coef = self.coef_[i] * self.friedscale.scale_multipliers[i] else: coef = self.coef_[i] if subregion is None: importance = abs(coef) * self.stddev[i] else: subregion = np.array(subregion) importance = sum( abs(coef) * abs([x[i] for x in self.winsorizer.trim(subregion)] - self.mean[i])) / len(subregion) output_rules += [(self.feature_names[i], 'linear', coef, 1, importance)] ## Add rules for i in range(0, len(self.rule_ensemble.rules)): rule = rule_ensemble[i] coef = self.coef_[i + n_features] if subregion is None: importance = abs(coef) * (rule.support * (1 - rule.support))**(1 / 2) else: rkx = rule.transform(subregion) importance = sum( abs(coef) * abs(rkx - rule.support)) / len(subregion) output_rules += [(rule.__str__(), 'rule', coef, rule.support, importance)] rules = pd.DataFrame( output_rules, columns=["rule", "type", "coef", "support", "importance"]) if exclude_zero_coef: rules = rules.ix[rules.coef != 0] return rules
#----- parameters tuning of GBDT -----# param = { # init the hyperparams of GBDT 'learning_rate': 0.2, 'n_estimators': 100, # number of trees here 'max_depth': 8, # set max_depth of a tree 'min_samples_split': 20, 'min_samples_leaf': 10, 'subsample': 0.01, 'max_leaf_nodes': None, # set max leaf nodes of a tree 'random_state': 1, 'verbose': 0 } gbdt_model = GradientBoostingClassifier() gbdt_model.set_params(**param) ''' #----- parameters tuning of GBDT -----# ### n_estimators log_loss_train = [] log_loss_valid = [] n_estimators = [10,20,30,40,50,60,70,80,90,100,120,140] for nt in n_estimators: print('training: n_estimators = ', nt) param['n_estimators'] = nt gbdt_model.set_params(**param) gbdt_model.fit(X_train_gbdt, y_train_gbdt) # scores
return gsearch.best_params_ if __name__ == '__main__': train, test = load_data() gbdt_model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, max_features='sqrt') model_fit(train, {'n_estimators': 100}) param_search = {'n_estimators': range(50, 220, 30)} param_find = grid_search(gbdt_model, param_search, train) gbdt_model.set_params(**param_find) param_search = { 'max_depth': range(3, 14, 2), 'min_samples_split': range(1, 301, 50) } param_find = grid_search(gbdt_model, param_search, train) gbdt_model.set_params(**param_find) param_search = {'min_samples_leaf': range(1, 101, 20)} param_find = grid_search(gbdt_model, param_search, train) gbdt_model.set_params(**param_find) gbdt_model = model_fit(train, gbdt_model.get_params())
from sklearn.ensemble import GradientBoostingClassifier gbmodel = GradientBoostingClassifier(random_state=0) from sklearn.model_selection import GridSearchCV param_grid = { 'max_depth': [2., 3., 4.], 'subsample': [0.8], 'n_estimators': [100], 'learning_rate': [0.2] } CV_gbmodel = GridSearchCV(estimator=gbmodel, param_grid=param_grid, cv=cross_validation_number_of_folds) CV_gbmodel.fit(X_train, Y_train) print("\n \n \n Gradient Boosting Classifier \n") print(CV_gbmodel.best_params_) #use the best parameters gbmodel = gbmodel.set_params(**CV_gbmodel.best_params_) gbmodel.fit(X_train, Y_train) Y_test_pred = gbmodel.predict(X_test) accte = accuracy_score(Y_test, Y_test_pred) report_x.loc[len(report_x)] = ['Gradient Boosting (grid)', CV_gbmodel.cv_results_['mean_test_score'][CV_gbmodel.best_index_], CV_gbmodel.cv_results_['std_test_score'][CV_gbmodel.best_index_], accte] print(report_x.loc[len(report_x)-1]) print(CV_rfmodel.cv_results_['mean_test_score']) plt.plot(range(int(param_grid['max_depth'][0]), int(param_grid['max_depth'][-1] + 1)), CV_gbmodel.cv_results_['mean_test_score']) plt.xlim(int(param_grid['max_depth'][0]), int(param_grid['max_depth'][-1])-1) plt.xticks(range(int(param_grid['max_depth'][0]), int(param_grid['max_depth'][-1]))) plt.xlabel('Number of Neighbors') plt.ylabel('Accuracy')
class RuleFitCustom(BaseEstimator, TransformerMixin): """Rulefit class Parameters ---------- tree_size: Number of terminal nodes in generated trees. If exp_rand_tree_size=True, this will be the mean number of terminal nodes. sample_fract: fraction of randomly chosen training observations used to produce each tree. FP 2004 (Sec. 2) max_rules: approximate total number of rules generated for fitting. Note that actual number of rules will usually be lower than this due to duplicates. memory_par: scale multiplier (shrinkage factor) applied to each new tree when sequentially induced. FP 2004 (Sec. 2) rfmode: 'regress' for regression or 'classify' for binary classification. lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2 by multiplying the winsorised variable by 0.4/stdev. lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear terms before standardisation. exp_rand_tree_size: If True, each boosted tree will have a different maximum number of terminal nodes based on an exponential distribution about tree_size. (Friedman Sec 3.3) model_type: 'r': rules only; 'l': linear terms only; 'rl': both rules and linear terms random_state: Integer to initialise random objects and provide repeatability. tree_generator: Optional: this object will be used as provided to generate the rules. This will override almost all the other properties above. Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None) Attributes ---------- rule_ensemble: RuleEnsemble The rule ensemble feature_names: list of strings, optional (default=None) The names of the features (columns) """ def __init__(self, tree_size=4, sample_fract='default', max_rules=2000, memory_par=0.01, tree_generator=None, rfmode='regress', lin_trim_quantile=0.025, lin_standardise=True, exp_rand_tree_size=True, model_type='rl', Cs=None, cv=3, random_state=None, simple_rules=False): self.tree_generator = tree_generator self.rfmode = rfmode self.lin_trim_quantile = lin_trim_quantile self.lin_standardise = lin_standardise self.friedscale = FriedScale(trim_quantile=lin_trim_quantile) self.exp_rand_tree_size = exp_rand_tree_size self.max_rules = max_rules self.sample_fract = sample_fract self.memory_par = memory_par self.tree_size = tree_size self.random_state = random_state self.model_type = model_type self.cv = cv self.Cs = Cs self.simple_rules = simple_rules # TODO mettre en param global, on veut pouvoir faire dans le fit pour garder les mêmes nodes et comparaison plus facile du coup def fit(self, X, y=None, feature_names=None): """Fit and estimate linear combination of rule ensemble """ ## Enumerate features if feature names not provided N = X.shape[0] if feature_names is None: self.feature_names = [ 'feature_' + str(x) for x in range(0, X.shape[1]) ] else: self.feature_names = feature_names if 'r' in self.model_type: ## initialise tree generator if self.tree_generator is None: n_estimators_default = int( np.ceil(self.max_rules / self.tree_size)) self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N) if self.rfmode == 'regress': self.tree_generator = GradientBoostingRegressor( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) else: self.tree_generator = GradientBoostingClassifier( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) if self.rfmode == 'regress': if type(self.tree_generator) not in [ GradientBoostingRegressor, RandomForestRegressor ]: raise ValueError( "RuleFit only works with RandomForest and BoostingRegressor" ) else: if type(self.tree_generator) not in [ GradientBoostingClassifier, RandomForestClassifier ]: raise ValueError( "RuleFit only works with RandomForest and BoostingClassifier" ) ## fit tree generator if not self.exp_rand_tree_size: # simply fit with constant tree size self.tree_generator.fit(X, y) else: # randomise tree size as per Friedman 2005 Sec 3.3 np.random.seed(self.random_state) tree_sizes = np.random.exponential( scale=self.tree_size - 2, size=int(np.ceil(self.max_rules * 2 / self.tree_size))) tree_sizes = np.asarray([ 2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes)) ], dtype=int) i = int(len(tree_sizes) / 4) while np.sum(tree_sizes[0:i]) < self.max_rules: i = i + 1 tree_sizes = tree_sizes[0:i] self.tree_generator.set_params(warm_start=True) curr_est_ = 0 for i_size in np.arange(len(tree_sizes)): size = tree_sizes[i_size] self.tree_generator.set_params(n_estimators=curr_est_ + 1) self.tree_generator.set_params(max_leaf_nodes=size) random_state_add = self.random_state if self.random_state else 0 self.tree_generator.set_params( random_state=i_size + random_state_add ) # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here. self.tree_generator.get_params()['n_estimators'] self.tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C')) curr_est_ = curr_est_ + 1 self.tree_generator.set_params(warm_start=False) tree_list = self.tree_generator.estimators_ if isinstance(self.tree_generator, RandomForestRegressor) or isinstance( self.tree_generator, RandomForestClassifier): tree_list = [[x] for x in self.tree_generator.estimators_] ## extract rules self.rule_ensemble = RuleEnsemble(tree_list=tree_list, feature_names=self.feature_names) ## concatenate original features and rules X_rules = self.rule_ensemble.transform( X, weigh_rules=self.simple_rules) self.X_rules = X_rules #if self.simple_rules: # for i in range(0, X_rules.shape[1]): # X_rules[:, i] = X_rules[:, i]/len(self.rule_ensemble.rules[i].conditions) ## standardise linear variables if requested (for regression model only) if 'l' in self.model_type: if self.lin_standardise: self.friedscale.train(X) X_regn = self.friedscale.scale(X) else: X_regn = X.copy() ## Compile Training data X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: X_concat = np.concatenate((X_concat, X_regn), axis=1) if 'r' in self.model_type: if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) self.X_concat = X_concat ## fit Lasso if self.rfmode == 'regress': if self.Cs is None: # use defaultshasattr(self.Cs, "__len__"): n_alphas = 100 alphas = None elif hasattr(self.Cs, "__len__"): n_alphas = None alphas = 1. / self.Cs else: n_alphas = self.Cs alphas = None self.lscv = LassoCV(n_alphas=n_alphas, alphas=alphas, cv=self.cv, random_state=self.random_state) self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_ self.intercept_ = self.lscv.intercept_ else: Cs = 10 if self.Cs is None else self.Cs self.lscv = LogisticRegressionCV(Cs=Cs, cv=self.cv, penalty='l1', random_state=self.random_state, solver='liblinear') self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_[0] self.intercept_ = self.lscv.intercept_[0] return self def predict(self, X): """Predict outcome for X """ X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: if self.lin_standardise: X_concat = np.concatenate((X_concat, self.friedscale.scale(X)), axis=1) else: X_concat = np.concatenate((X_concat, X), axis=1) if 'r' in self.model_type: rule_coefs = self.coef_[-len( self.rule_ensemble.rules ):] # bug correction. upstreamed at https://github.com/christophM/rulefit/issues/23 if len(rule_coefs) > 0: X_rules = self.rule_ensemble.transform( X, coefs=rule_coefs, weigh_rules=self.simple_rules) if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) return self.lscv.predict(X_concat) def transform(self, X=None, y=None): """Transform dataset. Parameters ---------- X : array-like matrix, shape=(n_samples, n_features) Input data to be transformed. Use ``dtype=np.float32`` for maximum efficiency. Returns ------- X_transformed: matrix, shape=(n_samples, n_out) Transformed data set """ return self.rule_ensemble.transform(X) def get_rules(self, exclude_zero_coef=False): """Return the estimated rules Parameters ---------- exclude_zero_coef: If True (default), returns only the rules with an estimated coefficient not equalt to zero. Returns ------- rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds the coefficients and 'support' the support of the rule in the training data set (X) """ n_features = len(self.coef_) - len(self.rule_ensemble.rules) rule_ensemble = list(self.rule_ensemble.rules) output_rules = [] ## Add coefficients for linear effects for i in range(0, n_features): if self.lin_standardise: coef = self.coef_[i] * self.friedscale.scale_multipliers[i] else: coef = self.coef_[i] output_rules += [(self.feature_names[i], 'linear', coef, 1, 0) ] # TODO REMOVE, pour debug ## Add rules for i in range(0, len(self.rule_ensemble.rules)): rule = rule_ensemble[i] coef = self.coef_[i + n_features] output_rules += [(rule.__str__(), 'rule', coef, rule.support, i) ] # TODO REMOVE, pour debug rules = pd.DataFrame( output_rules, columns=["rule", "type", "coef", "support", "rule_number"]) # TODO REMOVE, pour debug if exclude_zero_coef: rules = rules.ix[rules.coef != 0] return rules def rules_complexity(self): n_features = len(self.coef_) - len(self.rule_ensemble.rules) rule_ensemble = list(self.rule_ensemble.rules) res = 0 for i in range(0, len(self.rule_ensemble.rules)): rule = rule_ensemble[i] coef = self.coef_[i + n_features] if (coef != 0): res += len(rule.conditions) return res
pickle.dump(feature_map, f, pickle.HIGHEST_PROTOCOL) del feature_map label_train = ds_train['click'] label_valid = ds_valid['click'] ds_train = ds_train.drop(['click'], axis=1).values ds_valid = ds_valid.drop(['click'], axis=1).values print('build gbdt model ...') gbdt = GradientBoostingClassifier(loss='deviance', n_estimators=1000, learning_rate=0.1, max_depth=10, subsample=0.8, min_samples_split=2000, min_samples_leaf=1000, random_state=0, verbose=1, warm_start=True) for i in range(200): gbdt.set_params(n_estimators=(i + 1) * 100) print('fit model ...', i) gbdt.fit(ds_train, label_train) print('predict...') proba = gbdt.predict_proba(ds_valid) print('valid score', log_loss(label_valid, proba)) print('dump model to output') joblib.dump(gbdt, '/output/gbdt' + str(i) + '.pkl')
class Model: def __init__(self, datainfo, timeinfo): ''' This constructor is supposed to initialize data members. Use triple quotes for function documentation. ''' # Just print some info from the datainfo variable print("The Budget for this data set is: %d seconds" % datainfo['time_budget']) print( "Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables" % (datainfo['loaded_feat_types'][0], datainfo['loaded_feat_types'][1], datainfo['loaded_feat_types'][2], datainfo['loaded_feat_types'][3])) overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] print("[***] Overall time spent %5.2f sec" % overall_spenttime) print("[***] Dataset time spent %5.2f sec" % dataset_spenttime) self.num_train_samples = 0 self.num_feat = 1 self.num_labels = 1 self.is_trained = False #self.clf=svm.SVC() #self.clf = SGDClassifier(loss="hinge", penalty="l2") #self.clf = linear_model.SGDClassifier() self.clf = GradientBoostingClassifier(n_estimators=5, verbose=1, random_state=1, min_samples_split=10, warm_start=False) # Here you may have parameters and hyper-parameters def fit(self, F, y, datainfo, timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' # get the raw categorical and categorical multivalued variables in case you want to process them, in this baseline we simply ignore them MV = F['MV'] CAT = F['CAT'] # only get numerical variables X = F['numerical'] overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] print("[***] Overall time spent %5.2f sec" % overall_spenttime) print("[***] Dataset time spent %5.2f sec" % dataset_spenttime) # get numerical variables, concatenate them with categorical variables # catnumeric_dataset=np.array(CAT) # X= np.concatenate((F['numerical'],catnumeric_dataset),axis=1).astype(np.float64).copy(order='C') # convert NaN to zeros X = data_converter.replace_missing(X) #print "This batch of data has: " self.num_train_samples = X.shape[0] if X.ndim > 1: self.num_feat = X.shape[1] #print("FIT: dim(X)= [{:d}, {:d}]").format(self.num_train_samples, self.num_feat) num_train_samples = y.shape[0] if y.ndim > 1: self.num_labels = y.shape[1] #print("FIT: dim(y)= [{:d}, {:d}]").format(num_train_samples, self.num_labels) # subsample the data for efficient processing removeperc = 0.9 if removeperc > 0: rem_samples = int(num_train_samples * removeperc) skip = sorted( random.sample(range(num_train_samples), num_train_samples - rem_samples)) num_train_samples = num_train_samples - rem_samples X = X[skip, :] y = y[skip, :] self.num_train_samples = X.shape[0] if self.is_trained: _ = self.clf.set_params(n_estimators=self.clf.n_estimators + 1, warm_start=True) self.DataX = X self.DataY = y else: self.DataX = X self.DataY = y print("The whole available data is: ") print( ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0], self.DataX.shape[1])) print( ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0], self.num_labels)) #print "fitting with ..." #print self.clf.n_estimators self.clf.fit(self.DataX, np.ravel(self.DataY)) #print "Model fitted.." if (self.num_train_samples != num_train_samples): print("ARRGH: number of samples in X and y do not match!") self.is_trained = True def predict(self, F, datainfo, timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' # get the raw categorical multivalued variables in case you want to process them, in this baseline we simply ignore them MV = F['MV'] CAT = F['CAT'] overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] print("[***] Overall time spent %5.2f sec" % overall_spenttime) print("[***] Dataset time spent %5.2f sec" % dataset_spenttime) # only get numerical variables X = F['numerical'] # get numerical variables, concatenate them with categorical variables # catnumeric_dataset=np.array(CAT) # X= np.concatenate((F['numerical'],catnumeric_dataset),axis=1).astype(np.float64).copy(order='C') # convert NaN to zeros X = data_converter.replace_missing(X) num_test_samples = X.shape[0] if X.ndim > 1: num_feat = X.shape[1] print(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) if (self.num_feat != num_feat): print( "ARRGH: number of features in X does not match training data!") print(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y = self.clf.decision_function(X) y = np.transpose(y) return y def save(self, path="./"): pickle.dump(self, open(path + '_model.pickle', "w")) def load(self, path="./"): modelfile = path + '_model.pickle' if isfile(modelfile): with open(modelfile) as f: self = pickle.load(f) print("Model reloaded from: " + modelfile) return self
def gbdt_plus_liner_classifier_grid_search(stack_setting_, upper_param_keys=None, upper_param_vals=None, lower_param_keys=None, lower_param_vals=None, num_proc=None): """ upper model is GBDT or Random Forest lower model is Linear Classifier """ if stack_setting_ is None: sys.stderr.write('You have no setting Json file\n') sys.exit() if num_proc is None: num_proc = 6 # 1. upper model if upper_param_keys is None: upper_param_keys = ['model_type', 'n_estimators', 'loss', 'random_state', 'subsample', 'max_features', 'max_leaf_nodes', 'learning_rate', 'max_depth', 'min_samples_leaf'] if upper_param_vals is None: upper_param_vals = [[GradientBoostingClassifier], [100], ['deviance'], [0], [0.1], [5], [20], [0.1], [2], [8]] # grid search for upper model : GBDT or Random Forest # ExperimentL1 has model free. On the other hand, data is fix exp = ExperimentL1(data_folder = stack_setting_['0-Level']['folder'], train_fname = stack_setting_['0-Level']['train'], test_fname = stack_setting_['0-Level']['test']) # GridSearch has a single model. model is dertermined by param #gs = GridSearch(SklearnModel, exp, upper_param_keys, upper_param_vals, # cv_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['folder'], # cv_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_out'], # cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_pred_out'], # refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['refit_pred_out']) #upper_best_param, upper_best_score = gs.search_by_cv() model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_train_fname = os.path.join(Config.get_string('data.path'), model_folder, model_train_fname) model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] model_test_fname = os.path.join(Config.get_string('data.path'), model_folder, model_test_fname) upper_param_dict = dict(zip(upper_param_keys, upper_param_vals)) if os.path.isfile(model_train_fname) is False and \ os.path.isfile(model_test_fname) is False: #upper_param_dict['model_type'] == [GradientBoostingClassifier] del upper_param_dict['model_type'] clf = GradientBoostingClassifier() clf_cv = GridSearchCV(clf, upper_param_dict, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = num_proc, cv = 5) X_train, y_train = exp.get_train_data() clf_cv.fit(X_train, y_train) upper_best_params = clf_cv.best_params_ print upper_best_params del clf_cv clf.set_params(**upper_best_params) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) X_test, y_test = exp.get_test_data() for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) graph_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['folder'] graph_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['name'] graph_fname = os.path.join(Config.get_string('data.path'), graph_folder, graph_fname) gs = GridSpec(2,2) ax1 = plt.subplot(gs[0,1]) ax2 = plt.subplot(gs[1,1]) ax3 = plt.subplot(gs[:,0]) ax1.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') ax1.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') ax1.set_xlabel('the number of weak learner:Boosting Iterations') ax1.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE'))) ax1.legend(loc="best") # dump for the transformated feature clf = TreeTransform(GradientBoostingClassifier(), best_params_ = upper_best_params) if type(X_train) == pd.core.frame.DataFrame: clf.fit(X_train.as_matrix().astype(np.float32), y_train) elif X_train == np.ndarray: clf.fit(X_train.astype(np.float32), y_train) # train result train_loss = clf.estimator_.train_score_ test_loss = np.zeros((len(clf.estimator_.train_score_),), dtype=np.float32) if type(X_train) == pd.core.frame.DataFrame: for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.as_matrix().astype(np.float32))): test_loss[iter] = clf.estimator_.loss_(y_test, y_pred) elif type(X_train) == np.ndarray: for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.astype(np.float32))): test_loss[iter] = clf.estimator_.loss_(y_test, y_pred) ax2.plot(train_loss, label="train_loss") ax2.plot(test_loss, label="test_loss") ax2.set_xlabel('Boosting Iterations') ax2.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE'))) ax2.legend(loc="best") # tree ensambles score_threshold=0.8 index2feature = dict(zip(np.arange(len(X_train.columns.values)), X_train.columns.values)) feature_importances_index = [str(j) for j in clf.estimator_.feature_importances_.argsort()[::-1]] feature_importances_score = [clf.estimator_.feature_importances_[int(j)] for j in feature_importances_index] fis = pd.DataFrame( {'name':[index2feature.get(int(key),'Null') for key in feature_importances_index], 'score':feature_importances_score} ) score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) # where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) sns.barplot(x = 'score', y = 'name', data = fis, ax=ax3, color="blue") ax3.set_xlabel("Feature_Importance", fontsize=10) plt.tight_layout() plt.savefig(graph_fname) plt.close() #print clf.toarray().shape # >(26049, 100) #input_features = 26049, weak_learners = 100 #print len(one_hot.toarray()[:,0]), one_hot.toarray()[:,0] #print len(one_hot.toarray()[0,:]), one_hot.toarray()[0,:] ## feature transformation : get test data from train trees #print transformated_train_features.shape, X_train.shape #print transformated_test_features.shape, X_test.shape transformated_train_features = clf.one_hot_encoding if type(X_test) == pd.core.frame.DataFrame: transformated_test_features = clf.transform(X_test.as_matrix().astype(np.float32), y_test) elif type(X_train) == np.ndarray: transformated_test_features = clf.transform(X_test, y_test) #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] #model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] #model_train_fname = os.path.join(Config.get_string('data.path'), # model_folder, # model_train_fname) with gzip.open(model_train_fname, "wb") as gf: cPickle.dump([transformated_train_features, y_train], gf, cPickle.HIGHEST_PROTOCOL) #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] #model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] #model_test_fname = os.path.join(Config.get_string('data.path'), # model_folder, # model_test_fname) with gzip.open(model_test_fname, "wb") as gf: cPickle.dump([transformated_test_features, y_test], gf, cPickle.HIGHEST_PROTOCOL) """ # 2. lower model if lower_param_keys is None: lower_param_keys = ['model_type', 'n_neighbors', 'weights', 'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs'] if lower_param_vals is None: lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'], ['ball_tree'], [30], ['minkowski'], [2], [4]] lower_param_dict = dict(zip(lower_param_keys, lower_param_vals)) if lower_param_dict['model_type'] == [LogisticRegression]: # grid search for lower model : Linear Classifier # ExperimentL1_1 has model free. On the other hand, data is fix model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'], train_fname = model_train_fname, test_fname = model_test_fname) # GridSearch has a single model. model is dertermined by param gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals, cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'], cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out']) lower_best_param, lower_best_score = gs.search_by_cv() print lower_best_param # get meta_feature exp.write2csv_meta_feature( model = LogisticRegression(), meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'], meta_train_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'], meta_test_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'], meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'], best_param_ = lower_best_param ) """ # 2. lower model if lower_param_keys is None: lower_param_keys = ['model_type', 'n_neighbors', 'weights', 'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs'] if lower_param_vals is None: lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'], ['ball_tree'], [30], ['minkowski'], [2], [4]] lower_param_dict = dict(zip(lower_param_keys, lower_param_vals)) clf_lower_model = None clf_lower_mname = None # grid search for lower model : Linear Classifier # ExperimentL1_1 has model free. On the other hand, data is fix if lower_param_dict['model_type'] == [LogisticRegression]: # Logistic Regression clf_lower_model = LogisticRegression() clf_lower_mname = 'LR' elif lower_param_dict['model_type'] == [SVM]: # SVM clf_lower_model = LinearSVC() clf_lower_mname = 'SVM' else: sys.stderr.write("You should input lower liner model\n") sys.exit() model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'], train_fname = model_train_fname, test_fname = model_test_fname) # GridSearch has a single model. model is dertermined by param gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals, cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'], cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out']) lower_best_param, lower_best_score = gs.search_by_cv() print lower_best_param # get meta_feature meta_train_fname_ = "%s_%s.%s" % ( ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[-1] ) meta_test_fname_ = "%s_%s.%s" % ( ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[-1] ) exp.write2csv_meta_feature( model = clf_lower_model, meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'], meta_train_fname = meta_train_fname_, meta_test_fname = meta_test_fname_, meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'], best_param_ = lower_best_param ) ## best parameter for GBDT and anohter sklearn classifier #return best_param, best_score return upper_best_params, lower_best_param
# estimator : ensemble学習器 ======= # estimator : ensemble蟄ヲ鄙貞勣 >>>>>>> a476ecf10868a68d67e3d992fef72bc4370722a8 # cv : if train : get best parameter if phase == "train": clf = GradientBoostingClassifier() gscv = GridSearchCV(clf, parameters, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = n_jobs, cv = cv_k_fold) gscv.fit(X_train, y_train) self.best_params = gscv.best_params_ clf.set_params(**gscv.best_params_) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() estimator.set_params(**gscv.best_params_) self.estimator = estimator
def main(trainXFile='/home/kiran/kdd/trainXall.csv', trainYFile='/home/kiran/kdd/trainY.csv', validXFile='/home/kiran/kdd/validXall.csv', validYFile='/home/kiran/kdd/validY.csv', testXFile='/home/kiran/kdd/testXall.csv', n_estimators=150, n_estimators_step=150, learning_rate=0.005, max_features=30, max_depth=11, verbose=0, dump_file='/home/kiran/kdd/pymodels/gbm_all_0.005_30_11.pkl', outputFile='prediction.txt', max_trees=300, random_state=11): actual = np.loadtxt(validYFile, delimiter=",") trainY = np.loadtxt(trainYFile, delimiter=",") train = pd.read_csv(trainXFile) valid = pd.read_csv(validXFile) #trainY = pd.read_csv (trainYFile) #trainY = trainY.ix [:,'x'] #validY = pd.read_csv (validYFile) #actual = validY.ix [:,'x'] #validWeights = pd.read_csv (validFileWeights) #validWeights = validWeights.ix [:,'x'] #actual = actual.get_values () gbm = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_features=max_features, max_depth=max_depth, random_state=random_state, verbose=verbose) gbm.fit(train, trainY) prediction_valid = gbm.predict_proba(valid)[:, 1] #gbm = joblib.load ( '/home/kiran/kdd/pymodels/gbmmore.pkl') bestAUC = 0 #myAUC = kdd_metrics (actual, prediction_valid, validWeights) myAUC = metrics.roc_auc_score(actual, prediction_valid) bestAUC = 0 while myAUC >= bestAUC: n_estimators = n_estimators + n_estimators_step gbm.set_params(n_estimators=n_estimators, warm_start=True) gbm.fit(train, trainY) prediction_valid = gbm.predict_proba(valid)[:, 1] myAUC = metrics.roc_auc_score(actual, prediction_valid) print "bestAUC: %f myAUC: %f" % (bestAUC, myAUC) improvement = myAUC - bestAUC if improvement < 0.0000000001: break if n_estimators > max_trees: break bestAUC = myAUC bestPrediction = prediction_valid joblib.dump(gbm, dump_file) print "bestAUC: %f improvement: %f" % (bestAUC, improvement) myAUC = metrics.roc_auc_score(actual, bestPrediction) print "AUC: %f bestPrediction: %f" % (myAUC, improvement) test = pd.read_csv(testXFile) prediction_test1 = gbm.predict_proba(test)[:, 1] prediction_total = np.concatenate((bestPrediction, prediction_test1), axis=0) np.savetxt(outputFile, prediction_total, delimiter=",")
subsample=0.8, random_state=42 ) parameter_grid = { 'learning_rate': np.arange(0.001, 0.003, 0.0005), 'n_estimators': np.arange(1000, 3000, 500) } grid_gradient = GridSearchCV(gb, parameter_grid, cv=cv_splitter, n_jobs=-1) grid_gradient.fit(X_1, y) gradient_best_param = grid_gradient.best_params_ gradient_best_param # best parameter values to be used in the stack model # update gb with the optimal parameters gb.set_params(**gradient_best_param) # #### 2. Tune max_depth and min_sample_split parameter_grid = { 'max_depth': np.arange(1, 5), 'min_samples_split': np.arange(2, 6, 1) } grid_gradient = GridSearchCV(gb, parameter_grid, cv=cv_splitter, n_jobs=-1) grid_gradient.fit(X_1, y) grid_gradient.best_params_ gradient_best_param.update(grid_gradient.best_params_) gradient_best_param # update best parameter values to be used in the stack model
arrowprops=dict(arrowstyle="<->")) ax.text(810, 0.25, 'train-test gap') # regularization def fmt_params(params): return ", ".join("{0}={1}".format(key, val) for key, val in params.items()) fig = plt.figure(figsize=(8, 5)) ax = plt.gca() for params, (test_color, train_color) in [({}, ('#d7191c', '#2c7bb6')), ({'min_samples_leaf': 3}, ('#fdae61', '#abd9e9'))]: est = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=1, learning_rate=1.0) est.set_params(**params) est.fit(X_train, y_train) test_dev, ax = deviance_plot(est, X_test, y_test, ax=ax, label=fmt_params(params), train_color=train_color, test_color=test_color) ax.annotate('Higher bias', xy=(900, est.train_score_[899]), xycoords='data', xytext=(600, 0.3), textcoords='data', arrowprops=dict(arrowstyle="->", connectionstyle="arc"), ) ax.annotate('Lower variance', xy=(900, test_dev[899]), xycoords='data', xytext=(600, 0.4), textcoords='data', arrowprops=dict(arrowstyle="->", connectionstyle="arc"), ) plt.legend(loc='upper right')
train_data = data.loc[rindex,:] test_data = data.drop(rindex) train_label = train_data.target train_data = train_data.drop(['target', 'id'], axis=1) test_label = test_data.target test_data = test_data.drop(['target', 'id'], axis=1) gb_otto = GradientBoostingClassifier(n_estimators = 100, verbose = 1, warm_start = 1) gb_otto.fit(train_data, train_label) test_prob = gb_otto.predict_proba(test_data) train_prob = gb_otto.predict_proba(train_data) print 'The logloss score of test data:', logloss(test_label, test_prob) print 'The logloss score of train data:', logloss(train_label, train_prob) gb_otto.set_params(n_estimators = 150) gb_otto.fit(train_data, train_label) test_prob = gb_otto.predict_proba(test_data) train_prob = gb_otto.predict_proba(train_data) print 'The logloss score of test data:', logloss(test_label, test_prob) print 'The logloss score of train data:', logloss(train_label, train_prob) gb_otto.set_params(n_estimators = 200) gb_otto.fit(train_data, train_label) test_prob = gb_otto.predict_proba(test_data) train_prob = gb_otto.predict_proba(train_data) print 'The logloss score of test data:', logloss(test_label, test_prob) print 'The logloss score of train data:', logloss(train_label, train_prob) gb_otto.set_params(n_estimators = 250)
train_coup3_y = train_df_coup3_y.values test_coup3_X = test_df_coup3.values std= StandardScaler() train_coup3_X= std.fit_transform(train_coup3_X) test_coup3_X= std.fit_transform(test_coup3_X) X_train14,X_test14,y_train14,y_test14 = train_test_split(train_coup3_X,train_coup3_y,test_size = 0.20) gbc14 = GradientBoostingClassifier(n_estimators=3000) param_grid14 = {'max_depth': [3,4,6],#tree depths 'min_samples_leaf': [5,9,12], #no. of samples to be at leaf nodes 'learning_rate': [0.1,0.01,0.05,0.001]## Shrinkage #'max_features': [1.0, 0.3] #no.of features before finding best split node #stochastic gradient } #loss gs_cv14 = GridSearchCV(gbc14, param_grid14, cv=3,scoring='accuracy',n_jobs=-1).fit(X_train14, y_train14) print('Best hyperparameters: %r' % gs_cv14.best_params_) gbc14.set_params(**gs_cv14.best_params_) gbc14.fit(X_train14,y_train14) #Models Ensambling def ensambling(X,Y): # The DEV SET will be used for all training and validation purposes # The TEST SET will never be used for training, it is the unseen set. dev_cutoff = len(Y) * 4/5 X_dev = X[:dev_cutoff] Y_dev = Y[:dev_cutoff] X_test = X[dev_cutoff:] Y_test = Y[dev_cutoff:]