def deserialize_gradient_boosting(model_dict): model = GradientBoostingClassifier(**model_dict['params']) estimators = [ regression.deserialize_decision_tree_regressor(tree) for tree in model_dict['estimators_'] ] model.estimators_ = np.array(estimators).reshape( model_dict['estimators_shape']) if 'init_' in model_dict and model_dict['init_']['meta'] == 'dummy': model.init_ = dummy.DummyClassifier() model.init_.__dict__ = model_dict['init_'] model.init_.__dict__.pop('meta') model.classes_ = np.array(model_dict['classes_']) model.train_score_ = np.array(model_dict['train_score_']) model.max_features_ = model_dict['max_features_'] model.n_classes_ = model_dict['n_classes_'] model.n_features_ = model_dict['n_features_'] if model_dict['loss_'] == 'deviance': model.loss_ = _gb_losses.BinomialDeviance(model.n_classes_) elif model_dict['loss_'] == 'exponential': model.loss_ = _gb_losses.ExponentialLoss(model.n_classes_) elif model_dict['loss_'] == 'multinomial': model.loss_ = _gb_losses.MultinomialDeviance(model.n_classes_) if 'priors' in model_dict: model.init_.priors = np.array(model_dict['priors']) return model
def strategyGBDT(X_train, y_train, X_test, y_test): print('strategy result ...') original_params = { 'n_estimators': 100, 'max_leaf_nodes': 4, 'max_depth': 11, 'random_state': 10, 'min_samples_split': 60 } plt.figure() for label, color, setting in [ ('No shrinkage', 'orange', { 'learning_rate': 1.0, 'subsample': 1.0 }), ('learning_rate=1.0', 'turquoise', { 'learning_rate': 0.1, 'subsample': 1.0 }), ('subsample=0.5', 'blue', { 'learning_rate': 1.0, 'subsample': 0.5 }), ('learning_rate=0.1, subsample=0.5', 'gray', { 'learning_rate': 0.1, 'subsample': 0.5 }), ('learning_rate=0.1, max_features=1', 'magenta', { 'learning_rate': 0.1, 'max_features': 2 }) ]: params = dict(original_params) params.update(setting) print('model start') clf = GradientBoostingClassifier(**params) clf.fit(X_train, y_train) # compute test set deviance test_deviance = np.zeros((params['n_estimators'], ), dtype=np.float64) for i, y_pred in enumerate(clf.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0,1} test_deviance[i] = clf.loss_(y_test, y_pred) # # [::5]每5个获取一个元素 # plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5], '_', color=color, label=label) # # [:]获取每个元素 # plt.plot((np.arange(test_deviance.shape[0]) + 1)[:], test_deviance[:], '_', color=color, label=label) plt.plot((np.arange(test_deviance.shape[0]) + 1), test_deviance, color=color, label=label) print('model finished') plt.legend(loc='upper left') plt.xlabel('Boosting Iterations') plt.ylabel('Test Set Deviance') plt.show()
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": clf = GradientBoostingClassifier() gscv = GridSearchCV(clf, parameters, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = n_jobs, cv = cv_k_fold) gscv.fit(X_train, y_train) self.best_params = gscv.best_params_ clf.set_params(**gscv.best_params_) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() estimator.set_params(**gscv.best_params_) self.estimator = estimator self.one_hot_encoding = None
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": gscv = GridSearchCV( GradientBoostingClassifier(), parameters, verbose=10, scoring="f1", #scoring = "precision" or "recall" n_jobs=n_jobs, cv=cv_k_fold) gscv.fit(X_train, y_train) best_params = gscv.best_params_ print "[GBDT's Best Parameter]", gscv.best_params_ clf = GradientBoostingClassifier() clf.set_params(**gscv.best_params_) del gscv clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() else: best_params = { 'loss': ['deviance'], 'learning_rate': [0.1], 'max_depth': [2], 'min_samples_leaf': [8], 'max_features': [5], #max_features must be in (0, n_features] 'max_leaf_nodes': [20], 'subsample': [0.1], 'n_estimators': [100], 'random_state': [0] } estimator.set_params(**best_params) self.estimator = estimator self.one_hot_encoding = None
def test_max_feature_regression(): # Test to make sure random state is set properly. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5, max_depth=2, learning_rate=.1, max_features=2, random_state=1) gbrt.fit(X_train, y_train) deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test)) assert deviance < 0.5, "GB failed with deviance %.4f" % deviance
def test_max_feature_regression(): # Test to make sure random state is set properly. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5, max_depth=2, learning_rate=.1, max_features=2, random_state=1) gbrt.fit(X_train, y_train) deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test)) assert_true(deviance < 0.5, "GB failed with deviance %.4f" % deviance)
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": gscv = GridSearchCV(GradientBoostingClassifier(), parameters, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = n_jobs, cv = cv_k_fold) gscv.fit(X_train, y_train) best_params = gscv.best_params_ print "[GBDT's Best Parameter]", gscv.best_params_ clf = GradientBoostingClassifier() clf.set_params(**gscv.best_params_) del gscv clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() else: best_params = {'loss' : ['deviance'], 'learning_rate' : [0.1], 'max_depth': [2], 'min_samples_leaf': [8], 'max_features': [5],#max_features must be in (0, n_features] 'max_leaf_nodes' : [20], 'subsample' : [0.1], 'n_estimators' : [100], 'random_state' : [0]} estimator.set_params(**best_params) self.estimator = estimator self.one_hot_encoding = None
def main(): data = pd.read_csv('gbm-data.csv') values = data.values X = values[:, 1:] y = values[:, 0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) min_losses = [] for i in [1, 0.5, 0.3, 0.2, 0.1]: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=i) clf.fit(X_train, y_train) test_deviance = np.zeros(250, dtype=np.float64) losses = np.zeros(250, dtype=np.float64) min_loss = float('Inf') min_idx = 0 for j, y_pred in enumerate(clf.staged_decision_function(X_test)): test_deviance[j] = clf.loss_(y_test, y_pred) y_pred_s = sigmoid(y_pred) losses[j] = log_loss(y_test, y_pred_s) if min_loss > losses[j]: min_loss = losses[j] min_idx = j min_losses.append((min_loss, min_idx)) plt.figure() plt.plot(losses, 'r', linewidth=2) plt.plot(test_deviance, 'g', linewidth=2) plt.legend(['test', 'train']) plt.savefig('./' + str(i) + '.png') with open('task_2.txt', 'w') as f: f.write("{0:.2f} {1:.2f}".format(min_losses[3][0], min_losses[3][1])) min_loss, min_idx = min(min_losses) clf = RandomForestClassifier(n_estimators=min_idx, random_state=241) clf.fit(X_train, y_train) loss = log_loss(y_test, clf.predict_proba(X_test)) with open('task_3.txt', 'w') as f: f.write("{0:.2f}".format(loss))
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": clf = GradientBoostingClassifier() gscv = GridSearchCV( clf, parameters, verbose=10, scoring="f1", #scoring = "precision" or "recall" n_jobs=n_jobs, cv=cv_k_fold) gscv.fit(X_train, y_train) self.best_params = gscv.best_params_ clf.set_params(**gscv.best_params_) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() estimator.set_params(**gscv.best_params_) self.estimator = estimator self.one_hot_encoding = None
true_train = den_f[training_ids, -1] training_features = den_f[training_ids, :-1] testing_features = den_f[testing_index, :-1] path = "/Users/lls/Desktop/GBT_binary_class/lr_01_maxf_08_subs_06/" clf = GradientBoostingClassifier(n_estimators=1, max_depth=10, learning_rate=0.1, max_features=0.8, warm_start=True, subsample=0.6) clf.fit(training_features, true_train) imp_0 = clf.feature_importances_ pred_0 = clf.predict_proba(testing_features) fpr_0, tpr_0, auc_0, threshold_0 = ml.roc(pred_0, true_test, true_class=1) loss_0 = clf.loss_(true_test, pred_0[:, 1]) l1_norm = np.zeros(100, ) loss = np.zeros(100, ) auc = np.zeros(100, ) m = np.linspace(np.log10(3e10), np.log10(1e15), 50) width = np.append(np.diff(m), np.diff(m)[-1]) plt.figure() for i in range(100): clf.n_estimators += 1 print(clf.n_estimators) clf.fit(training_features, true_train) print("Done fit")
'max_features': 2 })] plt.figure() for (label, setting), color in zip(reg_settings, colors): params = dict(original_params) params.update(setting) clf = GradientBoostingClassifier(**params) clf.fit(X_train, y_train) # compute test set deviance test_deviance = np.zeros((params['n_estimators'], ), dtype=np.float64) for i, y_pred in enumerate(clf.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0, 1} test_deviance[i] = clf.loss_(y_test, y_pred) plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5], '-', color=color, label=label) plt.legend(loc='upper right') plt.xlabel('Boosting Iterations') plt.ylabel('Test Set Deviance') plt.show()
'max_depth': 3, 'learning_rate': 0.1, 'random_state': 123 } # 나무의 최대 수 지정 n_estimators = 700 # 모델 정의 clf = GradientBoostingClassifier(**param_grid) clf.fit(X_train, y_train) # 손실 함수값 구하기 loss = np.zeros((n_estimators, )) for i, y_pred in enumerate(clf.staged_decision_function(X_test)): loss[i] = clf.loss_(y_test, y_pred) # 최저 손실 함수 값을 주는 나무의 수 찾기 min_index = np.argmin(loss) min_value = loss[min_index] print(min_value) # 0.4996184773708032 opt_trees = min_index + 1 print(opt_trees) # 67 # 나무의 개수에 따른 손실 함수 값 그래프 plt.figure(figsize=(6, 6)) plt.plot(loss) plt.xlabel('나무의 개수') plt.ylabel('손실값')
'min_samples_split': 4, 'learning_rate': 0.01, 'subsample': 0.7, # 'max_features':'sqrt' } clf = GradientBoostingClassifier(**params) clf.fit(X_train, y_train) roc = roc_auc_score(y_test, clf.predict(X_test)) print("ROC: %.4f" % roc) test_score = np.zeros((params['n_estimators'], ), dtype=np.float64) for i, y_pred in enumerate(clf.staged_decision_function(X_test)): test_score[i] = clf.loss_(y_test, y_pred) plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title('Deviance') plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-', label='Training Set Deviance') plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', label='Test Set Deviance') plt.legend(loc='upper right') plt.xlabel('Boosting Iterations') plt.ylabel('Deviance')
for label, color, setting in [('learning_rate= 1', 'orange', {'learning_rate': 1.0}), ('learning_rate=0.5', 'turquoise', {'learning_rate': 0.5}), ('subsample=0.3', 'blue', {'learning_rate': 0.3}), ('learning_rate=0.2', 'gray', {'learning_rate': 0.2}), ('learning_rate=0.1', 'magenta', {'learning_rate': 0.1})]: params = dict(original_params) params.update(setting) clf = GradientBoostingClassifier(**params) clf.fit(X_train, y_train) # compute test set deviance test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64) for i, y_pred in enumerate(clf.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0, 1} test_deviance[i] = clf.loss_(y_test, y_pred) plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5], '-', color=color, label=label) plt.savefig(str(i)+'example.png') plt.show()
def gbdt_plus_liner_classifier_grid_search(stack_setting_, upper_param_keys=None, upper_param_vals=None, lower_param_keys=None, lower_param_vals=None, num_proc=None): """ upper model is GBDT or Random Forest lower model is Linear Classifier """ if stack_setting_ is None: sys.stderr.write('You have no setting Json file\n') sys.exit() if num_proc is None: num_proc = 6 # 1. upper model if upper_param_keys is None: upper_param_keys = ['model_type', 'n_estimators', 'loss', 'random_state', 'subsample', 'max_features', 'max_leaf_nodes', 'learning_rate', 'max_depth', 'min_samples_leaf'] if upper_param_vals is None: upper_param_vals = [[GradientBoostingClassifier], [100], ['deviance'], [0], [0.1], [5], [20], [0.1], [2], [8]] # grid search for upper model : GBDT or Random Forest # ExperimentL1 has model free. On the other hand, data is fix exp = ExperimentL1(data_folder = stack_setting_['0-Level']['folder'], train_fname = stack_setting_['0-Level']['train'], test_fname = stack_setting_['0-Level']['test']) # GridSearch has a single model. model is dertermined by param #gs = GridSearch(SklearnModel, exp, upper_param_keys, upper_param_vals, # cv_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['folder'], # cv_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_out'], # cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_pred_out'], # refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['refit_pred_out']) #upper_best_param, upper_best_score = gs.search_by_cv() model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_train_fname = os.path.join(Config.get_string('data.path'), model_folder, model_train_fname) model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] model_test_fname = os.path.join(Config.get_string('data.path'), model_folder, model_test_fname) upper_param_dict = dict(zip(upper_param_keys, upper_param_vals)) if os.path.isfile(model_train_fname) is False and \ os.path.isfile(model_test_fname) is False: #upper_param_dict['model_type'] == [GradientBoostingClassifier] del upper_param_dict['model_type'] clf = GradientBoostingClassifier() clf_cv = GridSearchCV(clf, upper_param_dict, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = num_proc, cv = 5) X_train, y_train = exp.get_train_data() clf_cv.fit(X_train, y_train) upper_best_params = clf_cv.best_params_ print upper_best_params del clf_cv clf.set_params(**upper_best_params) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) X_test, y_test = exp.get_test_data() for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) graph_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['folder'] graph_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['name'] graph_fname = os.path.join(Config.get_string('data.path'), graph_folder, graph_fname) gs = GridSpec(2,2) ax1 = plt.subplot(gs[0,1]) ax2 = plt.subplot(gs[1,1]) ax3 = plt.subplot(gs[:,0]) ax1.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') ax1.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') ax1.set_xlabel('the number of weak learner:Boosting Iterations') ax1.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE'))) ax1.legend(loc="best") # dump for the transformated feature clf = TreeTransform(GradientBoostingClassifier(), best_params_ = upper_best_params) if type(X_train) == pd.core.frame.DataFrame: clf.fit(X_train.as_matrix().astype(np.float32), y_train) elif X_train == np.ndarray: clf.fit(X_train.astype(np.float32), y_train) # train result train_loss = clf.estimator_.train_score_ test_loss = np.zeros((len(clf.estimator_.train_score_),), dtype=np.float32) if type(X_train) == pd.core.frame.DataFrame: for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.as_matrix().astype(np.float32))): test_loss[iter] = clf.estimator_.loss_(y_test, y_pred) elif type(X_train) == np.ndarray: for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.astype(np.float32))): test_loss[iter] = clf.estimator_.loss_(y_test, y_pred) ax2.plot(train_loss, label="train_loss") ax2.plot(test_loss, label="test_loss") ax2.set_xlabel('Boosting Iterations') ax2.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE'))) ax2.legend(loc="best") # tree ensambles score_threshold=0.8 index2feature = dict(zip(np.arange(len(X_train.columns.values)), X_train.columns.values)) feature_importances_index = [str(j) for j in clf.estimator_.feature_importances_.argsort()[::-1]] feature_importances_score = [clf.estimator_.feature_importances_[int(j)] for j in feature_importances_index] fis = pd.DataFrame( {'name':[index2feature.get(int(key),'Null') for key in feature_importances_index], 'score':feature_importances_score} ) score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) # where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) sns.barplot(x = 'score', y = 'name', data = fis, ax=ax3, color="blue") ax3.set_xlabel("Feature_Importance", fontsize=10) plt.tight_layout() plt.savefig(graph_fname) plt.close() #print clf.toarray().shape # >(26049, 100) #input_features = 26049, weak_learners = 100 #print len(one_hot.toarray()[:,0]), one_hot.toarray()[:,0] #print len(one_hot.toarray()[0,:]), one_hot.toarray()[0,:] ## feature transformation : get test data from train trees #print transformated_train_features.shape, X_train.shape #print transformated_test_features.shape, X_test.shape transformated_train_features = clf.one_hot_encoding if type(X_test) == pd.core.frame.DataFrame: transformated_test_features = clf.transform(X_test.as_matrix().astype(np.float32), y_test) elif type(X_train) == np.ndarray: transformated_test_features = clf.transform(X_test, y_test) #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] #model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] #model_train_fname = os.path.join(Config.get_string('data.path'), # model_folder, # model_train_fname) with gzip.open(model_train_fname, "wb") as gf: cPickle.dump([transformated_train_features, y_train], gf, cPickle.HIGHEST_PROTOCOL) #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] #model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] #model_test_fname = os.path.join(Config.get_string('data.path'), # model_folder, # model_test_fname) with gzip.open(model_test_fname, "wb") as gf: cPickle.dump([transformated_test_features, y_test], gf, cPickle.HIGHEST_PROTOCOL) """ # 2. lower model if lower_param_keys is None: lower_param_keys = ['model_type', 'n_neighbors', 'weights', 'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs'] if lower_param_vals is None: lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'], ['ball_tree'], [30], ['minkowski'], [2], [4]] lower_param_dict = dict(zip(lower_param_keys, lower_param_vals)) if lower_param_dict['model_type'] == [LogisticRegression]: # grid search for lower model : Linear Classifier # ExperimentL1_1 has model free. On the other hand, data is fix model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'], train_fname = model_train_fname, test_fname = model_test_fname) # GridSearch has a single model. model is dertermined by param gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals, cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'], cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out']) lower_best_param, lower_best_score = gs.search_by_cv() print lower_best_param # get meta_feature exp.write2csv_meta_feature( model = LogisticRegression(), meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'], meta_train_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'], meta_test_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'], meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'], best_param_ = lower_best_param ) """ # 2. lower model if lower_param_keys is None: lower_param_keys = ['model_type', 'n_neighbors', 'weights', 'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs'] if lower_param_vals is None: lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'], ['ball_tree'], [30], ['minkowski'], [2], [4]] lower_param_dict = dict(zip(lower_param_keys, lower_param_vals)) clf_lower_model = None clf_lower_mname = None # grid search for lower model : Linear Classifier # ExperimentL1_1 has model free. On the other hand, data is fix if lower_param_dict['model_type'] == [LogisticRegression]: # Logistic Regression clf_lower_model = LogisticRegression() clf_lower_mname = 'LR' elif lower_param_dict['model_type'] == [SVM]: # SVM clf_lower_model = LinearSVC() clf_lower_mname = 'SVM' else: sys.stderr.write("You should input lower liner model\n") sys.exit() model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'], train_fname = model_train_fname, test_fname = model_test_fname) # GridSearch has a single model. model is dertermined by param gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals, cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'], cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out']) lower_best_param, lower_best_score = gs.search_by_cv() print lower_best_param # get meta_feature meta_train_fname_ = "%s_%s.%s" % ( ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[-1] ) meta_test_fname_ = "%s_%s.%s" % ( ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[-1] ) exp.write2csv_meta_feature( model = clf_lower_model, meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'], meta_train_fname = meta_train_fname_, meta_test_fname = meta_test_fname_, meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'], best_param_ = lower_best_param ) ## best parameter for GBDT and anohter sklearn classifier #return best_param, best_score return upper_best_params, lower_best_param
# acc_test = est_tune.score(X_test_sm, y_test_sm) acc_test = est_tune.score(X_test_sm, y_test_sm) print('Accuracy:') print('R^2 train: %.4f' % acc_train) print('R^2 test: %.4f' % acc_test) # mse = metrics.mean_squared_error(y_test, est_tune.predict(X_test)) # print('MSE: %.4f' % mse) # compute test set deviance test_score = np.zeros((est_tune.n_estimators, ), dtype=np.float64) # for i, y_pred in enumerate(est_tune.staged_predict(X_test_sm)): # test_score[i] = est_tune.loss_(y_test_sm, y_pred) for i, y_pred in enumerate(est_tune.staged_predict(X_test)): test_score[i] = est_tune.loss_(y_test, y_pred) plt.figure(figsize=(10, 6)) plt.subplot(1, 1, 1) plt.title('Deviance') plt.plot(np.arange(est_tune.n_estimators) + 1, est_tune.train_score_, 'b-', label='Train') plt.plot(np.arange(est_tune.n_estimators) + 1, test_score, 'r-', label='Test') plt.legend(loc='right') plt.xlabel('Boosting Iterations') plt.ylabel('MSE') plt.savefig('../paper/figs/deviance.eps', format='eps') # Feature importance
params = [('First', { 'n_estimators': 1000, 'learning_rate': .1, 'max_depth': 3, 'max_features': 'sqrt' }), ('Second', { 'n_estimators': 1250, 'learning_rate': .01, 'max_depth': 3, 'max_features': 'sqrt' })] plt.figure() for label, setting in params: params = dict(original_params) params.update(setting) model = GradientBoostingClassifier(**params).fit(X_train, y_train) test_deviance = np.zeros((params['n_estimators'], ), dtype=np.float64) for i, y_pred in enumerate(model.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0, 1} test_deviance[i] = model.loss_(y_test, y_pred) plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5], '-', label=label) plt.show()
learning_rate=0.05, n_estimators=500, subsample=1.0, min_samples_split=20, min_samples_leaf=10, max_depth=4) # Apprentissage du modele gbt_noRand05.fit(X_train, y_train) niter = 500 iter = np.arange(niter) + 1 test_deviance = np.zeros((niter, ), dtype=np.float64) # staged_decision_functio : décision fonction à chaque iteration for i, y_pred in enumerate(gbt_noRand05.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0, 1} test_deviance[i] = gbt_noRand05.loss_(y_test, y_pred) plt.figure(figsize=(8, 6)) # Erreur sur le test (evolution deviance) plt.plot(iter, test_deviance, label='Test', color='darkorange') # min vers 100 # Erreur sur apprentissage (evolution deviance) plt.plot(iter, gbt_noRand05.train_score_, label='Apprentissage', color='navy') # Diminution de l'erreur rapport modele precedant (par rapport au oob) #plt.plot(iter,gbt_noRand05.oob_improvement_) plt.legend(loc="upper right", fontsize=12) # Prediction des probabilités de 1 , array2d probas_test = gbt_noRand05.predict_proba(X_test)[:, 1] probas_train = gbt_noRand05.predict_proba(X_train)[:, 1] #AUC roc_auc_score(y_train, probas_train)
bag = BaggingClassifier(alg, n_estimators=100) score = cross_val_score(bag, X, y, cv=10, n_jobs=-1) # Gradient Boosting (with GB learning iterations visualization) from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier(n_estimators=250, learning_rate=0.01, random_state=241) gbc.fit(X_train, y_train) test_score = [] train_score = [] for i, y_pred in enumerate(gbc.staged_decision_function(X_test)): test_score.append(gbc.loss_(y_test, y_pred)) # may use any other metrics for i, y_pred in enumerate(gbc.staged_decision_function(X_train)): train_score.append(gbc.loss_(y_train, y_pred)) # may use any other metrics plt.plot(test_score) plt.plot(train_score) plt.legend(['test score', 'train score']) plt.show() # Word vectorization from sklearn.feature_extraction.text import TfidfVectorizer vect = TfidfVectorizer() descr_tfidf_train = vect.fit_transform(df_train['FullDescription'])
def training_and_validation(self, X_train, y_train, X_valid, y_valid): roc_score_train = [] roc_score_valid = [] f1_score_train = [] f1_score_valid = [] loss_train = [] loss_valid = [] models = [] for estimators in tqdm(self.n_estimators, desc='Search the optimal parameter...'): model = GradientBoostingClassifier( n_estimators=estimators, learning_rate=self.learning_rate, max_depth=self.max_depth, random_state=self.random_state) model.fit(X_train, y_train) pred_train = model.predict(X_train) pred_valid = model.predict(X_valid) pred_train_proba = model.predict_proba(X_train) pred_valid_proba = model.predict_proba(X_valid) ROC_score_train = roc_auc_score(y_train, pred_train) ROC_score_valid = roc_auc_score(y_valid, pred_valid) models.append(model) loss_train.append(model.loss_(y_train, pred_train)) loss_valid.append(model.loss_(y_valid, pred_valid)) # Преобразование метрик (в случае если классы были присвоены не согласно истинной метрике (инвертация)) F1_score_train = round(f1_score(y_train, pred_train), 3) F1_score_valid = round(f1_score(y_valid, pred_valid), 3) roc_score_train.append(round(ROC_score_train, 3)) roc_score_valid.append(round(ROC_score_valid, 3)) f1_score_train.append(F1_score_train) f1_score_valid.append(F1_score_valid) print('\n n_estimators = ', estimators) print('ROC_Score_train = ', round(ROC_score_train, 3)) print('ROC_Score_valid = ', round(ROC_score_valid, 3)) print('F1_score_train = ', round(F1_score_train, 3)) print('F1_score_valid = ', round(F1_score_valid, 3), '\n') best_model = models[f1_score_valid.index(max(f1_score_valid))] pred_value = best_model.predict(X_train) pred_train = best_model.predict_proba(X_train) pred_valid = best_model.predict_proba(X_valid) if Path(self.out_path).exists(): dump(best_model, Path(self.out_path) / 'Best_model_GradBoost.joblib') else: path = Path(self.out_path) path.mkdir() dump(best_model, path / 'Best_model_GradBoost.joblib') return best_model, roc_score_train, roc_score_valid, f1_score_train, f1_score_valid, pred_train, pred_valid, loss_train, loss_valid
# cv : if train : get best parameter if phase == "train": clf = GradientBoostingClassifier() gscv = GridSearchCV(clf, parameters, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = n_jobs, cv = cv_k_fold) gscv.fit(X_train, y_train) self.best_params = gscv.best_params_ clf.set_params(**gscv.best_params_) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() estimator.set_params(**gscv.best_params_) self.estimator = estimator self.one_hot_encoding = None def fit(self, X, y): self.fit_transform(X, y) return self
LearningRates = [1, 0.5, 0.3, 0.2, 0.1] nEstim = 250 nLR = len(LearningRates) test_loss_score = np.empty([nLR, nEstim]) test_logloss_score = np.empty([nLR, nEstim]) train_loss_score = np.empty([nLR, nEstim]) train_logloss_score = np.empty([nLR, nEstim]) common_args = {'n_estimators': nEstim, 'random_state': 241, 'verbose': True} for i in range(nLR): LRcurr = LearningRates[i] clf = GradientBoostingClassifier(learning_rate=LRcurr, **common_args) clf.fit(X_train, y_train) for j, pred in enumerate(clf.staged_decision_function(X_test)): test_loss_score[i, j] = clf.loss_(y_test, pred) test_logloss_score[i, j] = log_loss(y_test, sigma_func(pred)) for j, pred in enumerate(clf.staged_decision_function(X_train)): train_loss_score[i, j] = clf.loss_(y_train, pred) train_logloss_score[i, j] = log_loss(y_train, sigma_func(pred)) plt.figure() plt.plot(test_loss_score[3, :].T, linewidth=2) idx = 3 minLogLossTest = min(test_logloss_score[idx, :]) nIterMin = np.where(test_logloss_score[idx, :] == minLogLossTest) print 'At Learning Rate %2.2f minimum logloss %2.2f at iter = %d' % ( LearningRates[idx], minLogLossTest, nIterMin[0]) #%% Learn Random Forest Classifier
t0 = DT.datetime.now() model_gbc.fit(X_train, y_train3) t1 = DT.datetime.now() print('GBC took ' + str(t1 - t0)) z_gbc = model_gbc.predict_proba(X_test)[:, 1] #ROC fpr_gbc, tpr_gbc, thresh_gbc = skm.roc_curve(y_test3, z_gbc) plt.figure(3) plt.plot(fpr_gbc, tpr_gbc, 'r-') # AUC skm.auc(fpr_gbc, tpr_gbc) # Deviance (see https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regularization.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regularization-py) # compute test set deviance test_deviance = np.zeros((params['n_estimators'], ), dtype=np.float64) for i, y_pred in enumerate(model_gbc.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0, 1} test_deviance[i] = model_gbc.loss_(y_test3, y_pred) plt.plot((np.arange(test_deviance.shape[0]) + 1)[::1], test_deviance[::1], '-', color='red', label=str(params)) #plt.close()