class GradientBoostingClassifierImpl(): def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001): self._hyperparams = { 'loss': loss, 'learning_rate': learning_rate, 'n_estimators': n_estimators, 'subsample': subsample, 'criterion': criterion, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_depth': max_depth, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'init': init, 'random_state': random_state, 'max_features': max_features, 'verbose': verbose, 'max_leaf_nodes': max_leaf_nodes, 'warm_start': warm_start, 'presort': presort, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'tol': tol } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
class MyGradientBoostingClassifier(BaseClassifier): def __init__(self, verbose=1, n_estimators=5, max_depth=6, min_samples_leaf=100): self.classifier = GradientBoostingClassifier( **{ 'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format(**{ "n": n_estimators, "md": max_depth, "ms": min_samples_leaf }) def get_name(self): return self.name def fit(self, X, y): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self, feat_names): ipts = dict(zip(feat_names, self.classifier.feature_importances_)) return ipts
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves = gbdt.apply(X_test)[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
class MyGradientBoostingClassifier(BaseClassifier): def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000): self.classifier = GradientBoostingClassifier( **{'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} ) def get_name(self): return self.name def fit(self, X, y): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self): return self.classifier.feature_importances_
'user_query_day_hour', 'context_page_id', 'hour', 'shop_id', 'shop_review_num_level', 'shop_star_level', 'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description', ] target = ['is_trade'] X_train = train[features] X_test = test[features] Y_train = train[target] # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=170, min_samples_split=3, min_samples_leaf=8) # 调参之后的GBDT模型 # 训练学习 gbdt.fit(X_train, Y_train) # 预测及AUC评测 Y_predict_gbdt = gbdt.predict_proba(X_test)[:, 1] pd.DataFrame({'instance_id': test['instance_id'], 'predicted_score': Y_predict_gbdt}). \ to_csv('D:\kaggle\\alimm\\baseline_06.csv', index=False, sep=' ')
def gbdt_lr_train(): cv_lr_scores = [] cv_lr_trans_scores = [] cv_lr_trans_raw_scores = [] cv_gbdt_scores = [] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) for train_index, valid_index in skf.split(X, y): X_train = X[train_index] X_valid = X[valid_index] y_train = y[train_index] y_valid = y[valid_index] # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=60, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) y_pred_gbdt = gbdt.predict_proba(X_valid)[:, 1] gbdt_auc = roc_auc_score(y_valid, y_pred_gbdt) print('基于原有特征的gbdt auc: %.5f' % gbdt_auc) cv_gbdt_scores.append(gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_valid)[:, 1] lr_valid_auc = roc_auc_score(y_valid, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_valid_auc) cv_lr_scores.append(lr_valid_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_valid_leaves = gbdt.apply(X_valid)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_valid_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_valid, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) cv_lr_trans_scores.append(gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_valid_ext = hstack([X_trans[train_rows:, :], X_valid]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_valid_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_valid, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2) cv_lr_trans_raw_scores.append(gbdt_lr_auc2) cv_lr = np.mean(cv_lr_scores) cv_lr_trans = np.mean(cv_lr_trans_scores) cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores) cv_gbdt = np.mean(cv_gbdt_scores) print("==" * 20) print("gbdt原始特征cv_gbdt:", cv_gbdt) print("lr原始特征cv_lr:", cv_lr) print("lr基于gbdt的特征cv_lr_trans:", cv_lr_trans) print("lr基于gbdt特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
def gbdt_lr_train(train,test,gbdt_features,lr_features,target,name,isOnline): # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=20, max_depth=3, verbose=0, max_features=0.3) #n_estimators=20, max_depth=3, verbose=0, max_features=0.5 # 训练学习 gbdt.fit(train[gbdt_features], train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdt = gbdt.predict_proba(test[gbdt_features])[:, 1] gbdt_test_log_loss = log_loss(test[target], y_pred_gbdt) print('gbdt log_loss: %.5f' % gbdt_test_log_loss) else: y_pred_gbdt = gbdt.predict_proba(train[gbdt_features].tail(57562))[:, 1] gbdt_test_log_loss = log_loss(train[target].tail(57562), y_pred_gbdt) print('gbdt log_loss: %.5f' % gbdt_test_log_loss) # GBDT编码原有特征 X_train_leaves = gbdt.apply(train[gbdt_features])[:,:,0] X_test_leaves = gbdt.apply(test[gbdt_features])[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_test_log_loss1 = log_loss(test[target], y_pred_gbdtlr1) print('基于GBDT特征编码后的LR log_loss: %.5f' % gbdt_lr_test_log_loss1) else: print('Online') # 定义LR模型 lr = LogisticRegression() # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], train[lr_features]]) X_test_ext = hstack([X_trans[train_rows:, :], test[lr_features]]) print("gbdt output",X_trans[:train_rows, :].shape) print("input",train[lr_features].shape) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_test_log_loss2 = log_loss(test[target], y_pred_gbdtlr2) print('基于组合特征的LR log_loss: %.5f' % gbdt_lr_test_log_loss2) else: print('Online') test['predicted_score'] = lr.predict_proba(X_test_ext)[:, 1] print(test['predicted_score'].head(5)) print(len(test)) test[['instance_id', 'predicted_score']].to_csv('../baseline_' + name +'.csv', index=False,sep=' ')#保存在线提交结果 print('Saved result success!')
import numpy as np from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier from numpy.ma.testutils import assert_array_almost_equal # Create some data m = 10000 X = np.random.normal(size=(m, 10)) thresh = np.random.normal(size=10) X_transformed = X * (X > thresh) beta = np.random.normal(size=10) y = (np.dot(X_transformed, beta) + np.random.normal(size=m)) > 0 # Train a gradient boosting classifier model = GradientBoostingClassifier() model.fit(X, y) print model.score(X, y) # Inspect pred = model.predict_proba(X) approx = model.loss_._score_to_proba( model.learning_rate * sum(map(lambda est: est.predict(X), model.estimators_[:, 0])) + np.ravel(model.init_.predict(X))) assert_array_almost_equal(pred, approx)
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) print "train data shape: ", X_train.shape # 模型训练 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] print "gbdt leaves shape: ", X_train_leaves.shape for i in range(0, len(X_train_leaves[0])): cateMap = {} for j in range(0, len(X_train_leaves)): cateMap[X_train_leaves[j][i]] = 0 print "F%d: %d" % (i, len(cateMap)) # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder(sparse=False, categories='auto') X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print "gbdt oneHot shape: ", X_trans.shape print "oneHot leaves: ", X_trans[0] # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print "gbdt leaves cross", X_train_ext.shape # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
n_redundant=2, n_classes=2, n_clusters_per_class=3, random_state=2017) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # 不生成新的特征,直接训练 clf = GradientBoostingClassifier(n_estimators=50) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_prob = clf.predict_proba(X_test)[:, 1] acc = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_prob) print("Original featrues") print("GBDT_ACC: {:.6f}".format(acc)) print("GBDT_AUC: {:.6f}".format(auc)) # 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵 X_train_leaves = clf.apply(X_train)[:, :, 0] X_test_leaves = clf.apply(X_test)[:, :, 0] # 将X_train_leaves, X_test_leaves在axis=0方向上合并,再进行OneHotEncoder操作 All_leaves = np.r_[X_train_leaves, X_test_leaves] # 索引矩阵每列不是0/1二值型离散特征,因此需要OneHotEncoder操作 enc = OneHotEncoder(categories='auto')
X_pick = np.zeros((features.shape[0], 112)) for i, match_id in enumerate(features.index): for p in range(5): X_pick[i, features.ix[match_id, 'r{}_hero'.format(p+1)]-1] = 1 X_pick[i, features.ix[match_id, 'd{}_hero'.format(p+1)]-1] = -1 return np.concatenate([X, X_pick], axis=1) X = inject_bag_of_words(X, features) clf, scaler = train_logistic(X, y, 'With Bag of Words') # final test proba clf.fit(scaler.transform(X), y) test_features = pandas.read_csv('features_test.csv', index_col='match_id') X_test = test_features.drop(category_features, axis=1) X_test = X_test.fillna(0) X_test = inject_bag_of_words(X_test, test_features) X_test = scaler.transform(X_test) proba = clf.predict_proba(X_test)[:, 1] print("Proba min: {}".format(proba.min())) print("Proba max: {}".format(proba.max()))
model2 = GradientBoostingClassifier(n_estimators= 1550,learning_rate= 0.041, max_depth = 4) run_model(model2, X_train, y_train, X_test, y_test) model.fit(X_train, y_train) y_pred = model.predict(X_test) tn,fp,fn,tp = confusion_matrix(y_pred= y_pred, y_true= y_test).ravel() print("False Negative Rate", fn/(fn+tp)) model3 = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(64, 64), random_state=1) run_model(model1, X_train, y_train, X_test, y_test) run_model(model3, X_train, y_train, X_test, y_test) model3.fit(X_train, y_train) y_T = pd.DataFrame(model2.predict_proba(test)) #y_submit = pd.concat(y_T[0],axis = 1) estimators = [] estimators.append(('GB', model2)) estimators.append(('NN', model3)) # create the ensemble model model4 = voting_classifier.VotingClassifier(estimators, voting = "soft") run_model(model4, X_train, y_train, X_test, y_test) results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold) print(results.mean()) def run_model(model, X_train, y_train, X_test, y_test): model.fit(X_train, y_train) y_pred = model.predict(X_test)
plt.plot(test_loss, color, linewidth=2) plt.plot(train_loss, color+'--', linewidth=2) looses[learning_rate] = test_loss plt.figure() colors = ['r', 'g', 'b', 'c', 'm'] learn_rates = [1, 0.5, 0.3, 0.2, 0.1] for index, learning_rate in enumerate(learn_rates): clf.learning_rate = learning_rate clf.fit(X_train, y_train) test_predictions = clf.staged_predict_proba(X_test) train_predictions = clf.staged_predict_proba(X_train) plot_score(test_predictions, y_test, train_predictions, y_train, color=colors[index], learning_rate=learning_rate) legends = [["Test {}".format(learn_rate), "Train {}".format(learn_rate)] for learn_rate in learn_rates] legends = [item for sublist in legends for item in sublist] plt.legend(legends) plt.savefig("coursera_out/gradient_boosting.png") min_loss_on_iteration = np.argmin(looses[0.2]) min_loss = looses[0.2][min_loss_on_iteration] print("on iteration {} was loose {}".format(min_loss_on_iteration, min_loss)) coursera.output("min_loose_on_0.2.txt", "{:.2f} {}".format(min_loss, min_loss_on_iteration)) clf = RandomForestClassifier(n_estimators=36, random_state=241) clf.fit(X_train, y_train) predicts = clf.predict_proba(X_test) loss = log_loss(y_test, predicts) coursera.output("random_forest_loss_on_36_trees.txt", "{:.2f}".format(loss))
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # X_all_dense = X_all.todense() print(type(X_all)) # print(type(X_all_dense[0])) # print(y_all) # print("===") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) # print(X_train) # print(y_train) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 toarray = X_test.toarray() print(type(toarray)) y_pred_gbdt = gbdt.predict_proba(toarray) # print(y_pred_gbdt) y_pred_gbdt = gbdt.predict_proba(toarray)[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # gbdt auc: 0.96455 # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # 基于原有特征的LR AUC: 0.93455 # GBDT编码原有特征 # X_train_leaves = gbdt.apply(X_train) X_train_leaves = gbdt.apply(X_train)[:, :, 0] np.set_printoptions(linewidth=400) np.set_printoptions(threshold=np.inf) # print(X_train_leaves[0:22,:]) # 打印22行,所有列 print(type(X_train_leaves)) X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape print(train_rows, cols) gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print(X_trans.shape) # print(X_trans.todense()[0:22,:]) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 # print(X_trans[train_rows:, :]) y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print("组合特征的个数:", X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)