def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df_test = pd.read_csv(data_test, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=50, learning_rate=0.3, ) clf.fit(X, y) y_pred = clf.predict(X_guest) acc = accuracy_score(y_guest, y_pred) result = {"accuracy": acc} print(result) return {}, result
class GradientBoostingClassifierImpl(): def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001): self._hyperparams = { 'loss': loss, 'learning_rate': learning_rate, 'n_estimators': n_estimators, 'subsample': subsample, 'criterion': criterion, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_depth': max_depth, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'init': init, 'random_state': random_state, 'max_features': max_features, 'verbose': verbose, 'max_leaf_nodes': max_leaf_nodes, 'warm_start': warm_start, 'presort': presort, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'tol': tol } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
class Boosting(): ''' ''' def __init__(self): self.clf = GB() def fit(self, X, y): ''' :param X: :param y: :return: ''' self.clf.fit(X,y) def predict(self, X): ''' :param X: :return: ''' m = int(X.shape[0] ** (0.5)) pred = [] for I in range(m): pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray())) return pred
def training(processed_train_csv_file): processed_train_samples = pd.read_csv(processed_train_csv_file) processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() random.shuffle(processed_train_samples_index_lst) shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples["booking_bool"].values print "Training Random Forest Classifier" rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) rf_classifier.fit(features, labels) print "Saving the Random Forest Classifier" data_io.save_model(rf_classifier, model_name="rf_classifier.pkl") print "Training Gradient Boosting Classifier" gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print "Saving the Gradient Boosting Classifier" data_io.save_model(gb_classifier, model_name="gb_classifier.pkl") print "Training SGD Classifier" sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print "Saving the SGD Classifier" data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingClassifier( random_state=0, n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X) try: auc_score = roc_auc_score(y, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} print(result) return {}, result
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves = gbdt.apply(X_test)[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def gradientBoostingClassifier(X_train, y_train, X_dev, y_dev): print("\nPerforming Gradient Boosting.") gb = GradientBoostingClassifier(n_estimators=50, learning_rate=0.25, max_depth=5, random_state=0) gb.fit(X_train, y_train) y_pred = gb.predict(X_dev) accuracy = np.mean(y_dev == y_pred) print("Accuracy", accuracy) return gb, accuracy
class Boosting(): #TODO: dokumentasi def __init__(self): self.clf = GB() def fit(self,X,y): self.clf.fit(X,y) def predict(self,X): m = int(X.shape[0] ** (0.5)) pred = [] for I in range(m): pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray())) return pred
def do_training(processed_train_csv_file): ## Processed train samples reading # read saved processed train samples from the given csv file processed_train_samples = pd.read_csv(processed_train_csv_file) # inf to nan processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) # nan to 0 processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() # 之前排过序,这里shuffle一下,效果更好 random.shuffle(processed_train_samples_index_lst) # organize new train samples and targets shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples['booking_bool'].values # Model training # 1 Random Forest Classifier print("Training Random Forest Classifier") rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, n_jobs=-1, min_samples_split=10) rf_classifier.fit(features, labels) print("Saving the Random Forest Classifier") data_io.save_model(rf_classifier, model_name='rf_classifier.pkl') # 2 Gradient Boosting Classifier print("Gradient Boosting Classifier") gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print("Saving the Gradient Boosting Classifier") data_io.save_model(gb_classifier, model_name='gb_classifier.pkl') # 3 SGD Classifier print("SGD Classifier") sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print("saved the SGD Classifier") data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
def GN(pth): train_desc=np.load(pth+'/training_features.npy') nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0) idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32') # Scaling the words stdSlr = StandardScaler().fit(train_desc) train_desc = stdSlr.transform(train_desc) modelGN=GradientBoostingClassifier(n_estimators=100,learning_rate=1.0, max_depth=1, random_state=0) modelGN.fit(train_desc,np.array(train_labels)) joblib.dump((modelGN, img_classes, stdSlr), pth+"/gn-bof.pkl", compress=3) test(pth, "gn-")
def classify_gbc(data_sets, label_sets): params = { 'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'deviance', 'verbose': 0 } clf = GradientBoostingClassifier(**params) clf.fit(data_sets, label_sets) # print(clf.score(data_sets, label_sets)) return clf
class MyGradientBoostingClassifier(BaseClassifier): def __init__(self, verbose=1, n_estimators=5, max_depth=6, min_samples_leaf=100): self.classifier = GradientBoostingClassifier( **{ 'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format(**{ "n": n_estimators, "md": max_depth, "ms": min_samples_leaf }) def get_name(self): return self.name def fit(self, X, y): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self, feat_names): ipts = dict(zip(feat_names, self.classifier.feature_importances_)) return ipts
def trainGBT(requestsQ, responsesQ): while True: args = requestsQ.get() if args[0] == 'KILL': break vectors = args[1] # expected in the order of learningRate, maxTrees, minSplitSize, maxDepth hyperparams = args[2] model = GradientBoostingClassifier(learning_rate=hyperparams[0], n_estimators=hyperparams[1], min_samples_split=hyperparams[2], max_depth=hyperparams[3]) model.fit(vectors['Xtrain'], vectors['Ytrain']) score = accuracy_score(vectors['Ytest'], model.predict(vectors['Xtest'])) responsesQ.put((model, score), True) return 0
def main(config="../../config.yaml", param="./gbdt_config_binary.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X_guest) try: auc_score = roc_auc_score(y_guest, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} import time print(result) print(data_guest) time.sleep(3) return {}, result
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingClassifier(random_state=0, n_estimators=50, learning_rate=0.3) clf.fit(X, y) y_pred = clf.predict(X) try: auc_score = roc_auc_score(y, y_pred) except: print(f"no auc score available") acc = accuracy_score(y, y_pred) result = {"accuracy": acc} print('multi result', result) return {}, result
def classify_gbc(data_sets, label_sets): # params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, # 'learning_rate': 0.01, 'loss': 'deviance', 'verbose': 0} # 网格搜索gbc最优超参数 grid_search(data_sets, label_sets) # 这是网格CV搜索出的最佳参数 100,0.52 params = { 'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.52, 'loss': 'deviance', 'verbose': 0 } clf = GradientBoostingClassifier(**params) clf.fit(data_sets, label_sets) print(clf.score(data_sets, label_sets)) return clf
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=50, learning_rate=0.3, ) clf.fit(X, y) y_pred = clf.predict(X_guest) acc = accuracy_score(y_guest, y_pred) result = {"accuracy": acc} print(result) return {}, result
def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.1, random_state=42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练模型 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 filename = 'finalized_model.sav' pickle.dump(lr, open(filename, 'wb')) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1] print(y_pred_gbdtlr2)
def gbm_model_train(self,train,targets,run_gs): if run_gs==False: gbm0 = GradientBoostingClassifier(random_state=0) gbm0.fit(train,targets) cv_result=cross_val_score(gbm0,train,targets,cv=5) print('gradient boosting cross validation score is ',cv_result.mean() ) else: #using grid search CV with random forest classfier rf=RandomForestClassifier(random_state=0) parameters = { 'max_depth' : [6, 8,10], 'n_estimators': [50, 100,200,400], 'max_features': ['sqrt', 'auto', 'log2'], 'min_samples_split': [3,5, 10], 'min_samples_leaf': [5, 10, 15], 'bootstrap': [True, False], 'criterion':['gini','entropy'] } grid_sear=GridSearchCV(rf,param_grid=parameters,scoring='accuracy',cv=10) grid=grid_sear.fit(train,targets) print(grid.best_score_) print(grid.best_params_)
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X_guest) try: auc_score = roc_auc_score(y_guest, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} import time print(result) print(data_guest) time.sleep(3) return {}, result
def apply_gradient_boosting(X_train_preprocessed, X_test_preprocessed, y_train, y_test): ##TO DO : Testing Hyper Parameters and Cross Validation print 'Applying Gradient Boosting' # Training the classifier classifier = GradientBoostingClassifier(n_estimators=100) classifier = classifier.fit(X_train_preprocessed, y_train) # Testing the classifier on Test Data y_test_pred = classifier.predict(X_test_preprocessed) #Compute Accuracy Score acc = accuracy_score(y_test, y_test_pred, normalize=True) print 'The accuracy achieved by the Gradient Boosting Classifier Model is: ', acc return classifier, acc
class MyGradientBoostingClassifier(BaseClassifier): def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000): self.classifier = GradientBoostingClassifier( **{'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} ) def get_name(self): return self.name def fit(self, X, y): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self): return self.classifier.feature_importances_
n_features=20, n_informative=18, n_redundant=2, n_classes=2, n_clusters_per_class=3, random_state=2017) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # 不生成新的特征,直接训练 clf = GradientBoostingClassifier(n_estimators=50) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_prob = clf.predict_proba(X_test)[:, 1] acc = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_prob) print("Original featrues") print("GBDT_ACC: {:.6f}".format(acc)) print("GBDT_AUC: {:.6f}".format(auc)) # 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵 X_train_leaves = clf.apply(X_train)[:, :, 0] X_test_leaves = clf.apply(X_test)[:, :, 0] # 将X_train_leaves, X_test_leaves在axis=0方向上合并,再进行OneHotEncoder操作 All_leaves = np.r_[X_train_leaves, X_test_leaves]
looses = {} def plot_score(test_predictions, y_test, train_predictions, y_train, color, learning_rate): test_loss = [log_loss(y_test, pred) for pred in test_predictions] train_loss = [log_loss(y_train, pred) for pred in train_predictions] plt.plot(test_loss, color, linewidth=2) plt.plot(train_loss, color+'--', linewidth=2) looses[learning_rate] = test_loss plt.figure() colors = ['r', 'g', 'b', 'c', 'm'] learn_rates = [1, 0.5, 0.3, 0.2, 0.1] for index, learning_rate in enumerate(learn_rates): clf.learning_rate = learning_rate clf.fit(X_train, y_train) test_predictions = clf.staged_predict_proba(X_test) train_predictions = clf.staged_predict_proba(X_train) plot_score(test_predictions, y_test, train_predictions, y_train, color=colors[index], learning_rate=learning_rate) legends = [["Test {}".format(learn_rate), "Train {}".format(learn_rate)] for learn_rate in learn_rates] legends = [item for sublist in legends for item in sublist] plt.legend(legends) plt.savefig("coursera_out/gradient_boosting.png") min_loss_on_iteration = np.argmin(looses[0.2]) min_loss = looses[0.2][min_loss_on_iteration] print("on iteration {} was loose {}".format(min_loss_on_iteration, min_loss)) coursera.output("min_loose_on_0.2.txt", "{:.2f} {}".format(min_loss, min_loss_on_iteration))
import numpy as np from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier from numpy.ma.testutils import assert_array_almost_equal # Create some data m = 10000 X = np.random.normal(size=(m, 10)) thresh = np.random.normal(size=10) X_transformed = X * (X > thresh) beta = np.random.normal(size=10) y = (np.dot(X_transformed, beta) + np.random.normal(size=m)) > 0 # Train a gradient boosting classifier model = GradientBoostingClassifier() model.fit(X, y) print model.score(X, y) # Inspect pred = model.predict_proba(X) approx = model.loss_._score_to_proba( model.learning_rate * sum(map(lambda est: est.predict(X), model.estimators_[:, 0])) + np.ravel(model.init_.predict(X))) assert_array_almost_equal(pred, approx)
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) print "train data shape: ", X_train.shape # 模型训练 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] print "gbdt leaves shape: ", X_train_leaves.shape for i in range(0, len(X_train_leaves[0])): cateMap = {} for j in range(0, len(X_train_leaves)): cateMap[X_train_leaves[j][i]] = 0 print "F%d: %d" % (i, len(cateMap)) # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder(sparse=False, categories='auto') X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print "gbdt oneHot shape: ", X_trans.shape print "oneHot leaves: ", X_trans[0] # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print "gbdt leaves cross", X_train_ext.shape # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
from sklearn.preprocessing import LabelEncoder def data_process(data): encoder = LabelEncoder() data['V2'] = encoder.fit_transform(data['V2']) data['V4'] = encoder.fit_transform(data['V4']) data['V5'] = encoder.fit_transform(data['V5']) data_process(train_agg) data_process(test_agg) del a,gp,gp_day_mean,gp_day_var,gp1,gp2,gp3,gp4,index1,l,m1,m2,m3,merge_log,ss,ss2,t1,t2,t3,train_flg #gbdt 构造新特征 gbdt = GradientBoostingClassifier(loss='exponential',learning_rate=0.12,n_estimators=60, max_depth=3,random_state=42,max_features=None) X_train=train_agg.drop(['USRID','FLAG'],axis=1) y_train=train_agg['FLAG'] # 训练学习 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves=gbdt.apply(test_agg.drop('USRID',axis=1))[:,:,0] (train_rows, cols) = X_train_leaves.shape onehot = OneHotEncoder() X_trans = onehot.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 组合特征 X_train_agg = DataFrame(hstack([X_trans[:train_rows, :], train_agg]).toarray()) X_test_agg = DataFrame(hstack([X_trans[train_rows:, :], test_agg]).toarray()) X_train_agg.rename(columns={494: "USRID",495:"FLAG"},inplace=True) X_test_agg.rename(columns={494: "USRID"},inplace=True) #训练集和测试集
def inject_bag_of_words(X, features): X_pick = np.zeros((features.shape[0], 112)) for i, match_id in enumerate(features.index): for p in range(5): X_pick[i, features.ix[match_id, 'r{}_hero'.format(p+1)]-1] = 1 X_pick[i, features.ix[match_id, 'd{}_hero'.format(p+1)]-1] = -1 return np.concatenate([X, X_pick], axis=1) X = inject_bag_of_words(X, features) clf, scaler = train_logistic(X, y, 'With Bag of Words') # final test proba clf.fit(scaler.transform(X), y) test_features = pandas.read_csv('features_test.csv', index_col='match_id') X_test = test_features.drop(category_features, axis=1) X_test = X_test.fillna(0) X_test = inject_bag_of_words(X_test, test_features) X_test = scaler.transform(X_test) proba = clf.predict_proba(X_test)[:, 1] print("Proba min: {}".format(proba.min())) print("Proba max: {}".format(proba.max()))
temp=groups[f].median() for i in range(0,768): if (dataset.loc[i,f]==0) & (dataset.loc[i,'outcome']==0): dataset.loc[i,f]=temp[0] if (dataset.loc[i,f]==0) & (dataset.loc[i,'outcome']==1): dataset.loc[i,f]=temp[1] dataset = dataset.values X = dataset[:,0:len(dataset[0]) -1] Y = dataset[:, (len(dataset[0])-1)] #this is for decision tree data=[[0,0,0,0,0]] df=pd.DataFrame(data,columns=['feats','depth','split','max_leaf','acc']) for feats in range(2, 7): for dept in range(2, 6): acc = 0 for split in range(5,40,5): for leaf in range(7,10): for i in range(20): X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) classifier= GradientBoostingClassifier(min_samples_split=split,max_depth=dept,max_features=feats,max_leaf_nodes=leaf) classifier.fit(X_train, Y_train) res = classifier.score(X_test, Y_test) acc = acc + res acc = acc / 20 print('feats:', feats, 'Depth:', dept,'split:',split,'max_leaf',leaf, 'acc:', acc*100) df=df.append({'feats':feats,'depth':dept,'split':split,'max_leaf':leaf,'acc':acc},ignore_index=True) df.to_csv('xgboost.csv', sep=',')
def gbdt_lr_train(train,test,gbdt_features,lr_features,target,name,isOnline): # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=20, max_depth=3, verbose=0, max_features=0.3) #n_estimators=20, max_depth=3, verbose=0, max_features=0.5 # 训练学习 gbdt.fit(train[gbdt_features], train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdt = gbdt.predict_proba(test[gbdt_features])[:, 1] gbdt_test_log_loss = log_loss(test[target], y_pred_gbdt) print('gbdt log_loss: %.5f' % gbdt_test_log_loss) else: y_pred_gbdt = gbdt.predict_proba(train[gbdt_features].tail(57562))[:, 1] gbdt_test_log_loss = log_loss(train[target].tail(57562), y_pred_gbdt) print('gbdt log_loss: %.5f' % gbdt_test_log_loss) # GBDT编码原有特征 X_train_leaves = gbdt.apply(train[gbdt_features])[:,:,0] X_test_leaves = gbdt.apply(test[gbdt_features])[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_test_log_loss1 = log_loss(test[target], y_pred_gbdtlr1) print('基于GBDT特征编码后的LR log_loss: %.5f' % gbdt_lr_test_log_loss1) else: print('Online') # 定义LR模型 lr = LogisticRegression() # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], train[lr_features]]) X_test_ext = hstack([X_trans[train_rows:, :], test[lr_features]]) print("gbdt output",X_trans[:train_rows, :].shape) print("input",train[lr_features].shape) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_test_log_loss2 = log_loss(test[target], y_pred_gbdtlr2) print('基于组合特征的LR log_loss: %.5f' % gbdt_lr_test_log_loss2) else: print('Online') test['predicted_score'] = lr.predict_proba(X_test_ext)[:, 1] print(test['predicted_score'].head(5)) print(len(test)) test[['instance_id', 'predicted_score']].to_csv('../baseline_' + name +'.csv', index=False,sep=' ')#保存在线提交结果 print('Saved result success!')
class Predict(): def __init__(self): self.gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) self.lr = LogisticRegression(n_jobs=-1) Train_tab = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] Train_libsvm = [[1, 1, 1, 1, 1, 1], [2, 2, 2, 1, 2, 2], [1, 1, 1, 1, 3, 1], [2, 2, 2, 1, 4, 1], [3, 3, 2, 1, 5, 2], [2, 2, 2, 1, 6, 1], [4, 4, 3, 1, 6, 2], [5, 5, 3, 1, 7, 2], [2, 2, 2, 1, 8, 1], [2, 2, 2, 1, 6, 1], [2, 2, 2, 1, 9, 2], [6, 6, 2, 1, 8, 3], [1, 1, 1, 1, 10, 1], [2, 2, 2, 1, 4, 2], [2, 2, 2, 1, 4, 1], [2, 2, 2, 1, 10, 2], [1, 1, 1, 1, 8, 1], [1, 1, 1, 1, 11, 1], [2, 2, 2, 1, 12, 1], [2, 2, 2, 1, 2, 1], [5, 5, 3, 1, 13, 2], [2, 2, 2, 1, 14, 1], [7, 7, 2, 1, 15, 2], [1, 1, 1, 1, 16, 1], [1, 1, 1, 1, 8, 1], [1, 1, 1, 1, 17, 1], [5, 5, 3, 1, 18, 2], [2, 2, 2, 1, 19, 2], [1, 1, 1, 1, 2, 1], [2, 2, 2, 1, 20, 1], [2, 2, 2, 1, 10, 1], [2, 2, 2, 1, 14, 2], [5, 5, 3, 1, 15, 2], [5, 5, 3, 1, 21, 2], [2, 2, 2, 1, 21, 1], [1, 1, 1, 1, 22, 1], [6, 6, 2, 1, 5, 2], [2, 2, 2, 1, 1, 2], [8, 8, 2, 1, 15, 3], [4, 4, 3, 1, 23, 2], [9, 9, 2, 2, 6, 2], [1, 1, 1, 1, 21, 1], [2, 2, 2, 1, 10, 2], [5, 5, 3, 1, 24, 2], [2, 2, 2, 1, 20, 1], [2, 2, 2, 1, 8, 1], [5, 5, 3, 1, 2, 2], [6, 6, 2, 1, 3, 3], [1, 1, 1, 1, 19, 1], [2, 2, 2, 1, 12, 2], [2, 2, 2, 1, 25, 1], [1, 1, 1, 1, 2, 1], [4, 4, 3, 1, 11, 2], [2, 2, 2, 1, 10, 1], [1, 1, 1, 1, 21, 1], [2, 2, 2, 1, 14, 2], [1, 1, 1, 1, 19, 1], [2, 2, 2, 1, 14, 1], [2, 2, 2, 1, 9, 1], [2, 2, 2, 1, 20, 2], [2, 2, 2, 1, 4, 2], [1, 1, 1, 1, 4, 1], [2, 2, 2, 1, 26, 1], [2, 2, 2, 1, 14, 1], [2, 2, 2, 1, 4, 2], [2, 2, 2, 1, 23, 1], [5, 5, 3, 1, 13, 2], [3, 3, 2, 1, 22, 2], [2, 2, 2, 1, 11, 2], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 9, 1], [1, 1, 1, 1, 9, 1], [2, 2, 2, 1, 12, 2], [2, 2, 2, 1, 20, 1], [2, 2, 2, 1, 1, 2], [1, 1, 1, 1, 14, 1], [10, 10, 2, 1, 23, 3], [5, 5, 3, 1, 21, 2], [1, 1, 1, 1, 1, 1], [2, 2, 2, 1, 19, 2], [1, 1, 1, 1, 23, 1], [2, 2, 2, 1, 20, 1], [1, 1, 1, 1, 14, 1], [4, 4, 3, 1, 11, 2], [2, 2, 2, 1, 19, 1], [5, 5, 3, 1, 19, 2], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 14, 1], [11, 11, 2, 1, 10, 1], [2, 2, 2, 1, 14, 2], [1, 1, 1, 1, 22, 1], [9, 9, 2, 2, 27, 2], [4, 4, 3, 1, 1, 2], [4, 4, 3, 1, 12, 2], [2, 2, 2, 1, 6, 1], [4, 4, 3, 1, 8, 2], [1, 1, 1, 1, 16, 1], [1, 1, 1, 1, 28, 1], [2, 2, 2, 1, 15, 2], [1, 1, 1, 1, 3, 1], [2, 2, 2, 1, 14, 1], [1, 1, 1, 1, 21, 1], [2, 2, 2, 1, 24, 2], [2, 2, 2, 1, 23, 1], [2, 2, 2, 1, 8, 1], [2, 2, 2, 1, 21, 2], [6, 6, 2, 1, 6, 2], [1, 1, 1, 1, 2, 1], [2, 2, 2, 1, 12, 1], [5, 5, 3, 1, 23, 2], [1, 1, 1, 1, 29, 1], [1, 1, 1, 1, 8, 1], [4, 4, 3, 1, 2, 2], [1, 1, 1, 1, 8, 1], [1, 1, 1, 1, 30, 1], [2, 2, 2, 1, 8, 1], [1, 1, 1, 1, 8, 1], [4, 4, 3, 1, 23, 2], [5, 5, 3, 1, 9, 2], [4, 4, 3, 1, 1, 2], [9, 9, 2, 2, 19, 2], [1, 1, 1, 1, 11, 1], [2, 2, 2, 1, 1, 2], [10, 10, 2, 1, 30, 1], [9, 9, 2, 2, 24, 2], [5, 5, 3, 1, 14, 2], [2, 2, 2, 1, 4, 1], [2, 2, 2, 1, 22, 2], [2, 2, 2, 1, 26, 1], [2, 2, 2, 1, 14, 1], [2, 2, 2, 1, 1, 1], [4, 4, 3, 1, 2, 2], [3, 3, 2, 1, 29, 2], [2, 2, 2, 1, 6, 2], [2, 2, 2, 1, 9, 2], [2, 2, 2, 1, 16, 2], [5, 5, 3, 1, 13, 2], [13, 13, 2, 1, 3, 2], [2, 2, 2, 1, 27, 1], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 4, 1], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 29, 2], [3, 3, 2, 1, 12, 2], [2, 2, 2, 1, 2, 2], [2, 2, 2, 1, 5, 1], [5, 5, 3, 1, 28, 2], [6, 6, 2, 1, 22, 3], [1, 1, 1, 1, 5, 1], [1, 1, 1, 1, 2, 1], [2, 2, 2, 1, 21, 2], [2, 2, 2, 1, 1, 1], [2, 2, 2, 1, 19, 1], [2, 2, 2, 1, 4, 1], [4, 4, 3, 1, 11, 2], [2, 2, 2, 1, 4, 2], [5, 5, 3, 1, 18, 2], [2, 2, 2, 1, 18, 1], [1, 1, 1, 1, 23, 1], [9, 9, 2, 2, 25, 2], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 5, 1], [10, 10, 2, 1, 2, 3], [2, 2, 2, 1, 9, 2], [2, 2, 2, 1, 14, 2], [1, 1, 1, 1, 26, 1], [1, 1, 1, 1, 3, 1], [14, 14, 2, 1, 23, 2], [4, 4, 3, 1, 2, 2], [2, 2, 2, 1, 23, 2]] self.gbdt_lr_train(Train_tab, Train_libsvm) def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(Train_libsvm, Train_tab, test_size=0.1, random_state=42) # 定义GBDT模型 self.gbdt.fit(X_train, y_train) # GBDT编码原有特征 self.X_train_leaves = self.gbdt.apply(X_train)[:, :, 0] X_test_leaves = self.gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (self.train_rows, cols) = self.X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0)) X_train_ext = hstack([X_trans[:self.train_rows, :], X_train]) # lr对组合特征的样本模型训练 self.lr.fit(X_train_ext, y_train) def Predict(self, X_test): X_test_leaves = self.gbdt.apply(X_test)[:, :, 0] gbdtenc = OneHotEncoder() self.X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0)) X_test_ext = hstack([self.X_trans[self.train_rows:, :], X_test]) y_pred_gbdtlr2 = self.lr.predict_proba(X_test_ext)[:, 1] values = [] for value in y_pred_gbdtlr2: values.append(value) return values
y_pred1 = SVM.predict(X_test) print(classification_report(y_test, y_pred1)) print(accuracy_score(y_test, y_pred1)) ## Random Forest Classifier print("RandomForrest Classifier Results are as following") rfc = RandomForestClassifier(n_estimators=200, max_depth=4) rfc.fit(X_train, y_train) y_pred2 = rfc.predict(X_test) print(classification_report(y_test, y_pred2)) print(accuracy_score(y_test, y_pred2)) # Neural Network print("Neural Network Classifier Results are as following") mlp = MLPClassifier(max_iter=500) mlp.fit(X_train, y_train) y_pred3 = mlp.predict(X_test) print(classification_report(y_test, y_pred3)) print(accuracy_score(y_test, y_pred3)) # GradientBoosting Classifier print("GradientBoosting Classifier Results are as following") grd = GradientBoostingClassifier() grd.fit(X_train, y_train) y_pred4 = grd.predict(X_test) print(classification_report(y_test, y_pred4)) print(accuracy_score(y_test, y_pred4))
# <codecell> X, y = shuffle(df2[possible_features], df2.bad) offset = int(X.shape[0] * 0.9) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] # <codecell> params = {'init': LogOddsEstimator(), 'n_estimators': 5, 'max_depth': 6, 'learning_rate': 0.1, 'loss': 'bdeviance'} clf = GradientBoostingClassifier(**params) # <codecell> clf = clf.fit(X_train, y_train) predicted = clf.predict(X_test) # <codecell> clf.feature_importances_ # <codecell> print "Mean Squared Error" mse = mean_squared_error(y_test, predicted) print("MSE: %.4f" % mse) print # <codecell>
print('Best parameters: {}'.format(grid_search.best_params_)) # In[192]: y_pred_rf = rf_model.predict(X_test) accuracy_rf = accuracy_score(y_test, y_pred_rf) print(accuracy_rf) # In[193]: ######## Trying Gradient Boost ###### # In[194]: gbc = GradientBoostingClassifier(n_estimators=100) gbc.fit(X_train, y_train) # In[195]: y_pred_gbc = gbc.predict(X_test) accuracy_gbc = accuracy_score(y_test, y_pred_gbc) print(accuracy_gbc) # In[196]: ############## Model evaluation ############## # In[197]: confusion_matrix(y_test, y_pred_lr)
def gbdt_lr_train(): cv_lr_scores = [] cv_lr_trans_scores = [] cv_lr_trans_raw_scores = [] cv_gbdt_scores = [] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) for train_index, valid_index in skf.split(X, y): X_train = X[train_index] X_valid = X[valid_index] y_train = y[train_index] y_valid = y[valid_index] # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=60, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) y_pred_gbdt = gbdt.predict_proba(X_valid)[:, 1] gbdt_auc = roc_auc_score(y_valid, y_pred_gbdt) print('基于原有特征的gbdt auc: %.5f' % gbdt_auc) cv_gbdt_scores.append(gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_valid)[:, 1] lr_valid_auc = roc_auc_score(y_valid, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_valid_auc) cv_lr_scores.append(lr_valid_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_valid_leaves = gbdt.apply(X_valid)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_valid_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_valid, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) cv_lr_trans_scores.append(gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_valid_ext = hstack([X_trans[train_rows:, :], X_valid]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_valid_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_valid, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2) cv_lr_trans_raw_scores.append(gbdt_lr_auc2) cv_lr = np.mean(cv_lr_scores) cv_lr_trans = np.mean(cv_lr_trans_scores) cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores) cv_gbdt = np.mean(cv_gbdt_scores) print("==" * 20) print("gbdt原始特征cv_gbdt:", cv_gbdt) print("lr原始特征cv_lr:", cv_lr) print("lr基于gbdt的特征cv_lr_trans:", cv_lr_trans) print("lr基于gbdt特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
def trainModel(param,feat_folder,feat_name): #read data from folder print 'now we read data from folder:%s'%(feat_folder) #start cv print 'now we need to generate cross_validation' accuracy_cv = [] for i in range(0,2): print 'this is the run:%d cross-validation'%(i+1) testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1))) #if we use xgboost to train model ,we need to use svmlib format if param['task'] in ['regression']: #with xgb we will dump the file with CV,and we will read data train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) watchlist = [(train_data,'train'),(valid_data,'valid')] bst = xgb.train(param,train_data,int(param['num_round']),watchlist) pred = bst.predict(valid_data) elif param['task'] in ['clf_skl_lr']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() clf = LogisticRegression() clf.fit(train_data,train_label) pred = clf.predict(test_data) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) rf = RandomForestRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(train_data, test_label) pred = rf.predict(test_data) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(train_data,test_label) pred = etr.predict(test_data) elif param['task'] in ['reg_skl_gbm'] : train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data,train_label) pred = gbm.predict(test_data) elif param['task'] in ['reg_skl_ridge']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data,train_label) predraw = ridge.predict(test_data) print predraw predrank = predraw.argsort().argsort() trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1))) cdf = creatCDF(train, trainIndex) pred = getScore(predrank,cdf) print pred """ elif param['task'] in ['regression']: elif param['task'] in ['reg_skl_gbm'] : gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data[feat_names],train_data['cid']) pred = gbm.predict(valid_data[feat_names]) elif param['task'] in ['reg_skl_ridge']: feat_names.remove('cid') ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data[feat_names],train_data['cid']) pred = ridge.predict(valid_data[feat_names]) """ #now we use the the accuracy to limit our model acc = accuracy_model(pred,train.iloc[testIndex]['cid']) print "the model accurary:%s"%(acc) accuracy_cv.append(acc) #here we will count the accuracy_cv_mean = np.mean(accuracy_cv) accuracy_cv_std = np.std(accuracy_cv) print 'the accuracy for %.6f'%(accuracy_cv_mean) return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
'user_query_day_hour', 'context_page_id', 'hour', 'shop_id', 'shop_review_num_level', 'shop_star_level', 'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description', ] target = ['is_trade'] X_train = train[features] X_test = test[features] Y_train = train[target] # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=170, min_samples_split=3, min_samples_leaf=8) # 调参之后的GBDT模型 # 训练学习 gbdt.fit(X_train, Y_train) # 预测及AUC评测 Y_predict_gbdt = gbdt.predict_proba(X_test)[:, 1] pd.DataFrame({'instance_id': test['instance_id'], 'predicted_score': Y_predict_gbdt}). \ to_csv('D:\kaggle\\alimm\\baseline_06.csv', index=False, sep=' ')
ax = treeplot.randomforest(model_dt) tree.plot_tree(model_dt) # %% RandromForest EXAMPLE from sklearn.ensemble import RandomForestClassifier model_rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0).fit(X, y) ax = treeplot.randomforest(model_rf, export='png') ax = treeplot.randomforest(model_rf, export='pdf') # %% Gradientboosting example from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier gb = GradientBoostingClassifier() model_gradientboost = gb.fit(X, y) ax = treeplot.plot(model_gradientboost) # %% XGBOOST EXAMPLE import xgboost as xgb model_xgb = xgb.XGBClassifier(n_estimators=100, max_depth=2, random_state=0).fit(X, y) ax = treeplot.plot(model_xgb) ax = treeplot.xgboost(model_xgb, plottype='vertical') # %% XGBOOST EXAMPLE from xgboost import XGBClassifier model_xgb = XGBClassifier(n_estimators=100, max_depth=2, random_state=0).fit(X, y)