def gbdt_lr(para): print("gbdt_lr") x_train = para[0] x_train_lr = para[1] x_test = para[2] y_train = para[3] y_train_lr = para[4] y_test = para[5] maxleafnodes = 11 gbc = GBDT(max_leaf_nodes=maxleafnodes - 1, n_estimators=600, min_samples_leaf=5, max_depth=3, learning_rate=0.02, subsample=0.2, max_features=0.1) gbc.fit(x_train, y_train) ohe = OHE() ohe.fit(gbc.apply(x_train)[:, :]) li = gbc.apply(x_train_lr)[:, :] x_train_lr_gbc = ohe.transform(li) #x_train_lr_gbc=myTransform(li,max_leaf_nodes=maxleafnodes) li = gbc.apply(x_test)[:, :] x_test_gbc = ohe.transform(li) #x_test_gbc=myTransform(li,max_leaf_nodes=maxleafnodes) del (li) lr = sgd(n_iter=50) lr.fit(x_train_lr_gbc, y_train_lr) yp = lr.predict(x_test_gbc) print("GBDT+SGD: " + str(auc(y_test, yp))) return (gbc, yp)
def GBDT_train(train_data, test_data): train_y = train_data[:, 0] train_x = train_data[:, 1:] # param_test = {'n_estimators': range(50, 1000, 50)} gbdt_model = GBDT(learning_rate=0.05, n_estimators=250, max_leaf_nodes=8, min_samples_split=6, max_depth=3) # gsearch = GridSearchCV(estimator=gbdt_model, param_grid=param_test, scoring='accuracy', cv=5) # gsearch.fit(train_x, train_y) # print(gsearch.best_params_, gsearch.best_score_) # bagging_gbdt = BaggingClassifier(gbdt_model, max_samples=0.8) print("GBDT cross score:") print( cross_val_score(gbdt_model, train_x, train_y, cv=5, scoring='accuracy')) #print(cross_val_score(bagging_gbdt, train_x, train_y, cv=5)) gbdt_model.fit(train_x, train_y) test_y = gbdt_model.predict(test_data) return test_y
def gbdt_test(para): x_train = para[0] x_train_lr = para[1] x_test = para[2] import pdb pdb.set_trace() y_train = para[3] y_train_lr = para[4] y_test = para[5] import pdb pdb.set_trace() xt = vstack([para[0], para[1]]) yt = merge_y(y_train, y_train_lr) #para[0]=0;para[1]=0; clf = GBDT() clf.subsample = 0.1 clf.max_features = 0.05 clf.min_samples_leaf = 5 clf.n_estimators = 200 clf.learning_rate = 0.03 clf.fit(xt, yt) yp_gbdt = clf.predict((para[2]).toarray()) print("GBDT: " + str(auc(y_test, yp_gbdt))) return (clf)
from sklearn.ensemble import AdaBoostClassifier as AdaBoost from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import ExtraTreesClassifier as etc from sklearn.neighbors import KNeighborsClassifier as knc from sklearn.neural_network import MLPClassifier as mlp valid_data = data[3200:].reset_index() clf_gender = mlp(hidden_layer_sizes=(2, 1), verbose=0, activation='tanh') clf_gender.fit(f_tfidf[:3200], data.gender[:3200]) valid_data.gender = clf_gender.predict(f_tfidf[3200:]) # clf_age_pre = LR() # clf_age_pre.fit(f_tfidf[:3200], data.age[:3200]) clf_age = GBDT(n_estimators=300, verbose=1) clf_age.fit(f_tfidf[:3200], data.age[:3200]) valid_data.age = clf_age.predict(f_tfidf[3200:]) clf_location = GBDT(n_estimators=300, verbose=1) clf_location.fit(f_tfidf[:3200], data.location[:3200]) valid_data.location = clf_location.predict(f_tfidf[3200:]) # # 输出到temp.csv # In[7]: valid_data.loc[:, ['id', 'age', 'gender', 'location']].to_csv( 'result/gender_mlp_2_1_age_gbdt_n_est_300_loc_gbdt_n_est_300.csv', index=False)
from sklearn.ensemble import GradientBoostingClassifier as GBDT import pickle import numpy as np if __name__ == "__main__": with open("VGG16_feature_dataset.pkl", "rb") as f: p = pickle.load(f) train_x = p["train_conv_feature"] test_x = p["test_conv_feature"] train_y = np.asarray(np.argmax(p["train_label"], axis=1), dtype=np.float32) test_y = np.asarray(np.argmax(p["test_label"], axis=1), dtype=np.float32) clf = GBDT() print(train_x.shape, train_y.shape) clf.fit(train_x, train_y) print(np.mean(test_y == clf.predict(test_x)))
print("精准率:", precision_score(y_test, y_pred)) print("召回率:", recall_score(y_test, y_pred)) print("F1:", f1_score(y_test, y_pred)) print("ROC:", roc_auc_score(y_test, y_pred)) confusion_mat = confusion_matrix(y_test, y_pred) endtime = time.time() totaltime = endtime - starttime print("XGB的时间:", totaltime) print(" ") # ax = model.plot_tree(model, tree_index=1, figsize=(20, 8), show_info=['split_gain']) # plt.show() starttime = time.time() clf = GBDT(n_estimators=100) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("GBDT准确率:", accuracy_score(y_test, y_pred)) print("精准率:", precision_score(y_test, y_pred)) print("召回率:", recall_score(y_test, y_pred)) print("F1:", f1_score(y_test, y_pred)) print("ROC:", roc_auc_score(y_test, y_pred)) endtime = time.time() totaltime = endtime - starttime print("GBDT的时间:", totaltime) print(" ") # 画ROC # false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred) # roc_auc = auc(false_positive_rate, true_positive_rate)
assert (len(y_gold) == len(y_pred)) ap = [] for gold, pred in zip(y_gold, y_pred): val, pred = 0.0, pred[:len(gold)] if len(gold) > 0: ap.append(average_precision(pred, gold)) precision = sum(ap) / len(ap) return precision classifiers = { 'svm': svm.SVC(kernel='linear', probability=True), 'rf': RF(n_estimators=200, n_jobs=5), 'gbdt': GBDT() } if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('train', help='training data file') parser.add_argument('test', help='test data file') parser.add_argument( 'multilabel', help='whether this is a multilabel classification problem') parser.add_argument('model', help='specify the classifier to use') options = parser.parse_args() print options if not options.model in classifiers: print 'Invalid model:', options.model print 'Available models:'
def fit_model(features, sumstats, train_genes, test_genes, model='logit'): """ Fit classifier to train_genes and calculate RMSE on test_genes """ all_genes = train_genes + test_genes # Join sumstats with features for logistic regression, subset to # genes of interest, and drop genes with NaN BFDPs full_df = sumstats.merge(features, how='left', left_index=True, right_index=True) full_df = full_df.loc[full_df.index.isin(all_genes), :].dropna() train_df = full_df.loc[full_df.index.isin(train_genes), :].\ drop(labels='chrom', axis=1) test_df = full_df.loc[full_df.index.isin(test_genes), :].\ drop(labels='chrom', axis=1) # Instantiate classifier dependent on model if model == 'logit': grid_params = { 'C': [10**x for x in range(-2, 3, 1)], 'l1_ratio': [x / 10 for x in range(0, 11, 1)] } base_class = logit(solver='saga', penalty='elasticnet') elif model == 'svm': grid_params = {'C': [10**x for x in range(-2, 2, 1)]} base_class = SVC(random_state=0, probability=True, break_ties=True, kernel='rbf') elif model == 'randomforest': grid_params = { 'n_estimators': [50, 100, 500], 'criterion': ['gini', 'entropy'] } base_class = RFC(random_state=0, bootstrap=True, oob_score=True) elif model == 'lda': grid_params = { 'shrinkage': [None, 0, 0.5, 1, 'auto'], 'solver': ['svd', 'lsqr', 'eigen'] } base_class = LDAC() elif model == 'naivebayes': grid_params = {'var_smoothing': [10**x for x in range(-4, -11, -1)]} base_class = GNBC() elif model == 'neuralnet': grid_params = { 'hidden_layer_sizes': [(10, 5, 2), (20, 10, 5), (20, 10, 5, 2), (50, 20, 10), (50, 20, 10, 5), (50, 20, 10, 5, 2)], 'alpha': [10**x for x in range(-4, 5, 1)] } base_class = MLPC(activation='relu', solver='adam', early_stopping=True, random_state=0) elif model == 'gbdt': grid_params = {'n_estimators': [50, 100], 'subsample': [0.5, 1]} base_class = GBDT(random_state=0) elif model == 'knn': grid_params = { 'n_neighbors': [10, 50, 100, 500], 'weights': ['uniform', 'distance'], 'leaf_size': [5, 10, 25, 50, 100] } base_class = KNN() # Learn best parameters for classifier using cross-validated grid search classifier = GridSearchCV(base_class, grid_params, verbose=1, n_jobs=-1) # Fit sklearn model & predict on test set # (Models parameterized by grid search need to be treated separately) if isinstance(classifier, GridSearchCV): fitted_model = classifier.fit(train_df.drop(labels='bfdp', axis=1), np.round(train_df.bfdp)).best_estimator_ else: fitted_model = classifier.fit(train_df.drop(labels='bfdp', axis=1), np.round(train_df.bfdp)) test_bfdps = pd.Series(fitted_model.predict_proba( test_df.drop(labels='bfdp', axis=1))[:, 1], name='pred', index=test_df.index) # Compute RMSE of bfdps for test set test_vals = test_df.merge(test_bfdps, left_index=True, right_index=True).\ loc[:, 'bfdp pred'.split()] test_rmse = rmse(test_vals.to_records(index=False)) return fitted_model, test_rmse
def new_gbdt(): args = {"n_estimators": 400, "max_depth": 10, "max_features": "sqrt", } return GBDT(**args)
def final_sim_pred(sqlContext, database_name, windowx=3, kfold_use=False, version='v1'): """v2.0版本,加入 unrichness特征并使用模型预测""" mix_model_data = sqlContext.sql( 'select * from {0}.synonyms_mix_model_data'.format( database_name)).toPandas() label_data = sqlContext.sql( 'select * from {0}.synonyms_label_sample'.format( database_name)).toPandas() # ## 1) join feature 和 label df0 = pds.merge(mix_model_data, label_data, how='left', on=['target_word', 'sim_word']) df0.set_index(['target_word', 'sim_word'], inplace=True) feature_col = df0.columns[0:-1].tolist() label_col = df0.columns[-1] #df0.head(100) # ## 2) 基于规则融合的结果 rule_pred rel_max_n_thr = df0.tf_max.describe( percentiles=[0.95])['95%'] / 5 * 0.1 # tf词频 阈值 df0['rule_pred'] = df0.apply(lambda x: cuple_wd_is_sim( x=x, unrelated_check=True, rel_max_n=rel_max_n_thr, windowx=windowx), axis=1) # ## 3) 基于model融合预测的结果 mdl_pred if version == 'v2': ### 划分 train & test data iloc_index = split_data_random_by_index( dfx=df0.loc[df0.is_sim.notna() == True, :].copy(), part_num=2, split_type='weight', weights=[0.7, 0.3]) train_data = df0.loc[df0.is_sim.notna() == True, :].iloc[ iloc_index[0], :][[label_col] + feature_col].copy() # label放到首列 test_data = df0.loc[df0.is_sim.notna() == True, :].iloc[ iloc_index[1], :][[label_col] + feature_col].copy() # label放到首列 ### 单模型 mdl = GBDT(learning_rate=0.1, n_estimators=50, max_depth=3) # 87% ,88% mdl.fit(train_data[feature_col], train_data[label_col]) df0['mdl_pred'] = mdl.predict(df0[feature_col]) ### 多模型k-fold stacking --when there are a few label samples , not recommend #kfold_use=False # defalut not run if kfold_use: k = 4 model_classes = [(LR, { 'penalty': 'l2' }), (GBDT, { 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 6 })] model_stacking_weight = [0.7, 0.7] confidence = 0.05 uniform_voting = False kfold_train_pred, kfold_test_pred, iloc_index = k_fold_cross_fit( train_data=train_data, test_data=df0[[label_col] + feature_col], model_classes=model_classes, model_stacking_weight=model_stacking_weight, confidence=confidence, k=k, stacking_type='DT', uniform_voting=uniform_voting) df0.iloc[iloc_index, 'kfold_pred'] = kfold_test_pred else: df0['kfold_pred'] = '' else: df0['mdl_pred'] = '' df0['kfold_pred'] = '' # # 4) storage try: df0['mdl_pred'] = df0.mdl_pred.astype('int') except: print('') try: df0['kfold_pred'] = df0.kfold_pred.astype('int') except: print('') sim_recog = sqlContext.createDataFrame(df0.reset_index()) sqlContext.sql('drop table if exists {0}.word_semantic_similarity'.format( database_name)) sim_recog.write.saveAsTable( '{0}.word_semantic_similarity'.format(database_name), mode='overwrite')
def get_model(model_name, feature): clf = " " if model_name == "lr": if feature == "word": clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=1, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.1) elif feature == "length": clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.09, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.1) elif feature == "struct": clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=2, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.1) elif feature == "lsa": clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=2, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.1) else: sp = feature.split(',') if set(sp) == set(["word", "length", "struct"]): clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=1, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif set(sp) == set(["word", "length", "lsa"]): clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.8, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif set(sp) == set(["struct", "length", "lsa"]): clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=2, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.3) elif set(sp) == set(["struct", "length", "lsa", "word"]): clf = LogisticRegression(penalty='l2', dual=False, fit_intercept=True, C=3, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=2) elif "word" in sp and "length" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.2, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif "word" in sp and "struct" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=5, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif "word" in sp and "lsa" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=2, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif "length" in sp and "struct" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.08, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif "length" in sp and "lsa" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.3, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif "struct" in sp and "lsa" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=2.5, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) else: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.09, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.1) elif model_name == "nb": clf = NB() elif model_name == "knn": if feature == "lsa": clf = KNN(n_neighbors=60) else: clf = KNN(n_neighbors=120) elif model_name == "rf": clf = RF(n_estimators=1000, max_features="auto", max_depth=8, min_samples_split=10, min_samples_leaf=2) elif model_name == "gbdt": clf = GBDT(n_estimators=400, max_features="auto", max_depth=8, min_samples_split=10, min_samples_leaf=2) elif model_name == "svm": if feature == "word" or feature == "length": clf = svm.SVC(C=0.8, kernel='rbf', gamma=0.08) elif feature == "structure": clf = svm.SVC(C=0.1, kernel='rbf', gamma=0.08) else: sp = feature.split(',') if "struct" in sp and "lsa" in sp: clf = svm.SVC(C=0.9, kernel='rbf', gamma=0.08) else: clf = svm.SVC(C=3, kernel='rbf', gamma=0.08) else: print("你只能从LR,NB,RF几种模型里选择") sys.exit(1) return clf
model_tree = DecisionTreeClassifier(random_state=0) # 建立分类决策树模型对象 selector_4 = feature_selection.SelectFromModel(model_tree) sel_features4 = selector_4.fit_transform(x, y) # 训练并转换数据 print(sel_features4.shape) # 打印形状 print(sel_features4[:3]) # 打印前3条记录 # 使用sklearn的LDA进行维度转换 model_lda = LDA() # 建立LDA模型对象 model_lda.fit(x, y) # 将数据集输入模型并训练 convert_features = model_lda.transform(x) # 转换数据 print(convert_features.shape) # 打印形状 print(model_lda.explained_variance_ratio_) # 获得各成分解释方差占比 print(convert_features[:3]) # 打印前3条记录 # 使用sklearn的GBDT方法组合特征 model_gbdt = GBDT() model_gbdt.fit(x, y) conbine_features = model_gbdt.apply(x)[:, :, 0] print(conbine_features.shape) # 打印形状 print(conbine_features[0]) # 打印第1条记录 # 使用sklearn的PolynomialFeatures方法组合特征 model_plf = plf(2) plf_features = model_plf.fit_transform(x) print(plf_features.shape) # 打印形状 print(plf_features[0]) # 打印第1条数据 # 使用gplearn的genetic方法组合特征 data = datasets.load_boston() # 加载数据集 x, y = data.data, data.target # 分割形成x和y print(x.shape) # 查看x的形状
def new_gbdt(k): args = { "n_estimators": k, } return GBDT(**args)