def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th): # SelectfromModel clf = LGBMClassifier(n_estimators=400) clf.fit(matrix_x_temp, label_y) sfm = SelectFromModel(clf, prefit=True, threshold=th) matrix_x = sfm.transform(matrix_x_temp) # 打印出有多少特征重要性非零的特征 feature_score_dict = {} for fn, s in zip(fe_name, clf.feature_importances_): feature_score_dict[fn] = s m = 0 for k in feature_score_dict: if feature_score_dict[k] == 0.0: m += 1 print 'number of not-zero features:' + str(len(feature_score_dict) - m) # 打印出特征重要性 feature_score_dict_sorted = sorted(feature_score_dict.items(), key=lambda d: d[1], reverse=True) print 'feature_importance:' for ii in range(len(feature_score_dict_sorted)): print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1] print '\n' f = open('../eda/lgb_feature_importance.txt', 'w') f.write(th) f.write('\nRank\tFeature Name\tFeature Importance\n') for i in range(len(feature_score_dict_sorted)): f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') f.close() # 打印具体使用了哪些字段 how_long = matrix_x.shape[1] # matrix_x 是 特征选择后的 输入矩阵 feature_used_dict_temp = feature_score_dict_sorted[:how_long] feature_used_name = [] for ii in range(len(feature_used_dict_temp)): feature_used_name.append(feature_used_dict_temp[ii][0]) print 'feature_chooesed:' for ii in range(len(feature_used_name)): print feature_used_name[ii] print '\n' f = open('../eda/lgb_feature_chose.txt', 'w') f.write('Feature Chose Name :\n') for i in range(len(feature_used_name)): f.write(str(feature_used_name[i]) + '\n') f.close() # 找到未被使用的字段名 feature_not_used_name = [] for i in range(len(fe_name)): if fe_name[i] not in feature_used_name: feature_not_used_name.append(fe_name[i]) return matrix_x, feature_not_used_name[:], len(feature_used_name)
def main(): # load the data print('\nloading...') wd = '/Users/ewenwang/Documents/credit/data' os.chdir(wd) dataFile = 'creditcard.csv' dataset = pd.read_csv(dataFile, low_memory=False) # set target and predictors target = 'Class' predictors = [x for x in dataset.columns if x not in [target]] # split the data into training and test sets seed = 2017 dtrain, dtest = train_test_split(dataset, test_size=0.33, random_state=seed) # build the classifier gbm = LGBMClassifier( learning_rate=0.01, n_estimators=5000, objective='binary', metric='auc', max_depth=10, subsample=0.83, colsample_bytree=0.63, save_binary=True, is_unbalance=True, random_state=seed ) # train the model print('\nfitting...') gbm.fit(dtrain[predictors], dtrain[target]) # report report(gbm, dtrain, dtest, predictors, target) return None
train_X, valid_X, train_Y, valid_Y = train_test_split(X, Y, test_size=0.2, random_state=2018) # In[ ]: clf = LGBMClassifier(n_estimators=200, learning_rate=0.01) # In[ ]: clf.fit( train_X, train_Y, eval_set=[(train_X, train_Y), (valid_X, valid_Y)], eval_metric='auc', early_stopping_rounds=50, verbose=False ) # In[ ]: plot_importance(clf, figsize=(10,10)) # In[ ]: #print("only showing the distribution for the first few columns, edit the counter to show all distribution")
xgbc_train_specificity = print_report(y_train,y_train_preds, thresh) print('Validation:') xgbc_valid_auc, xgbc_valid_accuracy, xgbc_valid_recall, xgbc_valid_precision, \ xgbc_valid_specificity = print_report(y_valid,y_valid_preds, thresh) # .......................... (11)xgboost END .......................... # .......................... (12)lightgbm .......................... from evaluate_metrix import * from lightgbm import LGBMClassifier import lightgbm as lgb x_valid = x_train y_valid = y_train lgbc = LGBMClassifier() lgbc.fit(x_train, y_train) y_train_preds = lgbc.predict_proba(x_train)[:,1] y_valid_preds = lgbc.predict_proba(x_valid)[:,1] print('Xtreme Gradient Boosting Classifier') print('Training:') lgbc_train_auc, lgbc_train_accuracy, lgbc_train_recall, lgbc_train_precision, \ lgbc_train_specificity = print_report(y_train,y_train_preds, thresh) print('Validation:') lgbc_valid_auc, lgbc_valid_accuracy, lgbc_valid_recall, lgbc_valid_precision, \ lgbc_valid_specificity = print_report(y_valid,y_valid_preds, thresh) # .......................... (12)lightgbm END .......................... # ------------------------------------------------------------------------------------------------------------------- # ------------------------ build traditional models and evaluate the model END ------------------------------
def AllModelsClass(self,X_train,y_train,X_test,y_test): loj1 = LogisticRegression() # Liblinear yerine baska yontemler var bunun bunlar da data yapisina gore sonuclar vermekte loj_model1 = loj1.fit(X_train,y_train) #mlp_regres = MLPClassifier().fit(StandardScaler().fit_transform(X_train),y_train) cart = DecisionTreeClassifier() cart_model = cart.fit(X_train, y_train) rf_model = RandomForestClassifier().fit(X_train, y_train) Svc = SVC().fit(X_train, y_train) bayes = GaussianNB() bayes_model = bayes.fit(X_train,y_train) lgbm_ = LGBMClassifier() lgbm_model = lgbm_.fit(X_train,y_train) knn = KNeighborsClassifier() knn_model = knn.fit(X_train, y_train) gbm_model = GradientBoostingClassifier().fit(X_train, y_train) xgb_model = XGBClassifier().fit(X_train, y_train) cat_model = CatBoostClassifier().fit(X_train, y_train) modeller = [ loj_model1, cart_model, rf_model, Svc, bayes_model, lgbm_model, knn_model, gbm_model, xgb_model, cat_model] for model in modeller: isimler = model.__class__.__name__ y_pred = model.predict(X_test) dogruluk = accuracy_score(y_test, y_pred) print("-"*28) print(isimler + ":" ) print("Accuracy: {:.4%}".format(dogruluk)) sonuc = [] sonuclar = pd.DataFrame(columns= ["Modeller","Accuracy"]) for model in modeller: isimler = model.__class__.__name__ y_pred = model.predict(X_test) dogruluk = accuracy_score(y_test, y_pred) sonuc = pd.DataFrame([[isimler, dogruluk*100]], columns= ["Modeller","Accuracy"]) sonuclar = sonuclar.append(sonuc) sns.barplot(x= 'Accuracy', y = 'Modeller', data=sonuclar, color="b") plt.xlabel('Accuracy %') plt.title('Modellerin Doğruluk Oranları');
# + X = df.drop(columns=["y"]) _X = pd.get_dummies(X, "c") y = df.y clf = tree.DecisionTreeClassifier(random_state=117, max_depth=5, min_samples_leaf=10) clf.fit(_X, y) pred = clf.predict(_X) accuracy_score(y, pred) # + # plot_tree(clf, X, y) # - # Que paso aca? # # Acá lo que queremos mostrar es que algunas representaciones o encodings no son siempre las mejores, depende del modelo que estemos usando. # # Por ejemplo, usando otro modelo... # + X = df.drop(columns=["y"]) X.c = X.c.astype("category") y = df.y lgbm_tree = LGBMClassifier(n_estimators=1) lgbm_tree.fit(X, y) pred = lgbm_tree.predict(X) accuracy_score(y, pred)
svc_rbf_disp = plot_roc_curve(svc_rbf, X_test, y_test, ax=ax, alpha=0.8) gauss_disp = plot_roc_curve(gauss, X_test, y_test, ax=ax, alpha=0.8) tree_disp = plot_roc_curve(tree, X_test, y_test, ax=ax, alpha=0.8) forest_disp = plot_roc_curve(forest, X_test, y_test, ax=ax, alpha=0.8) histgrad_disp = plot_roc_curve(histgrad, X_test, y_test, ax=ax, alpha=0.8) gbm_disp = plot_roc_curve(gbm, X_test, y_test, ax=ax, alpha=0.8) xgboost_disp = plot_roc_curve(xgboost, X_test, y_test, ax=ax, alpha=0.8) lightgbm_disp = plot_roc_curve(lightgbm, X_test, y_test, ax=ax, alpha=0.8) plt.legend(loc = 'best', prop={'size': 16}) plt.show() roc_curve_values=dict() from sklearn.metrics import roc_auc_score from lightgbm import LGBMClassifier lgbm=LGBMClassifier(learning_rate= 0.02 , max_depth= 4, subsample= 0.6, n_estimators= 1000, min_child_samples= 5) lgbm_tuned=lgbm.fit(X_train,y_train) y_pred=lgbm_tuned.predict(X_test) roc_curve_values["Light GBM Classifier"]=roc_auc_score(y_test,y_pred) roc_auc_score(y_test,y_pred) from xgboost import XGBClassifier xgb=XGBClassifier() xgb_tuned=xgb.fit(X_train,y_train) y_pred=xgb_tuned.predict(X_test) roc_curve_values["XGBoost Classifier"]=roc_auc_score(y_test,y_pred) roc_auc_score(y_test,y_pred) from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier histgrad=HistGradientBoostingClassifier(max_depth=8,max_leaf_nodes=14,learning_rate=0.1)
val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)): break X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \ y_train_all.iloc[train_idx], \ y_train_all.iloc[val_idx] del X_train_all model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True) model_lgb.fit(X_train, y_train, eval_metric=['logloss', 'auc'], eval_set=[(X_val, y_val)], early_stopping_rounds=50) sub = test1.copy() sub_size = len(sub) sub['label'] = model_lgb.predict_proba(test[feature_cols])[:, 1] sub.to_csv('result.txt', index=None, header=None, sep='\t') pd.set_option('display.max_rows', None) print( pd.DataFrame({ 'column': feature_cols, 'importance': model_lgb.feature_importances_ }).sort_values(by='importance', ascending=False))
from sklearn.metrics import precision_score # Importing the dataset X = np.load('./project/mini/data/X.npy') y = pd.read_csv('./project/mini/data/y_label.csv', header=0).iloc[:, 0] X = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) # Splitting the dataset into the Training set and Test set x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) # Feature Scaling x_train /= -80 x_test /= -80 model = LGBMClassifier(objective='multiclass') model.fit(x_train, y_train, categorical_feature=[0, 12]) print('feature_importances :', model.feature_importances_) y_pred = model.predict(x_test) print('최종 정답률 :', model.score(x_test, y_test)) # 최종 정답률 : 0.5326016785022595
import matplotlib.pyplot as plt # 1. 데이터 x, y = load_iris(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=88) # 2. 모델 구성 model = LGBMClassifier(n_estimators=1000, n_jobs=-1, objective="multiclass") # 3. 훈련 model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)], eval_metric=["multi_error", "multi_logloss"], early_stopping_rounds=100) score = model.score(x_test, y_test) # 3-1. 컬럼수 만큼 돌 thresholds 생성 thresholds = np.sort(model.feature_importances_) print(thresholds) # [0.01818451 0.01885792 0.3417337 0.62122387] # 3-2. SelectFromModel 생성 for thresh in thresholds:
# 获取数据 digits = datasets.load_digits() print(digits.data.shape) # 特征空间维度 print(digits.target.shape) # 标签的维度 # 将数据进行分割 x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3, random_state=30) params = { 'objective': 'multiclass', 'num_iterations': 193, 'num_leaves': 31, 'learning_rate': 0.1, } gbm = LGBMClassifier(**params) # 训练 gbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='multi_logloss', early_stopping_rounds=15) # predict y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration_) print(f'Best iterations: {gbm.best_iteration_}') print(accuracy_score(y_test, y_pred))
n_estimators=10000, learning_rate=0.03, num_leaves = 22, colsample_bytree=0.8, subsample=0.8, max_depth=6, reg_alpha=0.1, reg_lambda=0.1, min_split_gain=0.01, min_child_weight=100, silent=-1, verbose=-1) X_train, X_test,y_train, y_test = train_test_split(train_F_scaled , train_response,test_size =0.4, random_state=42) lgbm_model.fit(X_train, y_train) y_pred_prob = lgbm_model.predict_proba( X_test)[:,1] roc_auc_score(y_test, y_pred_prob) # In[ ]: from sklearn.cross_validation import cross_val_score print( np.mean(cross_val_score(lgbm_model , train_F_scaled,train_response, scoring = 'roc_auc', cv=5))) # In[ ]:
test_size=0.2, shuffle=True, random_state=66) ## 모델링 model = LGBMClassifier( n_estimators=1000, # verbose의 갯수, epochs와 동일 num_leaves=50, subsample=0.8, min_child_samples=60, max_depth=-1) model.fit( x_train, y_train, verbose=True, eval_metric=['auc', 'error'], # 리스트로 묶어서 매트릭스 두개 사용가능 eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=30) # ealrystopping # eval_metic의 종류 : rmse, mae, logloss, error(error가 0.2면 accuracy는 0.8), auc(정확도, 정밀도; accuracy의 친구다) thresholds = np.sort(model.feature_importances_) print(thresholds) import pickle for thresh in thresholds: # 칼럼수 만큼 돈다. selection = SelectFromModel(model, threshold=thresh, prefit=True) selection_x_train = selection.transform(x_train)
predicted_lr = model_lr.predict(X_test) ; print("LogisticRegression",metrics.accuracy_score(Y_test, predicted_lr),"\n") #aa = model_lr.coef_ if cond01 == 3: from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bayes model_nb = GaussianNB(); model_nb.fit(X_train, Y_train) predicted_nb = model_nb.predict(X_test) ; print("Gaussian Naive Bayes",metrics.accuracy_score(Y_test, predicted_nb),"\n") if cond01 == 4: from sklearn.ensemble import GradientBoostingClassifier # GradientBoosting model_gb = GradientBoostingClassifier(); model_gb.fit(X_train, Y_train) predicted_gb = model_gb.predict(X_test) ; print("GradientBoosting",metrics.accuracy_score(Y_test, predicted_gb),"\n") if cond01 == 5: from lightgbm import LGBMClassifier # LightGBM model_lgbm = LGBMClassifier(); model_lgbm.fit(X_train, Y_train) predicted_lgbm = model_lgbm.predict(X_test); print("LightGBM",metrics.accuracy_score(Y_test, predicted_lgbm),"\n") # ##http://myenigma.hatenablog.com/entry/2015/10/09/223629 #import seaborn as sns #iris = sns.load_dataset("iris") #サンプルデータセット ##sns.pairplot(iris); #sns.pairplot(iris,hue="species"); #sns.plt.savefig("iris.png") #sns.plt.show() #
ip_test = data_ip[test_index,:] gbm = LGBMClassifier( objective='binary', num_leaves=24, max_depth=3, learning_rate=0.1, seed=2018, colsample_bytree=0.3, subsample=0.8, n_jobs=-1, n_estimators=2000 ) print('fitting...') gbm.fit(ip_train, train.loc[train_index, 'is_trade'], eval_set=[(ip_test, train.loc[test_index, 'is_trade'])], early_stopping_rounds=10) property_df = pd.DataFrame(columns=['instance_id', 'item_property_prob']) property_df['instance_id'] = data['instance_id'] property_df['item_property_prob'] = gbm.predict_proba(data_ip)[:, 1] def NatureLP(data, columns): pass print('saving...') property_df.to_csv(wd+out_put[0], index=False, sep=' ')
from sklearn.ensemble import GradientBoostingClassifier modelGB = GradientBoostingClassifier() modelGB.fit(X_train, Y_train) Y_predGB = modelGB.predict(X_valid) print("Training Accuracy: ", modelGB.score(X_train, Y_train)) print('Testing Accuarcy: ', modelGB.score(X_valid, Y_valid)) print("AUROC Score of Gradient Boosting = ", roc_auc_score(Y_valid, Y_predGB)) from lightgbm import LGBMClassifier modelLGBM = LGBMClassifier() modelLGBM.fit(X_train, Y_train) Y_predLGBM = modelLGBM.predict(X_valid) print("Training Accuracy: ", modelLGBM.score(X_train, Y_train)) print('Testing Accuarcy: ', modelLGBM.score(X_valid, Y_valid)) print("AUROC Score of LGBM = ", roc_auc_score(Y_valid, Y_predLGBM)) test_Y_RF = modelRF.predict(test_X) test_Y_XG = modelXG.predict(test_X) test_Y_AB = modelAB.predict(test_X) test_Y_LGBM = modelLGBM.predict(test_X) test_Y_GB = modelGB.predict(test_X) test_Y_pred = []
def OOFPreds(X, y, test_X, params, n_splits=5, random_state=23, clf='lgb'): """ 输入要求数据为 Dataframe 返回数据 Series """ # 方便后续特征重要度分析 feature_importance = pd.DataFrame(columns=['feature', 'importance', 'fold']) folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) # oof 是交叉验证结果 sub是测试集预测结果 oof_preds, sub_preds = np.zeros(X.shape[0]), np.zeros(test_X.shape[0]) oof_train = np.zeros(X.shape[0]) print(X.shape, test_X.shape) valid_scores = [] train_scores = [] for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)): trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx] val_x, val_y = X.iloc[val_idx], y.iloc[val_idx] # # 初始化 score记录方式 # trn_init_score = pd.Series([0.95] * len(trn_x), index=trn_x.index) # val_init_score = pd.Series([0.95] * len(val_x), index=val_x.index) # 模型构建与预测任务 if clf == 'lgb': with timer('{} fold 训练时间:'.format(n_fold)) as time: gbm = LGBMClassifier(**params) gbm.fit(trn_x, trn_y, init_score=trn_init_score, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_init_score=[trn_init_score, val_init_score], eval_metric='auc', verbose=30, early_stopping_rounds=100) print('best iteration: {}'.format(gbm.best_iteration_)) print('100单次训练时间: {:.3f}'.format(time*100/gbm.best_iteration_)) pred_val = gbm.predict_proba(val_x, num_iteration=gbm.best_iteration_)[:, 1] pred_test = gbm.predict_proba(test_X, num_iteration=gbm.best_iteration_)[:, 1] # 预测分数 预测结果记录 oof_preds[val_idx] = pred_val sub_preds += pred_test / folds.n_splits print(gbm.best_score_) valid_score = gbm.best_score_['valid_1']['auc'] train_score = gbm.best_score_['training']['auc'] valid_scores.append(valid_score) train_scores.append(train_score) feature_importance = feature_importance.append(pd.DataFrame({ 'importance': gbm.feature_importances_, 'fold': [n_fold + 1] * X.shape[1], 'feature': X.columns.tolist()})) else: # 自己的模型 # 任务一:完成模型的构建预测任务 # 任务二:完成预测分数,预测结果的记录 # 任务三:完成模型重要程度的记录 clf = LogisticRegression(**params) clf.fit(trn_x, trn_y) pred_train = clf.predict_proba(trn_x)[:, 1] pred_val = clf.predict_proba(val_x)[:, 1] pred_test = clf.predict_proba(test_X)[:, 1] \ oof_preds[val_idx] = pred_val sub_preds += pred_test / folds.n_splits valid_score = roc_auc_score(val_y, pred_val) train_score = roc_auc_score(trn_y, pred_train) valid_scores.append(valid_score) train_scores.append(train_score) feature_importance = feature_importance.append(pd.DataFrame({ 'importance': clf.coef_[0], 'fold': [n_fold + 1] * X.shape[1], 'feature': X.columns.tolist()})) print('Fold {:02d} 训练集 AUC: {:.6f} 验证集 AUC: {:.6f} '.format(n_fold + 1, train_score, valid_score)) del trn_x, trn_y, val_x, val_y; gc.collect() feature_importance['importance'] = feature_importance['importance'].astype(float) fold_names = list(range(folds.n_splits)) fold_names.append('overall') valid_auc = roc_auc_score(y, oof_preds) valid_scores.append(valid_auc) train_scores.append(np.mean(train_scores)) # 构建记录分数的 Dataframe metrics = pd.DataFrame({'fold': fold_names, 'train': train_scores, 'valid': valid_scores}) oof_preds = pd.Series(oof_preds.flatten(), index=X.index).rename('TARGET') sub_preds = pd.Series(sub_preds.flatten(), index=test_X.index).rename('TARGET') return oof_preds, sub_preds, feature_importance, metrics
#FIT E PREDIÇÃO DOS CLASSIFICADORES from lightgbm import LGBMClassifier #Classificador 04 classifier_lgbm_4 = LGBMClassifier( max_depth = 100, learning_rate = 0.3, num_leaves = 500, n_estimators = 500 ) import time start = time.time() classifier_lgbm_4.fit(X_train, Y_train) end = time.time() print("Tempo de Execução: {:.2f} min".format((end - start)/60)) Tempo de Execução: 1.49 min Y_pred_lgbm_4 = classifier_lgbm_4.predict(X_test) #Classificador 07 classifier_lgbm_7 = LGBMClassifier( max_depth = 1000, learning_rate = 0.15, num_leaves = 2000, min_data_in_leaf = 200, n_estimators = 2000 ) import time start = time.time()
def kfold_lightgbm(training_file, testing_file, num_folds, stratified=False): # Divide in training/validation and test data train_df = pd.read_csv(training_file) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) feature_importance_df = pd.DataFrame() #feature name: train data dont use 'TARGET', '*_ID_*'is just a indentification of a sample, some is optional. # your data may include 'ID' to identify a sample, and modify feature name in the next row code feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] ] for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( nthread=4, n_estimators=100, learning_rate=0.01, num_leaves=40, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1, ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=100, early_stopping_rounds=100) oof_preds[valid_idx] = clf.predict_proba( valid_x, num_iteration=clf.best_iteration_)[:, 1] test_df = pd.read_csv(testing_file) sub_preds = np.zeros(test_df.shape[0]) sub_preds += clf.predict_proba( test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False) #display_importances(feature_importance_df) return feature_importance_df
from _00_imports import * # Use LGBM to select the most important 100 features if __name__ == '__main__': train_feat = np.load('../data/feat_train.npy') test_feat = np.load('../data/feat_test.npy') train_input = np.load('../data/feat_train.npy') lgbm = LGBMClassifier(n_estimators=5000) train_x, test_x, train_y, test_y = train_test_split(train_input, truth, test_size=0.2, random_state=1) lgbm.fit(train_x, train_y) pred = lgbm.predict(test_x) print(f1_score(test_y, pred)) indices = np.argsort(lgbm.feature_importances_) importance = lgbm.feature_importances_ top_indices = indices[-100:][::-1] train_feat_df = pd.read_csv('../data/feat_train.csv') test_feat_df = pd.read_csv('../data/feat_test.csv') feat_names = train_feat_df.columns.values selected_feat_names = feat_names[top_indices] train_feat_df[selected_feat_names].to_csv( '../data/simplified_train_feat.csv', index=False) test_feat_df[selected_feat_names].to_csv( '../data/simplified_test_feat.csv', index=False)
num_leaves=452, num_iterations=5500, learning_rate=0.01, min_data_in_leaf=17, max_bin=800, bagging_fraction=0.74, max_depth=50, objective='binary') """ grid = GridSearchCV(model,param_grid) grid.fit(res_train, feature) # summarize the results of the grid search print(grid.best_params_) """ model.fit(res_train, feature) y_pred = model.predict(res_test) my_submission = pd.DataFrame({ 'building_id': index_test, 'damage_grade': y_pred }) clean_submission = { "damage_grade": { 1: "Grade 1", 2: "Grade 2", 3: "Grade 3", 4: "Grade 4", 5: "Grade 5" }
num_leaves=10, colsample_bytree=.8, subsample=.9, max_depth=7, reg_alpha=.1, reg_lambda=.1, min_split_gain=.01, min_child_weight=2, silent=-1, verbose=-1, ) lgbm.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=100, early_stopping_rounds=10 #30 ) # Use 5-fold Cross Validation to get the accuracy # 0.6192 cv_score = model_selection.cross_val_score(lgbm, X_train, y_train, cv=5) print('Model accuracy of LGBM is:', cv_score.mean()) ### Part 4.2: Use Grid Search to Find Optimal Hyperparameters # Choose the number of trees parameters = {'n_estimators': [60, 80, 100]} Grid_RF = GridSearchCV(LGBMClassifier(), parameters, cv=5) Grid_RF.fit(X_train, y_train) # best number of tress
class Model: def __init__(self, datainfo, timeinfo): ''' This constructor is supposed to initialize data members. Use triple quotes for function documentation. ''' # Just logging.info some info from the datainfo variable logging.info("The Budget for this data set is: %d seconds" % datainfo['time_budget']) logging.info( "Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables" % (datainfo['loaded_feat_types'][0], datainfo['loaded_feat_types'][1], datainfo['loaded_feat_types'][2], datainfo['loaded_feat_types'][3])) overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) self.num_train_samples = 0 self.num_feat = 1 self.num_labels = 1 self.is_trained = False self.clf = LGBMClassifier(**params) # Here you may have parameters and hyper-parameters def fit(self, F, y, datainfo, timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical']) # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: self.cat_encs = FrequencyEncoder() X_cat = self.cat_encs.fit_transform(F['CAT']).values X = np.concatenate((X, X_cat), axis=1) del X_cat self.num_train_samples = X.shape[0] self.num_feat = X.shape[1] num_train_samples = y.shape[0] self.DataX = X self.DataY = y logging.info("The whole available data is: ") logging.info( ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0], self.DataX.shape[1])) logging.info( ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0], self.num_labels)) X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=.25, random_state=SEED) self.clf.fit(X_trn, y_trn, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=10) if (self.num_train_samples != num_train_samples): logging.info("ARRGH: number of samples in X and y do not match!") self.is_trained = True def predict(self, F, datainfo, timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical']) # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: X_cat = self.cat_encs.transform(F['CAT']).values X = np.concatenate((X, X_cat), axis=1) del X_cat num_test_samples = X.shape[0] if X.ndim > 1: num_feat = X.shape[1] logging.info( ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) if (self.num_feat != num_feat): logging.info( "ARRGH: number of features in X does not match training data!") logging.info( ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y = self.clf.predict_proba(X)[:, 1] y = np.transpose(y) return y def save(self, path="./"): pickle.dump(self, open(path + '_model.pickle', "w")) def load(self, path="./"): modelfile = path + '_model.pickle' if isfile(modelfile): with open(modelfile) as f: self = pickle.load(f) logging.info("Model reloaded from: " + modelfile) return self
# Make copies for X, Y to b e used within CV X = X_train.drop(["msno", "is_churn"], axis=1).copy() y = X_train["is_churn"].copy() # (stratified) Cross validation for train_index, validation_index in kf.split(X, y): print("Cross-validation, Fold %d" % (len(log_loss_val) + 1)) # Split data into training and testing set X_train = X.iloc[train_index, :].copy() X_validate = X.iloc[validation_index, :].copy() y_train = y[train_index] y_validate = y[validation_index] # Train the model model = model.fit(X_train, y_train) # Test the model log_loss_val.append(log_loss(y_validate, model.predict_proba(X_validate))) print("Log loss: %f" % log_loss_val[-1]) # Make predictions y_pred.append(np.log(model.predict_proba(X_test[X.columns])[:, 1])) # delete temporal dataframes del X_train, X_validate, y_train, y_validate # Evaluate results from CV print("Log loss %f +/- %f" % (np.mean(log_loss_val), 2 * np.std(log_loss_val))) ## =========================== 4. Output results =========================== ##
def cal_subject_mul(train_vec, train_subject, test_id, test_vec, iter, baseline): # param = { 'boosting_type':'gbdt', 'num_leaves':55, 'reg_alpha':0.0, 'reg_lambda':1, # 'max_depth':15, 'n_estimators':6000, 'objective':'binary', # 'subsample':0.8, 'colsample_bytree':0.8, 'subsample_freq':1, # 'learning_rate':0.06, 'min_child_weight':1, 'random_state':20, 'n_jobs':4} # clf = LGBMClassifier(param) # clf = svm.LinearSVC(max_iter=100000) N = 10 train_vec = np.array(train_vec) train_subject = np.array(train_subject) kf = StratifiedKFold(n_splits=N, random_state=2018).split(train_vec, train_subject) clf = LGBMClassifier(boosting_type='gbdt', num_leaves=80, reg_alpha=0.1, reg_lambda=1, max_depth=8, n_estimators=iter, objective='binary', subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.06, min_child_weight=1, random_state=20, n_jobs=4) # iter_list = [803,61,69,314,196,223,64,153,55,284,173] test_res = list() for l in range(len(test_id)): test_res.append(list()) subject_vocab = { '价格': 0, '配置': 1, '操控': 2, '舒适性': 3, '油耗': 4, '动力': 5, '内饰': 6, '安全性': 7, '空间': 8, '外观': 9 } for l in range(len(test_id)): test_res[l].append(subject_vocab[baseline['subject'][l]]) value_list = list() for l in range(len(test_id)): value_list.append(list()) for l in range(len(test_id)): value_list[l].append(baseline['sentiment_value'][l]) res_sub = np.zeros([10, len(test_id), N]) for k, (train_fold, test_fold) in enumerate(kf): for i in range(10): train_subject_kf = train_subject[train_fold] train_label_onehot = train_subject_kf.copy() for l in range(len(train_subject_kf)): if train_subject_kf[l] != i: train_label_onehot[l] = 0 else: train_label_onehot[l] = 1 # print(train_label_onehot) # print(train_subject) # clf = LGBMClassifier(boosting_type='gbdt', num_leaves=80, reg_alpha=0.1, reg_lambda=1, # max_depth=8, n_estimators=iter_list[i], objective='binary', # subsample=0.8, colsample_bytree=0.8, subsample_freq=1, # learning_rate=0.06, min_child_weight=1, random_state=20, n_jobs=4) clf.fit(train_vec[train_fold], train_label_onehot) res_onehot = clf.predict(test_vec) for l in range(len(test_id)): res_sub[i][l][k] = res_onehot[l] res_onehot = np.zeros([10, len(test_id)]) for i in range(10): for j in range(len(test_id)): tmp = [] for k in range(N): tmp.append(res_sub[i][j][k]) if sum(tmp) > 7: res_onehot[i][j] = 1 for i in range(10): for l in range(len(test_id)): if res_onehot[i][l] == 1 and i not in test_res[l]: test_res[l].append(i) value_list[l].append(0) # value_list[l].append(value_list[l][0]) # clf = LGBMClassifier(boosting_type='gbdt', num_leaves=80, reg_alpha=0.1, reg_lambda=1, # max_depth=8, n_estimators=iter_list[10], objective='binary', # subsample=0.8, colsample_bytree=0.8, subsample_freq=1, # learning_rate=0.06, min_child_weight=1, random_state=20, n_jobs=4) return test_id, test_res, value_list
#Bloco 02: Parametrização do Modelo from lightgbm import LGBMClassifier classifier_lgbm_kpca = LGBMClassifier(max_depth=500, learning_rate=0.01, num_leaves=1000, min_data_in_leaf=200, n_estimators=2000, objective='binary', metric='binary_logloss', random_state=42) #Bloco 03: Fit e Predição classifier_lgbm_kpca.fit(X_train_kpca_new, Y_train) Y_pred_lgbm_kpca = classifier_lgbm_kpca.predict(X_test_kpca_new) #Bloco 04: Análise de Métricas from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score #Accuracy Score mtrc_accuracy_score_lgbm_kpca = accuracy_score(Y_test, Y_pred_lgbm_kpca) print('Accuracy Score : ' + str(mtrc_accuracy_score_lgbm_kpca)) #Precision Score mtrc_precision_score_lgbm_kpca = precision_score(Y_test, Y_pred_lgbm_kpca) print('Precision Score : ' + str(mtrc_precision_score_lgbm_kpca))
def kfold_lightgbm(df, num_folds, stratified = False, debug= False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] best_model = None best_score = 0.0 for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( nthread=4, n_estimators=10000, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1, ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200) oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) roc_auc_score_kfold = roc_auc_score(valid_y, oof_preds[valid_idx]) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score_kfold)) if roc_auc_score_kfold > best_score: best_model = clf del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False) #display_importances(feature_importance_df) datasets = {'feats':feats,'train_x':train_df[feats],'train_y':train_df['TARGET'],'valid_x':test_df[feats],'valid_y':test_df['TARGET']} return feature_importance_df, datasets, best_model
x = np.concatenate([f_ds, m_ds], 0) x = x.reshape(x.shape[0], x.shape[1]*x.shape[2]) y = np.concatenate([f_lb, m_lb], 0) print(x.shape) # (2141, 110336) print(y.shape) # (2141,) # 전처리 x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, test_size=0.2, random_state=42) print(x_train.shape) # (1712, 110336) print(x_test.shape) # (429, 110336) print(y_train.shape) # (1712,) print(y_test.shape) # (429,) # 모델 구성 model = LGBMClassifier(device='gpu') model.fit(x_train, y_train) # model & weight save # pickle.dump(model, open('E:/nmb/nmb_data/cp/m04_mfcc_LGBMClassifier.data', 'wb')) # wb : write # print("== save complete ==") # model load model = pickle.load(open('E:/nmb/nmb_data/cp/m04_mfcc_LGBMClassifier.data', 'rb')) # rb : read # time >> # evaluate y_pred = model.predict(x_test) # print(y_pred[:100]) # print(y_pred[100:]) accuracy = accuracy_score(y_test, y_pred)
def find_markers( data: AnnData, label_attr: str, de_key: str = "de_res", n_jobs: int = -1, min_gain: float = 1.0, random_state: int = 0, remove_ribo: bool = False, ) -> Dict[str, Dict[str, List[str]]]: """Find markers using gradient boosting method. Parameters ---------- data: ``anndata.AnnData`` Annotated data matrix with rows for cells and columns for genes. label_attr: ``str`` Cluster labels used for finding markers. Must exist in ``data.obs``. de_key: ``str``, optional, default: ``"de_res"`` Keyword of DE analysis result stored in ``data.varm``. n_jobs: ``int``, optional, default: ``-1`` Number of threads to used. If ``-1``, use all available threads. min_gain: ``float``, optional, default: ``1.0`` Only report genes with a feature importance score (in gain) of at least ``min_gain``. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. remove_ribo: ``bool``, optional, default: ``False`` If ``True``, remove ribosomal genes with either RPL or RPS as prefixes. Returns ------- markers: ``Dict[str, Dict[str, List[str]]]`` A Python dictionary containing marker information in structure ``dict[cluster_id]['up' or 'down'][dataframe]``. Examples -------- >>> marker_dict = pg.find_markers(adata, label_attr = 'leiden_labels') """ n_jobs = effective_n_jobs(n_jobs) if remove_ribo: data = data[:, np.vectorize(lambda x: not x.startswith("RPL") and not x. startswith("RPS"))(data.var_names), ] X_train, X_test, y_train, y_test = train_test_split( data.X, data.obs[label_attr], test_size=0.1, random_state=random_state, stratify=data.obs[label_attr], ) # start = time.time() # xgb = XGBClassifier(n_jobs = n_jobs, n_gpus = 0) # xgb.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric = 'merror') # # print(xgb.evals_result()) # end = time.time() # print("XGBoost used {:.2f}s to train.".format(end - start)) # from xgboost import XGBClassifier try: from lightgbm import LGBMClassifier except ImportError: print("Need lightgbm! Try 'pip install lightgbm'.") start_lgb = time.time() lgb = LGBMClassifier(n_jobs=n_jobs, metric="multi_error", importance_type="gain") lgb.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=1, ) end_lgb = time.time() logger.info("LightGBM used {:.2f}s to train.".format(end_lgb - start_lgb)) ntot = (lgb.feature_importances_ >= min_gain).sum() ords = np.argsort(lgb.feature_importances_)[::-1][:ntot] log_exprs = [ x for x in data.varm[de_key].dtype.names if x.startswith("mean_logExpr:") ] labels = [x.rpartition(":")[2] for x in log_exprs] titles = [("down", "down_gain"), ("weak", "weak_gain"), ("strong", "strong_gain")] markers = defaultdict(lambda: defaultdict(list)) kmeans = KMeans(n_clusters=3, random_state=random_state) for gene_id in ords: gene_symbol = data.var_names[gene_id] mydat = [[x] for x in data.varm[de_key][log_exprs][gene_id]] kmeans.fit(mydat) kmeans_label_mode = pd.Series(kmeans.labels_).mode()[0] for i, kmeans_label in enumerate( np.argsort(kmeans.cluster_centers_[:, 0])): if kmeans_label != kmeans_label_mode: for pos in (kmeans.labels_ == kmeans_label).nonzero()[0]: clust_label = labels[pos] markers[clust_label][titles[i][0]].append(gene_symbol) markers[clust_label][titles[i][1]].append("{:.2f}".format( lgb.feature_importances_[gene_id])) return markers
TARGET_COL ] ] len(features) from lightgbm import LGBMClassifier clf = LGBMClassifier(n_estimators=550, learning_rate=0.03, min_child_samples=40, random_state=1, colsample_bytree=0.5, reg_alpha=2, reg_lambda=2) clf.fit(trn[features], trn[TARGET_COL], eval_set=[(val[features], val[TARGET_COL])], verbose=50, eval_metric='auc', early_stopping_rounds=100) preds = clf.predict_proba(test[features])[:, 1] fi = pd.Series(index=features, data=clf.feature_importances_) fi.sort_values(ascending=False)[0:20][::-1].plot(kind='barh') sub = pd.DataFrame({"Patient_ID": test.Patient_ID.values}) sub["Health_Camp_ID"] = test.Health_Camp_ID.values sub["Outcome"] = preds sub.to_csv("lgbmblending.csv", index=False)
data = pd.read_csv('feature.csv') data_y = data['tz_students'].values data_y = 1 - data_y data = data.drop(['STUDENTCODE', 'tz_students'], axis=1) # lightgbm clf = LGBMClassifier(num_leaves=40, learning_rate=0.05, max_depth=20, n_estimators=300, subsample=0.8, colsample_bytree=1, min_child_weight=1) # 计算特征重要度 clf.fit(X=data, y=data_y) score = clf.feature_importances_ score = [(data.columns[i], score[i]) for i in range(len(score))] score = sorted(score, key=lambda k: k[1], reverse=True) name_list = [] for i in range(len(score)): if score[i][1] > 0: name_list.append(score[i][0]) print(i, score[i]) else: break print(name_list) # 做出基本分 tmp_data = data[name_list] start = time.time()
############################################################ ############################################################ ############################################################ ############################################################ ColumnSelect=np.asarray(["C"+str(X) for X in range(1,15)]) TempTrain=TrainTransaction[ColumnSelect] TempTrain=TempTrain.join([pd.get_dummies(data=TrainTransaction["ProductCD"]), pd.get_dummies(data=TrainTransaction["P_emaildomain"]), pd.get_dummies(data=TrainTransaction["QuantileAmt"])]) #Train and test sets X_train, X_test, y_train, y_test = train_test_split(TempTrain, TrainTransaction['isFraud'], test_size=0.1, random_state=42) #Set up SDG Model with Grid Search LGBMModel=LGBMClassifier() LGBMModel.fit(X_train, y_train) #Predict Predictions=LGBMModel.predict(TempTrain) #Metrics print(confusion_matrix(y_test, Predictions)) print(classification_report(y_test, Predictions)) #Save Parameters text_file = open("Params_V5.txt", "w") text_file.write("%s\n" % confusion_matrix(y_test, Predictions)) text_file.write("%s\n" % classification_report(y_test, Predictions)) text_file.close() #Try with test
sns.catplot(x="Crop_Damage", y="Season", hue="Crop_Damage", kind="bar", data=train); X = train.drop(labels=['Crop_Damage'], axis=1) y = train['Crop_Damage'] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) y_test.value_counts X_train_data = X_train.iloc[:, 1:9] X_test_data = X_test.iloc[:, 1:9] X_train_data lgbm = LGBMClassifier() lgbm_pred = lgbm.fit(X_train_data, y_train) y_pred = lgbm_pred.predict(X_test_data) print(accuracy_score(y_pred, y_test)) test2 = test.iloc[:, 1:9] test_pred = lgbm_pred.predict(test2) test2['Crop_Damage'] = test_pred test2['ID'] = test['ID'] test2 output=pd.DataFrame(data={"ID":test2["ID"],"Crop_Damage":test2["Crop_Damage"]}).to_csv("Sol.csv", index=False) from google.colab import files files.download('Sol.csv') output
def gen_sub_by_para(): args = locals() logger.debug(f'Run train dnn:{args}') from code_felix.tiny.util import get_stable_feature feature_label = get_stable_feature('1003') #feature_label = get_dynamic_feature() logger.debug(f'The input feature:{feature_label.shape}') test = feature_label[feature_label['sex'].isnull()] train = feature_label[feature_label['sex'].notnull()] train['sex_age'] = train['sex_age'].astype('category') X_train, X_test, y_train, y_test = split_train(train) gbm = LGBMClassifier( n_estimators=20000, boosting_type='gbdt', objective='multiclass', num_class=22, random_state=47, metric=['multi_logloss'], verbose=-1, max_depth=3, feature_fraction=0.2, subsample=0.5, min_data_in_leaf=1472, reg_alpha=2, reg_lambda=4, ########## learning_rate=0.05, # 0.1 colsample_bytree=None, #1 min_child_samples=None, #20 min_child_weight=None, #0.001 min_split_gain=None, #0 num_leaves=None, #31 subsample_for_bin=None, #200000 subsample_freq=None, #1 nthread=-1, #device='gpu' ) # gbm.set_params(**params) logger.debug(gbm) res = gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=100, verbose=True) print(f'Fit return type:{type(res)}') print('Feature importances:', list(gbm.feature_importances_)) print_imp_list(train, gbm) best = round(gbm.best_score_.get('valid_1').get('multi_logloss'), 5) best_score = best best_epoch = gbm.best_iteration_ print(gbm) best = "{:.5f}".format(best) pre_x = test.drop(['sex', 'age', 'sex_age', 'device'], axis=1) # sub = pd.DataFrame(gbm.predict_proba(pre_x.values, num_iteration=gbm.best_iteration_)) # # sub.columns=train.sex_age.cat.categories # sub['DeviceID']=test['device'].values # sub=sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']] # # # from sklearn.metrics import log_loss # # loss = log_loss(y_test, gbm.predict_proba(X_test,num_iteration=gbm.best_iteration_)) # # # # print(f'Loss={loss}, best={best}') # #lgb.plot_importance(gbm, max_num_features=20) # # #print(f'=============Final train feature({len(feature_label.columns)}):\n{list(feature_label.columns)} \n {len(feature_label.columns)}') # # file = f'./sub/baseline_lg_sci_{best}_{args}.csv' # file = replace_invalid_filename_char(file) # print(f'sub file save to {file}') # sub.to_csv(file,index=False) ###Save result for ensemble train_bk = pd.DataFrame(gbm.predict_proba( train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)), index=train.device, columns=train.sex_age.cat.categories) test_bk = pd.DataFrame(gbm.predict_proba(pre_x), index=test.device, columns=train.sex_age.cat.categories) from code_felix.tiny.util import save_result_for_ensemble save_result_for_ensemble( f'{best_score}_{best_epoch}_lgb_{args}', train=train_bk, test=test_bk, label=None, )
"bagging_fraction": 0.70, 'bagging_freq': 4, "max_depth": -1, "verbosity": -1, "reg_alpha": 0.3, "reg_lambda": 0.1, # "min_split_gain":0.2, "min_child_weight": 10, 'zero_as_missing': True, 'num_threads': 8, } model = lgb.train(params, lgb.Dataset(X_values, label=y_train), 600) model = LGBMClassifier(n_estimators=100) model.fit(X_values, y_train) kfold = KFold(n_splits=3, shuffle=True, random_state=0) score = cross_val_score(model, X_values, y_train, cv=kfold, n_jobs=1, scoring='roc_auc', verbose=0) print('score {:.4}'.format(score.mean())) prediction = model.predict_proba(X_test)[:, 1] prediction = model.predict(X_test) result = y_true.copy() result['prediction'] = prediction metric = roc_auc_score(result['target'], result['prediction']) print('roc auc: {:.4}'.format(metric)) # 0.8453 0.8461 # 0.8317
def without_cv_transfer_a_to_b_modeling(): """ :return: """ '''Data input''' data_a_train = pd.read_csv('../data/A_train_final.csv', index_col='no') data_b_train = pd.read_csv('../data/B_train_final.csv', index_col='no') y_of_b_train = data_b_train['flag'] data_b_test = pd.read_csv('../data/B_test_final.csv', index_col='no') '''A train特征工程''' data_a_train_without_label = data_a_train.drop('flag', axis=1) data_a_train_without_label['UserInfo_222x82'] = data_a_train_without_label['UserInfo_82'] * data_a_train_without_label['UserInfo_222'] '''缺失值填充''' data_a_train_filled = data_a_train_without_label.fillna(value=10) '''特征的名字''' feature_name = list(data_a_train_without_label.columns.values) data_b_test_user_id = list(data_b_test.index.values) '''构造训练集和测试集''' x_temp = data_a_train_filled.iloc[:, :].as_matrix() # 自变量 y = data_a_train.iloc[:, -1].as_matrix() # 因变量 '''Feature selection 注意如果加特征的话,feature name还是需要改的''' X, dropped_feature_name, len_feature_choose = lgb_feature_selection(feature_name, x_temp, y, "0.1*mean") '''B train特征工程''' data_b_train_without_label = data_b_train.drop('flag', axis=1) data_b_train_without_label['UserInfo_222x82'] = data_b_train_without_label['UserInfo_82'] * data_b_train_without_label['UserInfo_222'] data_b_train_filled = data_b_train_without_label.fillna(value=10) '''b test 特征工程''' data_b_test['UserInfo_222x82'] = data_b_test['UserInfo_82'] * data_b_test['UserInfo_222'] data_b_test_filled = data_b_test.fillna(value=10) '''特征筛选''' data_b_train_filled_after_feature_selection = data_test_feature_drop(data_b_train_filled, dropped_feature_name) data_b_test_filled_after_feature_selection = data_test_feature_drop(data_b_test_filled, dropped_feature_name) '''用A_train建模预测B_train''' print '起始时间' print time.clock()*1.0/60 parameter_n_estimators = 400 classifier = LGBMClassifier(n_estimators=parameter_n_estimators) a_model = classifier.fit(X, y) prob_of_b_train = a_model.predict_proba(data_b_train_filled_after_feature_selection) print '训练终止时间' print time.clock()*1.0/60 '''画roc曲线''' fpr, tpr, thresholds = roc_curve(y_of_b_train, prob_of_b_train[:, 1]) roc_auc = auc(fpr, tpr) print '\nauc='+str(roc_auc) '''预测Btest''' prob_of_b_test = a_model.predict_proba(data_b_test_filled_after_feature_selection) result_file_name = '../result/B_test_predict_using_A_LGBLGB_without_cv_fillna_10' + '_N_' + str(parameter_n_estimators) + '_features_' + \ str(len_feature_choose) + '_offline_'+str(roc_auc)+'.csv' write_predict_results_to_csv(result_file_name, data_b_test_user_id, prob_of_b_test[:, 1].tolist())