def kfold_lightgbm(df, num_folds=5, stratified=False, debug=False): # Divide into train/valid and text data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] ids = train_df['SK_ID_CURR'] print('Starting Lightgbm. Train shape: {}, test shape: {}'.format( train_df.shape, test_df.shape)) del df gc.collect() if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=321) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=123) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] ] for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'TARGET'].iloc[valid_idx] clf = LGBMClassifier( n_estimators=10000, learning_rate=0.01, num_leaves=30, colsample_bytree=.9, subsample=0.5, max_depth=2, reg_alpha=.04, reg_lambda=.07, min_split_gain=.02, min_child_weight=39, silent=-1, verbose=-1, n_jobs=-1, ) clf.fit( train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=100, early_stopping_rounds=100 # 30 ) oof_preds[valid_idx] = clf.predict_proba( valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba( test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() score = roc_auc_score(train_df['TARGET'], oof_preds) print('Full AUC score %.6f' % score) df_oof_preds = pd.DataFrame({ 'SK_ID_CURR': ids, 'TARGET': train_df['TARGET'], 'PREDICTION': oof_preds }) df_oof_preds = df_oof_preds[['SK_ID_CURR', 'TARGET', 'PREDICTION']] if not debug: test_df['TARGET'] = sub_preds # Save test predictions now = datetime.now() created_time = now.strftime('%Y-%m-%d-%H-%M') score = str(round(score, 6)).replace('.', '') # submission file sub_file = f'../predictions/{created_time}_{score}_{num_folds}_fold-average-LGBClassifier_submission.csv' test_df[['SK_ID_CURR', 'TARGET']].to_csv(sub_file, index=False) # oof prediction file oof_file = f'../predictions/{created_time}_{score}_{num_folds}_fold-average-LGBClassifier_oof.csv' df_oof_preds.to_csv(oof_file, index=False) # Display a few plots vis_file = f'../visualization/{score}_{created_time}_' folds_idx = [(train_idx, valid_idx) for train_idx, valid_idx in folds.split( train_df[feats], train_df['TARGET'])] display_importances(feature_importance_df_=feature_importance_df, vis_file=vis_file + "_feature_importances_without_ext_source.png") display_roc_curve(y_=train_df['TARGET'], oof_preds_=oof_preds, folds_idx_=folds_idx, vis_file=vis_file + "_roc_curve_without_ext_source.png") display_precision_recall(y_=df['TARGET'], oof_preds_=oof_preds, folds_idx_=folds_idx, vis_file=vis_file + "_precision_recall_without_ext_source.png") return None
def main(debug=2000): if debug is not False: rows = debug else: rows = None app_train = pd.read_csv('../../input/application_train.csv', nrows=rows) app_test = pd.read_csv('../../input/application_test.csv', nrows=rows) test_skid = app_test[['SK_ID_CURR']] cols_label = ['TARGET'] cols_basic = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'] cols_amt = [ 'TARGET', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE' ] cols_test = cols_basic + cols_amt # ---------------------------------------------------------------------------------------------------- # 清除离群点 print('1 清除离群点') app_train = app_train.loc[:, cols_test] app_test = app_test.loc[:, cols_test] origin_index = app_train.index train_length = app_train.shape[0] need_drop = ['AMT_INCOME_TOTAL'] for col in need_drop: col_mean = app_train[col].mean() col_std = app_train[col].std() z = (app_train[col] - col_mean) / col_std outlier = z[abs(z) > 8].index app_train = app_train.drop(outlier) # print(app_train.index) # drop_index = [index for index in origin_index.tolist() if index not in app_train.index.tolist()] # print(drop_index) print('被删除的离群点个数:{}'.format(train_length - app_train.shape[0])) app_all = pd.concat([app_train, app_test], axis=0) print('shape: {}'.format(app_all.shape)) missing_values = missing_values_table(app_train) print(missing_values) # ---------------------------------------------------------------------------------------------------- print('2 构造新特征') # ---------------------------------------------------------------------------------------------------- features = [ 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE' ] for col in features: app_all[col].fillna(app_all[col].dropna().median(), inplace=True) new_features = [] for i in range(4): for j in range(i + 1, 4): app_all[features[i] + ' / ' + features[j]] = app_all[features[i]] / app_all[features[j]] new_features.append(features[i] + ' / ' + features[j]) print(new_features) features = features + new_features train_len = app_train.shape[0] print(train_len) app_train = app_all.iloc[:train_len, :] app_test = app_all.iloc[train_len:, :] # plot kde_plot(app_train, features, 'amt_4_pic/4_orgin_persent_kde.png') skew_plot(app_train, features, 'amt_4_pic/4_origin_persent_skew.png') # 准备训练数据 # ---------------------------------------------------------------------------------------------------- print('3 准备训练数据') # ---------------------------------------------------------------------------------------------------- label = app_train['TARGET'] print('label len {}'.format(len(label))) if 'TARGET' in app_train: train = app_train.drop(columns=['TARGET']) else: train = app_train.copy() # Copy of the testing data test = app_test[features].copy() print(train.shape, test.shape) imputer = Imputer(strategy='median') scaler = MinMaxScaler(feature_range=(0, 1)) imputer.fit(train) train = imputer.transform(train) test = imputer.transform(test) scaler.fit(train) train = scaler.transform(train) test = scaler.transform(test) print(train.shape[0], test.shape[0]) train_feature = pd.DataFrame(train, columns=features, index=label.index) # print('label shape, train_feature shape') # print(label.shape, train_feature.shape) # print(label.head(), train_feature.head()) # print((label.index == train_feature.index).sum()) app_train = pd.concat([label, train_feature], axis=1) app_test = pd.DataFrame(test, columns=features) # print('app_train {}, app_test {} '.format(app_train.shape, app_test.shape)) # plot kde_plot(app_train, features, 'amt_4_pic/4_amt_persent_fillna_scaler_kde.png') skew_plot(app_train, features, 'amt_4_pic/4_amt_persent_fillna_scaler_skew.png') # 模型训练 print('4 lgb 训练') params_lgb = { 'nthread': 4, 'n_estimators': 10000, 'learning_rate': 0.02, 'num_leaves': 34, 'colsample_bytree': 0.9497036, 'subsample': 0.8715623, 'max_depth': 8, 'reg_alpha': 0.041545473, 'reg_lambda': 0.0735294, 'min_split_gain': 0.0222415, 'min_child_weight': 39.3259775, 'silent': -1, 'verbose': -1, } oof_preds, sub_preds, feature_importance, metrics = OOFPreds(train_feature, label, app_test, params_lgb, clf='lgb') # display_importances(feature_importance, num_features=4, filename='./basic_4_pic/feature_importance.png') print(metrics) sub_preds = pd.concat([test_skid, sub_preds], axis=1) sub_preds.to_csv('lgb_4amt_persent_-fillmedian-minmax-val-180628.csv', index=False) # ---------------------------------------------------------------------------------------------------- print('5 box_cox') # ---------------------------------------------------------------------------------------------------- train_len = app_train.shape[0] app_all = pd.concat([app_train, app_test], axis=0) print(train_len) for col in features: col_trans = scale_minmax(app_all.loc[:, col]) app_all.loc[:, col], _ = stats.boxcox(col_trans + 1) app_all.loc[:, col] = scale_minmax(app_all.loc[:, col]) app_train = app_all.iloc[:train_len, :] app_test = app_all.iloc[train_len:, :] kde_plot(app_train, features, 'amt_4_pic/4_persent_boxcox_kde.png') skew_plot(app_train, features, 'amt_4_pic/4_persent_boxcox_skew.png') # ---------------------------------------------------------------------------------------------------- print('6 lgb 训练') # ---------------------------------------------------------------------------------------------------- params_lgb = { 'nthread': 4, 'n_estimators': 10000, 'learning_rate': 0.02, 'num_leaves': 34, 'colsample_bytree': 0.9497036, 'subsample': 0.8715623, 'max_depth': 8, 'reg_alpha': 0.041545473, 'reg_lambda': 0.0735294, 'min_split_gain': 0.0222415, 'min_child_weight': 39.3259775, 'silent': -1, 'verbose': -1, } print(app_train.shape, app_test.shape) train_feature = app_train[features] test = app_test[features] oof_preds, sub_preds, feature_importance, metrics = OOFPreds(train_feature, label, test, params_lgb, clf='lgb') print(FileExistsError) display_importances(feature_importance, num_features=4, filename='feature_importance.png') print(metrics) fpr, tpr, _ = roc_curve(label, oof_preds) print(fpr, tpr) plt.figure(figsize=(8, 8)) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC (AUC = %0.4f ' % (metrics.iloc[5, 2])) plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8) plt.savefig('roc_persent.png') sub_preds = pd.concat([test_skid, sub_preds], axis=1) sub_preds.to_csv('lgb_4amt_persent-boxcox-val-180628.csv', index=False)
def kfold_lightgbm(train_df, test_df, num_folds, submission_file_name, stratified=False, debug=False): logger = logging.getLogger('lgbm_train') logger.info("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=326) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=326) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['outliers'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'target'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'target'].iloc[valid_idx] # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) # params optimized by optuna params = { 'device': 'gpu', 'task': 'train', 'objective': 'regression', 'metric': 'rmse', 'boosting': 'gbdt', 'learning_rate': 0.01, 'subsample': 0.718509060213284, 'max_depth': 8, 'top_rate': 0.8076614306859368, 'num_leaves': 45, 'min_child_weight': 59.174950161115106, 'other_rate': 0.0721768246018207, 'reg_alpha': 17.018862389097798, 'reg_lambda': 24.20636870149939, 'colsample_bytree': 0.667864732544997, 'min_split_gain': 8.021790442813048, 'min_data_in_leaf': 30, 'verbose': -1, 'seed': int(2**n_fold), 'bagging_seed': int(2**n_fold), 'drop_seed': int(2**n_fold) } reg = lgb.train(params=params, train_set=lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=100) oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration) sub_preds += reg.predict( test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p( reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) logger.info('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y gc.collect() # display importances display_importances(feature_importance_df) if not debug: # save submission file test_df.loc[:, 'target'] = sub_preds test_df = test_df.reset_index() test_df[['card_id', 'target']].to_csv(submission_file_name, index=False)
def get_model(df, feats): res,feature_importance_df = lightgbm(df, feats) res.to_csv(config.SUBMISSION_FILE_NAME, index= False) utils.display_importances(feature_importance_df, config.IMPORTANCE_IMAGE_PATH)
def main(debug=2000): if debug is not False: rows = debug else: rows = None app_train = pd.read_csv('../input/application_train.csv', nrows=rows) app_test = pd.read_csv('../input/application_test.csv', nrows=rows) id_train = app_train.index id_test = app_test.index test_skid = app_test[['SK_ID_CURR']] app_all = pd.concat([app_train, app_test], axis=0) print('shape: {}'.format(app_all.shape)) y = app_train['TARGET'] ids = app_train['SK_ID_CURR'] del app_train['SK_ID_CURR'] missing_values = missing_values_table(app_train) # print(missing_values.head(40)) # print(missing_values.tail(10)) # ---------------------------------------------------------------------------------------------------- # 划分特征类型: 数值连续,数值离散, 分类 # ---------------------------------------------------------------------------------------------------- ''' 列名当中其实已经包含了,数据的类型 分类: --------- TYPE CODE 数值连续: ---------- DAYS CNT RELATIVE 相对值 也就是标准化过 AMT 数值离散: ---------- FLAG RATING NOT ''' # print('train shape {}'.format(app_train.shape)) # # num_cols = app_train.select_dtypes(include=[np.number]).columns.tolist() # cat_cols = app_train.select_dtypes(include=[np.object]).columns.tolist() # # print("\nnum_cols count:{}".format(len(num_cols))) # print("\ncat_cols count:{}".format(len(cat_cols))) # # print("\nnum_cols:{}".format(num_cols)) # print("\ncat_cols:{}".format(cat_cols)) # # cat_flag_cols = [col for col in cat_cols if app_train[col].nunique() == 2] # print('\ncat flag 特征总数:{}'.format(len(cat_flag_cols))) # print('cat flag cols {}'.format(cat_flag_cols)) # # # 只有一个值 # num_1_cols = [col for col in num_cols if app_train[col].nunique() == 1] # print('\nnum 1 特征总数:{}'.format(len(num_1_cols))) # print('{}'.format(num_1_cols)) # # num_flag_cols = [col for col in num_cols if app_train[col].nunique() == 2] # print('\nnum flag 特征总数:{}'.format(len(num_flag_cols))) # print('num flag cols {}'.format(num_flag_cols)) # # num_discreet_cols = [col for col in num_cols if 2 < app_train[col].nunique() <= 20] # print('\n数值离散 特征总数:{}'.format(len(num_discreet_cols))) # print('数值离散 {}'.format(num_discreet_cols)) # # num_continuous_cols = [col for col in num_cols if 20 < app_train[col].nunique()] # print('\n数值连续 特征总数:{}'.format(len(num_continuous_cols))) # print('数值连续 {}'.format(num_continuous_cols)) # for col in num_cols: # print(app_train[col].nunique()) # ---------------------------------------------------------------------------------------------------- # 数据探索可视化 # ---------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------- # 离群值可视化 # for col in num_continuous_cols: # draw_feature_distribution(app_all, col) # ---------------------------------------------------------------------------------------------------- # # fcols = 2 # frows = len(num_continuous_cols) # plt.figure(figsize=(4 * fcols, 6 * frows)) # i = 0 # # for col in num_continuous_cols: # # dat = app_train[[col, 'TARGET']].dropna() # # i += 1 # plt.subplot(frows, fcols, i) # sns.distplot(dat[col], fit=stats.norm) # plt.title(col + ' Original') # plt.xlabel('') # # i += 1 # plt.subplot(frows, fcols, i) # _ = stats.probplot(dat[col], plot=plt) # plt.title('skew=' + '{:.4f}'.format(stats.skew(dat[col]))) # plt.xlabel('') # plt.ylabel('') # # # # # # plt.tight_layout(h_pad=2.5) # plt.savefig('./pic/num_continue.png') # plt.show() # ---------------------------------------------------------------------------------------------------- # 编码 # ---------------------------------------------------------------------------------------------------- le = LabelEncoder() le_count = 0 # Iterate through the columns for col in app_train.columns: if app_train[col].dtype == 'object': # If 2 or fewer unique categories if len(list(app_train[col].unique())) <= 2: # Train on the training data le.fit(app_train[col]) # Transform both training and testing data app_train[col] = le.transform(app_train[col]) app_test[col] = le.transform(app_test[col]) # Keep track of how many columns were label encoded le_count += 1 print(col) print('%d columns were label encoded.' % le_count) app_train = pd.get_dummies(app_train) app_test = pd.get_dummies(app_test) print('Training Features shape: ', app_train.shape) print('Testing Features shape: ', app_test.shape) # 对齐 train test train_labels = app_train['TARGET'] app_train, app_test = app_train.align(app_test, join='inner', axis=1) # Add the target back in app_train['TARGET'] = train_labels print('Training Features shape: ', app_train.shape) print('Testing Features shape: ', app_test.shape) # ---------------------------------------------------------------------------------------------------- # 处理异常数据 DAYS_EMPLOYED_ANOM # ---------------------------------------------------------------------------------------------------- app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243 # Replace the anomalous values with nan app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True) # app_train['DAYS_EMPLOYED'].plot.hist(title='Days Employment Histogram') # plt.xlabel('Days Employment') # plt.show() app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243 app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace=True) print('There are %d anomalies in the test data out of %d entries' % (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test))) # ---------------------------------------------------------------------------------------------------- # baseline # ---------------------------------------------------------------------------------------------------- # Drop the target from the training data if 'TARGET' in app_train: train = app_train.drop(columns=['TARGET']) else: train = app_train.copy() # Feature names features = list(train.columns) # Copy of the testing data test = app_test.copy() print(type(test)) imputer = Imputer(strategy='median') scaler = MinMaxScaler(feature_range=(0, 1)) imputer.fit(train) train = imputer.transform(train) test = imputer.transform(app_test) scaler.fit(train) train = scaler.transform(train) test = scaler.transform(test) print('Training data shape: ', train.shape) print('Testing data shape: ', test.shape) train = pd.DataFrame(train, columns=features) test = pd.DataFrame(test, columns=features) # ---------------------------------------------------------------------------------------------------- # 尝试用 bayesian optimization 给 logistic 回归调参 # ---------------------------------------------------------------------------------------------------- print(train.shape) print(train_labels.ravel().reshape(len(train_labels), 1).shape) # ---------------------------------------------------------------------------------------------------- # 训练模型 # ---------------------------------------------------------------------------------------------------- params = {'C': 0.001} params_lgb = { 'nthread': 4, 'n_estimators': 10000, 'learning_rate': 0.02, 'num_leaves': 34, 'colsample_bytree': 0.9497036, 'subsample': 0.8715623, 'max_depth': 8, 'reg_alpha': 0.041545473, 'reg_lambda': 0.0735294, 'min_split_gain': 0.0222415, 'min_child_weight': 39.3259775, 'silent': -1, 'verbose': -1, } # oof_preds, sub_preds, feature_importance, metrics = OOFPreds(train, train_labels, test, params=params, # n_splits=2, clf=None) oof_preds, sub_preds, feature_importance, metrics = OOFPreds( train, train_labels, test, params=params_lgb, n_splits=5, clf='lgb') print('feature importance: {}'.format(feature_importance.shape)) print(feature_importance.dtypes) display_importances(feature_importance, num_features=20, filename='lgb_20.png') print(metrics) sub_preds = pd.concat([test_skid, sub_preds], axis=1) sub_preds.to_csv('lgb_baseline-fillmedian-minmax-val-180627.csv', index=False) # ---------------------------------------------------------------------------------------------------- # 需要处理的离群点 # ---------------------------------------------------------------------------------------------------- outlier = [ 'YEARS_BEGINEXPLUATATION_AVG', 'DAYS_EMPLOYED', 'BASEMENTAREA_AVG' ]
def kfold_lightgbm(df, num_folds, stratified=False, debug=False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): dtrain = lgb.Dataset(data=train_df[feats].iloc[train_idx], label=train_df['TARGET'].iloc[train_idx], free_raw_data=False, silent=True) dvalid = lgb.Dataset(data=train_df[feats].iloc[valid_idx], label=train_df['TARGET'].iloc[valid_idx], free_raw_data=False, silent=True) # LightGBM parameters found by Bayesian optimization params = { 'objective': 'binary', 'boosting_type': 'gbdt', 'nthread': 16, 'learning_rate': 0.02, # 02, 'num_leaves': 20, 'colsample_bytree': 0.9497036, 'subsample': 0.8715623, 'subsample_freq': 1, 'max_depth': 8, 'reg_alpha': 0.041545473, 'reg_lambda': 0.0735294, 'min_split_gain': 0.0222415, 'min_child_weight': 60, # 39.3259775, 'seed': 0, 'verbose': -1, 'metric': 'auc', } clf = lgb.train( params=params, train_set=dtrain, num_boost_round=10000, valid_sets=[dtrain, dvalid], early_stopping_rounds=200, verbose_eval=False ) oof_preds[valid_idx] = clf.predict(dvalid.data) sub_preds += clf.predict(test_df[feats]) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importance(importance_type='gain') fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid.label, oof_preds[valid_idx]))) del clf, dtrain, dvalid gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: sub_df = test_df[['SK_ID_CURR']].copy() sub_df['TARGET'] = sub_preds sub_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False) if debug: display_importances(feature_importance_df) return feature_importance_df