def bureau_balance_feature(df, Debug=False): if Debug: bureau_balance = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//bureau_balance.csv") bureau_balance = bureau_balance.sample(10000) else: bureau_balance = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//bureau_balance.csv") bureau = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//bureau.csv") bureau_all = pd.merge(bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], bureau_balance.groupby(['SK_ID_BUREAU', 'STATUS'], as_index=False).count(), how='left', on=['SK_ID_BUREAU']) bureau_main = df[['SK_ID_CURR']] bureau_all['overdue'] = bureau_all.STATUS.apply( lambda x: 1 if x in ['1', '2', '3', '4', '5'] else 0) with timer("bureau balance missing count"): bureau_main = bureau_balance_missing(bureau_main, bureau_all) with timer("bureau balance overdue analysis"): bureau_main = bureau_overdue(bureau_main, bureau_all) with timer("bureau balance status count"): bureau_main = bureau_status(bureau_main, bureau_all) bureau_main.fillna(0, inplace=True) bureau_main = correlation_reduce(bureau_main) df = pd.merge(df, bureau_main, on=['SK_ID_CURR'], how='left', validate='one_to_one') return df
def card_feature(df,Debug = False): if Debug: credit = import_data("D:\Kaggle\MyFirstKaggleCompetition\Data\credit_card_balance.csv") credit = credit.sample(10000) else: credit = import_data("D:\Kaggle\MyFirstKaggleCompetition\Data\credit_card_balance.csv") card_main = df[['SK_ID_CURR']] key = ['SK_ID_CURR','SK_ID_PREV'] amt = [f for f in credit.columns if 'AMT' in f] cnt = [f for f in credit.columns if 'CNT' in f] sk = ['SK_DPD', 'SK_DPD_DEF'] with timer("card missing analysis"): card_main = card_missing(card_main,credit) with timer("card overdue analysis"): card_main = card_sk(card_main, credit, key + sk) with timer("card using analysis"): card_main = card_using(card_main, credit, key + amt) with timer("card all behavior analysis"): card_main = card_all(card_main, credit, key + amt + cnt) with timer("card using behavior analysis from the first payment of AMT_PAYMENT_TOTAL_CURRENT"): card_main = card_amt_total_payment(card_main, credit, key + amt + cnt) with timer("card last two year behavior analysis"): card_main = card_last_two_year(card_main, credit) with timer("card last one year behavior analysis"): card_main = card_last_one_year(card_main, credit) card_main.fillna(0,inplace = True) card_main = correlation_reduce(card_main) df = pd.merge(df, card_main, on = ['SK_ID_CURR'], how = 'left') return df
def pos_cash_feature(df, Debug=False): if Debug: pos = import_data( "D:\Kaggle\MyFirstKaggleCompetition\Data\POS_CASH_balance.csv") pos = pos.sample(10000) else: pos = import_data( "D:\Kaggle\MyFirstKaggleCompetition\Data\POS_CASH_balance.csv") pos_main = df[['SK_ID_CURR']] pos['SK_DPD_DIFF'] = pos.SK_DPD - pos.SK_DPD_DEF pos['pos_cash_paid_late'] = pos['SK_DPD'].apply(lambda x: 1 if x > 0 else 0) pos['pos_cash_paid_late_with_tolerance'] = pos['SK_DPD_DEF'].apply( lambda x: 1 if x > 0 else 0) with timer("pos basic stat analysis"): pos_main = pos_cash_basic(pos_main, pos) with timer("pos cash last record"): pos_main = pos_cash_last(pos_main, pos) with timer("pos cash last k installment analysis"): pos_main = pos_cash_last_k_installment(pos_main, pos) with timer("pos cash last loan analysis"): pos_main = pos_cash_last_loan(pos_main, pos) with timer("pos cash trend analysis"): pos_main = pos_cash_trend_installment(pos_main, pos) pos_main.fillna(0, inplace=True) df = pd.merge(df, pos_main, on=['SK_ID_CURR'], how='left', validate='one_to_one') return df
def woe_encoder(df): print("woe categorical analysis.") from math import log train = import_data( "D:\\Kaggle\\MyFirstKaggleCompetition\\Data\\application_train.csv") categorical_feats = [ f for f in train.columns if train[f].dtype == 'object' ] temp = train[['SK_ID_CURR', 'TARGET'] + categorical_feats] woe_main = df[['SK_ID_CURR'] + categorical_feats] for i in categorical_feats: temp1 = temp[['SK_ID_CURR', i, 'TARGET']].groupby([i, 'TARGET']).count().unstack() temp1.columns = [ temp1.columns.names[1] + "_" + str(col) for col in temp1.columns.levels[1] ] temp1.loc['Row_sum'] = temp1.apply(lambda x: x.sum()) temp1['WOE'] = map( lambda x, y: log((float(x) / temp1.loc['Row_sum', 'TARGET_1']) / (float(y) / temp1.loc['Row_sum', 'TARGET_0'])), temp1.TARGET_1, temp1.TARGET_0) temp1.drop('Row_sum', axis=0, inplace=True) temp1.drop(['TARGET_0', 'TARGET_1'], axis=1, inplace=True) temp1.columns = [temp1.index.name + "_woe"] temp1.reset_index(inplace=True) woe_main = pd.merge(woe_main, temp1, on=[i], how='left') print("feature" + "_" + i + " is finished!") woe_main.fillna(0, inplace=True) woe_main.drop(categorical_feats, axis=1, inplace=True) df = pd.merge(df, woe_main, on=['SK_ID_CURR'], how='left') return df
def reading_main(): target_label = 'TARGET' train = import_data( "D:\\Kaggle\\MyFirstKaggleCompetition\\Data\\application_train.csv") df_x_submission = import_data( "D:\\Kaggle\\MyFirstKaggleCompetition\\Data\\application_test.csv") np.random.seed(1222) train_set_size = int(round(train.shape[0] * 0.9)) df_x_train, df_x_test = np.split(train.sample(frac=1), [train_set_size]) y_train = df_x_train[['SK_ID_CURR', target_label]] y_test = df_x_test[['SK_ID_CURR', target_label]] df_x_train = df_x_train.drop(target_label, axis=1) df_x_test = df_x_test.drop(target_label, axis=1) df_x_train['is_train'] = 1 df_x_test['is_train'] = 0 df_x_submission['is_train'] = -1 # Concatenate everything main = pd.concat([df_x_train, df_x_test, df_x_submission]) return main, y_train, y_test
def bureau_feature(df,Debug = False): if Debug: bureau = import_data("D://Kaggle//MyFirstKaggleCompetition//Data//bureau.csv") bureau = bureau.sample(10000) else: bureau = import_data("D://Kaggle//MyFirstKaggleCompetition//Data//bureau.csv") bureau['DAYS_CREDIT_ENDDATE'][bureau['DAYS_CREDIT_ENDDATE'] < -40000] = np.nan bureau['DAYS_CREDIT_UPDATE'][bureau['DAYS_CREDIT_UPDATE'] < -40000] = np.nan bureau['DAYS_ENDDATE_FACT'][bureau['DAYS_ENDDATE_FACT'] < -40000] = np.nan bureau['active_flag'] = bureau.CREDIT_ACTIVE.apply(lambda x: 1 if x == 'Active' else 0) bureau['enddate_flag'] = bureau.DAYS_CREDIT_ENDDATE.apply(lambda x: 1 if x > 0 else 0) bureau['overdue_flag'] = bureau.AMT_CREDIT_MAX_OVERDUE.apply(lambda x: 1 if x > 0 else 0) bureau['using_flag'] = bureau.AMT_CREDIT_SUM_DEBT.apply(lambda x: 1 if x > 0 else 0) credit_main = pd.DataFrame({'SK_ID_CURR': df['SK_ID_CURR'].unique()}) day = ['DAYS_CREDIT_ENDDATE','DAYS_ENDDATE_FACT','DAYS_CREDIT','DAYS_CREDIT_UPDATE'] key = ['SK_ID_CURR','SK_ID_BUREAU'] amt = ['CNT_CREDIT_PROLONG','CREDIT_DAY_OVERDUE','AMT_CREDIT_SUM','AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT','AMT_CREDIT_SUM_OVERDUE','AMT_ANNUITY'] cat = ['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE'] with timer("bureau missing count"): credit_main = bureau_missing(credit_main, bureau) print(credit_main.shape) with timer("bureau flag variable analysis"): credit_main = bureau_flag(credit_main, bureau) print(credit_main.shape) with timer("bureau amt debt analysis"): credit_main = bureau_all_debt(credit_main, bureau) print(credit_main.shape) with timer("bureau active status analysis"): credit_main = bureau_active(credit_main, bureau, key + day + amt) print(credit_main.shape) with timer("bureau closed status analysis"): credit_main = bureau_closed(credit_main, bureau, key + day + amt) print(credit_main.shape) credit_main.fillna(0, inplace = True) df = pd.merge(df, credit_main, on = ['SK_ID_CURR'],how = 'left',validate='one_to_one') return df
def install_feature(df, Debug=False): if Debug: installment = import_data( "D:\Kaggle\MyFirstKaggleCompetition\Data\installments_payments.csv" ) installment = installment.sample(10000) else: installment = import_data( "D:\Kaggle\MyFirstKaggleCompetition\Data\installments_payments.csv" ) ins_main = df[['SK_ID_CURR']] installment['instalment_paid_late_in_days'] = installment[ 'DAYS_ENTRY_PAYMENT'] - installment['DAYS_INSTALMENT'] installment['instalment_paid_late'] = installment[ 'instalment_paid_late_in_days'].apply(lambda x: 1 if x > 0 else 0) installment['instalment_paid_over_amount'] = installment[ 'AMT_PAYMENT'] - installment['AMT_INSTALMENT'] installment['instalment_paid_over'] = installment[ 'instalment_paid_over_amount'].apply(lambda x: 1 if x > 0 else 0) with timer("basic stat analysis"): ins_main = install_basic(ins_main, installment) with timer("advance stat analysis"): ins_main = install_advance(ins_main, installment) with timer("install prelong analysis"): ins_main = install_prelong(ins_main, installment) with timer("last k installment analysis"): ins_main = install_last_k_feature(ins_main, installment) with timer("last k fraction installment analysis"): ins_main = install_last_k_fraction_feature(ins_main, installment) with timer("last k trend installment analysis"): ins_main = install_trend_k_feature(ins_main, installment) ins_main.fillna(0, inplace=True) df = pd.merge(df, ins_main, how='left', on=['SK_ID_CURR'], validate='one_to_one') return df
def previous_feature(df, Debug=False): if Debug: prev = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//previous_application.csv" ) prev = prev.sample(10000) else: prev = import_data( "D://Kaggle//MyFirstKaggleCompetition//Data//previous_application.csv" ) prev_main = df[['SK_ID_CURR']] key = ['SK_ID_CURR', 'SK_ID_PREV'] Behaviour_variable = [ 'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'RATE_DOWN_PAYMENT', 'CNT_PAYMENT' ] Days_variable = [ 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'DAYS_DECISION' ] Categorical_variable = [ 'NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION' ] prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True) prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True) prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True) prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True) prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True) with timer("previous application missiong count analysis."): prev_main = previous_missing(prev_main, prev, Behaviour_variable + Days_variable) with timer("previous all record analysis for amt variable."): prev_main = previous_all_stat_amt(prev_main, prev, key + Behaviour_variable) with timer("previous all record analysis for day variable."): prev_main = previous_all_stat_day(prev_main, prev, key + Days_variable) with timer("previous approved analysis for amt variable"): prev_main = previous_approved_amt(prev_main, prev, key + Behaviour_variable) with timer("previous approved analysis for day variable"): prev_main = previous_approved_day(prev_main, prev, key + Days_variable) with timer("previous refused analysis for amt variable"): prev_main = previous_refused_amt(prev_main, prev, key + Behaviour_variable) with timer("previous refused analysis for day variable"): prev_main = previous_refused_day(prev_main, prev, key + Days_variable) with timer("previous category variable analysis."): prev_main = previous_category(prev_main, prev, Categorical_variable) with timer("previous last k contract analysis."): prev_main = previous_last_k_contract(prev_main, prev) prev_main.fillna(0, inplace=True) prev_main = correlation_reduce(prev_main) df = pd.merge(df, prev_main, on=['SK_ID_CURR'], how='left', validate='one_to_one') df = reduce_mem_usage(df) return df