def bureau_balance_feature(df, Debug=False):
    if Debug:
        bureau_balance = import_data(
            "D://Kaggle//MyFirstKaggleCompetition//Data//bureau_balance.csv")
        bureau_balance = bureau_balance.sample(10000)
    else:
        bureau_balance = import_data(
            "D://Kaggle//MyFirstKaggleCompetition//Data//bureau_balance.csv")
    bureau = import_data(
        "D://Kaggle//MyFirstKaggleCompetition//Data//bureau.csv")
    bureau_all = pd.merge(bureau[['SK_ID_CURR', 'SK_ID_BUREAU']],
                          bureau_balance.groupby(['SK_ID_BUREAU', 'STATUS'],
                                                 as_index=False).count(),
                          how='left',
                          on=['SK_ID_BUREAU'])
    bureau_main = df[['SK_ID_CURR']]
    bureau_all['overdue'] = bureau_all.STATUS.apply(
        lambda x: 1 if x in ['1', '2', '3', '4', '5'] else 0)
    with timer("bureau balance missing count"):
        bureau_main = bureau_balance_missing(bureau_main, bureau_all)
    with timer("bureau balance overdue analysis"):
        bureau_main = bureau_overdue(bureau_main, bureau_all)
    with timer("bureau balance status count"):
        bureau_main = bureau_status(bureau_main, bureau_all)
    bureau_main.fillna(0, inplace=True)
    bureau_main = correlation_reduce(bureau_main)
    df = pd.merge(df,
                  bureau_main,
                  on=['SK_ID_CURR'],
                  how='left',
                  validate='one_to_one')
    return df
def card_feature(df,Debug = False):
    if Debug:
        credit = import_data("D:\Kaggle\MyFirstKaggleCompetition\Data\credit_card_balance.csv")
        credit = credit.sample(10000)
    else:
        credit = import_data("D:\Kaggle\MyFirstKaggleCompetition\Data\credit_card_balance.csv")
    card_main = df[['SK_ID_CURR']]
    key = ['SK_ID_CURR','SK_ID_PREV']
    amt = [f for f in credit.columns if 'AMT' in f]
    cnt = [f for f in credit.columns if 'CNT' in f]
    sk = ['SK_DPD', 'SK_DPD_DEF']
    with timer("card missing analysis"):
        card_main = card_missing(card_main,credit)
    with timer("card overdue analysis"):
        card_main = card_sk(card_main, credit, key + sk)
    with timer("card using analysis"):
        card_main = card_using(card_main, credit, key + amt)
    with timer("card all behavior analysis"):
        card_main = card_all(card_main, credit, key + amt + cnt)
    with timer("card using behavior analysis from the first payment of AMT_PAYMENT_TOTAL_CURRENT"):
        card_main = card_amt_total_payment(card_main, credit, key + amt + cnt)
    with timer("card last two year behavior analysis"):
        card_main = card_last_two_year(card_main, credit)
    with timer("card last one year behavior analysis"):
        card_main = card_last_one_year(card_main, credit)
    card_main.fillna(0,inplace = True)
    card_main = correlation_reduce(card_main)
    df = pd.merge(df, card_main, on = ['SK_ID_CURR'], how = 'left')
    return df
示例#3
0
def pos_cash_feature(df, Debug=False):
    if Debug:
        pos = import_data(
            "D:\Kaggle\MyFirstKaggleCompetition\Data\POS_CASH_balance.csv")
        pos = pos.sample(10000)
    else:
        pos = import_data(
            "D:\Kaggle\MyFirstKaggleCompetition\Data\POS_CASH_balance.csv")
    pos_main = df[['SK_ID_CURR']]
    pos['SK_DPD_DIFF'] = pos.SK_DPD - pos.SK_DPD_DEF
    pos['pos_cash_paid_late'] = pos['SK_DPD'].apply(lambda x: 1
                                                    if x > 0 else 0)
    pos['pos_cash_paid_late_with_tolerance'] = pos['SK_DPD_DEF'].apply(
        lambda x: 1 if x > 0 else 0)
    with timer("pos basic stat analysis"):
        pos_main = pos_cash_basic(pos_main, pos)
    with timer("pos cash last record"):
        pos_main = pos_cash_last(pos_main, pos)
    with timer("pos cash last k installment analysis"):
        pos_main = pos_cash_last_k_installment(pos_main, pos)
    with timer("pos cash last loan analysis"):
        pos_main = pos_cash_last_loan(pos_main, pos)
    with timer("pos cash trend analysis"):
        pos_main = pos_cash_trend_installment(pos_main, pos)
    pos_main.fillna(0, inplace=True)
    df = pd.merge(df,
                  pos_main,
                  on=['SK_ID_CURR'],
                  how='left',
                  validate='one_to_one')
    return df
示例#4
0
def woe_encoder(df):
    print("woe categorical analysis.")
    from math import log
    train = import_data(
        "D:\\Kaggle\\MyFirstKaggleCompetition\\Data\\application_train.csv")
    categorical_feats = [
        f for f in train.columns if train[f].dtype == 'object'
    ]
    temp = train[['SK_ID_CURR', 'TARGET'] + categorical_feats]
    woe_main = df[['SK_ID_CURR'] + categorical_feats]
    for i in categorical_feats:
        temp1 = temp[['SK_ID_CURR', i,
                      'TARGET']].groupby([i, 'TARGET']).count().unstack()
        temp1.columns = [
            temp1.columns.names[1] + "_" + str(col)
            for col in temp1.columns.levels[1]
        ]
        temp1.loc['Row_sum'] = temp1.apply(lambda x: x.sum())
        temp1['WOE'] = map(
            lambda x, y: log((float(x) / temp1.loc['Row_sum', 'TARGET_1']) /
                             (float(y) / temp1.loc['Row_sum', 'TARGET_0'])),
            temp1.TARGET_1, temp1.TARGET_0)
        temp1.drop('Row_sum', axis=0, inplace=True)
        temp1.drop(['TARGET_0', 'TARGET_1'], axis=1, inplace=True)
        temp1.columns = [temp1.index.name + "_woe"]
        temp1.reset_index(inplace=True)
        woe_main = pd.merge(woe_main, temp1, on=[i], how='left')
        print("feature" + "_" + i + " is finished!")
    woe_main.fillna(0, inplace=True)
    woe_main.drop(categorical_feats, axis=1, inplace=True)
    df = pd.merge(df, woe_main, on=['SK_ID_CURR'], how='left')
    return df
def reading_main():
    target_label = 'TARGET'
    train = import_data(
        "D:\\Kaggle\\MyFirstKaggleCompetition\\Data\\application_train.csv")
    df_x_submission = import_data(
        "D:\\Kaggle\\MyFirstKaggleCompetition\\Data\\application_test.csv")
    np.random.seed(1222)
    train_set_size = int(round(train.shape[0] * 0.9))
    df_x_train, df_x_test = np.split(train.sample(frac=1), [train_set_size])
    y_train = df_x_train[['SK_ID_CURR', target_label]]
    y_test = df_x_test[['SK_ID_CURR', target_label]]
    df_x_train = df_x_train.drop(target_label, axis=1)
    df_x_test = df_x_test.drop(target_label, axis=1)
    df_x_train['is_train'] = 1
    df_x_test['is_train'] = 0
    df_x_submission['is_train'] = -1
    # Concatenate everything
    main = pd.concat([df_x_train, df_x_test, df_x_submission])
    return main, y_train, y_test
def bureau_feature(df,Debug = False):
    if Debug:
        bureau = import_data("D://Kaggle//MyFirstKaggleCompetition//Data//bureau.csv")
        bureau = bureau.sample(10000)
    else:
        bureau = import_data("D://Kaggle//MyFirstKaggleCompetition//Data//bureau.csv")

    bureau['DAYS_CREDIT_ENDDATE'][bureau['DAYS_CREDIT_ENDDATE'] < -40000] = np.nan
    bureau['DAYS_CREDIT_UPDATE'][bureau['DAYS_CREDIT_UPDATE'] < -40000] = np.nan
    bureau['DAYS_ENDDATE_FACT'][bureau['DAYS_ENDDATE_FACT'] < -40000] = np.nan
    bureau['active_flag'] = bureau.CREDIT_ACTIVE.apply(lambda x: 1 if x == 'Active' else 0)
    bureau['enddate_flag'] = bureau.DAYS_CREDIT_ENDDATE.apply(lambda x: 1 if x > 0 else 0)
    bureau['overdue_flag'] = bureau.AMT_CREDIT_MAX_OVERDUE.apply(lambda x: 1 if x > 0 else 0)
    bureau['using_flag'] = bureau.AMT_CREDIT_SUM_DEBT.apply(lambda x: 1 if x > 0 else 0)
    credit_main = pd.DataFrame({'SK_ID_CURR': df['SK_ID_CURR'].unique()})
    day = ['DAYS_CREDIT_ENDDATE','DAYS_ENDDATE_FACT','DAYS_CREDIT','DAYS_CREDIT_UPDATE']
    key = ['SK_ID_CURR','SK_ID_BUREAU']
    amt = ['CNT_CREDIT_PROLONG','CREDIT_DAY_OVERDUE','AMT_CREDIT_SUM','AMT_CREDIT_SUM_DEBT',
           'AMT_CREDIT_SUM_LIMIT','AMT_CREDIT_SUM_OVERDUE','AMT_ANNUITY']
    cat = ['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE']
    with timer("bureau missing count"):
        credit_main = bureau_missing(credit_main, bureau)
        print(credit_main.shape)
    with timer("bureau flag variable analysis"):
        credit_main = bureau_flag(credit_main, bureau)
        print(credit_main.shape)
    with timer("bureau amt debt analysis"):
        credit_main = bureau_all_debt(credit_main, bureau)
        print(credit_main.shape)
    with timer("bureau active status analysis"):
        credit_main = bureau_active(credit_main, bureau, key + day + amt)
        print(credit_main.shape)
    with timer("bureau closed status analysis"):
        credit_main = bureau_closed(credit_main, bureau, key + day + amt)
        print(credit_main.shape)

    credit_main.fillna(0, inplace = True)
    df = pd.merge(df, credit_main, on = ['SK_ID_CURR'],how = 'left',validate='one_to_one')
    return df
def install_feature(df, Debug=False):
    if Debug:
        installment = import_data(
            "D:\Kaggle\MyFirstKaggleCompetition\Data\installments_payments.csv"
        )
        installment = installment.sample(10000)
    else:
        installment = import_data(
            "D:\Kaggle\MyFirstKaggleCompetition\Data\installments_payments.csv"
        )
    ins_main = df[['SK_ID_CURR']]
    installment['instalment_paid_late_in_days'] = installment[
        'DAYS_ENTRY_PAYMENT'] - installment['DAYS_INSTALMENT']
    installment['instalment_paid_late'] = installment[
        'instalment_paid_late_in_days'].apply(lambda x: 1 if x > 0 else 0)
    installment['instalment_paid_over_amount'] = installment[
        'AMT_PAYMENT'] - installment['AMT_INSTALMENT']
    installment['instalment_paid_over'] = installment[
        'instalment_paid_over_amount'].apply(lambda x: 1 if x > 0 else 0)
    with timer("basic stat analysis"):
        ins_main = install_basic(ins_main, installment)
    with timer("advance stat analysis"):
        ins_main = install_advance(ins_main, installment)
    with timer("install prelong analysis"):
        ins_main = install_prelong(ins_main, installment)
    with timer("last k installment analysis"):
        ins_main = install_last_k_feature(ins_main, installment)
    with timer("last k fraction installment analysis"):
        ins_main = install_last_k_fraction_feature(ins_main, installment)
    with timer("last k trend installment analysis"):
        ins_main = install_trend_k_feature(ins_main, installment)
    ins_main.fillna(0, inplace=True)
    df = pd.merge(df,
                  ins_main,
                  how='left',
                  on=['SK_ID_CURR'],
                  validate='one_to_one')
    return df
示例#8
0
def previous_feature(df, Debug=False):
    if Debug:
        prev = import_data(
            "D://Kaggle//MyFirstKaggleCompetition//Data//previous_application.csv"
        )
        prev = prev.sample(10000)
    else:
        prev = import_data(
            "D://Kaggle//MyFirstKaggleCompetition//Data//previous_application.csv"
        )
    prev_main = df[['SK_ID_CURR']]
    key = ['SK_ID_CURR', 'SK_ID_PREV']
    Behaviour_variable = [
        'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT',
        'AMT_GOODS_PRICE', 'RATE_DOWN_PAYMENT', 'CNT_PAYMENT'
    ]
    Days_variable = [
        'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE',
        'DAYS_TERMINATION', 'DAYS_DECISION'
    ]
    Categorical_variable = [
        'NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START',
        'HOUR_APPR_PROCESS_START', 'NAME_CASH_LOAN_PURPOSE',
        'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON',
        'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY',
        'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE',
        'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION'
    ]
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
    with timer("previous application missiong count analysis."):
        prev_main = previous_missing(prev_main, prev,
                                     Behaviour_variable + Days_variable)
    with timer("previous all record analysis for amt variable."):
        prev_main = previous_all_stat_amt(prev_main, prev,
                                          key + Behaviour_variable)
    with timer("previous all record analysis for day variable."):
        prev_main = previous_all_stat_day(prev_main, prev, key + Days_variable)
    with timer("previous approved analysis for amt variable"):
        prev_main = previous_approved_amt(prev_main, prev,
                                          key + Behaviour_variable)
    with timer("previous approved analysis for day variable"):
        prev_main = previous_approved_day(prev_main, prev, key + Days_variable)
    with timer("previous refused analysis for amt variable"):
        prev_main = previous_refused_amt(prev_main, prev,
                                         key + Behaviour_variable)
    with timer("previous refused analysis for day variable"):
        prev_main = previous_refused_day(prev_main, prev, key + Days_variable)
    with timer("previous category variable analysis."):
        prev_main = previous_category(prev_main, prev, Categorical_variable)
    with timer("previous last k contract analysis."):
        prev_main = previous_last_k_contract(prev_main, prev)
    prev_main.fillna(0, inplace=True)
    prev_main = correlation_reduce(prev_main)
    df = pd.merge(df,
                  prev_main,
                  on=['SK_ID_CURR'],
                  how='left',
                  validate='one_to_one')
    df = reduce_mem_usage(df)
    return df