def preprocess_bureau():
    bure = pd.read_feather('./data/bureau.feather')
    bure['FINISHED'] = (bure['DAYS_ENDDATE_FACT'] <= 0).astype('i')
    indexer = pd.isnull(bure['DAYS_CREDIT_ENDDATE'])
    bure.loc[indexer, 'DAYS_CREDIT_ENDDATE'] = bure.loc[indexer, 'DAYS_ENDDATE_FACT']
    bure['DIFF_ENDDATE'] = bure['DAYS_ENDDATE_FACT'] - bure['DAYS_CREDIT_ENDDATE']
    bure['TERM'] = bure['DAYS_CREDIT_ENDDATE'] - bure['DAYS_CREDIT']
    bure['AMT_CREDIT_SUM_OVERDUE'].fillna(0, inplace=True)
    bure['AMT_CREDIT_SUM'].fillna(0, inplace=True)
    bure['AMT_CREDIT_SUM_DEBT'].fillna(0, inplace=True)
    bure['CNT_CREDIT_PROLONG'].fillna(0, inplace=True)
    bure['AMT_ANNUITY'].fillna(0, inplace=True)
    # discussion/57175
    bure.loc[bure['DAYS_CREDIT_ENDDATE'] < -40000, 'DAYS_CREDIT_ENDDATE'] = np.nan
    bure.loc[bure['DAYS_CREDIT_UPDATE'] < -40000, 'DAYS_CREDIT_UPDATE'] = np.nan
    bure.loc[bure['DAYS_ENDDATE_FACT'] < -40000, 'DAYS_ENDDATE_FACT'] = np.nan
    bure['FLAG_ACTIVE'] = (bure['CREDIT_ACTIVE'] != 'Closed').astype('int8')
    bure['FLAG_ONGOING'] = (bure['DAYS_CREDIT_ENDDATE'] > 0).astype('int8')

    bb = pd.read_feather('./data/bb.agg.feather')
    bure = bure.merge(bb, on='SK_ID_BUREAU', how='left')

    bure.drop(['SK_ID_BUREAU'], axis=1)
    reduce_memory(bure)
    return bure
示例#2
0
def main():
    df = pd.read_feather('./data/credit.preprocessed.feather')
    df = df.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE'])
    df = df.groupby('SK_ID_CURR').last()
    df = df.drop(['SK_ID_PREV'], axis=1)
    df.columns = ['CRED_LAST_{}'.format(c) for c in df.columns]
    df = df.reset_index()
    reduce_memory(df)
    df.to_feather('./data/credit.last.feather')
示例#3
0
def _aggregate(df, by, fs):
    print('agg {}...'.format(by))
    agg = df.groupby(by).agg(fs)
    agg.columns = ['{}_{}'.format(a, b.upper()) for a, b in agg.columns]
    features = []
    df = df.set_index(by)
    df = df.join(agg, on=by, how='left')
    df = df.reset_index(drop=True)
    for c, ms in fs.items():
        for m in ms:
            ac = '{}_{}'.format(c, m.upper())
            adc = 'APP_GRP_{}_ABS_DIFF_{}'.format('_'.join(by), ac)
            df[adc] = (df[c] - df[ac]).abs()
            features.append(adc)
    df = df[['SK_ID_CURR'] + features]
    reduce_memory(df)
    return df.set_index('SK_ID_CURR')
示例#4
0
def preprocess_inst():
    ins = pd.read_feather('./data/installments_payments.feather')
    ins['IS_CREDIT'] = (ins['NUM_INSTALMENT_VERSION'] == 0).astype('i')

    # Percentage and difference paid in each installment (amount paid and installment value) # noqa
    ins['RATIO_PAYMENT'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['DIFF_PAYMENT'] = ins['AMT_PAYMENT'] - ins['AMT_INSTALMENT']
    ins['FLAG_DIFF_PAYMENT'] = (ins['DIFF_PAYMENT'] > 0).astype('int8')
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['FLAG_DPD'] = (ins['DPD'] > 0).astype('int8')
    ins['FLAG_DBD'] = (ins['DBD'] > 0).astype('int8')

    reduce_memory(ins)

    return ins
示例#5
0
def _tail(k):
    print('create pos tail {}...'.format(k))
    df = pd.read_feather('./data/pos.preprocessed.feather')
    df = df.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE'])
    df = df.groupby('SK_ID_CURR').tail(k).reset_index(drop=True)
    grp = df.groupby('SK_ID_CURR')

    g = grp.agg({
        'SK_DPD': ['sum', 'mean', 'std', 'max', 'skew'],
        'SK_DPD_DEF': ['sum', 'mean', 'std', 'max', 'skew'],
        'FLAG_LATE': ['sum', 'mean'],
        'FLAG_LATE_DEF': ['sum', 'mean'],
    })
    g.columns = [a + "_" + b.upper() for a, b in g.columns]
    g.columns = ['POS_TAIL_{}_{}'.format(k, c) for c in g.columns]
    reduce_memory(g)

    return g
def last_k(k):
    print('create inst tail {}...'.format(k))
    ins = pd.read_feather('./data/inst.preprocessed.feather')
    ins = ins.sort_values(['SK_ID_CURR', 'DAYS_INSTALMENT'])
    ins = ins.groupby('SK_ID_CURR').tail(k).reset_index(drop=True)
    grp = ins.groupby('SK_ID_CURR')

    agg = {
        'NUM_INSTALMENT_VERSION':
        ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew'],
        'DPD': ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew'],
        'FLAG_DPD': ['sum', 'mean'],
        'DIFF_PAYMENT': ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew'],
        'FLAG_DIFF_PAYMENT': ['sum', 'mean'],
    }
    g = grp.agg(agg)
    g.columns = [a + "_" + b.upper() for a, b in g.columns]
    g.columns = ['INST_LAST_{}_{}'.format(k, c) for c in g.columns]
    reduce_memory(g)

    return g
示例#7
0
def _create(by, fs):
    print('create {}...'.format(by))
    df = pd.read_feather('./data/prev.preprocessed.feather')
    agg = df.groupby(by).agg(fs)
    agg.columns = ['GRP_{}_{}_{}'.format('_'.join(by), a, b.upper()) for a, b in agg.columns]
    features = []
    df = df.set_index(by)
    df = df.join(agg, on=by, how='left')
    df = df.reset_index(drop=True)
    for c, ms in fs.items():
        for m in ms:
            ac = 'GRP_{}_{}_{}'.format('_'.join(by), c, m.upper())
            dc = 'DIFF_{}'.format(ac)
            adc = 'ABS_DIFF_{}'.format(ac)
            df[dc] = df[c] - df[ac]
            df[adc] = (df[c] - df[ac]).abs()
            features += [dc, adc]
    df = df[['SK_ID_CURR'] + features]
    df = df.groupby('SK_ID_CURR').mean()
    reduce_memory(df)
    return df
示例#8
0
def aggregate_bb():
    bb = pd.read_feather('./data/bureau_balance.preprocessed.feather')

    # aggregate
    grp = bb.groupby('SK_ID_BUREAU')
    g = grp.agg({
        'DPD': ['mean'],
        'STATUS': ['nunique'],
    })
    g.columns = [a + "_" + b.upper() for a, b in g.columns]

    # edge
    edge = pd.read_feather('./data/bb.edge.feather')
    edge = edge.set_index('SK_ID_BUREAU')
    agg = g.join(edge, on='SK_ID_BUREAU')

    agg.columns = ["BB_" + c for c in agg.columns]

    reduce_memory(agg)

    return agg.reset_index()
示例#9
0
def main():
    df = pd.read_feather('./data/inst.preprocessed.feather')
    df, _ = one_hot_encoder(df)
    df = df.sort_values(['SK_ID_CURR', 'SK_ID_PREV', 'DAYS_INSTALMENT'])

    df = df.drop(['IS_CREDIT'], axis=1)

    a = {
        'AMT_PAYMENT': ['mean', 'min', 'max', 'std'],
        'AMT_INSTALMENT': ['mean', 'min', 'max', 'std'],
        'DAYS_ENTRY_PAYMENT': ['mean', 'min', 'max', 'std'],
        'NUM_INSTALMENT_VERSION': ['mean'],
        'NUM_INSTALMENT_NUMBER': ['mean', 'min', 'max', 'std'],
        'DBD': ['mean', 'min', 'max', 'std'],
        'DPD': ['mean', 'min', 'max', 'std'],
        'FLAG_DBD': ['mean'],
        'FLAG_DPD': ['mean'],
        'DAYS_INSTALMENT': ['mean'],
        'RATIO_PAYMENT': ['mean'],
        'DIFF_PAYMENT': ['mean'],
    }

    grp = df.groupby(['SK_ID_CURR', 'SK_ID_PREV'])
    prev = grp.shift(-1)
    for c in prev.columns:
        if df[c].dtype == 'object':
            continue
        df[c] -= prev[c]

    grp = df.groupby('SK_ID_CURR')
    agg = grp.agg(a)
    agg.columns = ["{}_{}".format(a, b.upper()) for a, b in agg.columns]
    agg.columns = ["INST_DIFF_{}".format(c) for c in agg.columns]
    agg = agg.reset_index()

    reduce_memory(agg)

    agg.to_feather('./data/inst.diff.feather')
def _create():
    df = pd.read_feather('./data/bureau_balance.feather')
    df = df[df['STATUS'] != 'C']
    df = df.sort_values(['SK_ID_BUREAU', 'MONTHS_BALANCE'])
    df = df.groupby('SK_ID_BUREAU').last()

    bure = pd.read_feather('./data/bureau.feather')
    df = df.merge(bure[['SK_ID_BUREAU', 'SK_ID_CURR']],
                  on='SK_ID_BUREAU',
                  how='left')
    df = df.reset_index(drop=True)
    df['DPD'] = df['STATUS'].apply(status_to_dpd)

    g = df.groupby('SK_ID_CURR').agg({
        'MONTHS_BALANCE': ['min', 'max', 'mean', 'std'],
        'DPD': ['min', 'max', 'mean', 'std'],
    })
    g.columns = ['{}_{}'.format(x, y.upper()) for x, y in g.columns]
    g.columns = ['BB_OPEN_{}'.format(c) for c in g.columns]
    reduce_memory(g)

    g = g.reset_index()

    return g
def _tail(k):
    print('create credit tail {}...'.format(k))
    df = pd.read_feather('./data/credit.preprocessed.feather')
    df = df.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE'])
    df = df.groupby('SK_ID_CURR').tail(k).reset_index(drop=True)
    grp = df.groupby('SK_ID_CURR')

    agg = {
        'AMT_DRAWINGS_CURRENT': ['sum', 'mean'],
        'CNT_DRAWINGS_ATM_CURRENT': ['sum', 'mean'],
        'AMT_PAYMENT_TOTAL_CURRENT': ['sum', 'mean'],
        'AMT_INST_MIN_REGULARITY': ['sum', 'mean'],
        'SK_DPD': ['mean'],
        # added
        'DIFF_RECEIVABLE_TOTAL_AND_PRINCIPAL': ['sum', 'mean'],
        'DIFF_PAYMENT_TOTAL_AND_PLAIN': ['sum', 'mean'],
    }
    g = grp.agg(agg)
    g.columns = ['{}_{}'.format(a, b.upper()) for a, b in g.columns]
    g['COUNT'] = grp.size()
    g.columns = ['CRED_TAIL_{}_{}'.format(k, c) for c in g.columns]
    reduce_memory(g)

    return g
示例#12
0
def main():
    agg = aggregate_credit_last()
    reduce_memory(agg)
    agg.to_feather('./data/credit.prev.last.feather')
def preprocess_application():
    train_df = pd.read_feather('./data/application_train.feather')
    test_df = pd.read_feather('./data/application_test.feather')
    n_train = len(train_df)
    df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)

    df = df.drop([
        'FLAG_DOCUMENT_11',
        'FLAG_DOCUMENT_21',
        'FLAG_DOCUMENT_20',
        'FLAG_DOCUMENT_19',
        'FLAG_DOCUMENT_18',
        'FLAG_DOCUMENT_17',
        'FLAG_DOCUMENT_16',
        'FLAG_DOCUMENT_15',
        'FLAG_DOCUMENT_14',
        'FLAG_DOCUMENT_13',
        'FLAG_DOCUMENT_12',
        'FLAG_DOCUMENT_10',
        'FLAG_DOCUMENT_9',
        'FLAG_DOCUMENT_7',
        'FLAG_DOCUMENT_6',
        'FLAG_DOCUMENT_5',
        'FLAG_DOCUMENT_4',
        'FLAG_DOCUMENT_2',
    ],
                 axis=1)

    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['LOAN_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_LENGTH'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['WORKING_LIFE_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_PER_FAM'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['CHILDREN_RATIO'] = df['CNT_CHILDREN'] / df['CNT_FAM_MEMBERS']

    # from https://www.kaggle.com/shep312/lightgbm-harder-better-slower
    df['CONSUMER_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['ANN_LENGTH_EMPLOYED_RATIO'] =\
        df['ANNUITY_LENGTH'] / (df['DAYS_EMPLOYED']-1)
    df['TOTAL_DOCS_SUBMITTED'] =\
        df.loc[:, df.columns.str.contains('FLAG_DOCUMENT')].sum(axis=1)

    # from ogrellier/fork-lightgbm-with-simple-features
    inc_by_org = df[[
        'AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE'
    ]].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
    df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    live = [
        _f for _f in df.columns
        if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)
    ]
    df['DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
    df['LIVE_IND_SUM'] = df[live].sum(axis=1)
    df['INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    df['ANNUITY_TO_INCOME_RATIO'] =\
        df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])
    df['SOURCES_PROD'] =\
        df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['EXT_SOURCES_MEAN'] =\
        df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['SCORES_STD'] =\
        df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df['SCORES_STD'] = df['SCORES_STD'].fillna(df['SCORES_STD'].mean())
    df['CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / (df['DAYS_EMPLOYED'] - 1)
    df['PHONE_TO_BIRTH_RATIO'] =\
        df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    df['PHONE_TO_EMPLOY_RATIO'] =\
        df['DAYS_LAST_PHONE_CHANGE'] / (df['DAYS_EMPLOYED']-1)

    df.columns = [c.replace(' ', '_') for c in df.columns]
    reduce_memory(df)

    train_df = df[:n_train].reset_index(drop=True)
    test_df = df[n_train:].reset_index(drop=True)

    return train_df, test_df
def main():
    df = _create()
    reduce_memory(df)
    df.to_feather('./data/pos.diff.feather')
示例#15
0
def main():
    agg = _aggregate()
    reduce_memory(agg)
    agg.to_feather('./data/pos.agg.feather')
def main():
    agg = preprocess_prev()
    reduce_memory(agg)
    agg.to_feather('./data/prev.agg.feather')
def main():
    df = preprocess_pos()
    reduce_memory(df)
    df.to_feather('./data/pos.preprocessed.feather')
def main():
    agg = _create()
    reduce_memory(agg)
    agg.to_feather('./data/prev.refused.feather')