def preprocess_bureau(): bure = pd.read_feather('./data/bureau.feather') bure['FINISHED'] = (bure['DAYS_ENDDATE_FACT'] <= 0).astype('i') indexer = pd.isnull(bure['DAYS_CREDIT_ENDDATE']) bure.loc[indexer, 'DAYS_CREDIT_ENDDATE'] = bure.loc[indexer, 'DAYS_ENDDATE_FACT'] bure['DIFF_ENDDATE'] = bure['DAYS_ENDDATE_FACT'] - bure['DAYS_CREDIT_ENDDATE'] bure['TERM'] = bure['DAYS_CREDIT_ENDDATE'] - bure['DAYS_CREDIT'] bure['AMT_CREDIT_SUM_OVERDUE'].fillna(0, inplace=True) bure['AMT_CREDIT_SUM'].fillna(0, inplace=True) bure['AMT_CREDIT_SUM_DEBT'].fillna(0, inplace=True) bure['CNT_CREDIT_PROLONG'].fillna(0, inplace=True) bure['AMT_ANNUITY'].fillna(0, inplace=True) # discussion/57175 bure.loc[bure['DAYS_CREDIT_ENDDATE'] < -40000, 'DAYS_CREDIT_ENDDATE'] = np.nan bure.loc[bure['DAYS_CREDIT_UPDATE'] < -40000, 'DAYS_CREDIT_UPDATE'] = np.nan bure.loc[bure['DAYS_ENDDATE_FACT'] < -40000, 'DAYS_ENDDATE_FACT'] = np.nan bure['FLAG_ACTIVE'] = (bure['CREDIT_ACTIVE'] != 'Closed').astype('int8') bure['FLAG_ONGOING'] = (bure['DAYS_CREDIT_ENDDATE'] > 0).astype('int8') bb = pd.read_feather('./data/bb.agg.feather') bure = bure.merge(bb, on='SK_ID_BUREAU', how='left') bure.drop(['SK_ID_BUREAU'], axis=1) reduce_memory(bure) return bure
def main(): df = pd.read_feather('./data/credit.preprocessed.feather') df = df.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE']) df = df.groupby('SK_ID_CURR').last() df = df.drop(['SK_ID_PREV'], axis=1) df.columns = ['CRED_LAST_{}'.format(c) for c in df.columns] df = df.reset_index() reduce_memory(df) df.to_feather('./data/credit.last.feather')
def _aggregate(df, by, fs): print('agg {}...'.format(by)) agg = df.groupby(by).agg(fs) agg.columns = ['{}_{}'.format(a, b.upper()) for a, b in agg.columns] features = [] df = df.set_index(by) df = df.join(agg, on=by, how='left') df = df.reset_index(drop=True) for c, ms in fs.items(): for m in ms: ac = '{}_{}'.format(c, m.upper()) adc = 'APP_GRP_{}_ABS_DIFF_{}'.format('_'.join(by), ac) df[adc] = (df[c] - df[ac]).abs() features.append(adc) df = df[['SK_ID_CURR'] + features] reduce_memory(df) return df.set_index('SK_ID_CURR')
def preprocess_inst(): ins = pd.read_feather('./data/installments_payments.feather') ins['IS_CREDIT'] = (ins['NUM_INSTALMENT_VERSION'] == 0).astype('i') # Percentage and difference paid in each installment (amount paid and installment value) # noqa ins['RATIO_PAYMENT'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT'] ins['DIFF_PAYMENT'] = ins['AMT_PAYMENT'] - ins['AMT_INSTALMENT'] ins['FLAG_DIFF_PAYMENT'] = (ins['DIFF_PAYMENT'] > 0).astype('int8') # Days past due and days before due (no negative values) ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT'] ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT'] ins['FLAG_DPD'] = (ins['DPD'] > 0).astype('int8') ins['FLAG_DBD'] = (ins['DBD'] > 0).astype('int8') reduce_memory(ins) return ins
def _tail(k): print('create pos tail {}...'.format(k)) df = pd.read_feather('./data/pos.preprocessed.feather') df = df.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE']) df = df.groupby('SK_ID_CURR').tail(k).reset_index(drop=True) grp = df.groupby('SK_ID_CURR') g = grp.agg({ 'SK_DPD': ['sum', 'mean', 'std', 'max', 'skew'], 'SK_DPD_DEF': ['sum', 'mean', 'std', 'max', 'skew'], 'FLAG_LATE': ['sum', 'mean'], 'FLAG_LATE_DEF': ['sum', 'mean'], }) g.columns = [a + "_" + b.upper() for a, b in g.columns] g.columns = ['POS_TAIL_{}_{}'.format(k, c) for c in g.columns] reduce_memory(g) return g
def last_k(k): print('create inst tail {}...'.format(k)) ins = pd.read_feather('./data/inst.preprocessed.feather') ins = ins.sort_values(['SK_ID_CURR', 'DAYS_INSTALMENT']) ins = ins.groupby('SK_ID_CURR').tail(k).reset_index(drop=True) grp = ins.groupby('SK_ID_CURR') agg = { 'NUM_INSTALMENT_VERSION': ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew'], 'DPD': ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew'], 'FLAG_DPD': ['sum', 'mean'], 'DIFF_PAYMENT': ['sum', 'mean', 'max', 'min', 'std', 'median', 'skew'], 'FLAG_DIFF_PAYMENT': ['sum', 'mean'], } g = grp.agg(agg) g.columns = [a + "_" + b.upper() for a, b in g.columns] g.columns = ['INST_LAST_{}_{}'.format(k, c) for c in g.columns] reduce_memory(g) return g
def _create(by, fs): print('create {}...'.format(by)) df = pd.read_feather('./data/prev.preprocessed.feather') agg = df.groupby(by).agg(fs) agg.columns = ['GRP_{}_{}_{}'.format('_'.join(by), a, b.upper()) for a, b in agg.columns] features = [] df = df.set_index(by) df = df.join(agg, on=by, how='left') df = df.reset_index(drop=True) for c, ms in fs.items(): for m in ms: ac = 'GRP_{}_{}_{}'.format('_'.join(by), c, m.upper()) dc = 'DIFF_{}'.format(ac) adc = 'ABS_DIFF_{}'.format(ac) df[dc] = df[c] - df[ac] df[adc] = (df[c] - df[ac]).abs() features += [dc, adc] df = df[['SK_ID_CURR'] + features] df = df.groupby('SK_ID_CURR').mean() reduce_memory(df) return df
def aggregate_bb(): bb = pd.read_feather('./data/bureau_balance.preprocessed.feather') # aggregate grp = bb.groupby('SK_ID_BUREAU') g = grp.agg({ 'DPD': ['mean'], 'STATUS': ['nunique'], }) g.columns = [a + "_" + b.upper() for a, b in g.columns] # edge edge = pd.read_feather('./data/bb.edge.feather') edge = edge.set_index('SK_ID_BUREAU') agg = g.join(edge, on='SK_ID_BUREAU') agg.columns = ["BB_" + c for c in agg.columns] reduce_memory(agg) return agg.reset_index()
def main(): df = pd.read_feather('./data/inst.preprocessed.feather') df, _ = one_hot_encoder(df) df = df.sort_values(['SK_ID_CURR', 'SK_ID_PREV', 'DAYS_INSTALMENT']) df = df.drop(['IS_CREDIT'], axis=1) a = { 'AMT_PAYMENT': ['mean', 'min', 'max', 'std'], 'AMT_INSTALMENT': ['mean', 'min', 'max', 'std'], 'DAYS_ENTRY_PAYMENT': ['mean', 'min', 'max', 'std'], 'NUM_INSTALMENT_VERSION': ['mean'], 'NUM_INSTALMENT_NUMBER': ['mean', 'min', 'max', 'std'], 'DBD': ['mean', 'min', 'max', 'std'], 'DPD': ['mean', 'min', 'max', 'std'], 'FLAG_DBD': ['mean'], 'FLAG_DPD': ['mean'], 'DAYS_INSTALMENT': ['mean'], 'RATIO_PAYMENT': ['mean'], 'DIFF_PAYMENT': ['mean'], } grp = df.groupby(['SK_ID_CURR', 'SK_ID_PREV']) prev = grp.shift(-1) for c in prev.columns: if df[c].dtype == 'object': continue df[c] -= prev[c] grp = df.groupby('SK_ID_CURR') agg = grp.agg(a) agg.columns = ["{}_{}".format(a, b.upper()) for a, b in agg.columns] agg.columns = ["INST_DIFF_{}".format(c) for c in agg.columns] agg = agg.reset_index() reduce_memory(agg) agg.to_feather('./data/inst.diff.feather')
def _create(): df = pd.read_feather('./data/bureau_balance.feather') df = df[df['STATUS'] != 'C'] df = df.sort_values(['SK_ID_BUREAU', 'MONTHS_BALANCE']) df = df.groupby('SK_ID_BUREAU').last() bure = pd.read_feather('./data/bureau.feather') df = df.merge(bure[['SK_ID_BUREAU', 'SK_ID_CURR']], on='SK_ID_BUREAU', how='left') df = df.reset_index(drop=True) df['DPD'] = df['STATUS'].apply(status_to_dpd) g = df.groupby('SK_ID_CURR').agg({ 'MONTHS_BALANCE': ['min', 'max', 'mean', 'std'], 'DPD': ['min', 'max', 'mean', 'std'], }) g.columns = ['{}_{}'.format(x, y.upper()) for x, y in g.columns] g.columns = ['BB_OPEN_{}'.format(c) for c in g.columns] reduce_memory(g) g = g.reset_index() return g
def _tail(k): print('create credit tail {}...'.format(k)) df = pd.read_feather('./data/credit.preprocessed.feather') df = df.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE']) df = df.groupby('SK_ID_CURR').tail(k).reset_index(drop=True) grp = df.groupby('SK_ID_CURR') agg = { 'AMT_DRAWINGS_CURRENT': ['sum', 'mean'], 'CNT_DRAWINGS_ATM_CURRENT': ['sum', 'mean'], 'AMT_PAYMENT_TOTAL_CURRENT': ['sum', 'mean'], 'AMT_INST_MIN_REGULARITY': ['sum', 'mean'], 'SK_DPD': ['mean'], # added 'DIFF_RECEIVABLE_TOTAL_AND_PRINCIPAL': ['sum', 'mean'], 'DIFF_PAYMENT_TOTAL_AND_PLAIN': ['sum', 'mean'], } g = grp.agg(agg) g.columns = ['{}_{}'.format(a, b.upper()) for a, b in g.columns] g['COUNT'] = grp.size() g.columns = ['CRED_TAIL_{}_{}'.format(k, c) for c in g.columns] reduce_memory(g) return g
def main(): agg = aggregate_credit_last() reduce_memory(agg) agg.to_feather('./data/credit.prev.last.feather')
def preprocess_application(): train_df = pd.read_feather('./data/application_train.feather') test_df = pd.read_feather('./data/application_test.feather') n_train = len(train_df) df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True) df = df.drop([ 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_21', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_2', ], axis=1) df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'] df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT'] df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS'] df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL'] df['LOAN_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL'] df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL'] df['ANNUITY_LENGTH'] = df['AMT_CREDIT'] / df['AMT_ANNUITY'] df['WORKING_LIFE_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'] df['INCOME_PER_FAM'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS'] df['CHILDREN_RATIO'] = df['CNT_CHILDREN'] / df['CNT_FAM_MEMBERS'] # from https://www.kaggle.com/shep312/lightgbm-harder-better-slower df['CONSUMER_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE'] df['ANN_LENGTH_EMPLOYED_RATIO'] =\ df['ANNUITY_LENGTH'] / (df['DAYS_EMPLOYED']-1) df['TOTAL_DOCS_SUBMITTED'] =\ df.loc[:, df.columns.str.contains('FLAG_DOCUMENT')].sum(axis=1) # from ogrellier/fork-lightgbm-with-simple-features inc_by_org = df[[ 'AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE' ]].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL'] df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org) docs = [_f for _f in df.columns if 'FLAG_DOC' in _f] live = [ _f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f) ] df['DOC_IND_KURT'] = df[docs].kurtosis(axis=1) df['LIVE_IND_SUM'] = df[live].sum(axis=1) df['INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN']) df['ANNUITY_TO_INCOME_RATIO'] =\ df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL']) df['SOURCES_PROD'] =\ df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3'] df['EXT_SOURCES_MEAN'] =\ df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1) df['SCORES_STD'] =\ df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1) df['SCORES_STD'] = df['SCORES_STD'].fillna(df['SCORES_STD'].mean()) df['CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH'] df['CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / (df['DAYS_EMPLOYED'] - 1) df['PHONE_TO_BIRTH_RATIO'] =\ df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH'] df['PHONE_TO_EMPLOY_RATIO'] =\ df['DAYS_LAST_PHONE_CHANGE'] / (df['DAYS_EMPLOYED']-1) df.columns = [c.replace(' ', '_') for c in df.columns] reduce_memory(df) train_df = df[:n_train].reset_index(drop=True) test_df = df[n_train:].reset_index(drop=True) return train_df, test_df
def main(): df = _create() reduce_memory(df) df.to_feather('./data/pos.diff.feather')
def main(): agg = _aggregate() reduce_memory(agg) agg.to_feather('./data/pos.agg.feather')
def main(): agg = preprocess_prev() reduce_memory(agg) agg.to_feather('./data/prev.agg.feather')
def main(): df = preprocess_pos() reduce_memory(df) df.to_feather('./data/pos.preprocessed.feather')
def main(): agg = _create() reduce_memory(agg) agg.to_feather('./data/prev.refused.feather')