예제 #1
0
            'main': all_df.AMT_CREDIT,
            'prev': prev_df.AMT_CREDIT.sum(),
            'buro1': buro_df.AMT_CREDIT_SUM.sum(),
            'buro2': buro_df.AMT_CREDIT_SUM_DEBT.sum(),
            'buro3': buro_df.AMT_CREDIT_SUM_LIMIT.sum(),
            'income': all_df.AMT_INCOME_TOTAL
        }).fillna(0)

        for i in range(1, 4):
            df[f'sub_credit{i}'] = df['prev'] + df[f'buro{i}']
            df[f'all_credit{i}'] = df[f'sub_credit{i}'] + df['main']
            df[f'sub_credit{i}_to_income'] = df[f'sub_credit{i}'] / df['income']
            df[f'all_credit{i}_to_income'] = df[f'all_credit{i}'] / df['income']

        self.df = df.filter(regex='(^sub_|^all_)')


if __name__ == '__main__':
    args = get_arguments('main')
    with timer('load dataset'):
        train = pd.read_feather(TRAIN)
        test = pd.read_feather(TEST)
        prev = pd.read_feather(PREV)
        buro = pd.read_feather(BURO)

    with timer('preprocessing'):
        prev = prev.query("NAME_CONTRACT_TYPE != 'XNA'")

    with timer('create dataset'):
        generate_features(globals(), args.force)
#         trn = train.merge(prev_df, left_on='SK_ID_CURR', right_index=True, how='left')
#         tst = test.merge(prev_df, left_on='SK_ID_CURR', right_index=True, how='left')
#         for m, p in itertoolsd.product(main_cols, prev_cols):
#             self.train[f'{m}_sub_{p}_max'] = trn[m] - trn[p + '_max']
#             self.train[f'{m}_sub_{p}_mean'] = trn[m] - trn[p + '_mean']
#             self.test[f'{m}_sub_{p}_max'] = tst[m] - tst[p + '_max']
#             self.test[f'{m}_sub_{p}_mean'] = tst[m] - tst[p + '_mean']
#
#             self.train[f'{m}_div_{p}_max'] = trn[m] / trn[p + '_max']
#             self.train[f'{m}_div_{p}_mean'] = trn[m] / trn[p + '_mean']
#             self.test[f'{m}_div_{p}_max'] = tst[m] / tst[p + '_max']
#             self.test[f'{m}_div_{p}_mean'] = tst[m] / tst[p + '_mean']


if __name__ == '__main__':
    args = get_arguments(Path(__file__).stem)
    with timer('load dataset'):
        train = pd.read_feather(TRAIN)
        test = pd.read_feather(TEST)
        prev = pd.read_feather(PREV)
    
    with timer('preprocessing'):
        prev.drop(['SK_ID_PREV', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED'], axis=1, inplace=True)
        prev = prev.sort_values(['SK_ID_CURR', 'DAYS_DECISION']).reset_index(drop=True)
        prev.loc[:, prev.columns.str.startswith('DAYS_')] = prev.filter(regex='^DAYS_').replace({365243: np.nan})
        prev.AMT_DOWN_PAYMENT.fillna(0)
        prev.RATE_DOWN_PAYMENT.fillna(0)
        cat_cols = prev.select_dtypes(['object']).columns
        prev[cat_cols] = prev[cat_cols].fillna('NaN')
    
    with timer('create dataset'):