'main': all_df.AMT_CREDIT, 'prev': prev_df.AMT_CREDIT.sum(), 'buro1': buro_df.AMT_CREDIT_SUM.sum(), 'buro2': buro_df.AMT_CREDIT_SUM_DEBT.sum(), 'buro3': buro_df.AMT_CREDIT_SUM_LIMIT.sum(), 'income': all_df.AMT_INCOME_TOTAL }).fillna(0) for i in range(1, 4): df[f'sub_credit{i}'] = df['prev'] + df[f'buro{i}'] df[f'all_credit{i}'] = df[f'sub_credit{i}'] + df['main'] df[f'sub_credit{i}_to_income'] = df[f'sub_credit{i}'] / df['income'] df[f'all_credit{i}_to_income'] = df[f'all_credit{i}'] / df['income'] self.df = df.filter(regex='(^sub_|^all_)') if __name__ == '__main__': args = get_arguments('main') with timer('load dataset'): train = pd.read_feather(TRAIN) test = pd.read_feather(TEST) prev = pd.read_feather(PREV) buro = pd.read_feather(BURO) with timer('preprocessing'): prev = prev.query("NAME_CONTRACT_TYPE != 'XNA'") with timer('create dataset'): generate_features(globals(), args.force)
# trn = train.merge(prev_df, left_on='SK_ID_CURR', right_index=True, how='left') # tst = test.merge(prev_df, left_on='SK_ID_CURR', right_index=True, how='left') # for m, p in itertoolsd.product(main_cols, prev_cols): # self.train[f'{m}_sub_{p}_max'] = trn[m] - trn[p + '_max'] # self.train[f'{m}_sub_{p}_mean'] = trn[m] - trn[p + '_mean'] # self.test[f'{m}_sub_{p}_max'] = tst[m] - tst[p + '_max'] # self.test[f'{m}_sub_{p}_mean'] = tst[m] - tst[p + '_mean'] # # self.train[f'{m}_div_{p}_max'] = trn[m] / trn[p + '_max'] # self.train[f'{m}_div_{p}_mean'] = trn[m] / trn[p + '_mean'] # self.test[f'{m}_div_{p}_max'] = tst[m] / tst[p + '_max'] # self.test[f'{m}_div_{p}_mean'] = tst[m] / tst[p + '_mean'] if __name__ == '__main__': args = get_arguments(Path(__file__).stem) with timer('load dataset'): train = pd.read_feather(TRAIN) test = pd.read_feather(TEST) prev = pd.read_feather(PREV) with timer('preprocessing'): prev.drop(['SK_ID_PREV', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED'], axis=1, inplace=True) prev = prev.sort_values(['SK_ID_CURR', 'DAYS_DECISION']).reset_index(drop=True) prev.loc[:, prev.columns.str.startswith('DAYS_')] = prev.filter(regex='^DAYS_').replace({365243: np.nan}) prev.AMT_DOWN_PAYMENT.fillna(0) prev.RATE_DOWN_PAYMENT.fillna(0) cat_cols = prev.select_dtypes(['object']).columns prev[cat_cols] = prev[cat_cols].fillna('NaN') with timer('create dataset'):