'DBD': ['max', 'mean', 'sum'], 'AMT_INSTALMENT_sub_AMT_PAYMENT': ['max', 'mean', 'sum', 'var'], 'AMT_INSTALMENT_div_AMT_PAYMENT': ['max', 'mean', 'sum', 'var'], 'AMT_INSTALMENT': ['max', 'mean', 'sum'], 'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'], 'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum'] } via = df.groupby('SK_ID_PREV').agg(aggs).reset_index(drop=True) via.columns = [ '_'.join(f) if f[0] != 'SK_ID_CURR' else f[0] for f in via.columns ] self.df = via.groupby('SK_ID_CURR').mean() if __name__ == '__main__': args = get_arguments('POS CASH') with timer('load dataset'): train = pd.read_feather(TRAIN)[['SK_ID_CURR']] test = pd.read_feather(TEST)[['SK_ID_CURR']] inst = pd.read_feather(INST) with timer('preprocessing'): # inst.drop(['SK_ID_PREV'], axis=1, inplace=True) inst = inst.sort_values(['SK_ID_CURR', 'DAYS_INSTALMENT']).reset_index(drop=True) inst.loc[:, inst.columns.str.startswith('AMT_')] = np.log1p( inst.filter(regex='^(AMT_)')) inst.loc[:, inst.columns.str.startswith('DAYS_')] = inst.filter( regex='^DAYS_').replace({365243: np.nan}) with timer('create dataset'):
import os import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from feat import get_arguments FILES = ['main.py', 'bureau.py', 'credit_card_balance.py', 'installments_payment.py', 'pos_cash_balance.py', 'previous_application.py'] if __name__ == '__main__': args = get_arguments('all') for file in FILES: cmd = ['python', '-u', file] if args.force: cmd += ['-f'] os.system(' '.join(cmd))