def _bureau_and_balance(self, configs): current_index = self.data_index['bureau'] major_index = self.data_index['application_train'] nan_as_category = configs.get('nan_as_category', False) # Read data and merge df = self.data_raw['bureau'] bb = self.data_raw['bureau_balance'] logger.info("Bureau: {}, Bureau Balance: {}".format( df.shape, bb.shape)) if configs.get('onehot_encoding', False): df, cat_cols, new_cols = process_one_hot_encode( df, configs['onehot_columns'], nan_as_category) bb, cat_cols_bb, new_cols_bb = process_one_hot_encode( bb, configs['onehot_columns'], nan_as_category) self.cols_one_hot.update({'bureau': new_cols + new_cols_bb}) agg_configs = self._split_configs(configs.copy(), 'bureau_balance') bb_agg = self._aggregate_pipeline(bb, cat_cols_bb, agg_configs)[current_index] df = df.set_index(current_index).join(bb_agg, how='left') bureau_cat_cols = cat_cols + [ c for c in bb_agg if any([True if cc in c else False for cc in cat_cols_bb]) ] #condictional aggregation # Bureau: Active credits - using only numerical aggregations # Bureau: Closed credits - using only numerical aggregations agg_configs = self._split_configs(configs.copy(), 'bureau') bureau_agg = self._aggregate_pipeline(df, bureau_cat_cols, agg_configs)[major_index] return Cast64To32(bureau_agg)
def _application_train_test(self, configs): nan_as_category = configs.get('nan_as_category', False) # Read data and merge major_index = self.data_index['application_train'] df = self.data_raw['application_train'] test_df = self.data_raw['application_test'] logger.info("Train samples: {}, test samples: {}".format( df.shape, test_df.shape)) df = df.append(test_df, sort=False, ignore_index=True) df = process_drop_rows(df, process_configs=configs['filter_rows']) df = process_factorize(df, process_configs=configs['factorize_columns']) if configs.get('onehot_encoding', False): df, cat_cols, new_cols = process_one_hot_encode( df, configs['onehot_columns'], nan_as_category) self.cols_one_hot.update({'application': new_cols}) else: cat_cols = IdentifyCategoricalColumn(df) df = process_replace(df, process_configs=configs['replace_rows']) df, interact_cols = process_interaction( df, process_configs=configs['interaction_columns']) if configs.get('deep_interactions', []): deep_interactions = configs.get('deep_interactions', []) for c in deep_interactions: df = process_deep_interactions(df, c) logger.info('prepare decompostion, application={}'.format(df.shape)) df_ext = [ process_decomposition(df, c) for c in configs['decomposition'] ] df = pd.concat([df] + df_ext, axis=1, join='inner') logger.info('finished decompositions, application={}'.format(df.shape)) df = Cast64To32(df) # seperate train test # Divide in training/validation and test data train_df = df.loc[df[ self.target_column].notnull()].reset_index().set_index(major_index) test_df = df.loc[df[ self.target_column].isnull()].reset_index().set_index(major_index) logger.info("Split into train samples: {}, test samples: {}".format( train_df.shape, test_df.shape)) del df gc.collect() return train_df, test_df
def ReadRawHDF(self, configs, filename, limited_by_configs=False): """ configs={'application_train' : {'name' : 'application_train.csv', 'index': 'SK_ID_CURR',}, """ data_dict = {k: None for k, data in configs.items()} self.data_raw = self.data_io_manager.loadHDF( filename, data_dict, limited_by_configs=limited_by_configs) self.data_raw = {k: Cast64To32(v) for k, v in self.data_raw.items()} self.data_index = { k: data.get('index', None) for k, data in configs.items() } return self.data_raw, self.data_index
def _pos_cash_balance(self, configs): current_index = self.data_index['pos_cash_balance'] major_index = self.data_index['application_train'] nan_as_category = configs.get('nan_as_category', False) df = self.data_raw['pos_cash_balance'] logger.info("pos_cash: {}".format(df.shape)) if configs.get('onehot_encoding', False): df, cat_cols, new_cols = process_one_hot_encode( df, configs['onehot_columns'], nan_as_category) self.cols_one_hot.update({'pos_cash': new_cols}) else: cat_cols = IdentifyCategoricalColumn(df) pos_cash_agg = self._aggregate_pipeline(df, cat_cols, configs)[major_index] return Cast64To32(pos_cash_agg)
def _installments_payments(self, configs): current_index = self.data_index['installments_payments'] major_index = self.data_index['application_train'] nan_as_category = configs.get('nan_as_category', False) df = self.data_raw['installments_payments'] logger.info("installments_payments: {}".format(df.shape)) cat_cols = [] if configs.get('onehot_encoding', False): df, cat_cols, new_cols = process_one_hot_encode( df, cat_cols, nan_as_category) self.cols_one_hot.update({'installments_payments': new_cols}) else: cat_cols = IdentifyCategoricalColumn(df) df, interact_cols = process_interaction( df, process_configs=configs['interaction_columns']) installments_agg = self._aggregate_pipeline(df, cat_cols, configs)[major_index] return Cast64To32(installments_agg)
def _previous_application(self, configs): current_index = self.data_index['previous_application'] major_index = self.data_index['application_train'] nan_as_category = configs.get('nan_as_category', False) df = self.data_raw['previous_application'] logger.info("Previous application: {}".format(df.shape)) if configs.get('onehot_encoding', False): df, cat_cols, new_cols = process_one_hot_encode( df, configs['onehot_columns'], nan_as_category) self.cols_one_hot.update({'previous_application': new_cols}) else: cat_cols = IdentifyCategoricalColumn(df) df = process_replace(df, process_configs=configs['replace_rows']) df, interact_cols = process_interaction( df, process_configs=configs['interaction_columns']) # Previous applications categorical features # Previous Applications: Approved Applications - only numerical features # Previous Applications: Refused Applications - only numerical features prev_agg = self._aggregate_pipeline(df, cat_cols, configs)[major_index] return Cast64To32(prev_agg)