Exemplo n.º 1
0
    def _bureau_and_balance(self, configs):
        current_index = self.data_index['bureau']
        major_index = self.data_index['application_train']
        nan_as_category = configs.get('nan_as_category', False)

        # Read data and merge
        df = self.data_raw['bureau']
        bb = self.data_raw['bureau_balance']
        logger.info("Bureau: {}, Bureau Balance: {}".format(
            df.shape, bb.shape))

        if configs.get('onehot_encoding', False):
            df, cat_cols, new_cols = process_one_hot_encode(
                df, configs['onehot_columns'], nan_as_category)
            bb, cat_cols_bb, new_cols_bb = process_one_hot_encode(
                bb, configs['onehot_columns'], nan_as_category)
            self.cols_one_hot.update({'bureau': new_cols + new_cols_bb})

        agg_configs = self._split_configs(configs.copy(), 'bureau_balance')
        bb_agg = self._aggregate_pipeline(bb, cat_cols_bb,
                                          agg_configs)[current_index]
        df = df.set_index(current_index).join(bb_agg, how='left')
        bureau_cat_cols = cat_cols + [
            c for c in bb_agg
            if any([True if cc in c else False for cc in cat_cols_bb])
        ]
        #condictional aggregation
        # Bureau: Active credits - using only numerical aggregations
        # Bureau: Closed credits - using only numerical aggregations
        agg_configs = self._split_configs(configs.copy(), 'bureau')
        bureau_agg = self._aggregate_pipeline(df, bureau_cat_cols,
                                              agg_configs)[major_index]
        return Cast64To32(bureau_agg)
Exemplo n.º 2
0
    def _application_train_test(self, configs):
        nan_as_category = configs.get('nan_as_category', False)

        # Read data and merge
        major_index = self.data_index['application_train']
        df = self.data_raw['application_train']
        test_df = self.data_raw['application_test']
        logger.info("Train samples: {}, test samples: {}".format(
            df.shape, test_df.shape))
        df = df.append(test_df, sort=False, ignore_index=True)

        df = process_drop_rows(df, process_configs=configs['filter_rows'])
        df = process_factorize(df,
                               process_configs=configs['factorize_columns'])

        if configs.get('onehot_encoding', False):
            df, cat_cols, new_cols = process_one_hot_encode(
                df, configs['onehot_columns'], nan_as_category)
            self.cols_one_hot.update({'application': new_cols})
        else:
            cat_cols = IdentifyCategoricalColumn(df)

        df = process_replace(df, process_configs=configs['replace_rows'])
        df, interact_cols = process_interaction(
            df, process_configs=configs['interaction_columns'])

        if configs.get('deep_interactions', []):
            deep_interactions = configs.get('deep_interactions', [])
            for c in deep_interactions:
                df = process_deep_interactions(df, c)

        logger.info('prepare decompostion, application={}'.format(df.shape))
        df_ext = [
            process_decomposition(df, c) for c in configs['decomposition']
        ]
        df = pd.concat([df] + df_ext, axis=1, join='inner')
        logger.info('finished decompositions, application={}'.format(df.shape))
        df = Cast64To32(df)

        # seperate train test
        # Divide in training/validation and test data
        train_df = df.loc[df[
            self.target_column].notnull()].reset_index().set_index(major_index)
        test_df = df.loc[df[
            self.target_column].isnull()].reset_index().set_index(major_index)
        logger.info("Split into train samples: {}, test samples: {}".format(
            train_df.shape, test_df.shape))
        del df
        gc.collect()

        return train_df, test_df
Exemplo n.º 3
0
    def ReadRawHDF(self, configs, filename, limited_by_configs=False):
        """
        configs={'application_train'     : {'name' : 'application_train.csv', 'index': 'SK_ID_CURR',},
        """
        data_dict = {k: None for k, data in configs.items()}
        self.data_raw = self.data_io_manager.loadHDF(
            filename, data_dict, limited_by_configs=limited_by_configs)

        self.data_raw = {k: Cast64To32(v) for k, v in self.data_raw.items()}
        self.data_index = {
            k: data.get('index', None)
            for k, data in configs.items()
        }
        return self.data_raw, self.data_index
Exemplo n.º 4
0
    def _pos_cash_balance(self, configs):
        current_index = self.data_index['pos_cash_balance']
        major_index = self.data_index['application_train']
        nan_as_category = configs.get('nan_as_category', False)

        df = self.data_raw['pos_cash_balance']
        logger.info("pos_cash: {}".format(df.shape))

        if configs.get('onehot_encoding', False):
            df, cat_cols, new_cols = process_one_hot_encode(
                df, configs['onehot_columns'], nan_as_category)
            self.cols_one_hot.update({'pos_cash': new_cols})
        else:
            cat_cols = IdentifyCategoricalColumn(df)

        pos_cash_agg = self._aggregate_pipeline(df, cat_cols,
                                                configs)[major_index]
        return Cast64To32(pos_cash_agg)
Exemplo n.º 5
0
    def _installments_payments(self, configs):
        current_index = self.data_index['installments_payments']
        major_index = self.data_index['application_train']
        nan_as_category = configs.get('nan_as_category', False)

        df = self.data_raw['installments_payments']
        logger.info("installments_payments: {}".format(df.shape))

        cat_cols = []
        if configs.get('onehot_encoding', False):
            df, cat_cols, new_cols = process_one_hot_encode(
                df, cat_cols, nan_as_category)
            self.cols_one_hot.update({'installments_payments': new_cols})
        else:
            cat_cols = IdentifyCategoricalColumn(df)

        df, interact_cols = process_interaction(
            df, process_configs=configs['interaction_columns'])
        installments_agg = self._aggregate_pipeline(df, cat_cols,
                                                    configs)[major_index]
        return Cast64To32(installments_agg)
Exemplo n.º 6
0
    def _previous_application(self, configs):
        current_index = self.data_index['previous_application']
        major_index = self.data_index['application_train']
        nan_as_category = configs.get('nan_as_category', False)

        df = self.data_raw['previous_application']
        logger.info("Previous application: {}".format(df.shape))

        if configs.get('onehot_encoding', False):
            df, cat_cols, new_cols = process_one_hot_encode(
                df, configs['onehot_columns'], nan_as_category)
            self.cols_one_hot.update({'previous_application': new_cols})
        else:
            cat_cols = IdentifyCategoricalColumn(df)

        df = process_replace(df, process_configs=configs['replace_rows'])
        df, interact_cols = process_interaction(
            df, process_configs=configs['interaction_columns'])
        # Previous applications categorical features
        # Previous Applications: Approved Applications - only numerical features
        # Previous Applications: Refused Applications - only numerical features
        prev_agg = self._aggregate_pipeline(df, cat_cols, configs)[major_index]

        return Cast64To32(prev_agg)