示例#1
0
文件: ga.py 项目: zadiq/competitions
    def __init__(self, tc=TrainConfig(), sample_df=None):
        self.tc = tc
        self.gam = self.tc.ga_model

        ckpt_root = get_file('ga_ckpt', v=self.tc.version)
        os.makedirs(ckpt_root, exist_ok=True)
        self.tc.ckpt_dir = self.ckpt_dir = os.path.join(
            ckpt_root, folder_date())
        os.makedirs(self.ckpt_dir)
        print('Logging to {}'.format(self.ckpt_dir))
        self.ckpt_temp = os.path.join(self.ckpt_dir, '{gen}_{score}.pk')

        self.best_score = 0
        self.best_gen = 0
        self.workers = {}
        # self.lazy_genes = LazyGenes(self.gam['lazy_size'])

        self.odf = sample_df if sample_df is not None else pd.read_csv(
            get_file('all_data', self.tc.version))
        self.train_ix = self.odf[self.odf['TARGET'].notnull()].index
        self.test_ix = self.odf[self.odf['TARGET'].isnull()].index

        print('Generating genes')
        original_genes = self.odf.drop(['SK_ID_CURR', 'TARGET'],
                                       axis=1).columns.tolist()
        ogs = [Gene(i, df=self.odf) for i in original_genes]
        self.genes = ogs
        new_genes = []
        for ix, g in enumerate(self.genes[:-1]):
            for og in self.genes[ix + 1:]:
                new_genes.extend(g.apply_ops(og))
        self.genes.extend(new_genes)
        print('Generated {} genes!'.format(len(self.genes)))

        # Generate initial generation
        self.population = Population(genes=self.genes, tc=self.tc)
        org_chromo = ogs[:self.gam['chromosome_size']]
        diff = self.gam['chromosome_size'] - len(org_chromo)
        random.shuffle(new_genes)
        org_chromo += new_genes[:diff]
        assert len(org_chromo) == self.gam[
            'chromosome_size'], 'Error in original chromo size'
        self.population.members.append(
            Chromosome(org_chromo, self.gam['mate_method'], self.genes,
                       self.gam['mutate_scale']))
        assert len(
            self.population.members
        ) == self.population.size, 'Discrepancies in population size'
        print('Generated a population of size {}'.format(self.population.size))

        if self.tc.use_gpu:
            alloc = self.gam['workers'] // len(self.tc.gpu_devices)
            rem = self.gam['workers'] % len(self.tc.gpu_devices)
            self.gpu_alloc = self.tc.gpu_devices * (alloc + rem)
示例#2
0
    def evaluate(self, x=None, y=None, update_rank=True):

        if x is None and self.data is None:
            self.load_train_data()

        x = x or self.data.drop('TARGET', axis=1).values
        y = y or self.data['TARGET'].values
        bm = self.tc.board_model
        self.evaluation = self.k_model.evaluate(
            x=x,
            y=y,
            batch_size=bm['batch_size'],
        )
        rank = dict(zip(self.k_model.metrics_names, self.evaluation))
        if update_rank:
            path = get_file('board_models_ranks', self.tc.version)
            try:
                with open(path) as fp:
                    current_rank = json.load(fp)
            except (json.JSONDecodeError, FileNotFoundError):
                current_rank = {}

            with open(path, 'w') as fp:
                current_rank[self.tc.board_model['model_folder']] = {
                    'rank': rank,
                    'model_ckpt': self.ckpt_model
                }
                json.dump(current_rank, fp)
        return rank
示例#3
0
 def installments_payments(self, num_rows=None):
     ins = pd.read_csv(get_file('ins_pay'), nrows=num_rows)
     ins, cat_cols = self.one_hot_encoder(ins, nan_as_category=True)
     # Percentage and difference paid in each installment (amount paid and installment value)
     ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
     ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
     # Days past due and days before due (no negative values)
     ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
     ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
     ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
     ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
     # Features: Perform aggregations
     aggregations = {
         'NUM_INSTALMENT_VERSION': ['nunique'],
         'DPD': ['max', 'mean', 'sum'],
         'DBD': ['max', 'mean', 'sum'],
         'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
         'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
         'AMT_INSTALMENT': ['max', 'mean', 'sum'],
         'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
         'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
     }
     for cat in cat_cols:
         aggregations[cat] = ['mean']
     ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
     ins_agg.columns = pd.Index([
         'INSTAL_' + e[0] + "_" + e[1].upper()
         for e in ins_agg.columns.tolist()
     ])
     # Count installments accounts
     ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
     del ins
     gc.collect()
     return ins_agg
示例#4
0
    def gen_data(self, debug=False, version='v0'):
        os.makedirs(get_file('curated', version), exist_ok=True)
        num_rows = 10000 if debug else None
        df = self.application_train_test(num_rows)
        with timer("Process bureau and bureau_balance"):
            bureau = self.bureau_and_balance(num_rows)
            print("Bureau df shape:", bureau.shape)
            df = df.join(bureau, how='left', on='SK_ID_CURR')
            del bureau
            gc.collect()
        with timer("Process previous_applications"):
            prev = self.previous_applications(num_rows)
            print("Previous applications df shape:", prev.shape)
            df = df.join(prev, how='left', on='SK_ID_CURR')
            del prev
            gc.collect()
        with timer("Process POS-CASH balance"):
            pos = self.pos_cash(num_rows)
            print("Pos-cash balance df shape:", pos.shape)
            df = df.join(pos, how='left', on='SK_ID_CURR')
            del pos
            gc.collect()
        with timer("Process installments payments"):
            ins = self.installments_payments(num_rows)
            print("Installments payments df shape:", ins.shape)
            df = df.join(ins, how='left', on='SK_ID_CURR')
            del ins
            gc.collect()
        with timer("Process credit card balance"):
            cc = self.credit_card_balance(num_rows)
            print("Credit card balance df shape:", cc.shape)
            df = df.join(cc, how='left', on='SK_ID_CURR')
            del cc
            gc.collect()
        with timer("Saving data"):
            print(df.shape)
            print('Dropping unimportant features')
            df.drop(features_with_no_imp_at_least_twice, axis=1, inplace=True)
            gc.collect()
            print(df.shape)
            df.to_csv(get_file('all_data', version), index=False)
            df[df['TARGET'].notnull()].to_csv(get_file('org_train', version),
                                              index=False)
            df[df['TARGET'].isnull()].to_csv(get_file('org_test', version),
                                             index=False)

        return df
示例#5
0
    def load_test_data(self):
        print('Loading test data')
        self.test_data = pd.read_csv(get_file('board_test_prob', self.tc.version), index_col='SK_ID_CURR')
        print(self.test_data.head())
        print('Test Data with shape {}'.format(self.test_data.shape))

        self.test_x = self.test_data.drop('TARGET', axis=1).values
        self.test_y = self.test_data['TARGET'].values.reshape((-1, 1))
示例#6
0
文件: ga.py 项目: zadiq/competitions
 def __init__(self, odf, max_size=5, version='v1', is_sample=False):
     self.bins = {}
     self.odf = odf
     self.file = 'bin_sample_{}.csv' if is_sample else 'bin_{}.csv'
     self.bin_path = os.path.join(get_file('genes_bin_dir', version),
                                  self.file)
     self.max_size = max_size
     self.version = version
示例#7
0
    def __init__(self, version='v0'):
        self.train_data = pd.read_csv(get_file('org_train', v=version),
                                      index_col='index')
        self.test_data = pd.read_csv(get_file('org_test', v=version),
                                     index_col='index')
        self.train_data.reset_index(inplace=True)
        self.test_data.reset_index(inplace=True)

        with open(get_file('board')) as fp:
            self.board = json.load(fp)

        self.models = {
            'Model_{}'.format(ix): lgb.Booster(model_file=m)
            for ix, m in enumerate(self.board)
        }
        self.train_prob = pd.DataFrame()
        self.test_prob = pd.DataFrame()

        for m in self.models:
            print('Model {} is predicting'.format(m))
            self.train_prob[m] = self.models[m].predict(self.train_data)
            self.test_prob[m] = self.models[m].predict(self.test_data)

        agg = ['mean', 'median', 'max', 'min', 'std', 'var', 'mad', 'sum']
        model_cols = list(self.models.keys())
        for a in agg:
            name = 'MODELS_' + a.upper()
            self.train_prob[name] = self.train_prob[model_cols].agg(a, axis=1)
            self.test_prob[name] = self.test_prob[model_cols].agg(a, axis=1)

        self.train_prob['TARGET'] = self.train_data['TARGET']
        self.train_prob['SK_ID_CURR'] = self.train_data['SK_ID_CURR']
        self.test_prob['TARGET'] = self.test_data['TARGET']
        self.test_prob['SK_ID_CURR'] = self.test_data['SK_ID_CURR']

        if self.train_prob.isin([
                np.inf, -np.inf
        ]).any().any() or self.train_prob.isna().any().any():
            raise ValueError('Problem in data integrity!')

        self.train_prob.to_csv(get_file('board_train_prob', version),
                               index=False)
        self.test_prob.to_csv(get_file('board_test_prob', version),
                              index=False)
示例#8
0
    def save_predicted(self, x=None, data=None, path=None, notes='', include_rank=True):
        os.makedirs(get_file('submission'), exist_ok=True)

        submission = pd.DataFrame()
        self.predict(x)
        rank = None if not include_rank else self.evaluate()
        data = data or self.test_data
        submission['SK_ID_CURR'] = data.index
        submission['TARGET'] = self.predicted

        file = 'submission_{:.4f}_{}.csv'.format(rank['auc_roc'], datetime.now().strftime('%m-%d_%H-%M-%S'))
        path = path or os.path.join(get_file('submission'), file)
        current_meta = safe_load_json(get_file('sub_meta'))
        sub_meta = {
            'path': path,
            'notes': notes,
            'model_path': self.tc.board_model['model_folder'],
            'model_ckpt': self.ckpt_model,
            'rank': rank
        }
        current_meta[file] = sub_meta
        dump_json(current_meta, get_file('sub_meta'))
        submission.to_csv(path, index=False)
示例#9
0
    def gen_genetic_data(self, df=None, version='v0'):
        bin_dir = get_file('genes_bin_dir', version)
        bin_file = 'bin_sample_{ix}.csv' if df is not None else 'bin_{ix}.csv'
        gene_file = 'genes' if df is None else 'genes_sample'

        os.makedirs(bin_dir, exist_ok=True)
        operators = ['+', '-', '/', '*']
        df = df if df is not None else pd.read_csv(
            get_file('all_data', version))
        original_genes = df.drop(['SK_ID_CURR', 'TARGET'],
                                 axis=1).columns.tolist()
        genes_map = {}
        og_len = len(original_genes)

        print('Starting')
        for ix, og in enumerate(original_genes[:-1]):
            if ix // 10:
                print('{} of {}'.format(ix, og_len))

            cols = original_genes[ix:]
            cols.pop(cols.index(og))
            gen_cols = []
            for c in cols:
                gen_cols.extend([og + op + c for op in operators])

            new_genes = self.apply_operator(df[og], df[cols])
            new_genes = [g for i in new_genes for g in i]
            gene_bin = pd.DataFrame(dict(zip(gen_cols, new_genes)))
            genes_map.update(
                dict(zip(gen_cols, [ix for _ in range(len(gen_cols))])))
            gene_bin['SK_ID_CURR'] = df['SK_ID_CURR']
            gene_bin.to_csv(os.path.join(bin_dir, bin_file.format(ix=ix)),
                            index=False)

            del gene_bin

        dump_json(genes_map, get_file(gene_file, version))
示例#10
0
 def credit_card_balance(self, num_rows=None):
     cc = pd.read_csv(get_file('cc_bal'), nrows=num_rows)
     cc, cat_cols = self.one_hot_encoder(cc, nan_as_category=True)
     # General aggregations
     cc.drop(['SK_ID_PREV'], axis=1, inplace=True)
     cc_agg = cc.groupby('SK_ID_CURR').agg(
         ['min', 'max', 'mean', 'sum', 'var'])
     cc_agg.columns = pd.Index([
         'CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()
     ])
     # Count credit card lines
     cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
     del cc
     gc.collect()
     return cc_agg
示例#11
0
文件: ga.py 项目: zadiq/competitions
    def evolve(self):
        e = 1
        try:
            while e:
                self.assign_workers()
                self.start_workers()
                self.await_workers()
                self.population.sort()

                cs = self.population.members[0].score
                gen = self.population.gen
                if cs >= self.best_score:
                    self.best_score = cs
                    self.best_gen = gen
                rank_file = 'sample_ga_rank' if SAMPLE else 'ga_rank'
                rank_path = get_file(rank_file, self.tc.version)
                current_rank = safe_load_json(rank_path)
                current_rank[self.population.members[0].model_path] = {
                    'gen': self.population.gen,
                    'score': cs
                }
                dump_json(current_rank, rank_path)
                with open(self.ckpt_temp.format(gen=gen, score=cs),
                          'wb') as fp:
                    pickle.dump(self.population.members, fp)

                print(
                    '\n-----------------------------------------------------------------------------'
                )
                print(
                    'Gen: {} | Best score: {} by {} | Current Best Score: {}'.
                    format(gen, self.best_score, self.best_gen, cs))
                print(
                    '-----------------------------------------------------------------------------\n'
                )

                self.population.mate_members()
                self.population.mutate_members()
                self.population.gen += 1
                gc.collect()

                if SAMPLE:
                    e -= 1
        except KeyboardInterrupt:
            print('Exited Gracefully with score:{}'.format(self.best_score))

        print('Finished!')
示例#12
0
    def get_callbacks(self):
        callbacks = []
        os.makedirs(get_file('board_models', self.tc.version), exist_ok=True)

        self.tc.board_model['model_folder'] = model_folder = self.get_model_folder()
        print('Model dir: {}'.format(model_folder))
        os.makedirs(model_folder, exist_ok=True)

        log_dir = os.path.join(model_folder, 'logs')
        print('Tensorboard log dir: {}'.format(log_dir))
        best_models_dir = os.path.join(model_folder, 'best')
        print('Best Model dir: {}'.format(best_models_dir))
        os.makedirs(log_dir, exist_ok=True)
        os.makedirs(best_models_dir, exist_ok=True)
        save_format = 'weights.{epoch:02d}-{val_loss:.2f}-{auc_roc:.2f}.hdf5'

        _j = os.path.join
        callbacks.append(ModelCheckpoint(_j(model_folder, save_format)))
        callbacks.append(ModelCheckpoint(_j(best_models_dir, save_format), save_best_only=True))
        callbacks.append(TensorBoard(log_dir=log_dir))

        return callbacks
示例#13
0
    def pos_cash(self, num_rows=None):
        pos = pd.read_csv(get_file('pc_bal'), nrows=num_rows)
        pos, cat_cols = self.one_hot_encoder(pos, nan_as_category=True)
        # Features
        aggregations = {
            'MONTHS_BALANCE': ['max', 'mean', 'size'],
            'SK_DPD': ['max', 'mean'],
            'SK_DPD_DEF': ['max', 'mean']
        }
        for cat in cat_cols:
            aggregations[cat] = ['mean']

        pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
        pos_agg.columns = pd.Index([
            'POS_' + e[0] + "_" + e[1].upper()
            for e in pos_agg.columns.tolist()
        ])
        # Count pos cash accounts
        pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
        del pos
        gc.collect()
        return pos_agg
示例#14
0
def kfold_lightgbm(tc=TrainConfig(), manual=False, test_df=None,
                   train_df=None):

    board = {}

    tc = load_train_config(tc)
    if not manual:
        train_df = pd.read_csv(get_file('org_train', v=tc.version),
                               index_col='index')
        test_df = pd.read_csv(get_file('org_test', v=tc.version),
                              index_col='index')
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))

    os.makedirs(get_file('models', v=tc.version), exist_ok=True)
    model_dir = datetime.now().strftime('%m-%d_%H-%M-%S')
    model_path = os.path.join(get_file('models', v=tc.version), model_dir)
    os.makedirs(model_path, exist_ok=True)
    sub_file = os.path.join(model_path, 'submission.csv')
    feat_imp_file = os.path.join(model_path, 'feature_importance.csv')
    model_txt = os.path.join(model_path, 'model-{score:.2f}.txt')
    tc_path = os.path.join(model_path, 'train_config.json')

    with open(tc_path, 'w') as tcp:
        json.dump(json.loads(tc.to_json()), tcp, indent=4)

    print('Model path: {}'.format(model_path))

    # Cross validation model
    if tc.stratified:
        folds = StratifiedKFold(n_splits=tc.num_folds,
                                shuffle=True,
                                random_state=1001)
    else:
        folds = KFold(n_splits=tc.num_folds, shuffle=True, random_state=1001)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [
        f for f in train_df.columns if f not in
        ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
    ]
    clf = None

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['TARGET'])):
        d_train = lgb.Dataset(data=train_df[feats].iloc[train_idx],
                              label=train_df['TARGET'].iloc[train_idx],
                              free_raw_data=False,
                              silent=True)
        d_valid = lgb.Dataset(data=train_df[feats].iloc[valid_idx],
                              label=train_df['TARGET'].iloc[valid_idx],
                              free_raw_data=False,
                              silent=True)

        clf = lgb.train(params=tc.params,
                        train_set=d_train,
                        num_boost_round=10000,
                        valid_sets=[d_train, d_valid],
                        early_stopping_rounds=200,
                        verbose_eval=False)

        oof_preds[valid_idx] = clf.predict(d_valid.data)
        sub_preds += clf.predict(test_df[feats]) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance(
            importance_type='gain')
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(d_valid.label, oof_preds[valid_idx])))

    # print('Saving model')
    score = roc_auc_score(train_df['TARGET'], oof_preds)
    mt = model_txt.format(score=score)
    board[mt] = score
    clf.save_model(mt)

    print('Full AUC score %.6f' % score)

    try:
        with open(get_file('board')) as fp:
            current_board = json.load(fp)
    except (FileNotFoundError, json.JSONDecodeError):
        current_board = {}

    with open(get_file('board'), 'w') as fp:
        current_board.update(board)
        json.dump(current_board, fp)

    # Write submission file and plot feature importance
    sub_df = test_df[['SK_ID_CURR']].copy()
    sub_df['TARGET'] = sub_preds
    sub_df[['SK_ID_CURR', 'TARGET']].to_csv(sub_file, index=False)
    feature_importance_df.to_csv(feat_imp_file, index=False)
    return feature_importance_df, score, model_path
示例#15
0
 def get_model_folder(self, model_type=None, time=None):
     time_format = time or datetime.now().strftime('%m-%d_%H-%M-%S')
     model_type = model_type or self.model_type
     model_folder = '{type}_{time_format}'.format(type=model_type, time_format=time_format)
     return os.path.join(get_file('board_models', self.tc.version), model_folder)
示例#16
0
    def previous_applications(self, num_rows=None):
        prev = pd.read_csv(get_file('pre_app'), nrows=num_rows)
        prev, cat_cols = self.one_hot_encoder(prev, nan_as_category=True)
        # Days 365.243 values -> nan
        prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
        prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
        prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
        prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
        prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
        # Add feature: value ask / value received percentage
        prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
        # Previous applications numeric features
        num_aggregations = {
            'AMT_ANNUITY': ['min', 'max', 'mean'],
            'AMT_APPLICATION': ['min', 'max', 'mean'],
            'AMT_CREDIT': ['min', 'max', 'mean'],
            'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
            'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
            'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
            'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
            'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
            'DAYS_DECISION': ['min', 'max', 'mean'],
            'CNT_PAYMENT': ['mean', 'sum'],
        }
        # Previous applications categorical features
        cat_aggregations = {}
        for cat in cat_cols:
            cat_aggregations[cat] = ['mean']

        prev_agg = prev.groupby('SK_ID_CURR').agg({
            **num_aggregations,
            **cat_aggregations
        })
        prev_agg.columns = pd.Index([
            'PREV_' + e[0] + "_" + e[1].upper()
            for e in prev_agg.columns.tolist()
        ])
        # Previous Applications: Approved Applications - only numerical features
        approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
        approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
        cols = approved_agg.columns.tolist()
        approved_agg.columns = pd.Index([
            'APPROVED_' + e[0] + "_" + e[1].upper()
            for e in approved_agg.columns.tolist()
        ])
        prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
        # Previous Applications: Refused Applications - only numerical features
        refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
        refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
        refused_agg.columns = pd.Index([
            'REFUSED_' + e[0] + "_" + e[1].upper()
            for e in refused_agg.columns.tolist()
        ])
        prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
        del refused, refused_agg, approved, approved_agg, prev

        for e in cols:
            prev_agg['NEW_RATIO_PREV_' + e[0] + "_" + e[1].upper()] = prev_agg['APPROVED_' + e[0] + "_" + e[1].upper()] / \
                                                                      prev_agg['REFUSED_' + e[0] + "_" + e[1].upper()]

        gc.collect()
        return prev_agg
示例#17
0
    def bureau_and_balance(self, num_rows=None, nan_as_category=True):
        bureau = pd.read_csv(get_file('bureau'), nrows=num_rows)
        bb = pd.read_csv(get_file('bureau_bal'), nrows=num_rows)
        bb, bb_cat = self.one_hot_encoder(bb, nan_as_category)
        bureau, bureau_cat = self.one_hot_encoder(bureau, nan_as_category)

        # Bureau balance: Perform aggregations and merge with bureau.csv
        bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
        for col in bb_cat:
            bb_aggregations[col] = ['mean']
        bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
        bb_agg.columns = pd.Index(
            [e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
        bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
        bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
        del bb, bb_agg
        gc.collect()

        # Bureau and bureau_balance numeric features
        num_aggregations = {
            'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
            'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
            'DAYS_CREDIT_UPDATE': ['mean'],
            'CREDIT_DAY_OVERDUE': ['max', 'mean'],
            'AMT_CREDIT_MAX_OVERDUE': ['mean'],
            'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
            'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
            'AMT_CREDIT_SUM_OVERDUE': ['mean'],
            'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
            'AMT_ANNUITY': ['max', 'mean'],
            'CNT_CREDIT_PROLONG': ['sum'],
            'MONTHS_BALANCE_MIN': ['min'],
            'MONTHS_BALANCE_MAX': ['max'],
            'MONTHS_BALANCE_SIZE': ['mean', 'sum']
        }
        # Bureau and bureau_balance categorical features
        cat_aggregations = {}
        for cat in bureau_cat:
            cat_aggregations[cat] = ['mean']
        for cat in bb_cat:
            cat_aggregations[cat + "_MEAN"] = ['mean']

        bureau_agg = bureau.groupby('SK_ID_CURR').agg({
            **num_aggregations,
            **cat_aggregations
        })
        bureau_agg.columns = pd.Index([
            'BURO_' + e[0] + "_" + e[1].upper()
            for e in bureau_agg.columns.tolist()
        ])
        # Bureau: Active credits - using only numerical aggregations
        active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
        active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
        cols = active_agg.columns.tolist()
        active_agg.columns = pd.Index([
            'ACTIVE_' + e[0] + "_" + e[1].upper()
            for e in active_agg.columns.tolist()
        ])
        bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
        del active, active_agg
        gc.collect()
        # Bureau: Closed credits - using only numerical aggregations
        closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
        closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
        closed_agg.columns = pd.Index([
            'CLOSED_' + e[0] + "_" + e[1].upper()
            for e in closed_agg.columns.tolist()
        ])
        bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')

        for e in cols:
            bureau_agg['NEW_RATIO_BURO_' + e[0] + "_" + e[1].upper()] = (
                bureau_agg['ACTIVE_' + e[0] + "_" + e[1].upper()] /
                bureau_agg['CLOSED_' + e[0] + "_" + e[1].upper()])

        del closed, closed_agg, bureau
        gc.collect()
        return bureau_agg
示例#18
0
    def application_train_test(self, num_rows=None, nan_as_category=False):
        # Read data and merge
        df = pd.read_csv(get_file('app_train'), nrows=num_rows)
        test_df = pd.read_csv(get_file('app_test'), nrows=num_rows)
        print("Train samples: {}, test samples: {}".format(
            len(df), len(test_df)))
        df = df.append(test_df).reset_index()
        # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
        df = df[df['CODE_GENDER'] != 'XNA']

        docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
        live = [
            _f for _f in df.columns
            if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)
        ]

        # NaN values for DAYS_EMPLOYED: 365.243 -> nan
        df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

        inc_by_org = df[[
            'AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE'
        ]].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

        df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
        df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df[
            'AMT_GOODS_PRICE']
        df['NEW_DOC_IND_AVG'] = df[docs].mean(axis=1)
        df['NEW_DOC_IND_STD'] = df[docs].std(axis=1)
        df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
        df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
        df['NEW_LIVE_IND_STD'] = df[live].std(axis=1)
        df['NEW_LIVE_IND_KURT'] = df[live].kurtosis(axis=1)
        df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 +
                                                           df['CNT_CHILDREN'])
        df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
        df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
        df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (
            1 + df['AMT_INCOME_TOTAL'])
        df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df[
            'EXT_SOURCE_3']
        df['NEW_EXT_SOURCES_MEAN'] = df[[
            'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'
        ]].mean(axis=1)
        df['NEW_SCORES_STD'] = df[[
            'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'
        ]].std(axis=1)
        df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(
            df['NEW_SCORES_STD'].mean())
        df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
        df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
        df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df[
            'DAYS_BIRTH']
        df['NEW_PHONE_TO_EMPLOY_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df[
            'DAYS_EMPLOYED']
        df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df[
            'AMT_INCOME_TOTAL']

        # Categorical features with Binary encode (0 or 1; two categories)
        for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
            df[bin_feature], uniques = pd.factorize(df[bin_feature])
        # Categorical features with One-Hot encode
        df, cat_cols = self.one_hot_encoder(df, nan_as_category)

        del test_df
        gc.collect()
        return df
示例#19
0
 def load_train_data(self):
     print('Loading train data')
     self.data = pd.read_csv(get_file('board_train_prob', self.tc.version), index_col='SK_ID_CURR')