Exemplo n.º 1
0
    def gen_data(self, debug=False, version='v0'):
        os.makedirs(get_file('curated', version), exist_ok=True)
        num_rows = 10000 if debug else None
        df = self.application_train_test(num_rows)
        with timer("Process bureau and bureau_balance"):
            bureau = self.bureau_and_balance(num_rows)
            print("Bureau df shape:", bureau.shape)
            df = df.join(bureau, how='left', on='SK_ID_CURR')
            del bureau
            gc.collect()
        with timer("Process previous_applications"):
            prev = self.previous_applications(num_rows)
            print("Previous applications df shape:", prev.shape)
            df = df.join(prev, how='left', on='SK_ID_CURR')
            del prev
            gc.collect()
        with timer("Process POS-CASH balance"):
            pos = self.pos_cash(num_rows)
            print("Pos-cash balance df shape:", pos.shape)
            df = df.join(pos, how='left', on='SK_ID_CURR')
            del pos
            gc.collect()
        with timer("Process installments payments"):
            ins = self.installments_payments(num_rows)
            print("Installments payments df shape:", ins.shape)
            df = df.join(ins, how='left', on='SK_ID_CURR')
            del ins
            gc.collect()
        with timer("Process credit card balance"):
            cc = self.credit_card_balance(num_rows)
            print("Credit card balance df shape:", cc.shape)
            df = df.join(cc, how='left', on='SK_ID_CURR')
            del cc
            gc.collect()
        with timer("Saving data"):
            print(df.shape)
            print('Dropping unimportant features')
            df.drop(features_with_no_imp_at_least_twice, axis=1, inplace=True)
            gc.collect()
            print(df.shape)
            df.to_csv(get_file('all_data', version), index=False)
            df[df['TARGET'].notnull()].to_csv(get_file('org_train', version),
                                              index=False)
            df[df['TARGET'].isnull()].to_csv(get_file('org_test', version),
                                             index=False)

        return df
Exemplo n.º 2
0
import argparse
import sys
sys.path.append('/home/zoguntim/dev/home_credit_ml')
from credit.utils import timer
from credit.models import kfold_lightgbm, TrainConfig

if __name__ == '__main__':

    parser = argparse.ArgumentParser('Home Credit')
    parser.add_argument("-c",
                        "--config",
                        help="path to configuration file",
                        default=TrainConfig())

    parsed = parser.parse_args(sys.argv[1:])

    with timer("Train model"):
        data = kfold_lightgbm(tc=parsed.config)
    print('Finished!')

# python train.py -c /home/zoguntim/dev/home_credit_ml/runs/configs/cfg-2.json
Exemplo n.º 3
0
import sys
sys.path.append('/home/zoguntim/dev/home_credit_ml')
from credit.data import CurateData
from credit.utils import timer

if __name__ == "__main__":
    with timer("Generate curated data"):
        data = CurateData()
        data.gen_data()
    print('Finished!')
Exemplo n.º 4
0
import argparse
import sys
sys.path.append('/home/zoguntim/dev/home_credit_ml')
from credit.utils import timer
from credit.models import BoardModel, TrainConfig


if __name__ == '__main__':

    parser = argparse.ArgumentParser('Home Credit')
    parser.add_argument("-c", "--config", help="path to configuration file", default=TrainConfig())

    parsed = parser.parse_args(sys.argv[1:])

    tc = TrainConfig()
    tc.board_model['layers'] = [50]
    tc.board_model['dropouts'] = [.4]
    tc.board_model['use_multi_gpu'] = False

    model = BoardModel(tc=tc)
    with timer("Train Board model"):
        model.fit()
    print('Finished!')


# python train_board_model.py -c /home/zoguntim/dev/home_credit_ml/runs/configs/cfg-2.json
Exemplo n.º 5
0
import sys
sys.path.append('/home/zoguntim/dev/home_credit_ml')
from credit.data import BoardProbabilities
from credit.utils import timer


if __name__ == "__main__":
    with timer("Generate board probabilities"):
        data = BoardProbabilities()
    print('Finished!')