예제 #1
0
def main():
    all_data = read_all(directory='../data/03_powertransform')
    dims = get_dims(all_data)
    dump(dims, '../data/07_dims/dims03.joblib')
    all_data = read_all(directory='../data/05_onehot')
    dims = get_dims(all_data)
    dump(dims, '../data/07_dims/dims05.joblib')
예제 #2
0
def main():
    args = parse_args()
    seed_everything(args.seed)

    if args.onehot:
        all_data = read_all(directory='../data/05_onehot')
        sequences = read_sequences(directory='../data/06_onehot_seq')
    else:
        all_data = read_all(directory='../data/03_powertransform')
        sequences = read_sequences(directory='../data/04_sequence')
    dims = get_dims(all_data)
    loader_maker = LoaderMaker(all_data, sequences, args, onehot=args.onehot)

    # CV
    name = '15_cnn-onehot' if args.onehot else '15_cnn-label'
    skf = StratifiedKFold(n_splits=5)
    folds = skf.split(all_data['application_train']['SK_ID_CURR'],
                      all_data['application_train']['TARGET'])
    best_models = []
    for train_index, val_index in folds:
        train_dataloader = loader_maker.make(train_index)
        val_dataloader = loader_maker.make(val_index)
        model = LightningModel(R2NCNN(dims, args.n_hidden, args.n_main),
                               nn.BCEWithLogitsLoss(), train_dataloader,
                               val_dataloader, args)
        trainer = HomeCreditTrainer(name, args.n_epochs, args.patience)
        trainer.fit(model)
        best_model = load_model(model, name, trainer.logger.version)
        best_models.append(best_model)

    # Predict
    test_dataloader = loader_maker.make(index=None, train=False)
    df_submission = predict(best_models, test_dataloader)
    filename = '../submission/15_r2n-cnn-onehot.csv' if args.onehot else '../submission/15_r2n-cnn-label.csv'
    df_submission.to_csv(filename, index=False)
예제 #3
0
def main():
    args = parse_args()
    feature_matrix = joblib.load('../data/02_featuretools/feature_matrix.joblib')
    app_train = feature_matrix.dropna(subset=['TARGET'])
    app_test = feature_matrix[feature_matrix['TARGET'].isnull()].drop('TARGET', axis=1)
    features = read_all('../data/21_dimlstm')
    for feature in features.values():
        app_train = app_train.merge(feature)
        app_test = app_test.merge(feature)
    features = read_all('../data/22_vaelstm')
    for feature in features.values():
        app_train = app_train.merge(feature)
        app_test = app_test.merge(feature)
    
    run_lgb(args, app_train, app_test, '36_ftdimvae')
예제 #4
0
def main():
    args = parse_args()
    app_train = joblib.load(
        '../data/01_labelencoding/application_train.joblib')
    app_test = joblib.load('../data/01_labelencoding/application_test.joblib')
    features = read_all('../data/21_dimlstm')
    for feature in features.values():
        app_train = app_train.merge(feature)
        app_test = app_test.merge(feature)
    features = read_all('../data/22_vaelstm')
    for feature in features.values():
        app_train = app_train.merge(feature)
        app_test = app_test.merge(feature)

    run_lgb(args, app_train, app_test, '35_dimvae')
예제 #5
0
def main():
    args = parse_args()
    seed_everything(args.seed)
    app_train = joblib.load('../data/05_onehot/application_train.joblib')
    app_test = joblib.load('../data/05_onehot/application_test.joblib')
    sequences = read_all('../data/06_onehot_seq/')
    dims = joblib.load('../data/07_dims/dims05.joblib')
    app_dims = {}
    app_dims['application_train'] = dims.pop('application_train')
    app_dims['application_test'] = dims.pop('application_test')

    app_data = {'application_train': app_train, 'application_test': app_test}
    loader_maker = LoaderMaker(app_data, sequences, args, onehot=True)

    skf = StratifiedKFold(n_splits=5)
    folds = skf.split(app_train['SK_ID_CURR'], app_train['TARGET'])
    best_models = []
    for train_index, val_index in folds:
        encoders = pretrain(app_train, app_test, sequences, dims, train_index,
                            val_index, args)
        train_dataloader = loader_maker.make(train_index)
        val_dataloader = loader_maker.make(val_index)
        model = LightningModel(
            PretrainedR2N(app_dims, args.n_hidden, args.n_main, encoders),
            nn.BCEWithLogitsLoss(), train_dataloader, val_dataloader, args)
        name = '82_vaelstm_fine'
        trainer = HomeCreditTrainer(name, args.n_epochs, args.patience)
        trainer.fit(model)
        best_model = load_model(model, name, trainer.logger.version)
        best_models.append(best_model)

    # Predict
    test_dataloader = loader_maker.make(index=None, train=False)
    df_submission = predict(best_models, test_dataloader)
    df_submission.to_csv(f'../submission/{name}.csv', index=False)
예제 #6
0
def main():
    args = parse_args()
    app_train = joblib.load('../data/05_onehot/application_train.joblib')
    app_test = joblib.load('../data/05_onehot/application_test.joblib')
    sequences = read_all('../data/06_onehot_seq')
    dims = joblib.load('../data/07_dims/dims05.joblib')
    app_dims = {}
    app_dims['application_train'] = dims.pop('application_train')
    app_dims['application_test'] = dims.pop('application_test')

    encoders = {}
    for name, diminfo in dims.items():
        model = VAELSTMModule(diminfo, args.n_hidden, None, None, args)
        model = load_model(model, name, logdir='../logs/22_vaelstm')
        encoder = model.model.encoder
        encoders[name] = encoder

    run_fine_tuning(args,
                    app_dims,
                    app_train,
                    app_test,
                    sequences,
                    encoders,
                    '42_vaelstm',
                    onehot=True)
예제 #7
0
def main():
    formatter = '%(asctime)s %(message)s'
    logging.basicConfig(filename='../logs/02_featuretools.log', level=logging.INFO, format=formatter)

    datas = read_all()
    app_train = datas['application_train']
    app_test = datas['application_test']
    bureau = datas['bureau']
    bureau_balance = datas['bureau_balance']
    cash = datas['POS_CASH_balance']
    previous = datas['previous_application']
    installments = datas['installments_payments']
    credit = datas['credit_card_balance']

    app_test["TARGET"] = np.nan
    app = app_train.append(app_test, ignore_index=True, sort=False)

    # Entity set with id applications
    entity_set = ft.EntitySet(id='HomeCredit')

    # Entities with a unique index
    entity_set = entity_set.entity_from_dataframe(entity_id='app', dataframe=app, index='SK_ID_CURR')
    entity_set = entity_set.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU')
    entity_set = entity_set.entity_from_dataframe(entity_id='previous', dataframe=previous, index='SK_ID_PREV')

    # Entities that do not have a unique index
    entity_set = entity_set.entity_from_dataframe(
        entity_id='bureau_balance', dataframe=bureau_balance, make_index=True, index='bureaubalance_index'
    )
    entity_set = entity_set.entity_from_dataframe(
        entity_id='cash', dataframe=cash, make_index=True, index='cash_index'
    )
    entity_set = entity_set.entity_from_dataframe(
        entity_id='installments', dataframe=installments, make_index=True, index='installments_index'
    )
    entity_set = entity_set.entity_from_dataframe(
        entity_id='credit', dataframe=credit, make_index=True, index='credit_index'
    )

    # Add in the defined relationships
    entity_set = entity_set.add_relationships([
        ft.Relationship(entity_set['app']['SK_ID_CURR'],      entity_set['bureau']['SK_ID_CURR']),
        ft.Relationship(entity_set['bureau']['SK_ID_BUREAU'], entity_set['bureau_balance']['SK_ID_BUREAU']),
        ft.Relationship(entity_set['app']['SK_ID_CURR'],      entity_set['previous']['SK_ID_CURR']),
        ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['cash']['SK_ID_PREV']),
        ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['installments']['SK_ID_PREV']),
        ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['credit']['SK_ID_PREV'])
    ])

    agg_primitives = ['sum', 'count', 'min', 'max', 'mean', 'mode']
    feature_matrix, _ = ft.dfs(
        entityset=entity_set, target_entity='app', agg_primitives=agg_primitives, max_depth=2, features_only=False, verbose=True
    )

    feature_matrix = feature_matrix.reset_index()
    dump(feature_matrix, '../data/02_featuretools/feature_matrix.joblib')
예제 #8
0
def main():
    all_data = read_all(directory='../data/03_powertransform')
    app_train = all_data.pop('application_train')
    app_test = all_data.pop('application_test')
    data = app_train.append(app_test, sort=False)
    data = pd.get_dummies(data)
    app_train = data.dropna(subset=['TARGET'])
    app_test = data[data['TARGET'].isnull()].drop('TARGET', axis=1)
    dump(app_train, '../data/05_onehot/application_train.joblib')
    dump(app_test, '../data/05_onehot/application_test.joblib')
    with Pool(6) as pool:
        pool.map(process, list(all_data.items()))
예제 #9
0
def main():
    args = parse_args()
    seed_everything(args.seed)
    app_train = joblib.load('../data/03_powertransform/application_train.joblib')
    app_test = joblib.load('../data/03_powertransform/application_test.joblib')
    sequences = read_all('../data/04_sequence/')
    dims = joblib.load('../data/07_dims/dims03.joblib')
    app_dims = {}
    app_dims['application_train'] = dims.pop('application_train')
    app_dims['application_test'] = dims.pop('application_test')

    mlflow.set_tracking_uri('../logs/mlruns')
    mlflow.set_experiment('HomeCredit')
    run_name = '91_dimlstm'
    params = vars(args)
    df_submission = app_test[['SK_ID_CURR']].copy()

    skf = StratifiedKFold(n_splits=5)
    folds = skf.split(app_train['SK_ID_CURR'], app_train['TARGET'])
    for i, (train_index, val_index) in enumerate(folds):
        # Train Encoder
        encoders = pretrain(app_train, sequences, dims, train_index, val_index, args)

        # Train LightGBM Model
        app_encoding_train = predict(app_train, encoders, sequences, args)
        x = app_encoding_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
        y = app_encoding_train['TARGET']
        x_train, y_train = x.iloc[train_index], y.iloc[train_index]
        x_valid, y_valid = x.iloc[val_index], y.iloc[val_index]
        train_set = lgb.Dataset(x_train, y_train)
        valid_set = lgb.Dataset(x_valid, y_valid)
        model = lgb.train(params, train_set, valid_sets=[valid_set])
        y_pred = model.predict(x_valid)
        auc = roc_auc_score(y_valid, y_pred)
        with mlflow.start_run(run_name=run_name):
            mlflow.log_params(params)
            mlflow.log_metric('auc', auc)
        
        # Predict
        app_encoding_test = predict(app_test, encoders, sequences, args)
        x_test = app_encoding_test.drop('SK_ID_CURR', axis=1)
        y_pred = model.predict(x_test)
        df_submission[f'pred_{i}'] = y_pred
    df_submission = df_submission.set_index('SK_ID_CURR').mean(axis=1).reset_index()
    df_submission.columns = ['SK_ID_CURR', 'TARGET']
    df_submission.to_csv(f'../submission/{run_name}.csv', index=False)
예제 #10
0
def main():
    args = parse_args()
    app_train = joblib.load('../data/03_powertransform/application_train.joblib')
    app_test = joblib.load('../data/03_powertransform/application_test.joblib')
    sequences = read_all('../data/04_sequence')
    dims = joblib.load('../data/07_dims/dims03.joblib')
    app_dims = {}
    app_dims['application_train'] = dims.pop('application_train')
    app_dims['application_test'] = dims.pop('application_test')

    encoders = {}
    for name, diminfo in dims.items():
        model = DIMLSTMModule(diminfo, args.n_hidden, None, None, args)
        model = load_model(model, name, logdir='../logs/21_dimlstm')
        encoder = model.encoder
        encoders[name] = encoder
    
    run_fine_tuning(args, app_dims, app_train, app_test, sequences, encoders, '41_dimlstm')
예제 #11
0
def main():
    all_data = read_all('../data/05_onehot')

    app_train = all_data.pop('application_train')
    app_test = all_data.pop('application_test')
    df_sk_id_curr = pd.concat(
        [app_train[['SK_ID_CURR']], app_test[['SK_ID_CURR']]])
    # df_sk_id_curr = df_sk_id_curr.head(100)

    bureau = all_data['bureau']
    bureau_balance = all_data['bureau_balance']
    all_data['bureau_balance'] = pd.merge(
        bureau[['SK_ID_CURR', 'SK_ID_BUREAU']],
        bureau_balance,
        on='SK_ID_BUREAU')

    id_list = [df_sk_id_curr] * len(all_data)
    with Pool(6) as pool:
        pool.starmap(process, zip(id_list, list(all_data.items())))
예제 #12
0
                    drop_cols.add(c2)
            except TypeError:
                continue
    df.drop(drop_cols, axis=1, inplace=True)
    return df


def process(item):
    name, df = item
    df = power_transform(df)
    df = fillna(df)
    df = drop_same_columns(df)
    dump(df, f'../data/03_powertransform/{name}.joblib')


if __name__ == "__main__":
    datas = read_all()
    app_train = datas.pop('application_train')
    app_test = datas.pop('application_test')
    app = app_train.append(app_test, sort=False)
    app = power_transform(app)
    app = fillna(app)
    app = drop_same_columns(app)
    app_train = app.dropna(subset=['TARGET'])
    app_test = app[app['TARGET'].isnull()].drop('TARGET', axis=1)
    dump(app_train, '../data/03_powertransform/application_train.joblib')
    dump(app_test, '../data/03_powertransform/application_test.joblib')

    with Pool(6) as pool:
        pool.map(process, list(datas.items()))
예제 #13
0
def main():
    args = parse_args()
    seed_everything(args.seed)
    app_train = joblib.load('../data/05_onehot/application_train.joblib')
    app_test = joblib.load('../data/05_onehot/application_test.joblib')
    sequences = read_all('../data/06_onehot_seq/')
    dims = joblib.load('../data/07_dims/dims05.joblib')
    dims.pop('application_train')
    dims.pop('application_test')

    for name, diminfo in dims.items():
        sequence = sequences[name]
        train_loader = torch.utils.data.DataLoader(
            OneHotSequenceDataset(app_train, sequence),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=6,
            worker_init_fn=worker_init_fn)
        test_loader = torch.utils.data.DataLoader(
            OneHotSequenceDataset(app_test, sequence),
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=6,
            worker_init_fn=worker_init_fn)
        model = VAELSTMModule(diminfo, args.n_hidden, train_loader,
                              test_loader, args)
        logdir = '../logs/22_vaelstm'
        path = pathlib.Path(logdir) / name
        if not path.exists():
            path.mkdir(parents=True)
        logger = TensorBoardLogger(logdir, name=name)
        early_stopping = EarlyStopping(patience=args.patience,
                                       monitor='val_loss',
                                       mode='min')
        filepath = pathlib.Path(
            logdir) / name / f'version_{logger.version}' / 'checkpoints'
        model_checkpoint = ModelCheckpoint(str(filepath),
                                           monitor='val_loss',
                                           mode='min')
        trainer = pl.Trainer(default_save_path=logdir,
                             gpus=-1,
                             max_epochs=args.n_epochs,
                             early_stop_callback=early_stopping,
                             logger=logger,
                             row_log_interval=100,
                             checkpoint_callback=model_checkpoint)
        trainer.fit(model)

        best_model = load_model(model,
                                name,
                                trainer.logger.version,
                                logdir=logdir)
        train_loader_no_shuffle = torch.utils.data.DataLoader(
            OneHotSequenceDataset(app_train, sequence),
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=6,
            worker_init_fn=worker_init_fn)
        df_train = predict(name, best_model, train_loader_no_shuffle)
        df_test = predict(name, best_model, test_loader)
        df_encoding = pd.concat([df_train, df_test])
        dump(df_encoding, f'../data/22_vaelstm/{name}.joblib')