def main(): args = parse_args() seed_everything(args.seed) app_train = joblib.load('../data/05_onehot/application_train.joblib') app_test = joblib.load('../data/05_onehot/application_test.joblib') sequences = read_all('../data/06_onehot_seq/') dims = joblib.load('../data/07_dims/dims05.joblib') app_dims = {} app_dims['application_train'] = dims.pop('application_train') app_dims['application_test'] = dims.pop('application_test') app_data = {'application_train': app_train, 'application_test': app_test} loader_maker = LoaderMaker(app_data, sequences, args, onehot=True) skf = StratifiedKFold(n_splits=5) folds = skf.split(app_train['SK_ID_CURR'], app_train['TARGET']) best_models = [] for train_index, val_index in folds: encoders = pretrain(app_train, app_test, sequences, dims, train_index, val_index, args) train_dataloader = loader_maker.make(train_index) val_dataloader = loader_maker.make(val_index) model = LightningModel( PretrainedR2N(app_dims, args.n_hidden, args.n_main, encoders), nn.BCEWithLogitsLoss(), train_dataloader, val_dataloader, args) name = '82_vaelstm_fine' trainer = HomeCreditTrainer(name, args.n_epochs, args.patience) trainer.fit(model) best_model = load_model(model, name, trainer.logger.version) best_models.append(best_model) # Predict test_dataloader = loader_maker.make(index=None, train=False) df_submission = predict(best_models, test_dataloader) df_submission.to_csv(f'../submission/{name}.csv', index=False)
def main(): args = parse_args() seed_everything(args.seed) if args.onehot: all_data = read_all(directory='../data/05_onehot') sequences = read_sequences(directory='../data/06_onehot_seq') else: all_data = read_all(directory='../data/03_powertransform') sequences = read_sequences(directory='../data/04_sequence') dims = get_dims(all_data) loader_maker = LoaderMaker(all_data, sequences, args, onehot=args.onehot) # CV name = '15_cnn-onehot' if args.onehot else '15_cnn-label' skf = StratifiedKFold(n_splits=5) folds = skf.split(all_data['application_train']['SK_ID_CURR'], all_data['application_train']['TARGET']) best_models = [] for train_index, val_index in folds: train_dataloader = loader_maker.make(train_index) val_dataloader = loader_maker.make(val_index) model = LightningModel(R2NCNN(dims, args.n_hidden, args.n_main), nn.BCEWithLogitsLoss(), train_dataloader, val_dataloader, args) trainer = HomeCreditTrainer(name, args.n_epochs, args.patience) trainer.fit(model) best_model = load_model(model, name, trainer.logger.version) best_models.append(best_model) # Predict test_dataloader = loader_maker.make(index=None, train=False) df_submission = predict(best_models, test_dataloader) filename = '../submission/15_r2n-cnn-onehot.csv' if args.onehot else '../submission/15_r2n-cnn-label.csv' df_submission.to_csv(filename, index=False)
def main(): args = parse_args() app_train = joblib.load('../data/05_onehot/application_train.joblib') app_test = joblib.load('../data/05_onehot/application_test.joblib') sequences = read_all('../data/06_onehot_seq') dims = joblib.load('../data/07_dims/dims05.joblib') app_dims = {} app_dims['application_train'] = dims.pop('application_train') app_dims['application_test'] = dims.pop('application_test') encoders = {} for name, diminfo in dims.items(): model = VAELSTMModule(diminfo, args.n_hidden, None, None, args) model = load_model(model, name, logdir='../logs/22_vaelstm') encoder = model.model.encoder encoders[name] = encoder run_fine_tuning(args, app_dims, app_train, app_test, sequences, encoders, '42_vaelstm', onehot=True)
def main(): args = parse_args() seed_everything(args.seed) if args.onehot: app_train = joblib.load('../data/05_onehot/application_train.joblib') app_test = joblib.load('../data/05_onehot/application_test.joblib') dims = get_dims({'application_train': app_train}) _, _, cont_dim = dims['application_train'] n_input = cont_dim else: app_train = joblib.load( '../data/03_powertransform/application_train.joblib') app_test = joblib.load( '../data/03_powertransform/application_test.joblib') dims = get_dims({'application_train': app_train}) cat_dims, emb_dims, cont_dim = dims['application_train'] n_input = emb_dims.sum() + cont_dim n_hidden = args.n_hidden # CV skf = StratifiedKFold(n_splits=5) folds = skf.split(app_train['SK_ID_CURR'], app_train['TARGET']) best_models = [] for train_index, val_index in folds: train_dataloader = make_dataloader(app_train, train_index, args.batch_size, onehot=args.onehot) val_dataloader = make_dataloader(app_train, val_index, args.batch_size, onehot=args.onehot) if args.onehot: network = MLPOneHot(n_input, n_hidden) else: network = MLP(cat_dims, emb_dims, n_input, n_hidden) model = LightningModel(network, nn.BCEWithLogitsLoss(), train_dataloader, val_dataloader, args) name = '13_mlp-onehot' if args.onehot else '13_mlp-label' trainer = HomeCreditTrainer(name, args.n_epochs, args.patience) trainer.fit(model) best_model = load_model(model, name, trainer.logger.version) best_models.append(best_model) # Predict test_dataloader = make_dataloader(app_test, None, args.batch_size, train=False, onehot=args.onehot) df_submission = predict(best_models, test_dataloader) filename = '../submission/13_mlp-onehot.csv' if args.onehot else '../submission/13_mlp-label.csv' df_submission.to_csv(filename, index=False)
def pretrain(app_train, app_test, sequences, dims, train_index, val_index, args): encoders = {} for name, diminfo in dims.items(): cat = sequences[f'{name}_cat'] cont = sequences[f'{name}_cont'] train_loader = torch.utils.data.DataLoader( SequenceDataset(app_train, cat, cont, index=train_index, app_test=app_test), batch_size=args.batch_size, shuffle=True, num_workers=6, worker_init_fn=worker_init_fn) val_loader = torch.utils.data.DataLoader(SequenceDataset( app_train, cat, cont, index=val_index), batch_size=args.batch_size, shuffle=False, num_workers=6, worker_init_fn=worker_init_fn) model = DIMLSTMModule(diminfo, args.n_hidden, train_loader, val_loader, args) logdir = '../logs/81_dimlstm' path = pathlib.Path(logdir) / name if not path.exists(): path.mkdir(parents=True) logger = TensorBoardLogger(logdir, name=name) early_stopping = EarlyStopping(patience=args.patience, monitor='val_loss_main', mode='min') filepath = pathlib.Path( logdir) / name / f'version_{logger.version}' / 'checkpoints' model_checkpoint = ModelCheckpoint(str(filepath), monitor='val_loss_main', mode='min') trainer = pl.Trainer(default_save_path=logdir, gpus=-1, max_epochs=args.n_epochs, early_stop_callback=early_stopping, logger=logger, row_log_interval=100, checkpoint_callback=model_checkpoint) trainer.fit(model) best_model = load_model(model, name, trainer.logger.version, logdir=logdir) encoders[name] = best_model.encoder return encoders
def main(): args = parse_args() app_train = joblib.load('../data/03_powertransform/application_train.joblib') app_test = joblib.load('../data/03_powertransform/application_test.joblib') sequences = read_all('../data/04_sequence') dims = joblib.load('../data/07_dims/dims03.joblib') app_dims = {} app_dims['application_train'] = dims.pop('application_train') app_dims['application_test'] = dims.pop('application_test') encoders = {} for name, diminfo in dims.items(): model = DIMLSTMModule(diminfo, args.n_hidden, None, None, args) model = load_model(model, name, logdir='../logs/21_dimlstm') encoder = model.encoder encoders[name] = encoder run_fine_tuning(args, app_dims, app_train, app_test, sequences, encoders, '41_dimlstm')
def main(): args = parse_args() seed_everything(args.seed) app_train = joblib.load( '../data/03_powertransform/application_train.joblib') app_test = joblib.load('../data/03_powertransform/application_test.joblib') sequences = read_sequences('../data/04_sequence/') dims = joblib.load('../data/07_dims/dims03.joblib') dims.pop('application_train') dims.pop('application_test') for name, diminfo in dims.items(): cat = sequences[f'{name}_cat'] cont = sequences[f'{name}_cont'] train_loader = torch.utils.data.DataLoader( SequenceDataset(app_train, cat, cont), batch_size=args.batch_size, shuffle=True, num_workers=6, worker_init_fn=worker_init_fn) test_loader = torch.utils.data.DataLoader( SequenceDataset(app_test, cat, cont), batch_size=args.batch_size, shuffle=False, num_workers=6, worker_init_fn=worker_init_fn) model = DIMLSTMModule(diminfo, args.n_hidden, train_loader, test_loader, args) logdir = '../logs/21_dimlstm' path = pathlib.Path(logdir) / name if not path.exists(): path.mkdir(parents=True) logger = TensorBoardLogger(logdir, name=name) early_stopping = EarlyStopping(patience=args.patience, monitor='val_loss_main', mode='min') filepath = pathlib.Path( logdir) / name / f'version_{logger.version}' / 'checkpoints' model_checkpoint = ModelCheckpoint(str(filepath), monitor='val_loss_main', mode='min') trainer = pl.Trainer(default_save_path=logdir, gpus=-1, max_epochs=args.n_epochs, early_stop_callback=early_stopping, logger=logger, row_log_interval=100, checkpoint_callback=model_checkpoint) trainer.fit(model) best_model = load_model(model, name, trainer.logger.version, logdir=logdir) train_loader_no_shuffle = torch.utils.data.DataLoader( SequenceDataset(app_train, cat, cont), batch_size=args.batch_size, shuffle=False, num_workers=6, worker_init_fn=worker_init_fn) df_train = predict(name, best_model, train_loader_no_shuffle) df_test = predict(name, best_model, test_loader) df_encoding = pd.concat([df_train, df_test]) dump(df_encoding, f'../data/21_dimlstm/{name}.joblib')