예제 #1
0
def train_cross_val(p):
    data_ = load_data(root_dir='./data/', mode='train')
    data_, target_, features, date = preprocess_data(data_, nn=True)

    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)

    input_size = data_.shape[-1]
    output_size = 1
    tb_logger = pl_loggers.TensorBoardLogger('logs/')
    models = []
    for i, (train_idx, val_idx) in enumerate(gts.split(data_, groups=date)):
        idx = np.concatenate([train_idx, val_idx])
        data = copy.deepcopy(data_[idx])
        target = copy.deepcopy(target_[idx])
        checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join(
            'models/', "fold_{}".format(i)),
                                                           monitor="val_auc",
                                                           mode='max',
                                                           save_top_k=1,
                                                           period=10)
        model = Classifier(input_size=input_size,
                           output_size=output_size,
                           params=p)
        if p['activation'] == nn.ReLU:
            model.apply(lambda m: init_weights(m, 'relu'))
        elif p['activation'] == nn.LeakyReLU:
            model.apply(lambda m: init_weights(m, 'leaky_relu'))
        train_idx = [i for i in range(0, max(train_idx) + 1)]
        val_idx = [i for i in range(len(train_idx), len(idx))]
        data[train_idx] = calc_data_mean(data[train_idx],
                                         './cache',
                                         train=True,
                                         mode='mean')
        data[val_idx] = calc_data_mean(data[val_idx],
                                       './cache',
                                       train=False,
                                       mode='mean')
        dataset = FinData(data=data, target=target, date=date)
        dataloaders = create_dataloaders(dataset,
                                         indexes={
                                             'train': train_idx,
                                             'val': val_idx
                                         },
                                         batch_size=p['batch_size'])
        es = EarlyStopping(monitor='val_auc',
                           patience=10,
                           min_delta=0.0005,
                           mode='max')
        trainer = pl.Trainer(logger=tb_logger,
                             max_epochs=500,
                             gpus=1,
                             callbacks=[checkpoint_callback, es],
                             precision=16)
        trainer.fit(model,
                    train_dataloader=dataloaders['train'],
                    val_dataloaders=dataloaders['val'])
        torch.save(model.state_dict(), f'models/fold_{i}_state_dict.pth')
        models.append(model)
    return models, features
예제 #2
0
def loptimize(trial, data_dict: dict):
    p = {
        'learning_rate': trial.suggest_uniform('learning_rate', 1e-4, 1e-1),
        'max_leaves': trial.suggest_int('max_leaves', 5, 100),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3,
                                                  0.99),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3,
                                                  0.99),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 1000),
        'lambda_l1': trial.suggest_uniform('lambda_l1', 0.005, 0.05),
        'lambda_l2': trial.suggest_uniform('lambda_l2', 0.005, 0.05),
        'boosting': trial.suggest_categorical('boosting',
                                              ['gbdt', 'goss', 'rf']),
        'objective': 'binary',
        'verbose': 1,
        'n_jobs': 10,
        'metric': 'auc'
    }
    if p['boosting'] == 'goss':
        p['bagging_freq'] = 0
        p['bagging_fraction'] = 1.0
    scores = []
    sizes = []
    data = data_dict['data']
    target = data_dict['target']
    date = data_dict['date']
    # gts = GroupTimeSeriesSplit()
    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10)
    for i, (tr_idx, val_idx) in enumerate(gts.split(data, groups=date)):
        sizes.append(len(tr_idx))
        x_tr, x_val = data.iloc[tr_idx], data.iloc[val_idx]
        y_tr, y_val = target[tr_idx], target[val_idx]
        x_tr, x_val = calc_data_mean(x_tr, cache_dir='cache/', fold=i, train=True), \
            calc_data_mean(x_val, cache_dir='cache/', fold=i, train=False)
        train = lgb.Dataset(x_tr, label=y_tr)
        val = lgb.Dataset(x_val, label=y_val)
        clf = lgb.train(p,
                        train,
                        500,
                        valid_sets=[val],
                        early_stopping_rounds=50,
                        verbose_eval=True)
        preds = clf.predict(x_val)
        score = roc_auc_score(y_val, preds)
        print(f'Fold {i} ROC AUC:\t', score)
        scores.append(score)
        del clf, preds, train, val, x_tr, x_val, y_tr, y_val, score
        rubbish = gc.collect()
    print(scores)
    avg_score = weighted_mean(scores, sizes)
    print('Avg Score:', avg_score)
    return avg_score
예제 #3
0
def optimize(trial: optuna.trial.Trial, data_dict: dict):
    p = {
        'learning_rate': trial.suggest_uniform('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'max_leaves': trial.suggest_int('max_leaves', 5, 50),
        'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,
                                                  1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 100),
        'lambda': trial.suggest_uniform('lambda', 0.05, 0.2),
        'alpha': trial.suggest_uniform('alpha', 0.05, 0.2),
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',
        'verbosity': 1,
        'n_jobs': 10,
        'eval_metric': 'auc'
    }
    print('Choosing parameters:', p)
    scores = []
    sizes = []
    # gts = GroupTimeSeriesSplit()
    data = data_dict['data']
    target = data_dict['target']
    date = data_dict['date']

    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10)
    for i, (tr_idx, val_idx) in enumerate(gts.split(data, groups=date)):
        sizes.append(len(tr_idx))
        x_tr, x_val = copy.deepcopy(data.iloc[tr_idx]), copy.deepcopy(
            data.iloc[val_idx])
        y_tr, y_val = copy.deepcopy(target[tr_idx]), copy.deepcopy(
            target[val_idx])
        x_tr, x_val = calc_data_mean(x_tr, cache_dir='cache/', fold=i, train=True), \
            calc_data_mean(x_val, cache_dir='cache/', fold=i, train=False)
        d_tr = xgb.DMatrix(x_tr, label=y_tr)
        d_val = xgb.DMatrix(x_val, label=y_val)
        clf = xgb.train(p,
                        d_tr,
                        500, [(d_val, 'eval')],
                        early_stopping_rounds=50,
                        verbose_eval=True)
        val_pred = clf.predict(d_val)
        score = roc_auc_score(y_val, val_pred)
        print(f'Fold {i} ROC AUC:\t', score)
        scores.append(score)
        del clf, val_pred, d_tr, d_val, x_tr, x_val, y_tr, y_val, score
        rubbish = gc.collect()
    print(scores)
    avg_score = weighted_mean(scores, sizes)
    print('Avg Score:', avg_score)
    return avg_score
예제 #4
0
def main(train=True):
    p = {
        'batch_size': 4986,
        'dim_1': 248,
        'dim_2': 487,
        'dim_3': 269,
        'dim_4': 218,
        'dim_5': 113,
        'activation': nn.ReLU,
        'dropout': 0.01563457578202565,
        'lr': 0.00026372556533974916,
        'label_smoothing': 0.06834918091900156,
        'weight_decay': 0.005270589494631074,
        'amsgrad': False
    }
    if train:
        models, features = train_cross_val(p)
        # models, features = final_train(p, load=False)
    else:
        data_ = load_data(root_dir='./data/', mode='train')
        data_, target_, features, date = preprocess_data(data_, nn=True)
        model_path = '/kaggle/input/model-files'
        f_mean = calc_data_mean(data_, 'cache')
        models = load_model(model_path, data_.shape[-1], 1, p, False)
    # model, checkpoint = final_train(p)
    # best_model_path = checkpoint.best_model_path
    # model, features = final_train(load=best_model_path)
    test_model(models, features)
    return models
예제 #5
0
def final_train(p, load=False):
    data_ = load_data(root_dir='./data/', mode='train')
    data, target, features, date = preprocess_data(data_, nn=True)
    input_size = data.shape[-1]
    output_size = 1
    train_idx, val_idx = date[date <= 450].index.values.tolist(), date[
        date > 450].index.values.tolist()
    data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True)
    data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False)
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        filepath='models/full_train',
        monitor="val_auc",
        mode='max',
        save_top_k=1,
        period=10)
    model = Classifier(input_size=input_size,
                       output_size=output_size,
                       params=p)
    if p['activation'] == nn.ReLU:
        model.apply(lambda m: init_weights(m, 'relu'))
    elif p['activation'] == nn.LeakyReLU:
        model.apply(lambda m: init_weights(m, 'leaky_relu'))
    dataset = FinData(data, target, date)
    dataloaders = create_dataloaders(dataset,
                                     indexes={
                                         'train': train_idx,
                                         'val': val_idx
                                     },
                                     batch_size=p['batch_size'])
    es = EarlyStopping(monitor='val_auc',
                       patience=10,
                       min_delta=0.0005,
                       mode='max')
    trainer = pl.Trainer(max_epochs=500,
                         gpus=1,
                         callbacks=[checkpoint_callback, es],
                         precision=16)
    trainer.fit(model,
                train_dataloader=dataloaders['train'],
                val_dataloaders=dataloaders['val'])
    torch.save(model.state_dict(), 'models/final_train.pth')
    return model, features
예제 #6
0
def optimize(trial: optuna.Trial, data_dict):
    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10)
    input_size = data_dict['data'].shape[-1]
    output_size = 5
    checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join(
        'models/', "trial_resnet_{}".format(trial.number)),
                                                       monitor="val_auc",
                                                       mode='max')
    logger = MetricsCallback()
    metrics = []
    sizes = []
    # trial_file = 'HPO/nn_hpo_2021-01-05.pkl'
    trial_file = None
    p = create_param_dict(trial, trial_file)
    p['batch_size'] = trial.suggest_int('batch_size', 8000, 15000)
    for i, (train_idx, val_idx) in enumerate(
            gts.split(data_dict['data'], groups=data_dict['date'])):
        idx = np.concatenate([train_idx, val_idx])
        data = copy.deepcopy(data_dict['data'][idx])
        target = copy.deepcopy(data_dict['target'][idx])
        date = copy.deepcopy(data_dict['date'][idx])
        train_idx = [i for i in range(0, max(train_idx) + 1)]
        val_idx = [i for i in range(len(train_idx), len(idx))]
        data[train_idx] = calc_data_mean(data[train_idx],
                                         './cache',
                                         train=True,
                                         mode='mean')
        data[val_idx] = calc_data_mean(data[val_idx],
                                       './cache',
                                       train=False,
                                       mode='mean')
        model = Classifier(input_size, output_size, params=p)
        # model.apply(init_weights)
        dataset = FinData(data=data, target=target, date=date, multi=True)
        dataloaders = create_dataloaders(dataset,
                                         indexes={
                                             'train': train_idx,
                                             'val': val_idx
                                         },
                                         batch_size=p['batch_size'])
        es = EarlyStopping(monitor='val_loss',
                           patience=10,
                           min_delta=0.0005,
                           mode='min')
        trainer = pl.Trainer(logger=False,
                             max_epochs=500,
                             gpus=1,
                             callbacks=[
                                 checkpoint_callback, logger,
                                 PyTorchLightningPruningCallback(
                                     trial, monitor='val_loss'), es
                             ],
                             precision=16)
        trainer.fit(model,
                    train_dataloader=dataloaders['train'],
                    val_dataloaders=dataloaders['val'])
        val_loss = logger.metrics[-1]['val_loss'].item()
        metrics.append(val_loss)
        sizes.append(len(train_idx))
    metrics_mean = weighted_mean(metrics, sizes)
    return metrics_mean