Пример #1
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = pd.read_csv(config.VALID_FILE)

    model_config = transformers.RobertaConfig.from_pretrained(
        '/home/mikhail/workspace/roberta-base/')
    model_config.output_hidden_states = True
    model = TweetModel(model_config)
    optimizer = AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
    criterion = loss_fn
    dataloaders_dict = get_train_val_loaders(df_train, df_valid,
                                             config.TRAIN_BATCH_SIZE)

    engine.train_model(model, dataloaders_dict, criterion, optimizer,
                       config.EPOCHS, f'roberta_fold{fold}.pth')
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    # optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
    optimizer = optim.AdamW(model.parameters(), lr=opt.lr, betas=(0.9, 0.999))
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=opt.num_warmup_steps,
        num_training_steps=num_train_steps)
    criterion = loss_fn
    dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx,
                                             batch_size, MODEL_PATH)

    train_model(model, dataloaders_dict, criterion, optimizer, num_epochs,
                f'roberta_fold{fold}.pth', scheduler)

test_it(MODEL_PATH)
Пример #3
0
def main():
    seed = 42
    seed_everything(seed)

    num_epochs = 3
    batch_size = 32
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

    train_df = pd.read_csv('data/train.csv')
    train_df['text'] = train_df['text'].astype(str)
    train_df['selected_text'] = train_df['selected_text'].astype(str)

    for fold, (train_idx,
               val_idx) in enumerate(skf.split(train_df, train_df.sentiment),
                                     start=1):
        print(f'Fold: {fold}')

        model = TweetModel()
        optimizer = optim.AdamW(model.parameters(),
                                lr=3e-5,
                                betas=(0.9, 0.999))
        criterion = loss_fn
        dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx,
                                                 batch_size)

        train_model(model, dataloaders_dict, criterion, optimizer, num_epochs,
                    f'roberta_fold{fold}.pth')

    # inference

    test_df = pd.read_csv('data/test.csv')
    test_df['text'] = test_df['text'].astype(str)
    test_loader = get_test_loader(test_df)
    predictions = []
    models = []

    for fold in range(skf.n_splits):
        model = TweetModel()
        model.cuda()
        model.load_state_dict(torch.load(f'roberta_fold{fold+1}.pth'))
        model.eval()
        models.append(model)

    for data in test_loader:
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        tweet = data['tweet']
        offsets = data['offsets'].numpy()

        start_logits = []
        end_logits = []
        for model in models:
            with torch.no_grad():
                output = model(ids, masks)
                start_logits.append(
                    torch.softmax(output[0], dim=1).cpu().detach().numpy())
                end_logits.append(
                    torch.softmax(output[1], dim=1).cpu().detach().numpy())

        start_logits = np.mean(start_logits, axis=0)
        end_logits = np.mean(end_logits, axis=0)
        for i in range(len(ids)):
            start_pred = np.argmax(start_logits[i])
            end_pred = np.argmax(end_logits[i])
            if start_pred > end_pred:
                pred = tweet[i]
            else:
                pred = get_selected_text(tweet[i], start_pred, end_pred,
                                         offsets[i])
            predictions.append(pred)

    #submission

    sub_df = pd.read_csv('data/sample_submission.csv')
    sub_df['selected_text'] = predictions
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('!!!!', '!') if len(x.split()) == 1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('..', '.') if len(x.split()) == 1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('...', '.') if len(x.split()) == 1 else x)
    sub_df.to_csv('submission.csv', index=False)
    sub_df.head()