예제 #1
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    device = torch.device('cuda')
    model_config = transformers.RobertaConfig.from_pretrained(
        config.MODEL_CONFIG)
    model_config.output_hidden_states = True

    model = models.TweetModel(conf=model_config)
    model.to(device)
    model.load_state_dict(torch.load(
        f'{config.TRAINED_MODEL_PATH}/model_{fold}.bin'))
    model.eval()

    valid_dataset = dataset.TweetDataset(
        tweets=df_valid.text.values,
        sentiments=df_valid.sentiment.values,
        selected_texts=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=4,
        shuffle=False)

    jaccard = eval_fn(valid_data_loader, model, device)

    return jaccard
예제 #2
0
def store_tweets_to_db(keyword, tweets):
    # TODO: store tweets to db
    print_msg("Storing tweets to database of keyword %s..." % (keyword))
    sk = models.SearchKeyword.get_keyword(keyword)
    if not sk:
        sk = models.SearchKeyword(keyword)

    for tweet in tweets:
        # print(tweet.get('id_str'))
        s_tweet = models.TweetModel.get_tweet(tweet.get('id_str'))
        if not s_tweet:
            tweetm = models.TweetModel(tweet)
            tweetm.keywords.append(sk)
            tweetm.save()
        else:
            s_tweet.update(sk)
    print_msg("Done!")
예제 #3
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweets=df_train.text.values,
        sentiments=df_train.sentiment.values,
        selected_texts=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
        shuffle=True)

    valid_dataset = dataset.TweetDataset(
        tweets=df_valid.text.values,
        sentiments=df_valid.sentiment.values,
        selected_texts=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=4,
        shuffle=False)

    device = torch.device('cuda')
    model_config = transformers.RobertaConfig.from_pretrained(
        config.MODEL_CONFIG)
    model_config.output_hidden_states = True
    model = models.TweetModel(conf=model_config)
    model = model.to(device)

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        config.WEIGHT_DECAY
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    base_opt = transformers.AdamW(optimizer_parameters,
                                  lr=config.LEARNING_RATE)
    optimizer = torchcontrib.optim.SWA(base_opt,
                                       swa_start=int(num_train_steps *
                                                     config.SWA_RATIO),
                                       swa_freq=config.SWA_FREQ,
                                       swa_lr=None)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=int(num_train_steps * config.WARMUP_RATIO),
        num_training_steps=num_train_steps)

    print(f'Training is starting for fold={fold}')

    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)

    if config.USE_SWA:
        optimizer.swap_swa_sgd()

    torch.save(model.state_dict(),
               f'{config.MODEL_SAVE_PATH}/model_{fold}.bin')

    return jaccard
예제 #4
0
def run():
    df_test = pd.read_csv(config.TEST_FILE)
    df_test.loc[:, 'selected_text'] = df_test.text.values

    device = torch.device('cuda')
    model_config = transformers.RobertaConfig.from_pretrained(
        config.MODEL_CONFIG)
    model_config.output_hidden_states = True

    fold_models = []
    for i in range(config.N_FOLDS):
        model = models.TweetModel(conf=model_config)
        model.to(device)
        model.load_state_dict(
            torch.load(f'{config.TRAINED_MODEL_PATH}/model_{i}.bin'))
        model.eval()
        fold_models.append(model)

    test_dataset = dataset.TweetDataset(
        tweets=df_test.text.values,
        sentiments=df_test.sentiment.values,
        selected_texts=df_test.selected_text.values)

    data_loader = torch.utils.data.DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=4)

    char_pred_test_start = []
    char_pred_test_end = []

    with torch.no_grad():
        tk0 = tqdm.tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d['ids']
            token_type_ids = d['token_type_ids']
            mask = d['mask']
            orig_tweet = d['orig_tweet']
            offsets = d['offsets']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)

            outputs_start_folds = []
            outputs_end_folds = []
            for i in range(config.N_FOLDS):
                outputs_start, outputs_end = \
                    fold_models[i](ids=ids,
                                   mask=mask,
                                   token_type_ids=token_type_ids)
                outputs_start_folds.append(outputs_start)
                outputs_end_folds.append(outputs_end)

            outputs_start = sum(outputs_start_folds) / config.N_FOLDS
            outputs_end = sum(outputs_end_folds) / config.N_FOLDS

            outputs_start = torch.softmax(outputs_start,
                                          dim=-1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end,
                                        dim=-1).cpu().detach().numpy()

            for px, tweet in enumerate(orig_tweet):
                char_pred_test_start.append(
                    utils.token_level_to_char_level(tweet, offsets[px],
                                                    outputs_start[px]))
                char_pred_test_end.append(
                    utils.token_level_to_char_level(tweet, offsets[px],
                                                    outputs_end[px]))

    with open('roberta-char_pred_test_start.pkl', 'wb') as handle:
        pickle.dump(char_pred_test_start, handle)
    with open('roberta-char_pred_test_end.pkl', 'wb') as handle:
        pickle.dump(char_pred_test_end, handle)