Exemplo n.º 1
0
def run(fold, model_name):
    writer = SummaryWriter(log_dir=f'{SAVE_PATH}/', filename_suffix=f'{model_name}-fold{fold}')
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)
    print(df_train.shape)
    print(df_valid.shape)
    train_dataset = dataset.TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = dataset.TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
    print(f'training on {device}')
    model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    optimizer = AdamW(params.optimizer_params(model), lr=5e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    es = utils.EarlyStopping(patience=5, mode="max")
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, writer)
        jaccard = engine.eval_fn(valid_data_loader, model, device, writer)
        print(f"Jaccard Score = {jaccard}")
        print(f"Epoch={epoch}, Jaccard={jaccard}")
        es(jaccard, model, model_path=f"{SAVE_PATH}/{model_name}-f{fold}.pt")
        if es.early_stop:
            print("Early stopping")
            break
Exemplo n.º 2
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = pd.read_csv(config.VALID_FILE)

    model_config = transformers.RobertaConfig.from_pretrained(
        '/home/mikhail/workspace/roberta-base/')
    model_config.output_hidden_states = True
    model = TweetModel(model_config)
    optimizer = AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
    criterion = loss_fn
    dataloaders_dict = get_train_val_loaders(df_train, df_valid,
                                             config.TRAIN_BATCH_SIZE)

    engine.train_model(model, dataloaders_dict, criterion, optimizer,
                       config.EPOCHS, f'roberta_fold{fold}.pth')
Exemplo n.º 3
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE)
    df_train, df_valid = model_selection.train_test_split(dfx,
                                                          test_size=0.1,
                                                          random_state=42)
    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)
    valid_dataset = TweetDataset(tweet=df_valid.text.values,
                                 sentiment=df_valid.sentiment.values,
                                 selected_text=df_valid.selected_text.values)
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2)
    device = torch.device('cuda')
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    for epoch in range(3):
        train_fn(train_data_loader,
                 model,
                 optimizer,
                 device,
                 scheduler=scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        results.append(jaccard)
        print(f"Jaccard Score = {jaccard}")
        torch.save(model.state_dict(), f"MODEL_PATH")
Exemplo n.º 4
0
def run():
    seed_everything(config.SEED)
    df_train = pd.read_csv(
        config.TRAINING_FILE).dropna().reset_index(drop=True)

    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    es = utils.EarlyStopping(patience=2, mode="max")

    for epoch in range(EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        if epoch + 1 == MAX_EPOCHS:
            torch.save(model.state_dict(), 'model_full.bin')
            break
Exemplo n.º 5
0
def predict(df_test):
    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.load_state_dict(torch.load("model.bin"))
    model.to(device)

    test_dataset = TweetDataset(tweet=df_test.text.values,
                                sentiment=df_test.sentiment.values,
                                selected_text=df_test.selected_text.values)

    data_loader = torch.utils.data.DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1)

    return eval_fn(data_loader, model, device)
Exemplo n.º 6
0
def main():
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train, df_valid = train_test_split(dfx, test_size=0.2, random_state=42)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values,
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values,
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2)

    device = torch.device("cuda")
    model_config = transformers.RobertaConfig.from_pretrained(
        config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    for _ in range(config.EPOCHS):
        train_fn(train_data_loader,
                 model,
                 optimizer,
                 device,
                 scheduler=scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")

    torch.save(model, "model.pth")
def test_it(MODEL_PATH='roberta-base'):
    models = []
    for t in os.listdir('type'):
        for model_file in os.listdir(os.path.join('type', t)):
            model = TweetModel(MODEL_PATH=t)
            # model.cuda()
            model.cpu()
            model.load_state_dict(
                torch.load(os.path.join(os.path.join('type', t), model_file)))
            model.eval()
            models.append(model)

    test_df = pd.read_csv('data/test.csv')
    test_df['text'] = test_df['text'].astype(str)
    test_loader = get_test_loader(test_df, MODEL_PATH=MODEL_PATH)
    predictions = []

    for data in test_loader:
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        tweet = data['tweet']
        offsets = data['offsets'].numpy()
        sentiment = data['sentiment']

        start_logits = []
        end_logits = []
        # len_logits = []
        for model in models:
            with torch.no_grad():
                model.cuda()
                output = model(ids, masks)
                start_logits.append(
                    torch.softmax(output[0], dim=1).cpu().detach().numpy())
                end_logits.append(
                    torch.softmax(output[1], dim=1).cpu().detach().numpy())
                # len_logits.append(torch.softmax(output[2], dim=1).cpu().detach().numpy())
                model.cpu()
        start_logits = np.mean(start_logits, axis=0)
        end_logits = np.mean(end_logits, axis=0)
        # len_logits = np.mean(len_logits, axis=0)
        for i in range(len(ids)):
            start_pred = np.argmax(start_logits[i])
            end_pred = np.argmax(end_logits[i])
            # length = np.argmax(len_logits[i])
            # end_pred = start_pred + int(length)
            sentiment_val = sentiment[i]
            original_tweet = tweet[i]
            if start_pred > end_pred:
                pred = original_tweet
            else:
                pred = get_selected_text(tweet[i], start_pred, end_pred,
                                         offsets[i])
            if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
                pred = original_tweet
            predictions.append(pred)

    sub_df = pd.read_csv('data/sample_submission.csv')
    sub_df['selected_text'] = predictions
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('!!!!', '!') if len(x.split()) == 1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('..', '.') if len(x.split()) == 1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('...', '.') if len(x.split()) == 1 else x)
    sub_df.to_csv('submission.csv', index=False)
    sub_df.head()
Exemplo n.º 8
0
def predict(df_test):
    device = torch.device("cuda")
    model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model1 = TweetModel(conf=model_config)
    model1.to(device)
    model1.load_state_dict(torch.load("model_0.bin"))
    model1.eval()

    model2 = TweetModel(conf=model_config)
    model2.to(device)
    model2.load_state_dict(torch.load("model_1.bin"))
    model2.eval()

    model3 = TweetModel(conf=model_config)
    model3.to(device)
    model3.load_state_dict(torch.load("model_2.bin"))
    model3.eval()

    model4 = TweetModel(conf=model_config)
    model4.to(device)
    model4.load_state_dict(torch.load("model_3.bin"))
    model4.eval()

    model5 = TweetModel(conf=model_config)
    model5.to(device)
    model5.load_state_dict(torch.load("model_4.bin"))
    model5.eval()

    final_output = []

    test_dataset = TweetDataset(
            tweet=df_test.text.values,
            sentiment=df_test.sentiment.values,
            selected_text=df_test.selected_text.values
    )

    data_loader = torch.utils.data.DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )
    jaccards = utils.AverageMeter()
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)

            outputs_start1, outputs_end1 = model1(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start2, outputs_end2 = model2(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start3, outputs_end3 = model3(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start4, outputs_end4 = model4(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start5, outputs_end5 = model5(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            outputs_start = (
                outputs_start1 
                + outputs_start2 
                + outputs_start3 
                + outputs_start4 
                + outputs_start5
            ) / 5
            outputs_end = (
                outputs_end1 
                + outputs_end2 
                + outputs_end3 
                + outputs_end4 
                + outputs_end5
            ) / 5
            
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            
            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                jaccard_score, output_sentence = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)
                final_output.append(output_sentence)
            jaccards.update(np.mean(jaccard_scores), ids.size(0))
    return final_output, jaccards.avg
Exemplo n.º 9
0
from model import TweetModel
from dataset import TweetDataset
from tqdm import tqdm
import numpy as np
df_test = pd.read_csv(config.TEST_FILE)
df_test.loc[:, "selected_text"] = df_test.text.values
#data = [["Its coming out the socket I feel like my phones hole is not a virgin. That`s how loose it is... :`(","loose it is...", "negative"]]
# Create the pandas DataFrame
# df_test = pd.DataFrame(data, columns = ["text","selected_text","sentiment"])
# df_test.loc[:, "selected_text"] = df_test.text.values
device = torch.device("cuda")
model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
model_config.output_hidden_states = True

# Load each of the five trained models and move to GPU
model1 = TweetModel(conf=model_config)
model1.to(device)
model1.load_state_dict(torch.load("../models/nmodel_0.bin"))  #strict=False
# print(model1.eval())

model2 = TweetModel(conf=model_config)
model2.to(device)
model2.load_state_dict(torch.load("../models/nmodel_1.bin"))  #strict=False
# print(model2.eval())

model3 = TweetModel(conf=model_config)
model3.to(device)
model3.load_state_dict(torch.load("../models/nmodel_2.bin"))
# print(model3.eval())

model4 = TweetModel(conf=model_config)
Exemplo n.º 10
0

num_epochs = 5
batch_size = 16
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

train_df = pd.read_csv('data/train.csv')
train_df['text'] = train_df['text'].astype(str)
train_df['selected_text'] = train_df['selected_text'].astype(str)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df,
                                                      train_df.sentiment),
                                            start=1):
    print(f'Fold: {fold}')

    model = TweetModel(MODEL_PATH)
    num_train_steps = int(len(train_idx) / batch_size * num_epochs)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)

    valid_dataset = TweetDataset(tweet=df_valid.text.values,
                                 sentiment=df_valid.sentiment.values,
                                 selected_text=df_valid.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")
    conf = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model = TweetModel(conf)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model = nn.DataParallel(model)

    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = jaccard
Exemplo n.º 12
0
def main():
    seed = 42
    seed_everything(seed)

    num_epochs = 3
    batch_size = 32
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

    train_df = pd.read_csv('data/train.csv')
    train_df['text'] = train_df['text'].astype(str)
    train_df['selected_text'] = train_df['selected_text'].astype(str)

    for fold, (train_idx,
               val_idx) in enumerate(skf.split(train_df, train_df.sentiment),
                                     start=1):
        print(f'Fold: {fold}')

        model = TweetModel()
        optimizer = optim.AdamW(model.parameters(),
                                lr=3e-5,
                                betas=(0.9, 0.999))
        criterion = loss_fn
        dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx,
                                                 batch_size)

        train_model(model, dataloaders_dict, criterion, optimizer, num_epochs,
                    f'roberta_fold{fold}.pth')

    # inference

    test_df = pd.read_csv('data/test.csv')
    test_df['text'] = test_df['text'].astype(str)
    test_loader = get_test_loader(test_df)
    predictions = []
    models = []

    for fold in range(skf.n_splits):
        model = TweetModel()
        model.cuda()
        model.load_state_dict(torch.load(f'roberta_fold{fold+1}.pth'))
        model.eval()
        models.append(model)

    for data in test_loader:
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        tweet = data['tweet']
        offsets = data['offsets'].numpy()

        start_logits = []
        end_logits = []
        for model in models:
            with torch.no_grad():
                output = model(ids, masks)
                start_logits.append(
                    torch.softmax(output[0], dim=1).cpu().detach().numpy())
                end_logits.append(
                    torch.softmax(output[1], dim=1).cpu().detach().numpy())

        start_logits = np.mean(start_logits, axis=0)
        end_logits = np.mean(end_logits, axis=0)
        for i in range(len(ids)):
            start_pred = np.argmax(start_logits[i])
            end_pred = np.argmax(end_logits[i])
            if start_pred > end_pred:
                pred = tweet[i]
            else:
                pred = get_selected_text(tweet[i], start_pred, end_pred,
                                         offsets[i])
            predictions.append(pred)

    #submission

    sub_df = pd.read_csv('data/sample_submission.csv')
    sub_df['selected_text'] = predictions
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('!!!!', '!') if len(x.split()) == 1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('..', '.') if len(x.split()) == 1 else x)
    sub_df['selected_text'] = sub_df['selected_text'].apply(
        lambda x: x.replace('...', '.') if len(x.split()) == 1 else x)
    sub_df.to_csv('submission.csv', index=False)
    sub_df.head()
Exemplo n.º 13
0
def main(args, mode):
    config = Config(
        train_dir='/mfs/renxiangyuan/tweets/data/train_folds.csv',  # 原始数据
        # train_dir='/mfs/renxiangyuan/tweets/data/train_folds_extra.csv',  # 加入更多sentimen分类数据
        model_save_dir=
        f'/mfs/renxiangyuan/tweets/output/{args.model_type}-5-fold-ak',
        # model_save_dir=f'/mfs/renxiangyuan/tweets/output/shuffle/{args.model_type}-5-fold-ak',
        model_type=args.model_type,
        batch_size=args.bs,
        seed=args.seed,
        lr=args.lr * 1e-5,
        max_seq_length=args.max_seq_length,
        num_hidden_layers=args.num_hidden_layers,
        cat_n_layers=args.cat_n_layers,
        froze_n_layers=args.froze_n_layers,

        # conv_head=True,
        # eps=args.eps,
        shuffle_seed=args.shuffle_seed,
        init_seed=args.init_seed,
        epochs=args.epochs,  # 默认epochs=3
        warmup_samples=args.warmup_samples,
        # frozen_warmup=False,
        warmup_scheduler=args.scheduler,
        mask_pad_loss=args.mask_pad_loss,
        smooth=args.smooth,
        # fp16=False,
        io_loss_ratio=args.io_loss_ratio,
        io_loss_type=args.io_loss_type,
        # multi_sent_loss_ratio=0,
        # clean_data=True,  # 模型clean_data=False
    )

    config.print_info()

    set_seed(config.seed)

    # 训练
    if "train" in mode:
        os.makedirs(config.MODEL_SAVE_DIR, exist_ok=True)
        jaccard_scores = []
        for i in args.train_folds:
            scores_i = train(fold=i, config=config)
            jaccard_scores.append(scores_i)
            # if i == 0 and max(scores_i) < 0.705:
            #     print("Fold 0 Too Weak, Early Stop")
            #     break
        for i, res_i in enumerate(jaccard_scores):
            print(i, res_i)
        print("mean", np.mean([max(scores) for scores in jaccard_scores]))
        for i in range(1, config.EPOCHS):
            print(f"\tEpoch{i+1}: ",
                  np.mean([scores[i] for scores in jaccard_scores]))
        config.print_info()

    # 测试
    if "test" in mode:
        model_paths = [
            "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_11shufflesd/model_0_epoch_2.pth",
            "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_3shufflesd/model_1_epoch_3.pth",
            "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_18shufflesd/model_2_epoch_3.pth",
            "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_13shufflesd/model_3_epoch_2.pth",
            "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_19shufflesd/model_4_epoch_3.pth",
            # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_0_epoch_2.pth",
            # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_1_epoch_2.pth",
            # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_2_epoch_3.pth",
            # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_3_epoch_3.pth",
            # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_4_epoch_3.pth",
        ]
        ensemble_infer(model_paths, config)
        # ensemble_infer(model_paths=None, config=config)

    # # 评估
    if "evaluate" in mode:
        device = torch.device("cuda")
        model = TweetModel(conf=config.model_config, config=config)
        model.to(device)
        res = [[] for _ in range(5)]
        for fold in range(5):
            dfx = pd.read_csv(config.TRAINING_FILE)
            df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

            valid_dataset = TweetDataset(
                tweet=df_valid.text.values,
                sentiment=df_valid.sentiment.values,
                selected_text=df_valid.selected_text.values,
                config=config,
            )

            valid_data_loader = DataLoader(valid_dataset,
                                           batch_size=config.VALID_BATCH_SIZE,
                                           num_workers=8)

            for ep in range(1, config.EPOCHS):
                state_dict_dir = os.path.join(
                    config.MODEL_SAVE_DIR, f"model_{fold}_epoch_{ep+1}.pth")
                print(state_dict_dir)
                model.load_state_dict(torch.load(state_dict_dir))
                model.eval()

                jaccards = eval_fn(valid_data_loader, model, device, config)
                print(jaccards)
                res[fold].append(jaccards)

        for i, res_i in enumerate(res):
            print(i, res_i)
        print("mean", np.mean([max(scores) for scores in res]))

        for i in range(2):
            print(f"\tEpoch{i + 1}: ", np.mean([scores[i] for scores in res]))
Exemplo n.º 14
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = TweetDataset(tweet=df_valid.text.values,
                                 sentiment=df_valid.sentiment.values,
                                 selected_text=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2)

    device = torch.device("cuda")
    model_config = transformers.RobertaConfig.from_pretrained(
        config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    es = utils.EarlyStopping(patience=2, mode="max")
    print(f"Training is Starting for fold={fold}")

    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)
        #print(f"Jaccard Score = {jaccard}")
        es(jaccard, model, model_path=f"model_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
Exemplo n.º 15
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    # Set train validation set split
    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )

    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # Define two sets of parameters: those with weight decay, and those without
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    '''
    Create a scheduler to set the learning rate at each training step
    "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html)
    Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step
    '''
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )
    es = utils.EarlyStopping(patience=2, mode="max")
    print(f"Training is Starting for fold={fold}")
    logger.info("{} - {}".format("Training is Starting for fold", fold))
    #model=nn.DataParallel(model)

    for epoch in range(3):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard=engine.eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        logger.info("EPOCHS {} - Jaccard Score - {}".format(epoch, jaccard))
        es(jaccard, model, model_path=f"../models/nmodel_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
Exemplo n.º 16
0
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps):
    dfx = pd.read_csv(training_file)

    df_train = dfx[dfx.kfold != fold].reset_index(drop = True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop = True)

    # 训练集
    train_dataset = TweetDataset(
        tweet = df_train.text.values,
        sentiment = df_train.sentiment.values,
        selected_text = df_train.selected_text.values,
        tokenizer = tokenizer,
        max_len = max_len
    )
    # 验证集
    valid_dataset = TweetDataset(
        tweet = df_valid.text.values,
        sentiment = df_valid.sentiment.values,
        selected_text = df_valid.selected_text.values,
        tokenizer = tokenizer,
        max_len = max_len
    )

    train_sampler, valid_sampler = None, None
    if args.shuffle:
        train_sampler = RandomSampler(train_dataset)
        valid_sampler = SequentialSampler(valid_dataset)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = train_batch_size,
        num_workers = 4,
        sampler=train_sampler
    )


    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = valid_batch_size,
        num_workers = 2,
        sampler=valid_sampler
    )

    device = torch.device("cuda")

    model_config = transformers.RobertaConfig.from_pretrained(roberta_path)
    model_config.output_hidden_states = True
    model = TweetModel(roberta_path = roberta_path, conf = model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / train_batch_size * epochs)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.003},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    optimizer = AdamW(optimizer_parameters, lr = lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = num_warmup_steps,
        num_training_steps = num_train_steps
    )

    if args.fp16:
        # try:
        #     from apex import amp
        # except ImportError:
        #     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)


    # multi-gpu training (should be after apex fp16 initialization)
    if args.parallel:
        model = torch.nn.DataParallel(model)

    es = utils.EarlyStopping(patience = patience, mode = "max")
    print("Training is Starting for fold", fold)

    for epoch in range(epochs):
        train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print("Jaccard Score = ", jaccard)
        experiment.log_metric("jaccard", jaccard)
        es(jaccard, model, model_path = f"{save_path}/model_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
    del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader
    import gc
    gc.collect()
    torch.cuda.empty_cache()
Exemplo n.º 17
0
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps):
    dfx = pd.read_csv(training_file)

    df_train = dfx[dfx.kfold != fold].reset_index(drop = True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop = True)

    train_sampler = None
    val_sampler = None


    # 训练集  # 3)使用DistributedSampler
    train_dataset = TweetDataset(
        tweet = df_train.text.values,
        sentiment = df_train.sentiment.values,
        selected_text = df_train.selected_text.values,
        tokenizer = tokenizer,
        max_len = max_len
    )
    # 验证集
    valid_dataset = TweetDataset(
        tweet = df_valid.text.values,
        sentiment = df_valid.sentiment.values,
        selected_text = df_valid.selected_text.values,
        tokenizer = tokenizer,
        max_len = max_len
    )

    if distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = train_batch_size,
        shuffle=(train_sampler is None),
        num_workers = 4,
        sampler=train_sampler
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = valid_batch_size,
        shuffle=False,
        num_workers = 2,
        sampler=val_sampler
    )

    device = torch.device("cuda")

    model_config = transformers.RobertaConfig.from_pretrained(roberta_path)
    model_config.output_hidden_states = True
    model = TweetModel(roberta_path = roberta_path, conf = model_config)
    model.to(device)

    if torch.cuda.device_count() > 1:
        num_device = torch.cuda.device_count()
        print("Let's use", num_device, "GPUs!")
        # 5) 封装
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[local_rank],
                                                          output_device=local_rank,
                                                          find_unused_parameters=True)


    num_train_steps = int(len(df_train) / train_batch_size * epochs)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    optimizer = AdamW(optimizer_parameters, lr = lr * num_device)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = num_warmup_steps,
        num_training_steps = num_train_steps
    )

    es = utils.EarlyStopping(patience = patience, mode = "max")
    print("Training is Starting for fold", fold)

    for epoch in range(epochs):
        if distributed:
            train_sampler.set_epoch(epoch)
        train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)

        # if distributed:
        #     jaccard_reduce = reduce_tensor(jaccard)
        # print("jaccard_reduce:", jaccard_reduce)
        if not distributed or (distributed and torch.distributed.get_rank() == 0):
            print("Jaccard Score = ", jaccard)
            es(jaccard, model, model_path = f"./bin/model_{fold}.bin")
            if es.early_stop:
                print("Early stopping")
                break

    del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader
    import gc
    gc.collect()
    torch.cuda.empty_cache()