def load_tweets_dataset(lang, text_transform, batch_size, val_batch_size):
    """
    Import tweets data set
    :param lang:
    :param text_transform:
    :param batch_size:
    :param val_batch_size:
    :return:
    """
    # Tweet data set 2017 training
    tweet_dataset_train_17 = dataset.TweetDataset(root='./data/', download=True, lang=lang,
                                                  text_transform=text_transform, year=2017, train=True)
    pan17loader_training = torch.utils.data.DataLoader(tweet_dataset_train_17, batch_size=batch_size, shuffle=True)

    # Tweet data set 2017 validation
    tweet_dataset_val_17 = dataset.TweetDataset(root='./data/', download=True, lang=lang,
                                                text_transform=text_transform, year=2017, train=False)
    pan17loader_validation = torch.utils.data.DataLoader(tweet_dataset_val_17, batch_size=batch_size, shuffle=True)

    # Tweet data set 2018 training
    tweet_dataset_train_18 = dataset.TweetDataset(root='./data/', download=True,
                                                  lang=lang, text_transform=text_transform, year=2018,
                                                  train=True)
    pan18loader_training = torch.utils.data.DataLoader(tweet_dataset_train_18, batch_size=batch_size, shuffle=True)

    # Tweet data set 2018 validation
    tweet_dataset_val_18 = dataset.TweetDataset(root='./data/', download=True,
                                                lang=lang, text_transform=text_transform, year=2018,
                                                train=False)
    pan18loader_validation = torch.utils.data.DataLoader(tweet_dataset_val_18, batch_size=val_batch_size,
                                                         shuffle=True)

    return pan17loader_training, pan17loader_validation, pan18loader_training, pan18loader_validation
示例#2
0
def run(fold, model_name):
    writer = SummaryWriter(log_dir=f'{SAVE_PATH}/', filename_suffix=f'{model_name}-fold{fold}')
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)
    print(df_train.shape)
    print(df_valid.shape)
    train_dataset = dataset.TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = dataset.TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
    print(f'training on {device}')
    model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    optimizer = AdamW(params.optimizer_params(model), lr=5e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    es = utils.EarlyStopping(patience=5, mode="max")
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, writer)
        jaccard = engine.eval_fn(valid_data_loader, model, device, writer)
        print(f"Jaccard Score = {jaccard}")
        print(f"Epoch={epoch}, Jaccard={jaccard}")
        es(jaccard, model, model_path=f"{SAVE_PATH}/{model_name}-f{fold}.pt")
        if es.early_stop:
            print("Early stopping")
            break
示例#3
0
def run(df):
    y = df["target"].values
    X = df.drop(["target", "kfold"], axis=1).values

    train_dataset = dataset.TweetDataset(
        tweets=X,
        targets=y
    )

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=2
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using {} device".format(device))

    model = neural_net.NeuralNetwork().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = torch.nn.BCELoss()

    print("Training Model...")

    for epoch in range(config.EPOCHS):
        print(f"Epoch {epoch+1}\n--------------------")
        engine.train(
            train_dataloader,
            model,
            optimizer,
            loss_fn,
            device
        )

    torch.save(model.state_dict(), f"{config.MODEL_PATH}/{config.MODEL_NAME}.pth")
示例#4
0
def eval_fn(model, dataset):
    """Eval the eval dataset and returns the metric"""

    data = tf.data.Dataset.from_generator(
        dataset.TweetDataset(data, config.TOKENIZER, config.MAX_LEN).gen,
        output_types=dataset.gen_str).batch(config.VALID_BATCH_SIZE)

    def get_text(text, pred):

        pred_texts = []
        orig_texts = []
        text = text.numpy()
        pred = tf.argmax(pred, axis=1).numpy()

        for t, p in zip(text, pred):
            orig_texts.append(t.decode("utf-8"))
            t = config.TOKENIZER.encode(orig_texts[-1]).offsets
            i, j = p[0], p[1]
            pred_texts.append(orig_texts[-1][t[i][0]:t[j][1]])

        return orig_texts, pred_texts

    score = 0
    for i, (data, _) in tqdm(enumerate(data)):
        orig_text = data["orig"]
        ext_text = data["ext"]
        preds = model.predict(data)
        targets, pred_texts = get_text(orig_text, preds)
        score = score + utils.jaccard(pred_texts, targets)
    score = sum(score) / len(score)

    print("Total jaccard score : ", score)
示例#5
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    device = torch.device('cuda')
    model_config = transformers.RobertaConfig.from_pretrained(
        config.MODEL_CONFIG)
    model_config.output_hidden_states = True

    model = models.TweetModel(conf=model_config)
    model.to(device)
    model.load_state_dict(torch.load(
        f'{config.TRAINED_MODEL_PATH}/model_{fold}.bin'))
    model.eval()

    valid_dataset = dataset.TweetDataset(
        tweets=df_valid.text.values,
        sentiments=df_valid.sentiment.values,
        selected_texts=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=4,
        shuffle=False)

    jaccard = eval_fn(valid_data_loader, model, device)

    return jaccard
示例#6
0
def get_dataloader(
    df_train,
    df_valid,
    max_seq_len=128,
    model_type='roberta',
    dataloader_shuffle=False,
    ):

    tokenizer = m.init_tokenizer(model_type) # [!]

    train_dataset = dataset.TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values,
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        model_type=model_type,
    )

    train_data_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=dataloader_shuffle,
        num_workers=4,
        drop_last=True
    )

    valid_dataset = dataset.TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values,
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        model_type=model_type,
    )

    valid_data_loader = DataLoader(
        valid_dataset,
        batch_size=args.batch_size,
        num_workers=2,
        drop_last=True
    )

    return train_data_loader, valid_data_loader
示例#7
0
def train_fn(model, data):
    """ Train the model with the given data and returns it.
    """
    # checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    # es=EarlyStopping(monitor='val_acc', baseline=0.85, patience=10 ,verbose=1,mode="max")
    # callbacks_list = [checkpoint]

    data = tf.data.Dataset.from_generator(
        dataset.TweetDataset(data, config.TOKENIZER, config.MAX_LEN).gen,
        output_types=dataset.gen_str).batch(config.TRAIN_BATCH_SIZE)

    model.fit(dataset, epochs=config.EPOCHS)
    return model
示例#8
0
import pandas as pd
from model import BertBaseUncased
import torch
import numpy as np
import string
from tqdm import tqdm
import dataset
import config

TEST_FILE = 'Data/test.csv'
df_test = pd.read_csv(TEST_FILE)
sample = pd.read_csv('Data/sample_submission.csv')

test_dataset = dataset.TweetDataset(
    tweet=df_test.text.values,
    sentiment=df_test.sentiment.values,
    selected_text=df_test.text.
    values  # wont need this just so that the data loader works
)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=False)

device = torch.device('cuda')
print('Running on ', device)
model = BertBaseUncased().to(device)

model.load_state_dict(torch.load('model.bin'))


def test_fn(data_loader, model, device):
    model.eval()
示例#9
0
def run_cv(df, fold):
    train_df = df[df["kfold"] != fold].reset_index(drop=True)
    valid_df = df[df["kfold"] == fold].reset_index(drop=True)

    #y_train = pd.get_dummies(train_df["target"], dtype="int64").values
    y_train = train_df["target"].values
    X_train = train_df.drop(["target", "kfold"], axis=1).values
    
    #y_valid = pd.get_dummies(valid_df["target"], dtype="int64").values
    y_valid = valid_df["target"].values
    X_valid = valid_df.drop(["target", "kfold"], axis=1).values

    train_dataset = dataset.TweetDataset(
        tweets=X_train,
        targets=y_train
    )

    valid_dataset = dataset.TweetDataset(
        tweets=X_valid,
        targets=y_valid
    )

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=2
    )

    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=1
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using {} device".format(device))

    model = neural_net.NeuralNetwork().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = torch.nn.BCELoss()

    print("Training Model...")
    
    #early_stopping_counter = 0

    for epoch in range(config.EPOCHS):
        print(f"Epoch {epoch+1}\n--------------------")
        engine.train(
            train_dataloader,
            model,
            optimizer,
            loss_fn,
            device
        )

        outputs, targets = engine.evaluate(
            valid_dataloader,
            model,
            loss_fn,
            device
        )
        outputs = np.array(outputs).reshape(-1,)
        outputs = list(map(lambda pred: 1 if pred>0.5 else 0, outputs))
        valid_score = metrics.f1_score(targets, outputs)
        print(f" F1 Score: {valid_score}\n")
示例#10
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweets=df_train.text.values,
        sentiments=df_train.sentiment.values,
        selected_texts=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
        shuffle=True)

    valid_dataset = dataset.TweetDataset(
        tweets=df_valid.text.values,
        sentiments=df_valid.sentiment.values,
        selected_texts=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=4,
        shuffle=False)

    device = torch.device('cuda')
    model_config = transformers.RobertaConfig.from_pretrained(
        config.MODEL_CONFIG)
    model_config.output_hidden_states = True
    model = models.TweetModel(conf=model_config)
    model = model.to(device)

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        config.WEIGHT_DECAY
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    base_opt = transformers.AdamW(optimizer_parameters,
                                  lr=config.LEARNING_RATE)
    optimizer = torchcontrib.optim.SWA(base_opt,
                                       swa_start=int(num_train_steps *
                                                     config.SWA_RATIO),
                                       swa_freq=config.SWA_FREQ,
                                       swa_lr=None)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=int(num_train_steps * config.WARMUP_RATIO),
        num_training_steps=num_train_steps)

    print(f'Training is starting for fold={fold}')

    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)

    if config.USE_SWA:
        optimizer.swap_swa_sgd()

    torch.save(model.state_dict(),
               f'{config.MODEL_SAVE_PATH}/model_{fold}.bin')

    return jaccard
示例#11
0
def run():
    print('Loading Files...')
    
    dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop = True)
    
    #dfx = dfx.sample(100)
    df_train,df_valid = model_selection.train_test_split(
            dfx,
            test_size = 0.1,
            random_state = 42,            
            )
    df_train = df_train.reset_index(drop = True)
    df_valid = df_valid.reset_index(drop = True)
    
    print('Files loaded')
    
    train_dataset = dataset.TweetDataset( 
            tweet=df_train.text.values,
            selected_text = df_train.selected_text.values
            )
    
    train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=config.TRAIN_BATCH_SIZE,
            shuffle = False
            )
    
    
    valid_dataset = dataset.TweetDataset( 
            tweet=df_valid.text.values,
            selected_text = df_valid.selected_text.values
            )
    valid_dataloader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size=config.VALID_BATCH_SIZE,
            shuffle = False
            )
    
    
    device = torch.device('cuda')
    print('Running on ',device)
    model = BertBaseUncased().to(device)
    
    
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias','LayerNorm.bias','layerNorm.weight']
    
    optimizer_params = [
            {'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay':0.003},
            {'params':[p for n,p in param_optimizer if  any(nd in n for nd in no_decay)],'weight_decay':0.00}
            ]
    
    
    num_training_steps = int(len(df_train)/config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_params, lr = 2e-5)
    scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps = 0,
            num_training_steps = num_training_steps)
    
    
    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_dataloader,model,optimizer,device,scheduler)
        jaccard = engine.eval_fn(valid_dataloader,model,device)
        
        print(f'Epochs {epoch+1}...',
              f'Jaccard {jaccard}')
        
        if jaccard > best_jaccard:
            torch.save(model.state_dict(),config.MODEL_PATH)
            best_jaccard = jaccard
        
        print('Memory Used: ',torch.cuda.memory_allocated()/1000000000,'GB')         
        torch.cuda.empty_cache()
示例#12
0
文件: train.py 项目: naveengampala/AI
def run():
    dfx = pd.read_csv(config.TRAINING_FILE,
                      nrows=100).dropna().reset_index(drop=True)
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model = nn.DataParallel(model)

    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = jaccard
示例#13
0
def run():
    df_test = pd.read_csv(config.TEST_FILE)
    df_test.loc[:, 'selected_text'] = df_test.text.values

    device = torch.device('cuda')
    model_config = transformers.RobertaConfig.from_pretrained(
        config.MODEL_CONFIG)
    model_config.output_hidden_states = True

    fold_models = []
    for i in range(config.N_FOLDS):
        model = models.TweetModel(conf=model_config)
        model.to(device)
        model.load_state_dict(
            torch.load(f'{config.TRAINED_MODEL_PATH}/model_{i}.bin'))
        model.eval()
        fold_models.append(model)

    test_dataset = dataset.TweetDataset(
        tweets=df_test.text.values,
        sentiments=df_test.sentiment.values,
        selected_texts=df_test.selected_text.values)

    data_loader = torch.utils.data.DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=4)

    char_pred_test_start = []
    char_pred_test_end = []

    with torch.no_grad():
        tk0 = tqdm.tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d['ids']
            token_type_ids = d['token_type_ids']
            mask = d['mask']
            orig_tweet = d['orig_tweet']
            offsets = d['offsets']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)

            outputs_start_folds = []
            outputs_end_folds = []
            for i in range(config.N_FOLDS):
                outputs_start, outputs_end = \
                    fold_models[i](ids=ids,
                                   mask=mask,
                                   token_type_ids=token_type_ids)
                outputs_start_folds.append(outputs_start)
                outputs_end_folds.append(outputs_end)

            outputs_start = sum(outputs_start_folds) / config.N_FOLDS
            outputs_end = sum(outputs_end_folds) / config.N_FOLDS

            outputs_start = torch.softmax(outputs_start,
                                          dim=-1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end,
                                        dim=-1).cpu().detach().numpy()

            for px, tweet in enumerate(orig_tweet):
                char_pred_test_start.append(
                    utils.token_level_to_char_level(tweet, offsets[px],
                                                    outputs_start[px]))
                char_pred_test_end.append(
                    utils.token_level_to_char_level(tweet, offsets[px],
                                                    outputs_end[px]))

    with open('roberta-char_pred_test_start.pkl', 'wb') as handle:
        pickle.dump(char_pred_test_start, handle)
    with open('roberta-char_pred_test_end.pkl', 'wb') as handle:
        pickle.dump(char_pred_test_end, handle)
示例#14
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE, nrows=30).dropna().reset_index(drop=True)


    df_train, df_valid = model_selection.train_test_split(
        dfx, 
        test_size = 0.1,
        random_state = 42,
        stratify = dfx.sentiment.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweet = df_train.text.values,
        sentiment = df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=1
    )

    valid_dataset = dataset.TweetDataset(
        tweet = df_valid.text.values,
        sentiment = df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1 
    )

    device = torch.device('cpu')
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        print("here")
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        mean_jac = engine.eval_fn(valid_data_loader, model, device)
        print("jaccard_score = {mean_jac}".format(mean_jac=mean_jac))
        if(mean_jac>best_jaccard):
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = mean_jac
示例#15
0
        for i in range(cfg.K_FOLD):
            _model = m.SentimentExtractor(model_type=cfg.MODEL_TYPE,
                                          dropout_rate=cfg.DROPOUT_RATE,
                                          last_n_layers=cfg.LAST_N_LAYERS,
                                          device=device)
            _model.to(device)
            _model.load_state_dict(torch.load(f'{model_path}/model_{i}.pt'))
            _model.eval()
            models.append(_model)

        m.init_tokenizer()

        test_dataset = dataset.TweetDataset(
            tweet=test_data.text.values,
            sentiment=test_data.sentiment.values,
            selected_text=test_data.selected_text.values,
            tokenizer=m.tokenizer,
            max_seq_len=max_seq_len,
            model_type=cfg.MODEL_TYPE,
        )

        id_list = []
        answer = []
        sentiments = ['positive', 'negative', 'neutral']

        scores = []
        selected = []
        # [START] with torch.no_grad():
        with torch.no_grad():
            for idx, d in enumerate(tqdm(test_dataset, desc="test", ncols=80)):

                uniq_id = test_data.textID.iloc[idx]
def run():
    dfx = pd.read_csv(config.TRAINING_FILE,
                      nrows=config.NROWS).dropna().reset_index(drop=True)
    # dfx.sentiment = dfx.sentiment.apply(
    #     lambda x: 1 if x =='positive' else 0
    # )
    print('Data Loaded')
    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.5, random_state=42, stratify=dfx.sentiment.values)
    print('Data split into train data and validation data')
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values)

    print('Train data preprocessed and made into Tweet Dataset Object')

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        shuffle=True,
        num_workers=4)

    print('Train dataloader created')
    valid_dataset = dataset.TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values)
    print('Valid data preprocessed and made into Tweet Dataset Object')
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)
    print('Valid dataloader created')
    device = config.DEVICE
    conf = transformers.RobertaConfig.from_pretrained(
        f'{config.PATH}roberta-base-config.json')
    conf.output_hidden_states = False

    model = Roberta(conf)
    model.to(device)
    print('Model Object created')

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = utils.get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_jaccard = 0
    print('Starting Training....')
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)

        print(f'Jaccard Score : {jaccard}')
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = jaccard
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True)

    #stratify split so that class can be balanced for both train and validation ==>> it means number of positive class will be equal to negative class for train ===>>same  for validation dataset also
    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweet=df_train.text.values,
        target=df_train.sentiment.values,
        selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.TweetDataset(
        tweet=df_valid.text.values,
        target=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    #specify what parameters you want to train
    param_optimizer = list(model.named_parameters())

    #we don't want any deacy for these layer names such as bias and othr following things
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [
        {
            #don't decay weight for above no_decay list else decay
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    #experiment with lr
    optimizer = AdamW(optimizer_parameters, lr=3e-5)

    #scheduler can be of your choice
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    #convert model to multi-gpu model --->> no need to do this if you have not multiple gpus
    model = nn.DataParallel(model)

    #evaluation matrix is jacccard
    best_jaccard = 0

    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)

        print(f"Jaccard Score = {jaccard}")
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = jaccard
示例#18
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE,
                      nrows=100).dropna().reset_index(drop=True)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx["sentiment"].values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweet=df_train["text"].values,
        sentiment=df_train["sentiment"].values,
        selected_text=df_train["selected_text"].values,
    )

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        # num_workers=4,
    )

    valid_dataset = dataset.TweetDataset(
        tweet=df_valid["text"].values,
        sentiment=df_valid["sentiment"].values,
        selected_text=df_valid["selected_text"].values,
    )

    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALIDATION_BATCH_SIZE,
        # num_workers=1,
    )
    device = torch.device("cpu")
    model = BERTBasedUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = len(dfx) / (config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_dataloader, model, optimizer, device, scheduler)
        jaccard = engine.eval_fn(valid_dataloader, model, device)

        print(f"Jaccard score :  {jaccard}")

        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = jaccard
示例#19
0
def get_test_loader(df):
    loader = torch.utils.data.DataLoader(dataset.TweetDataset(df))
    return loader