예제 #1
0
    def __init__(self, batch_size, \
        csv_file="/scratch/si699w20_cbudak_class_root/si699w20_cbudak_class/shared_data/JI_team/data/dataset/OneMonthData/OneMonthFilter846.csv", \
        root_dir="/scratch/si699w20_cbudak_class_root/si699w20_cbudak_class/shared_data/JI_team/data/dataset/OneMonthData/Image/10033", \
        data_transform= transforms.Compose([
                #transforms.ToPILImage(mode="RGB"),
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]), \
        split_ratio=(0.7, 0.1, 0.2), file_size=None):

        assert(sum(split_ratio) == 1)
        torch.manual_seed(0)
        self.label_generator = LabelGenerator(csv_file, file_size=file_size)
        dataset = TweetDataset(csv_file=csv_file, root_dir=root_dir, tag2label=self.label_generator.tag2label, \
                            text_vocab=self.label_generator.text_vocab, transform=data_transform, file_size=file_size)
        train_size = int(len(dataset) * split_ratio[0])
        val_size = int(len(dataset) * split_ratio[1])
        test_size = len(dataset) - train_size - val_size
        
        train_set, val_set, test_set = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

        self.datasets = {"train": train_set, "val": test_set, "test": val_set}
        self.dataset_sizes = {x: len(self.datasets[x]) for x in ["train", "val", "test"]}

        self.dataloaders = {x: torch.utils.data.DataLoader(self.datasets[x], batch_size=batch_size,
                                                shuffle=True, num_workers=4, drop_last=True, collate_fn=dataset.collate_fn)
                    for x in ["train", "val", "test"]}
예제 #2
0
def get_train_val_loaders(df, train_idx, val_idx, batch_size=BATCH_SIZE):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(TweetDataset(train_df),
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=NUM_WORKERS,
                                               drop_last=True)

    val_loader = torch.utils.data.DataLoader(TweetDataset(val_df),
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=NUM_WORKERS)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict
예제 #3
0
def export_RNN_regressor(checkpoint_path):
    """
    :param checkpoint_path: relative path to a PyTorch .pth checkpoint
    :return: None, dumps a prediction text file in the model's training folder
    """
    checkpoint = torch.load(checkpoint_path)
    model = RNN(checkpoint['net_config'])
    model.load_state_dict(checkpoint['model'])
    model = model.eval().cuda()

    test_dataset = TweetDataset(dataset_type='test')
    test_loader = DataLoader(test_dataset,
                             batch_size=TRAIN_CONFIG['batch_size'],
                             num_workers=TRAIN_CONFIG['workers'],
                             collate_fn=collate_function,
                             shuffle=False,
                             pin_memory=True)

    with open(DATASET_CONFIG['test_csv_relative_path'], newline='') as csvfile:
        test_data = list(csv.reader(csvfile))[1:]

    ids = [datum[0] for datum in test_data]
    n = len(test_loader)

    with open(
            "checkpoints/{}/predictions.txt".format(
                checkpoint['train_config']['experiment_name']), 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["TweetID", "NoRetweets"])
        current_idx = 0
        for batch_index, batch in enumerate(test_loader):
            printProgressBar(batch_index, n)
            batch_size = batch['numeric'].shape[0]

            numeric = batch['numeric'].cuda()
            text = batch['embedding'].cuda()
            prediction = torch.exp(model(
                text, numeric)) - 1 if EXPORT_CONFIG['log'] else model(
                    text, numeric)

            if EXPORT_CONFIG['threshold']:
                prediction[
                    prediction >
                    EXPORT_CONFIG['threshold']] = EXPORT_CONFIG['threshold']

            for idx_in_batch in range(batch_size):
                writer.writerow([
                    str(ids[current_idx + idx_in_batch]),
                    str(int(prediction[idx_in_batch].item()))
                ])

            current_idx += batch_size

    print("Exportation done! :)")
예제 #4
0
def create_loader(tweet: str, sentiment: str):

    df = pd.DataFrame({"text": tweet, "sentiment": sentiment}, index=[1])

    test_dataset = TweetDataset(
        tweet=df.text.values,
        sentiment=df.sentiment.values,
        selected_text=df.text.values,
    )

    test_data_loader = DataLoader(test_dataset, batch_size=1)
    return test_data_loader
예제 #5
0
def run():
    seed_everything(config.SEED)
    df_train = pd.read_csv(
        config.TRAINING_FILE).dropna().reset_index(drop=True)

    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    es = utils.EarlyStopping(patience=2, mode="max")

    for epoch in range(EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        if epoch + 1 == MAX_EPOCHS:
            torch.save(model.state_dict(), 'model_full.bin')
            break
예제 #6
0
    def __create_data(self):

        im_dataset = {
            i: TweetDataset(input_data=self.data_dict[i])
            for i in self.set_names
        }

        im_loader = {
            i: DataLoader(im_dataset[i],
                          batch_size=self.batch_size,
                          shuffle=self.shuffle,
                          num_workers=self.num_works,
                          drop_last=True)
            for i in self.set_names
        }
        return im_dataset, im_loader
예제 #7
0
def predict(df_test):
    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.load_state_dict(torch.load("model.bin"))
    model.to(device)

    test_dataset = TweetDataset(tweet=df_test.text.values,
                                sentiment=df_test.sentiment.values,
                                selected_text=df_test.selected_text.values)

    data_loader = torch.utils.data.DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1)

    return eval_fn(data_loader, model, device)
예제 #8
0
    def run():

        torch.manual_seed(seed)

        device = xm.xla_device()
        model = MX.to(device)

        # DataLoaders
        train_dataset = TweetDataset(args=args,
                                     df=train_df,
                                     mode="train",
                                     fold=args.fold_index,
                                     tokenizer=tokenizer)
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True)
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=args.batch_size,
                                                   sampler=train_sampler,
                                                   drop_last=False,
                                                   num_workers=2)

        valid_dataset = TweetDataset(args=args,
                                     df=train_df,
                                     mode="valid",
                                     fold=args.fold_index,
                                     tokenizer=tokenizer)
        valid_sampler = torch.utils.data.distributed.DistributedSampler(
            valid_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=args.batch_size,
                                  sampler=valid_sampler,
                                  num_workers=1,
                                  drop_last=False)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.001
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]

        num_train_steps = int(num_train_dpoints / args.batch_size /
                              xm.xrt_world_size() * args.epochs)

        optimizer = AdamW(optimizer_parameters,
                          lr=args.learning_rate * xm.xrt_world_size())

        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        xm.master_print("Training is Starting ...... ")
        best_jac = 0
        #early_stopping = utils.EarlyStopping(patience=2, mode="max", verbose=True)

        for epoch in range(args.epochs):
            para_loader = pl.ParallelLoader(train_loader, [device])
            train_loss = train(args, para_loader.per_device_loader(device),
                               model, device, optimizer, scheduler, epoch, f)

            para_loader = pl.ParallelLoader(valid_loader, [device])
            valid_jac = valid(args, para_loader.per_device_loader(device),
                              model, device, tokenizer, epoch, f)

            jac = xm.mesh_reduce("jac_reduce", valid_jac, reduce_fn)
            xm.master_print(f"**** Epoch {epoch+1} **==>** Jaccard = {jac}")

            log_ = f"**** Epoch {epoch+1} **==>** Jaccard = {jac}"

            f.write(log_ + "\n\n")

            if jac > best_jac:
                xm.master_print("**** Model Improved !!!! Saving Model")
                xm.save(
                    model.state_dict(),
                    os.path.join(args.save_path, f"fold_{args.fold_index}"))
                best_jac = jac
예제 #9
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = TweetDataset(tweet=df_valid.text.values,
                                 sentiment=df_valid.sentiment.values,
                                 selected_text=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2)

    device = torch.device("cuda")
    model_config = transformers.RobertaConfig.from_pretrained(
        config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    es = utils.EarlyStopping(patience=2, mode="max")
    print(f"Training is Starting for fold={fold}")

    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)
        #print(f"Jaccard Score = {jaccard}")
        es(jaccard, model, model_path=f"model_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
예제 #10
0
def main():
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train, df_valid = train_test_split(dfx, test_size=0.2, random_state=42)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values,
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values,
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2)

    device = torch.device("cuda")
    model_config = transformers.RobertaConfig.from_pretrained(
        config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    for _ in range(config.EPOCHS):
        train_fn(train_data_loader,
                 model,
                 optimizer,
                 device,
                 scheduler=scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")

    torch.save(model, "model.pth")
예제 #11
0
파일: train.py 프로젝트: eloitanguy/jej
def train(model, infer_train, infer_val, load_checkpoint=None):
    """
    Train the RNN model using the parameters defined in the config file \n
    :param model: a pytorch NN
    :param infer_train: the inference function used for training (see above)
    :param infer_val: the inference function used for validating (see above)
    :param load_checkpoint: if None, does nothing, otherwise starts training from the given path to a .pth checkpoint
    :return:
    """

    global checkpoint_name
    print('Initialising {}'.format(cfg['experiment_name']))
    checkpoint_folder = 'checkpoints/{}/'.format(cfg['experiment_name'])

    if not os.path.exists(checkpoint_folder):
        os.makedirs(checkpoint_folder)

    tb_folder = 'tb/{}/'.format(cfg['experiment_name'])
    if not os.path.exists(tb_folder):
        os.makedirs(tb_folder)

    writer = SummaryWriter(logdir=tb_folder, flush_secs=30)
    optimiser = Adam(model.parameters(),
                     lr=cfg['learning_rate'],
                     weight_decay=cfg['weight_decay'])

    train_dataset = TweetDataset(dataset_type='train')
    train_loader = DataLoader(train_dataset,
                              batch_size=cfg['batch_size'],
                              num_workers=cfg['workers'],
                              collate_fn=collate_function,
                              shuffle=True,
                              pin_memory=True)

    val_dataset = TweetDataset(dataset_type='val')
    val_loader = DataLoader(val_dataset,
                            batch_size=cfg['batch_size'],
                            num_workers=cfg['workers'],
                            collate_fn=collate_function,
                            shuffle=False,
                            pin_memory=True)

    if load_checkpoint:
        checkpoint = torch.load(load_checkpoint)
        assert model.config == checkpoint['net_config'], \
            "The provided checkpoint has a different configuration, loading is impossible"
        start_epoch = checkpoint['epoch'] + 1
        epochs = cfg['epochs'] + start_epoch
        step = checkpoint['step']
        model.load_state_dict(checkpoint['model'])
        optimiser.load_state_dict(checkpoint['optimiser'])
        print("Loaded the checkpoint at {}".format(load_checkpoint))
    else:
        start_epoch, step = 0, 0
        epochs = cfg['epochs']

    init_loss = 0.
    avg_loss = AverageMeter()
    best_mae = 1e10

    print('Sanity val')
    val(model, val_loader, writer, 0, infer_val)
    model.train()

    print('Starting training')
    for epoch in range(start_epoch, epochs):
        loader_length = len(train_loader)
        epoch_start = time.time()

        for batch_idx, batch in enumerate(train_loader):
            optimiser.zero_grad()

            loss = infer_train(model, batch)
            loss.backward()

            if epoch == 0 and batch_idx == 0:
                init_loss = loss

            # logging
            elapsed = time.time() - epoch_start
            progress = batch_idx / loader_length
            est = datetime.timedelta(
                seconds=int(elapsed / progress)) if progress > 0.001 else '-'
            avg_loss.update(loss)
            suffix = '\tloss {:.4f}/{:.4f}\tETA [{}/{}]'.format(
                avg_loss.avg, init_loss,
                datetime.timedelta(seconds=int(elapsed)), est)
            printProgressBar(batch_idx,
                             loader_length,
                             suffix=suffix,
                             prefix='Epoch [{}/{}]\tStep [{}/{}]'.format(
                                 epoch, epochs - 1, batch_idx, loader_length))

            writer.add_scalar('Steps/train_loss', loss, step)

            # saving the model
            if step % cfg['checkpoint_every'] == 0:
                checkpoint_name = '{}/epoch_{}.pth'.format(
                    checkpoint_folder, epoch)
                torch.save(
                    {
                        'model': model.state_dict(),
                        'epoch': epoch,
                        'batch_idx': batch_idx,
                        'step': step,
                        'optimiser': optimiser.state_dict(),
                        'train_config': cfg,
                        'net_config': model.config,
                        'dataset_config': DATASET_CONFIG
                    }, checkpoint_name)
            step += 1
            optimiser.step()

            # validating
            if step % cfg['val_every'] == 0:
                mae = val(model, val_loader, writer, step, infer_val)
                if mae < best_mae:
                    best_mae = mae
                    print('Best model with V{:.2f}'.format(best_mae))
                    torch.save(
                        {
                            'model': model.state_dict(),
                            'epoch': epoch,
                            'batch_idx': batch_idx,
                            'step': step,
                            'optimiser': optimiser.state_dict(),
                            'train_config': cfg,
                            'net_config': model.config,
                            'dataset_config': DATASET_CONFIG
                        }, '{}/best.pth'.format(checkpoint_folder))
                model.train()

        # end of epoch
        print('')
        writer.add_scalar('Epochs/train_loss', avg_loss.avg, epoch)
        avg_loss.reset()
        checkpoint_name = '{}/epoch_{}.pth'.format(checkpoint_folder, epoch)
        torch.save(
            {
                'model': model.state_dict(),
                'epoch': epoch,
                'batch_idx': loader_length,
                'step': step,
                'optimiser': optimiser.state_dict(),
                'train_config': cfg,
                'net_config': model.config,
                'dataset_config': DATASET_CONFIG
            }, checkpoint_name)

    # finished training
    writer.close()
    print('Training finished :)')
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)

    valid_dataset = TweetDataset(tweet=df_valid.text.values,
                                 sentiment=df_valid.sentiment.values,
                                 selected_text=df_valid.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")
    conf = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model = TweetModel(conf)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model = nn.DataParallel(model)

    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = jaccard
예제 #13
0
model4 = TweetModel(conf=model_config)
model4.to(device)
model4.load_state_dict(torch.load("../models/nmodel_3.bin"))
# print(model4.eval())

model5 = TweetModel(conf=model_config)
model5.to(device)
model5.load_state_dict(torch.load("../models/nmodel_4.bin"))
# print(model5.eval())

final_output = []

# Instantiate TweetDataset with the test data
test_dataset = TweetDataset(tweet=df_test.text.values,
                            sentiment=df_test.sentiment.values,
                            selected_text=df_test.selected_text.values)

# Instantiate DataLoader with `test_dataset`
data_loader = torch.utils.data.DataLoader(test_dataset,
                                          shuffle=False,
                                          batch_size=config.VALID_BATCH_SIZE,
                                          num_workers=1)

# Turn of gradient calculations
with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    # Predict the span containing the sentiment for each batch
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        mask = d["mask"]
예제 #14
0
def prepare_datasets():
    """
    Prepares the training, validation and test (Kaggle) datasets used by the XGBoost model \n
    This function looks into the XGBOOST_CONFIG dictionary in config.py for the following information: \n
    * which embedding NN to use (looks for the .pth checkpoint in XGBOOST_CONFIG['embedder'])
    * how to extract the embedding: XGBOOST_CONFIG['embedding_use_hidden', 'embedding_use_output', 'embedding_size']
    * how many numeric variables to add as input
    * where to dump the prepared .npy files: XGBOOST_CONFIG['train_file', 'val_file', 'test_file']
    """
    checkpoint = torch.load(XGBOOST_CONFIG['embedder'])
    embed = RNN(config=checkpoint['net_config']).eval()
    embed.load_state_dict(checkpoint['model'])
    embed = embed.cuda()

    annotated_dataset = TweetDataset(dataset_type='all')
    test_dataset = TweetDataset(dataset_type='test')

    def get_data(dataset, message):
        N = len(dataset)
        data = np.zeros((N, XGBOOST_CONFIG['numeric_data_size'] +
                         XGBOOST_CONFIG['embedding_size'] + 1))  # 1 for answer
        loader = DataLoader(dataset,
                            batch_size=TRAIN_CONFIG['batch_size'],
                            num_workers=TRAIN_CONFIG['workers'],
                            collate_fn=collate_function,
                            shuffle=False)
        current_idx = 0
        n = len(loader)
        print('')
        for batch_index, batch in enumerate(loader):
            printProgressBar(batch_index, n, prefix=message)
            batch_size = batch['numeric'].shape[0]

            numeric = batch['numeric'].cuda()
            text = batch['embedding'].cuda()

            if XGBOOST_CONFIG['embedding_use_hidden']:
                embedding = embed(
                    text,
                    numeric[:, :checkpoint['net_config']['numeric_data_size']]
                )[1]
            elif XGBOOST_CONFIG['embedding_use_output']:
                embedding = torch.exp(
                    embed(
                        text, numeric[:, :checkpoint['net_config']
                                      ['numeric_data_size']])[0]) - 1
            else:  # expecting a built-in embedding layer -> taking the mean of the embeddings
                embedding = embed.emb(text).mean(axis=1)

            data[current_idx:current_idx+batch_size, XGBOOST_CONFIG['numeric_data_size']:-1] = \
                embedding.detach().cpu().numpy()
            data[current_idx:current_idx+batch_size, :XGBOOST_CONFIG['numeric_data_size']] = \
                numeric.detach().cpu().numpy()
            data[current_idx:current_idx + batch_size,
                 -1] = batch['target'].numpy()

            current_idx += batch_size

        return data

    annotated_data = get_data(annotated_dataset, "Preparing train.csv ...")
    split = int(len(annotated_dataset) * DATASET_CONFIG['train_percent'])
    np.save(XGBOOST_CONFIG['train_file'], annotated_data[1:split])
    np.save(XGBOOST_CONFIG['val_file'], annotated_data[split:])

    test_data = get_data(test_dataset, "Preparing evaluation.csv ...")
    with open(DATASET_CONFIG['test_csv_relative_path'], newline='') as csvfile:
        ids = [line[0] for line in list(csv.reader(csvfile))[1:]]

    ids = np.array(ids).reshape(np.shape(ids)[0], 1)
    prepared_test_data = np.concatenate((test_data, ids), axis=1)
    np.save(XGBOOST_CONFIG['test_file'], prepared_test_data)
예제 #15
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    # Set train validation set split
    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )

    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # Define two sets of parameters: those with weight decay, and those without
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    '''
    Create a scheduler to set the learning rate at each training step
    "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html)
    Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step
    '''
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )
    es = utils.EarlyStopping(patience=2, mode="max")
    print(f"Training is Starting for fold={fold}")
    logger.info("{} - {}".format("Training is Starting for fold", fold))
    #model=nn.DataParallel(model)

    for epoch in range(3):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard=engine.eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        logger.info("EPOCHS {} - Jaccard Score - {}".format(epoch, jaccard))
        es(jaccard, model, model_path=f"../models/nmodel_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
예제 #16
0
def predict(df_test):
    device = torch.device("cuda")
    model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model1 = TweetModel(conf=model_config)
    model1.to(device)
    model1.load_state_dict(torch.load("model_0.bin"))
    model1.eval()

    model2 = TweetModel(conf=model_config)
    model2.to(device)
    model2.load_state_dict(torch.load("model_1.bin"))
    model2.eval()

    model3 = TweetModel(conf=model_config)
    model3.to(device)
    model3.load_state_dict(torch.load("model_2.bin"))
    model3.eval()

    model4 = TweetModel(conf=model_config)
    model4.to(device)
    model4.load_state_dict(torch.load("model_3.bin"))
    model4.eval()

    model5 = TweetModel(conf=model_config)
    model5.to(device)
    model5.load_state_dict(torch.load("model_4.bin"))
    model5.eval()

    final_output = []

    test_dataset = TweetDataset(
            tweet=df_test.text.values,
            sentiment=df_test.sentiment.values,
            selected_text=df_test.selected_text.values
    )

    data_loader = torch.utils.data.DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )
    jaccards = utils.AverageMeter()
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)

            outputs_start1, outputs_end1 = model1(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start2, outputs_end2 = model2(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start3, outputs_end3 = model3(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start4, outputs_end4 = model4(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            outputs_start5, outputs_end5 = model5(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            outputs_start = (
                outputs_start1 
                + outputs_start2 
                + outputs_start3 
                + outputs_start4 
                + outputs_start5
            ) / 5
            outputs_end = (
                outputs_end1 
                + outputs_end2 
                + outputs_end3 
                + outputs_end4 
                + outputs_end5
            ) / 5
            
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            
            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                jaccard_score, output_sentence = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)
                final_output.append(output_sentence)
            jaccards.update(np.mean(jaccard_scores), ids.size(0))
    return final_output, jaccards.avg
예제 #17
0
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps):
    dfx = pd.read_csv(training_file)

    df_train = dfx[dfx.kfold != fold].reset_index(drop = True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop = True)

    # 训练集
    train_dataset = TweetDataset(
        tweet = df_train.text.values,
        sentiment = df_train.sentiment.values,
        selected_text = df_train.selected_text.values,
        tokenizer = tokenizer,
        max_len = max_len
    )
    # 验证集
    valid_dataset = TweetDataset(
        tweet = df_valid.text.values,
        sentiment = df_valid.sentiment.values,
        selected_text = df_valid.selected_text.values,
        tokenizer = tokenizer,
        max_len = max_len
    )

    train_sampler, valid_sampler = None, None
    if args.shuffle:
        train_sampler = RandomSampler(train_dataset)
        valid_sampler = SequentialSampler(valid_dataset)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = train_batch_size,
        num_workers = 4,
        sampler=train_sampler
    )


    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = valid_batch_size,
        num_workers = 2,
        sampler=valid_sampler
    )

    device = torch.device("cuda")

    model_config = transformers.RobertaConfig.from_pretrained(roberta_path)
    model_config.output_hidden_states = True
    model = TweetModel(roberta_path = roberta_path, conf = model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / train_batch_size * epochs)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.003},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    optimizer = AdamW(optimizer_parameters, lr = lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = num_warmup_steps,
        num_training_steps = num_train_steps
    )

    if args.fp16:
        # try:
        #     from apex import amp
        # except ImportError:
        #     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)


    # multi-gpu training (should be after apex fp16 initialization)
    if args.parallel:
        model = torch.nn.DataParallel(model)

    es = utils.EarlyStopping(patience = patience, mode = "max")
    print("Training is Starting for fold", fold)

    for epoch in range(epochs):
        train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print("Jaccard Score = ", jaccard)
        experiment.log_metric("jaccard", jaccard)
        es(jaccard, model, model_path = f"{save_path}/model_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
    del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader
    import gc
    gc.collect()
    torch.cuda.empty_cache()
예제 #18
0
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps):
    dfx = pd.read_csv(training_file)

    df_train = dfx[dfx.kfold != fold].reset_index(drop = True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop = True)

    train_sampler = None
    val_sampler = None


    # 训练集  # 3)使用DistributedSampler
    train_dataset = TweetDataset(
        tweet = df_train.text.values,
        sentiment = df_train.sentiment.values,
        selected_text = df_train.selected_text.values,
        tokenizer = tokenizer,
        max_len = max_len
    )
    # 验证集
    valid_dataset = TweetDataset(
        tweet = df_valid.text.values,
        sentiment = df_valid.sentiment.values,
        selected_text = df_valid.selected_text.values,
        tokenizer = tokenizer,
        max_len = max_len
    )

    if distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = train_batch_size,
        shuffle=(train_sampler is None),
        num_workers = 4,
        sampler=train_sampler
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = valid_batch_size,
        shuffle=False,
        num_workers = 2,
        sampler=val_sampler
    )

    device = torch.device("cuda")

    model_config = transformers.RobertaConfig.from_pretrained(roberta_path)
    model_config.output_hidden_states = True
    model = TweetModel(roberta_path = roberta_path, conf = model_config)
    model.to(device)

    if torch.cuda.device_count() > 1:
        num_device = torch.cuda.device_count()
        print("Let's use", num_device, "GPUs!")
        # 5) 封装
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[local_rank],
                                                          output_device=local_rank,
                                                          find_unused_parameters=True)


    num_train_steps = int(len(df_train) / train_batch_size * epochs)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    optimizer = AdamW(optimizer_parameters, lr = lr * num_device)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = num_warmup_steps,
        num_training_steps = num_train_steps
    )

    es = utils.EarlyStopping(patience = patience, mode = "max")
    print("Training is Starting for fold", fold)

    for epoch in range(epochs):
        if distributed:
            train_sampler.set_epoch(epoch)
        train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)

        # if distributed:
        #     jaccard_reduce = reduce_tensor(jaccard)
        # print("jaccard_reduce:", jaccard_reduce)
        if not distributed or (distributed and torch.distributed.get_rank() == 0):
            print("Jaccard Score = ", jaccard)
            es(jaccard, model, model_path = f"./bin/model_{fold}.bin")
            if es.early_stop:
                print("Early stopping")
                break

    del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader
    import gc
    gc.collect()
    torch.cuda.empty_cache()