Exemplo n.º 1
0
def build_loader(record_file, data_dir, dataset, batch_size, num_workers, shuffle, use_squad_v2=True, collate_fn=None):
    record_file_path = util.preprocessed_path(record_file, data_dir, dataset)
    dataset = SQuAD(record_file_path, use_squad_v2)
    num_samples = len(dataset)
    loader = data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        collate_fn=collate_fn
    )
    return loader, num_samples
Exemplo n.º 2
0
    def run_experiment(tbx, train_loader, train_size, eval_loader, eval_size,
                       gold_dict, config):
        from models import init_training
        max_grad_norm = args.max_grad_norm
        model, optimizer, scheduler, ema, step = init_training(
            args, word_vectors, char_vectors, device, config)

        prev_epoch_avg_nll = None
        for epoch in range(step, args.num_epochs):
            model.train()
            epoch_avg_nll = util.AverageMeter()
            with torch.enable_grad(), tqdm(total=train_size) as progress_bar:
                for sample in train_loader:
                    loss, batch_size, _ = process_sample(sample, model, None)
                    nll = loss.item()
                    epoch_avg_nll.update(nll)
                    tbx.add_scalar('train/NLL', loss.item(), step)
                    current_lr = optimizer.param_groups[0]['lr']
                    tbx.add_scalar('train/LR', current_lr, step)
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                    optimizer.step()
                    scheduler.step()

                    ema(model, step // batch_size)
                    progress_bar.update(batch_size)
                    progress_bar.set_postfix(epoch=epoch,
                                             STEP=util.millify(step),
                                             LR=current_lr,
                                             NLL=nll)
                    step += batch_size

            model.eval()
            ema.assign(model)
            results, pred_dict = evaluate(model, eval_loader, eval_size,
                                          gold_dict)
            ema.resume(model)

            tbx.add_scalar('eval/NLL', results['NLL'], step)
            if 'AvNA' in results:
                tbx.add_scalar('eval/AvNA', results['AvNA'], step)
            tbx.add_scalar('eval/F1', results['F1'], step)
            tbx.add_scalar('eval/EM', results['EM'], step)

            dev_eval_file = util.preprocessed_path(args.dev_eval_file,
                                                   args.data_dir, args.dataset)
            util.visualize(tbx,
                           pred_dict=pred_dict,
                           eval_dict=gold_dict,
                           step=step,
                           split='eval',
                           num_visuals=args.num_visuals)

            if ((min_nll_decrease is not None)
                    and (prev_epoch_avg_nll is not None) and
                (epoch_avg_nll.avg > prev_epoch_avg_nll - min_nll_decrease)):
                print(
                    f"Avg NLL {epoch_avg_nll.avg:.2f} > {prev_epoch_avg_nll:.2f} - {(min_nll_decrease):.2f}. Break"
                )
                break
            prev_epoch_avg_nll = epoch_avg_nll.avg

        return model, step
Exemplo n.º 3
0
def create_training_function(args, experiment_save_dir, k_fold_spits=None):
    device, args.gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(args.gpu_ids))
    word_vectors, char_vectors = train.load_embeddings(args)
    training_dataset = util.SQuAD(
        util.preprocessed_path(args.train_record_file, args.data_dir,
                               args.dataset), args.use_squad_v2)
    eval_dataset = util.SQuAD(
        util.preprocessed_path(args.dev_record_file, args.data_dir,
                               args.dataset), args.use_squad_v2)
    train_gold_dict = util.load_eval_file(args, args.train_eval_file)
    eval_gold_dict = util.load_eval_file(args, args.dev_eval_file)

    k_fold_spits = args.k_fold
    min_nll_decrease = args.min_nll_decrease

    def process_sample(sample, model, gold_dict=None):
        cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids = sample
        batch_size = cw_idxs.size(0)
        log_p1, log_p2 = model(cw_idxs.to(device), cc_idxs.to(device),
                               qw_idxs.to(device), qc_idxs.to(device))
        y1, y2 = y1.to(device), y2.to(device)
        nll_loss_1 = F.nll_loss(log_p1, y1)
        nll_loss_2 = F.nll_loss(log_p2, y2)
        loss = nll_loss_1 + nll_loss_2
        preds = None
        if gold_dict:
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)
            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           args.use_squad_v2)

        return loss, batch_size, preds

    def run_experiment(tbx, train_loader, train_size, eval_loader, eval_size,
                       gold_dict, config):
        from models import init_training
        max_grad_norm = args.max_grad_norm
        model, optimizer, scheduler, ema, step = init_training(
            args, word_vectors, char_vectors, device, config)

        prev_epoch_avg_nll = None
        for epoch in range(step, args.num_epochs):
            model.train()
            epoch_avg_nll = util.AverageMeter()
            with torch.enable_grad(), tqdm(total=train_size) as progress_bar:
                for sample in train_loader:
                    loss, batch_size, _ = process_sample(sample, model, None)
                    nll = loss.item()
                    epoch_avg_nll.update(nll)
                    tbx.add_scalar('train/NLL', loss.item(), step)
                    current_lr = optimizer.param_groups[0]['lr']
                    tbx.add_scalar('train/LR', current_lr, step)
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                    optimizer.step()
                    scheduler.step()

                    ema(model, step // batch_size)
                    progress_bar.update(batch_size)
                    progress_bar.set_postfix(epoch=epoch,
                                             STEP=util.millify(step),
                                             LR=current_lr,
                                             NLL=nll)
                    step += batch_size

            model.eval()
            ema.assign(model)
            results, pred_dict = evaluate(model, eval_loader, eval_size,
                                          gold_dict)
            ema.resume(model)

            tbx.add_scalar('eval/NLL', results['NLL'], step)
            if 'AvNA' in results:
                tbx.add_scalar('eval/AvNA', results['AvNA'], step)
            tbx.add_scalar('eval/F1', results['F1'], step)
            tbx.add_scalar('eval/EM', results['EM'], step)

            dev_eval_file = util.preprocessed_path(args.dev_eval_file,
                                                   args.data_dir, args.dataset)
            util.visualize(tbx,
                           pred_dict=pred_dict,
                           eval_dict=gold_dict,
                           step=step,
                           split='eval',
                           num_visuals=args.num_visuals)

            if ((min_nll_decrease is not None)
                    and (prev_epoch_avg_nll is not None) and
                (epoch_avg_nll.avg > prev_epoch_avg_nll - min_nll_decrease)):
                print(
                    f"Avg NLL {epoch_avg_nll.avg:.2f} > {prev_epoch_avg_nll:.2f} - {(min_nll_decrease):.2f}. Break"
                )
                break
            prev_epoch_avg_nll = epoch_avg_nll.avg

        return model, step

    def evaluate(model, eval_loader, eval_size, gold_dict):
        pred_dict = {}
        with torch.no_grad(), tqdm(total=eval_size) as progress_bar:
            nll_meter = util.AverageMeter()
            for sample in eval_loader:
                loss, batch_size, preds = process_sample(
                    sample, model, gold_dict)
                nll_meter.update(loss.item(), batch_size)
                pred_dict.update(preds)
                progress_bar.update(batch_size)
                progress_bar.set_postfix(NLL=nll_meter.avg)

            results = {
                **util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2),
                **{
                    'NLL': nll_meter.avg
                }
            }
        return results, pred_dict

    def kfold_training_function(experiment, config):
        avg_meter = util.MultiAverageMeter(['F1', 'EM', 'AvNA', 'NLL'])
        gold_dict = train_gold_dict
        for fold_index, train_loader, train_size, test_loader, test_size in kfold_generator(
                args, k_fold_spits, training_dataset):
            save_dir = os.path.join(experiment_save_dir,
                                    *GridSearch.experiment_path(experiment),
                                    f"fold={fold_index + 1}")
            tbx = SummaryWriter(save_dir)

            model, steps = run_experiment(tbx, train_loader, train_size,
                                          test_loader, test_size, gold_dict,
                                          config)
            results, _ = evaluate(model, test_loader, test_size, gold_dict)
            avg_meter.update(results, steps)

        return {**experiment, **avg_meter.avg}

    def training_function(experiment, config):
        import torch.utils.data as data
        train_loader = data.DataLoader(training_dataset,
                                       shuffle=True,
                                       batch_size=args.batch_size,
                                       num_workers=args.num_workers,
                                       collate_fn=None)
        eval_loader = data.DataLoader(eval_dataset,
                                      shuffle=False,
                                      batch_size=args.batch_size,
                                      num_workers=args.num_workers,
                                      collate_fn=None)
        save_dir = os.path.join(experiment_save_dir,
                                *GridSearch.experiment_path(experiment))
        tbx = SummaryWriter(save_dir)

        train_size = len(training_dataset)
        eval_size = len(eval_dataset)
        model, steps = run_experiment(tbx, train_loader, train_size,
                                      eval_loader, eval_size, eval_gold_dict,
                                      config)
        results, _ = evaluate(model, eval_loader, eval_size, eval_gold_dict)

        return {**experiment, **results}

    return kfold_training_function if k_fold_spits is not None else training_function
Exemplo n.º 4
0
def load_training_data(args):
    return util.SQuAD(
        util.preprocessed_path(args.train_record_file, args.data_dir,
                               args.dataset), args.use_squad_v2)
Exemplo n.º 5
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, args.dataset, mode="train")
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get data loader
    log.info('Building dataset...')
    collate_fn = None if args.name in ['qanet'] else util.collate_fn
    train_loader, train_size, dev_loader, dev_size = build_datasets(args, collate_fn)
    dev_eval_dict = util.load_eval_file(args, args.dev_eval_file)

    # Get model
    log.info(f'Building {args.name} model...')
    config = None
    if args.config_file:
        with open(args.config_file, 'r') as pf: config = json_load(pf)
        log.info(f"Model config: {dumps(config, indent=4, sort_keys=True)}")
    model, optimizer, scheduler, ema, step = init_training(args, *(load_embeddings(args)), device, config=config)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // train_size
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), tqdm(total=train_size) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                # if use_char_vectors:
                #     log_p1, log_p2 = model(cw_idxs.to(device), qw_idxs.to(device))
                # else:
                log_p1, log_p2 = model(cw_idxs.to(device), cc_idxs.to(device), qw_idxs.to(device), qc_idxs.to(device))

                y1, y2 = y1.to(device), y2.to(device)
                nll_loss_1 = F.nll_loss(log_p1, y1)
                nll_loss_2 = F.nll_loss(log_p2, y2)
                loss = nll_loss_1 + nll_loss_2
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                current_lr = optimizer.param_groups[0]['lr']
                progress_bar.set_postfix(epoch=epoch, STEP=util.millify(step), LR=current_lr, NLL=loss_val)

                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               current_lr,
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    dev_eval_file = util.preprocessed_path(args.dev_eval_file, args.data_dir, args.dataset)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)

                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_dict=dev_eval_dict,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Exemplo n.º 6
0
def load_embeddings(args):
    word_emb_file = util.preprocessed_path(args.word_emb_file, args.data_dir, args.dataset)
    char_emb_file = util.preprocessed_path(args.char_emb_file, args.data_dir, args.dataset)
    word_vectors = util.torch_from_json(word_emb_file)
    char_vectors = util.torch_from_json(char_emb_file)
    return word_vectors, char_vectors
Exemplo n.º 7
0
def pre_process(args):
    train_file = args.train_file
    dev_file = args.dev_file
    test_file = args.test_file

    word_emb_file = preprocessed_path(args.word_emb_file, args.data_dir, args.dataset)
    char_emb_file = preprocessed_path(args.char_emb_file, args.data_dir, args.dataset)
    word2idx_file = preprocessed_path(args.word2idx_file, args.data_dir, args.dataset)
    char2idx_file = preprocessed_path(args.char2idx_file, args.data_dir, args.dataset)

    train_eval_file = preprocessed_path(args.train_eval_file, args.data_dir, args.dataset)
    dev_eval_file = preprocessed_path(args.dev_eval_file, args.data_dir, args.dataset)

    dev_meta_file = preprocessed_path(args.dev_meta_file, args.data_dir, args.dataset)
    test_meta_file = preprocessed_path(args.test_meta_file, args.data_dir, args.dataset)

    train_record_file = preprocessed_path(args.train_record_file, args.data_dir, args.dataset)
    dev_record_file = preprocessed_path(args.dev_record_file, args.data_dir, args.dataset)
    test_record_file = preprocessed_path(args.test_record_file, args.data_dir, args.dataset)

    preprocess_targets = [
        word_emb_file, char_emb_file,
        word2idx_file, char2idx_file,
        train_eval_file, dev_eval_file,
        dev_meta_file, test_meta_file,
        train_record_file, dev_record_file, test_record_file
    ]

    if all([os.path.exists(p) for p in preprocess_targets]):
        print("Preprocess skipped: all target files exist")
        return

    # Process training set and use it to decide on the word/character vocabularies
    word_counter, char_counter = Counter(), Counter()
    train_examples, train_eval = process_file(train_file, "train", word_counter, char_counter)
    word_emb_mat, word2idx_dict = get_embedding(
        word_counter, 'word', emb_file=args.glove_file, vec_size=args.glove_dim, num_vectors=args.glove_num_vecs)
    char_emb_mat, char2idx_dict = get_embedding(
        char_counter, 'char', emb_file=None, vec_size=args.char_dim)

    # Process dev and test sets
    dev_examples, dev_eval = process_file(dev_file, "dev", word_counter, char_counter)
    build_features(args, train_examples, "train", train_record_file, word2idx_dict, char2idx_dict)
    dev_meta = build_features(args, dev_examples, "dev", dev_record_file, word2idx_dict, char2idx_dict)
    if args.include_test_examples:
        test_examples, test_eval = process_file(test_file, "test", word_counter, char_counter)
        save(args.test_eval_file, test_eval, message="test eval")
        test_meta = build_features(args, test_examples, "test",
                                   test_record_file, word2idx_dict, char2idx_dict, is_test=True)
        save(test_meta_file, test_meta, message="test meta")

    save(word_emb_file, word_emb_mat, message="word embedding")
    save(char_emb_file, char_emb_mat, message="char embedding")
    save(train_eval_file, train_eval, message="train eval")
    save(dev_eval_file, dev_eval, message="dev eval")
    save(word2idx_file, word2idx_dict, message="word dictionary")
    save(char2idx_file, char2idx_dict, message="char dictionary")
    save(dev_meta_file, dev_meta, message="dev meta")