示例#1
0
def evaluate(model, data_loader, device, eval_file, args):
    nll_meter = stats.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, "r") as fh:
        gold_dict = json_load(fh)
    with torch.no_grad():
        for x, y, c_padding_mask, c_starts, ids in data_loader:
            batch_size = x.size(0)
            _, loss_val, scores = forward(x, y, c_padding_mask, args, device,
                                          model)
            nll_meter.update(loss_val, batch_size)

            # Get F1 and EM scores
            p1, p2 = model.module.get_prob(scores).split(1, dim=-1)
            p1, p2 = p1.squeeze(-1), p2.squeeze(-1)
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            preds, _ = util.convert_tokens(
                gold_dict,
                ids.tolist(),
                starts.tolist(),
                ends.tolist(),
                args.use_squad_v2,
                c_starts.tolist(),
            )
            pred_dict.update(preds)

    model.train()

    results = {"NLL": nll_meter.avg}
    results.update(eval.eval_dicts(gold_dict, pred_dict, args.use_squad_v2))
    return results, pred_dict
def evaluate(
    model,
    data_loader,
    device,
    eval_file,
    max_len,
    use_squad_v2,
    args,
    padding_idx,
):
    nll_meter = stats.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, "r") as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            batch_size = cw_idxs.size(0)
            _, loss_val, scores = forward(cw_idxs, qw_idxs, y1, y2,
                                          padding_idx, args, device, model)
            nll_meter.update(loss_val, batch_size)

            # Get F1 and EM scores
            p1, p2 = model.module.get_prob(scores).split(1, dim=-1)
            p1, p2 = p1.squeeze(-1), p2.squeeze(-1)
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = eval.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [
        ("NLL", nll_meter.avg),
        ("F1", results["F1"]),
        ("EM", results["EM"]),
    ]
    if use_squad_v2:
        results_list.append(("AvNA", results["AvNA"]))
    results = OrderedDict(results_list)

    return results, pred_dict
示例#3
0
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2):
    nll_meter = stats.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, "r") as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = {"NLL": nll_meter.avg}
    results.update(eval.eval_dicts(gold_dict, pred_dict, use_squad_v2))
    return results, pred_dict
示例#4
0
def test(args):
    # Set up logging
    log = util.get_logger(args.save_dir, args.name)
    log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}")
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info("Loading embeddings...")
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info("Building model...")
    model = BiDAF(
        word_vectors=word_vectors,
        hidden_size=args.hidden_size,
        use_glove=args.use_glove,
    )
    model = nn.DataParallel(model, gpu_ids)
    log.info(f"Loading checkpoint from {args.load_path}...")
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info("Building dataset...")
    record_file = vars(args)[f"{args.split}_record_file"]
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        collate_fn=collate_fn,
    )

    # Evaluate
    log.info(f"Evaluating on {args.split} split...")
    nll_meter = stats.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f"{args.split}_eval_file"]
    with open(eval_file, "r") as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != "test":
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(
                gold_dict,
                ids.tolist(),
                starts.tolist(),
                ends.tolist(),
                args.use_squad_v2,
            )
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != "test":
        results = {"NLL": nll_meter.avg}
        results.update(eval.eval_dicts(gold_dict, pred_dict,
                                       args.use_squad_v2))

        # Log to console
        results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items())
        log.info(f"{args.split.title()} {results_str}")

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(
            tbx,
            pred_dict=pred_dict,
            eval_path=eval_file,
            step=0,
            split=args.split,
            num_visuals=args.num_visuals,
        )

    # Write submission file
    if args.split == "dev":
        sub_path = join(args.save_dir, "val" + "_" + args.sub_file)
    else:
        sub_path = join(args.save_dir, args.split + "_" + args.sub_file)
    log.info(f"Writing submission file to {sub_path}...")
    eval.write_submission(sub_path, sub_dict)
def test(args):
    # Set up logging
    log = util.get_logger(args.save_dir, args.name)
    log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}")
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info("Loading embeddings...")
    word_vectors = util.torch_from_json(args.word_emb_file)

    # TODO: Hardcode padding_idx
    padding_idx = 0

    # Get model
    log.info("Building model...")
    model = WordTransformerQA(
        dim=args.dim,
        n_heads=args.n_heads,
        ff_dim=args.ff_dim,
        activation=args.activation,
        dropout=args.dropout,
        attn_dropout=args.attn_dropout,
        act_dropout=args.act_dropout,
        n_layers=args.n_layers,
        max_positions=args.max_positions,
        word_vectors=word_vectors,
    )
    model = nn.DataParallel(model, gpu_ids)
    log.info(f"Loading checkpoint from {args.load_path}...")
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info("Building dataset...")
    record_file = vars(args)[f"{args.split}_record_file"]
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        collate_fn=collate_fn,
    )

    # Evaluate
    log.info(f"Evaluating on {args.split} split...")
    nll_meter = stats.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f"{args.split}_eval_file"]
    with open(eval_file, "r") as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            batch_size = cw_idxs.size(0)
            _, loss_val, scores = forward(cw_idxs, qw_idxs, y1, y2,
                                          padding_idx, args, device, model)
            nll_meter.update(loss_val, batch_size)

            # Get F1 and EM scores
            p1, p2 = model.module.get_prob(scores).split(1, dim=-1)
            p1, p2 = p1.squeeze(-1), p2.squeeze(-1)
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != "test":
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(
                gold_dict,
                ids.tolist(),
                starts.tolist(),
                ends.tolist(),
                args.use_squad_v2,
            )
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != "test":
        results = eval.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        results_list = [
            ("NLL", nll_meter.avg),
            ("F1", results["F1"]),
            ("EM", results["EM"]),
        ]
        if args.use_squad_v2:
            results_list.append(("AvNA", results["AvNA"]))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items())
        log.info(f"{args.split.title()} {results_str}")

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(
            tbx,
            pred_dict=pred_dict,
            eval_path=eval_file,
            step=0,
            split=args.split,
            num_visuals=args.num_visuals,
        )

    # Write submission file
    if args.split == "dev":
        sub_path = join(args.save_dir, "val" + "_" + args.sub_file)
    else:
        sub_path = join(args.save_dir, args.split + "_" + args.sub_file)
    log.info(f"Writing submission file to {sub_path}...")
    eval.write_submission(sub_path, sub_dict)
示例#6
0
def test(args):
    trainer = base_trainer.Trainer(is_train=False)
    args, device = get_args(args)
    args, log, tbx = trainer.setup(args)

    # Get BPE
    log.info("Loading BPE...")
    bpe = get_bpe(args)
    log.info("Loaded {} BPE tokens".format(len(bpe)))

    # Get data loader
    log.info("Building dataset...")
    record_file = vars(args)[f"{args.split}_record_file"]
    dataset, data_loader = get_dataset(args,
                                       record_file,
                                       shuffle=False,
                                       randomize=False)

    # Get model
    log.info("Building model...")
    model = get_model(args, bpe)
    model = trainer.setup_model(model, device)
    model.eval()

    trainer.setup_close()

    # Evaluate
    log.info(f"Evaluating on {args.split} split...")
    nll_meter = stats.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f"{args.split}_eval_file"]
    with open(eval_file, "r") as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar:
        for x, y, c_padding_mask, c_starts, ids in data_loader:
            batch_size = x.size(0)
            _, loss_val, scores = forward(x, y, c_padding_mask, args, device,
                                          model)
            nll_meter.update(loss_val, batch_size)

            # Get F1 and EM scores
            p1, p2 = model.module.get_prob(scores).split(1, dim=-1)
            p1, p2 = p1.squeeze(-1), p2.squeeze(-1)
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != "test":
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(
                gold_dict,
                ids.tolist(),
                starts.tolist(),
                ends.tolist(),
                args.use_squad_v2,
                c_starts.tolist(),
            )
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != "test":

        results = {"NLL": nll_meter.avg}
        results.update(eval.eval_dicts(gold_dict, pred_dict,
                                       args.use_squad_v2))

        # Log to console
        results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items())
        log.info(f"{args.split.title()} {results_str}")

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(
            tbx,
            pred_dict=pred_dict,
            eval_path=eval_file,
            step=0,
            split=args.split,
            num_visuals=args.num_visuals,
        )

    # Write submission file
    if args.split == "dev":
        sub_path = join(args.save_dir, "val" + "_" + args.sub_file)
    else:
        sub_path = join(args.save_dir, args.split + "_" + args.sub_file)
    log.info(f"Writing submission file to {sub_path}...")
    eval.write_submission(sub_path, sub_dict)