示例#1
0
def evaluate_src_model(_log, _run, device="cpu"):
    """Evaluate the source model."""
    class Wrapper(torch.nn.Module):
        def __init__(self, model):
            super().__init__()
            self.model = model

        def forward(self, *args, **kwargs):
            emissions = self.model(*args, **kwargs)[0]
            bsz, slen, nl = emissions.shape

            scores = emissions[:, :-1].unsqueeze(2)
            assert scores.shape == (bsz, slen - 1, 1, nl)
            scores = scores.expand(bsz, slen - 1, nl, nl)

            scores = scores.clone()
            scores[:, -1] += emissions[:, -1].unsqueeze(2)
            assert scores.shape == (bsz, slen - 1, nl, nl)

            return scores

    model_name = "clulab/roberta-timex-semeval"
    _log.info("Loading %s", model_name)
    config = AutoConfig.from_pretrained(model_name)
    model = Wrapper(
        AutoModelForTokenClassification.from_pretrained(model_name,
                                                        config=config))
    model.to(device)

    _log.info("Evaluating")
    eval_score, _ = run_eval(model, config.id2label, read_samples_())
    print_accs(eval_score)

    return eval_score["f1"]
示例#2
0
    def maybe_eval_on_test(state):
        if state["epoch"] != max_epoch:
            return

        _log.info("Evaluating on test")
        eval_state = run_eval(model, vocab, samples["test"])
        print_accs(eval_state["counts"].accs,
                   on="test",
                   run=_run,
                   step=state["n_iters"])
示例#3
0
    def eval_on_dev(state):
        _log.info("Evaluating on dev")
        eval_state = run_eval(model, vocab, samples["dev"])
        accs = eval_state["counts"].accs
        print_accs(accs, run=_run, step=state["n_iters"])

        ppt_loss = eval_state["mean_ppt_loss"]
        _log.info("dev_ppt_loss: %.4f", ppt_loss)
        _run.log_scalar("dev_ppt_loss", ppt_loss, step=state["n_iters"])

        state["dev_accs"] = accs
示例#4
0
    def maybe_eval_on_test(state):
        if not state["better"]:
            return

        _log.info("Evaluating on test")
        eval_state = run_eval(model, vocab, samples["test"])
        state["test_accs"] = eval_state["counts"].accs
        print_accs(state["test_accs"],
                   on="test",
                   run=_run,
                   step=state["n_iters"])
示例#5
0
def evaluate(
    _log,
    _run,
    temperature=1.0,
    artifacts_dir="artifacts",
    load_params="model.pth",
    device="cpu",
    save_confusion_matrix=False,
):
    """Evaluate a trained target model."""
    model_name = "clulab/roberta-timex-semeval"
    _log.info("Loading %s", model_name)
    config = AutoConfig.from_pretrained(model_name)
    token_clf = AutoModelForTokenClassification.from_pretrained(model_name,
                                                                config=config)
    model = RoBERTagger(token_clf, config.num_labels, temperature)

    artifacts_dir = Path(artifacts_dir)
    _log.info("Loading model parameters from %s", artifacts_dir / load_params)
    model.load_state_dict(torch.load(artifacts_dir / load_params, "cpu"))
    model.to(device)

    _log.info("Evaluating")
    eval_score, _ = run_eval(model,
                             config.id2label,
                             read_samples_(),
                             confusion=save_confusion_matrix)
    c = eval_score.pop("confusion", None)
    print_accs(eval_score, on="test", run=_run)
    if c is not None:
        labels = set()
        for k in c.keys():
            labels.update(k)
        if "O" in labels:
            labels.remove("O")
        labels = sorted(labels)
        labels.insert(0, "O")

        label2id = {l: i for i, l in enumerate(labels)}
        m = np.zeros((len(labels), len(labels)))
        for k, cnt in c.items():
            m[label2id[k[0]], label2id[k[1]]] = cnt

        _log.info("Saving labels list in %s", artifacts_dir / "labels.pkl")
        with open(artifacts_dir / "labels.pkl", "wb") as f:
            pickle.dump(labels, f)
        _log.info("Saving confusion matrix in %s",
                  artifacts_dir / "confusion.npy")
        np.save(artifacts_dir / "confusion.npy", m)

    return eval_score["f1"]
示例#6
0
    def evaluate(state):
        _log.info("Evaluating on train")
        eval_score, loss = run_eval(model,
                                    config.id2label,
                                    samples,
                                    compute_loss=True)
        if eval_score is not None:
            print_accs(eval_score, on="train", run=_run, step=state["n_iters"])
        _log.info("train_ptst_loss: %.4f", loss)
        _run.log_scalar("train_ptst_loss", loss, step=state["n_iters"])

        _log.info("Evaluating on eval")
        eval_score, _ = run_eval(model, config.id2label, eval_samples)
        if eval_score is not None:
            print_accs(eval_score, on="eval", run=_run, step=state["n_iters"])

        state["eval_f1"] = None if eval_score is None else eval_score["f1"]
示例#7
0
    def eval_on_dev(state):
        _log.info("Evaluating on dev")
        eval_state = run_eval(model, vocab, samples["dev"])
        accs = eval_state["counts"].accs
        print_accs(accs, run=_run, step=state["n_iters"])

        scheduler.step(accs["las_nopunct"])

        if eval_state["counts"].larcs_nopunct > state["dev_larcs_nopunct"]:
            state["better"] = True
        elif eval_state["counts"].larcs_nopunct < state["dev_larcs_nopunct"]:
            state["better"] = False
        elif eval_state["counts"].uarcs_nopunct > state["dev_uarcs_nopunct"]:
            state["better"] = True
        else:
            state["better"] = False

        if state["better"]:
            _log.info("Found new best result on dev!")
            state["dev_larcs_nopunct"] = eval_state["counts"].larcs_nopunct
            state["dev_uarcs_nopunct"] = eval_state["counts"].uarcs_nopunct
            state["dev_accs"] = accs
            state["dev_epoch"] = state["epoch"]
        else:
            _log.info("Not better, the best so far is epoch %d:",
                      state["dev_epoch"])
            print_accs(state["dev_accs"])
            print_accs(state["test_accs"], on="test")
示例#8
0
def validate(val_loader, model, use_val_th=False):
    batch_time = AverageMeter()
    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (inputs, targets, infos) in enumerate(val_loader):
            if default.generate_features_all:
                logger.info('generating features, batch %d', i)
            filenames = [info[0] for info in infos]
            lesion_idxs = [info[1] for info in infos]
            inputs = [input.cuda() for input in inputs]
            unc_targets = targets[1]
            targets = targets[0]

            # compute output
            out = model(inputs)
            if config.SCORE_PROPAGATION:
                prob_np = out['class_prob2'].detach().cpu().numpy()
                scores_np = out['class_score2'].detach().cpu().numpy()
            else:
                prob_np = out['class_prob1'].detach().cpu().numpy()
                scores_np = out['class_score1'].detach().cpu().numpy()

            target1 = targets.numpy() > 0
            pred_wt = unc_targets.numpy() == 0
            if i == 0:
                target_all = target1
                prob_all = prob_np
                score_all = scores_np
                lesion_idx_all = lesion_idxs
                pred_wt_all = pred_wt
                if default.generate_features_all:
                    ft_all = out['emb']
            else:
                target_all = np.vstack((target_all, target1))
                prob_all = np.vstack((prob_all, prob_np))
                score_all = np.vstack((score_all, scores_np))
                pred_wt_all = np.vstack((pred_wt_all, pred_wt))
                lesion_idx_all.extend(lesion_idxs)
                if default.generate_features_all:
                    ft_all = np.vstack((ft_all, out['emb']))

        if default.generate_features_all:
            save_ft_to_file(ft_all)
            assert 0, 'all features have been generated and saved.'

        if config.TEST.USE_CALIBRATED_TH:
            accs, pred_label_all = compute_all_acc_wt_th(target_all, prob_all, pred_wt_all, use_val_th)
        else:
            pred_label_all = score2label(prob_all, config.TEST.SCORE_PARAM)
            accs = compute_all_acc_wt(target_all, pred_label_all, prob_all, pred_wt_all)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % default.frequent == 0:
            logger.info('Test: [{0}/{1}]\t'
                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        '{crit} {accs:.3f}'
                        .format(
                   i, len(val_loader), batch_time=batch_time, crit=config.TEST.CRITERION,
                   accs=accs[config.TEST.CRITERION]
            ))

        print_accs(accs)
        accs['ex_neg'] = np.sum((target_all == 0) & pred_wt_all, axis=0)

        if use_val_th:  # only save for test set not val set
            save_acc_to_file(accs, val_loader, 'all_terms')
        if default.mode == 'infer' and use_val_th:
            save_test_scores_to_file(score_all, pred_label_all, target_all, accs, lesion_idx_all)

    return accs
示例#9
0
def validate(val_loader, model, use_val_th=False):
    batch_time = AverageMeter()
    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (inputs, targets, infos) in enumerate(val_loader):
            if default.generate_features_all:
                logger.info('generating features, batch %d', i)
            filenames = [info[0] for info in infos]
            lesion_idxs = [info[1] for info in infos]
            inputs = [input.cuda() for input in inputs]
            unc_targets = targets[1]
            targets = targets[0]

            # compute output
            out = model(inputs)
            if config.SCORE_PROPAGATION:
                prob_np = out['class_prob2'].detach().cpu().numpy()
                scores_np = out['class_score2'].detach().cpu().numpy()
            else:
                prob_np = out['class_prob1'].detach().cpu().numpy()
                scores_np = out['class_score1'].detach().cpu().numpy()

            target1 = targets.numpy() > 0
            pred_wt = unc_targets.numpy() == 0
            if i == 0:
                target_all = target1
                prob_all = prob_np
                score_all = scores_np
                lesion_idx_all = lesion_idxs
                pred_wt_all = pred_wt
                if default.generate_features_all:
                    ft_all = out['emb']
            else:
                target_all = np.vstack((target_all, target1))
                prob_all = np.vstack((prob_all, prob_np))
                score_all = np.vstack((score_all, scores_np))
                pred_wt_all = np.vstack((pred_wt_all, pred_wt))
                lesion_idx_all.extend(lesion_idxs)
                if default.generate_features_all:
                    ft_all = np.vstack((ft_all, out['emb']))

        if default.generate_features_all:
            save_ft_to_file(ft_all)
            assert 0, 'all features have been generated and saved.'

        if config.TEST.USE_CALIBRATED_TH:
            accs, pred_label_all = compute_all_acc_wt_th(
                target_all, prob_all, pred_wt_all, use_val_th)
        else:
            pred_label_all = score2label(prob_all, config.TEST.SCORE_PARAM)
            accs = compute_all_acc_wt(target_all, pred_label_all, prob_all,
                                      pred_wt_all)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % default.frequent == 0:
            logger.info('Test: [{0}/{1}]\t'
                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        '{crit} {accs:.3f}'.format(
                            i,
                            len(val_loader),
                            batch_time=batch_time,
                            crit=config.TEST.CRITERION,
                            accs=accs[config.TEST.CRITERION]))

        print_accs(accs)
        accs['ex_neg'] = np.sum((target_all == 0) & pred_wt_all, axis=0)

        if use_val_th:  # only save for test set not val set
            save_acc_to_file(accs, val_loader, 'all_terms')
        if default.mode == 'infer' and use_val_th:
            save_test_scores_to_file(score_all, pred_label_all, target_all,
                                     accs, lesion_idx_all)

    return accs
示例#10
0
def evaluate(
    _log,
    _run,
    max_length=None,
    artifacts_dir="artifacts",
    load_params="model.pth",
    word_emb_path="wiki.id.vec",
    device="cpu",
):
    """Evaluate a trained self-attention graph-based parser."""
    if max_length is None:
        max_length = {}

    artifacts_dir = Path(artifacts_dir)

    samples = {}
    try:
        samples["dev"] = list(
            read_samples(which="dev", max_length=max_length.get("dev")))
    except FileNotFoundError:
        _log.info("Dev set is not found, skipping")
    samples["test"] = list(
        read_samples(which="test", max_length=max_length.get("test")))

    for wh in samples:
        n_toks = sum(len(s["words"]) for s in samples[wh])
        _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh,
                  n_toks)

    path = artifacts_dir / "vocab.yml"
    _log.info("Loading source vocabulary from %s", path)
    vocab = load(path.read_text(encoding="utf8"))
    for name in vocab.keys():
        _log.info("Found %d %s", len(vocab[name]), name)

    _log.info("Extending vocab with target words")
    old_n_words = len(vocab["words"])
    vocab.extend(chain(*samples.values()), ["words"])
    _log.info("Found %d words now", len(vocab["words"]))

    samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples}

    path = artifacts_dir / "model.yml"
    _log.info("Loading model from metadata %s", path)
    model = load(path.read_text(encoding="utf8"))

    path = artifacts_dir / load_params
    _log.info("Loading model parameters from %s", path)
    model.load_state_dict(torch.load(path, "cpu"))

    if len(vocab["words"]) > old_n_words:
        _log.info("Creating extended word embedding layer")
        if word_emb_path:
            kv = KeyedVectors.load_word2vec_format(word_emb_path)
            assert model.word_emb.embedding_dim == kv.vector_size
        else:
            _log.warning(
                "Word embedding file not specified; any extra target words will be treated as unks"
            )
            kv = None
        with torch.no_grad():
            model.word_emb = torch.nn.Embedding.from_pretrained(
                extend_word_embedding(
                    model.word_emb.weight,
                    vocab["words"],
                    kv,
                    vocab["words"].index(vocab.UNK_TOKEN),
                ))

    model.to(device)
    dev_accs = {}
    for wh in samples:
        _log.info("Evaluating on %s", wh)
        state = run_eval(model, vocab, samples[wh])
        accs = state["counts"].accs
        if wh == "dev":
            dev_accs = accs
        print_accs(accs, on=wh, run=_run)

        if "type2counts" in state:
            _log.info("Type-wise accuracies:")
            for type_, c in state["type2counts"].items():
                for key, acc in c.accs.items():
                    metric_name = f"{wh}_{type_}_{key}"
                    _log.info(f"{metric_name}: {acc:.2%}")
                    _run.log_scalar(metric_name, acc)

                for suffix in ("", "_nopunct"):
                    metric_name = f"{wh}_{type_}_n_arcs{suffix}"
                    _log.info("%s: %d", metric_name,
                              getattr(c, f"n_arcs{suffix}"))
                    _run.log_scalar(metric_name, getattr(c, f"n_arcs{suffix}"))

    return dev_accs.get("las_nopunct")
示例#11
0
 def eval_on_dev(state):
     _log.info("Evaluating on dev")
     eval_state = run_eval(model, vocab, samples["dev"])
     accs = eval_state["counts"].accs
     print_accs(accs, run=_run, step=state["n_iters"])
     state["dev_accs"] = accs