def evaluate_src_model(_log, _run, device="cpu"): """Evaluate the source model.""" class Wrapper(torch.nn.Module): def __init__(self, model): super().__init__() self.model = model def forward(self, *args, **kwargs): emissions = self.model(*args, **kwargs)[0] bsz, slen, nl = emissions.shape scores = emissions[:, :-1].unsqueeze(2) assert scores.shape == (bsz, slen - 1, 1, nl) scores = scores.expand(bsz, slen - 1, nl, nl) scores = scores.clone() scores[:, -1] += emissions[:, -1].unsqueeze(2) assert scores.shape == (bsz, slen - 1, nl, nl) return scores model_name = "clulab/roberta-timex-semeval" _log.info("Loading %s", model_name) config = AutoConfig.from_pretrained(model_name) model = Wrapper( AutoModelForTokenClassification.from_pretrained(model_name, config=config)) model.to(device) _log.info("Evaluating") eval_score, _ = run_eval(model, config.id2label, read_samples_()) print_accs(eval_score) return eval_score["f1"]
def maybe_eval_on_test(state): if state["epoch"] != max_epoch: return _log.info("Evaluating on test") eval_state = run_eval(model, vocab, samples["test"]) print_accs(eval_state["counts"].accs, on="test", run=_run, step=state["n_iters"])
def eval_on_dev(state): _log.info("Evaluating on dev") eval_state = run_eval(model, vocab, samples["dev"]) accs = eval_state["counts"].accs print_accs(accs, run=_run, step=state["n_iters"]) ppt_loss = eval_state["mean_ppt_loss"] _log.info("dev_ppt_loss: %.4f", ppt_loss) _run.log_scalar("dev_ppt_loss", ppt_loss, step=state["n_iters"]) state["dev_accs"] = accs
def maybe_eval_on_test(state): if not state["better"]: return _log.info("Evaluating on test") eval_state = run_eval(model, vocab, samples["test"]) state["test_accs"] = eval_state["counts"].accs print_accs(state["test_accs"], on="test", run=_run, step=state["n_iters"])
def evaluate( _log, _run, temperature=1.0, artifacts_dir="artifacts", load_params="model.pth", device="cpu", save_confusion_matrix=False, ): """Evaluate a trained target model.""" model_name = "clulab/roberta-timex-semeval" _log.info("Loading %s", model_name) config = AutoConfig.from_pretrained(model_name) token_clf = AutoModelForTokenClassification.from_pretrained(model_name, config=config) model = RoBERTagger(token_clf, config.num_labels, temperature) artifacts_dir = Path(artifacts_dir) _log.info("Loading model parameters from %s", artifacts_dir / load_params) model.load_state_dict(torch.load(artifacts_dir / load_params, "cpu")) model.to(device) _log.info("Evaluating") eval_score, _ = run_eval(model, config.id2label, read_samples_(), confusion=save_confusion_matrix) c = eval_score.pop("confusion", None) print_accs(eval_score, on="test", run=_run) if c is not None: labels = set() for k in c.keys(): labels.update(k) if "O" in labels: labels.remove("O") labels = sorted(labels) labels.insert(0, "O") label2id = {l: i for i, l in enumerate(labels)} m = np.zeros((len(labels), len(labels))) for k, cnt in c.items(): m[label2id[k[0]], label2id[k[1]]] = cnt _log.info("Saving labels list in %s", artifacts_dir / "labels.pkl") with open(artifacts_dir / "labels.pkl", "wb") as f: pickle.dump(labels, f) _log.info("Saving confusion matrix in %s", artifacts_dir / "confusion.npy") np.save(artifacts_dir / "confusion.npy", m) return eval_score["f1"]
def evaluate(state): _log.info("Evaluating on train") eval_score, loss = run_eval(model, config.id2label, samples, compute_loss=True) if eval_score is not None: print_accs(eval_score, on="train", run=_run, step=state["n_iters"]) _log.info("train_ptst_loss: %.4f", loss) _run.log_scalar("train_ptst_loss", loss, step=state["n_iters"]) _log.info("Evaluating on eval") eval_score, _ = run_eval(model, config.id2label, eval_samples) if eval_score is not None: print_accs(eval_score, on="eval", run=_run, step=state["n_iters"]) state["eval_f1"] = None if eval_score is None else eval_score["f1"]
def eval_on_dev(state): _log.info("Evaluating on dev") eval_state = run_eval(model, vocab, samples["dev"]) accs = eval_state["counts"].accs print_accs(accs, run=_run, step=state["n_iters"]) scheduler.step(accs["las_nopunct"]) if eval_state["counts"].larcs_nopunct > state["dev_larcs_nopunct"]: state["better"] = True elif eval_state["counts"].larcs_nopunct < state["dev_larcs_nopunct"]: state["better"] = False elif eval_state["counts"].uarcs_nopunct > state["dev_uarcs_nopunct"]: state["better"] = True else: state["better"] = False if state["better"]: _log.info("Found new best result on dev!") state["dev_larcs_nopunct"] = eval_state["counts"].larcs_nopunct state["dev_uarcs_nopunct"] = eval_state["counts"].uarcs_nopunct state["dev_accs"] = accs state["dev_epoch"] = state["epoch"] else: _log.info("Not better, the best so far is epoch %d:", state["dev_epoch"]) print_accs(state["dev_accs"]) print_accs(state["test_accs"], on="test")
def validate(val_loader, model, use_val_th=False): batch_time = AverageMeter() # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (inputs, targets, infos) in enumerate(val_loader): if default.generate_features_all: logger.info('generating features, batch %d', i) filenames = [info[0] for info in infos] lesion_idxs = [info[1] for info in infos] inputs = [input.cuda() for input in inputs] unc_targets = targets[1] targets = targets[0] # compute output out = model(inputs) if config.SCORE_PROPAGATION: prob_np = out['class_prob2'].detach().cpu().numpy() scores_np = out['class_score2'].detach().cpu().numpy() else: prob_np = out['class_prob1'].detach().cpu().numpy() scores_np = out['class_score1'].detach().cpu().numpy() target1 = targets.numpy() > 0 pred_wt = unc_targets.numpy() == 0 if i == 0: target_all = target1 prob_all = prob_np score_all = scores_np lesion_idx_all = lesion_idxs pred_wt_all = pred_wt if default.generate_features_all: ft_all = out['emb'] else: target_all = np.vstack((target_all, target1)) prob_all = np.vstack((prob_all, prob_np)) score_all = np.vstack((score_all, scores_np)) pred_wt_all = np.vstack((pred_wt_all, pred_wt)) lesion_idx_all.extend(lesion_idxs) if default.generate_features_all: ft_all = np.vstack((ft_all, out['emb'])) if default.generate_features_all: save_ft_to_file(ft_all) assert 0, 'all features have been generated and saved.' if config.TEST.USE_CALIBRATED_TH: accs, pred_label_all = compute_all_acc_wt_th(target_all, prob_all, pred_wt_all, use_val_th) else: pred_label_all = score2label(prob_all, config.TEST.SCORE_PARAM) accs = compute_all_acc_wt(target_all, pred_label_all, prob_all, pred_wt_all) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % default.frequent == 0: logger.info('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' '{crit} {accs:.3f}' .format( i, len(val_loader), batch_time=batch_time, crit=config.TEST.CRITERION, accs=accs[config.TEST.CRITERION] )) print_accs(accs) accs['ex_neg'] = np.sum((target_all == 0) & pred_wt_all, axis=0) if use_val_th: # only save for test set not val set save_acc_to_file(accs, val_loader, 'all_terms') if default.mode == 'infer' and use_val_th: save_test_scores_to_file(score_all, pred_label_all, target_all, accs, lesion_idx_all) return accs
def validate(val_loader, model, use_val_th=False): batch_time = AverageMeter() # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (inputs, targets, infos) in enumerate(val_loader): if default.generate_features_all: logger.info('generating features, batch %d', i) filenames = [info[0] for info in infos] lesion_idxs = [info[1] for info in infos] inputs = [input.cuda() for input in inputs] unc_targets = targets[1] targets = targets[0] # compute output out = model(inputs) if config.SCORE_PROPAGATION: prob_np = out['class_prob2'].detach().cpu().numpy() scores_np = out['class_score2'].detach().cpu().numpy() else: prob_np = out['class_prob1'].detach().cpu().numpy() scores_np = out['class_score1'].detach().cpu().numpy() target1 = targets.numpy() > 0 pred_wt = unc_targets.numpy() == 0 if i == 0: target_all = target1 prob_all = prob_np score_all = scores_np lesion_idx_all = lesion_idxs pred_wt_all = pred_wt if default.generate_features_all: ft_all = out['emb'] else: target_all = np.vstack((target_all, target1)) prob_all = np.vstack((prob_all, prob_np)) score_all = np.vstack((score_all, scores_np)) pred_wt_all = np.vstack((pred_wt_all, pred_wt)) lesion_idx_all.extend(lesion_idxs) if default.generate_features_all: ft_all = np.vstack((ft_all, out['emb'])) if default.generate_features_all: save_ft_to_file(ft_all) assert 0, 'all features have been generated and saved.' if config.TEST.USE_CALIBRATED_TH: accs, pred_label_all = compute_all_acc_wt_th( target_all, prob_all, pred_wt_all, use_val_th) else: pred_label_all = score2label(prob_all, config.TEST.SCORE_PARAM) accs = compute_all_acc_wt(target_all, pred_label_all, prob_all, pred_wt_all) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % default.frequent == 0: logger.info('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' '{crit} {accs:.3f}'.format( i, len(val_loader), batch_time=batch_time, crit=config.TEST.CRITERION, accs=accs[config.TEST.CRITERION])) print_accs(accs) accs['ex_neg'] = np.sum((target_all == 0) & pred_wt_all, axis=0) if use_val_th: # only save for test set not val set save_acc_to_file(accs, val_loader, 'all_terms') if default.mode == 'infer' and use_val_th: save_test_scores_to_file(score_all, pred_label_all, target_all, accs, lesion_idx_all) return accs
def evaluate( _log, _run, max_length=None, artifacts_dir="artifacts", load_params="model.pth", word_emb_path="wiki.id.vec", device="cpu", ): """Evaluate a trained self-attention graph-based parser.""" if max_length is None: max_length = {} artifacts_dir = Path(artifacts_dir) samples = {} try: samples["dev"] = list( read_samples(which="dev", max_length=max_length.get("dev"))) except FileNotFoundError: _log.info("Dev set is not found, skipping") samples["test"] = list( read_samples(which="test", max_length=max_length.get("test"))) for wh in samples: n_toks = sum(len(s["words"]) for s in samples[wh]) _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh, n_toks) path = artifacts_dir / "vocab.yml" _log.info("Loading source vocabulary from %s", path) vocab = load(path.read_text(encoding="utf8")) for name in vocab.keys(): _log.info("Found %d %s", len(vocab[name]), name) _log.info("Extending vocab with target words") old_n_words = len(vocab["words"]) vocab.extend(chain(*samples.values()), ["words"]) _log.info("Found %d words now", len(vocab["words"])) samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples} path = artifacts_dir / "model.yml" _log.info("Loading model from metadata %s", path) model = load(path.read_text(encoding="utf8")) path = artifacts_dir / load_params _log.info("Loading model parameters from %s", path) model.load_state_dict(torch.load(path, "cpu")) if len(vocab["words"]) > old_n_words: _log.info("Creating extended word embedding layer") if word_emb_path: kv = KeyedVectors.load_word2vec_format(word_emb_path) assert model.word_emb.embedding_dim == kv.vector_size else: _log.warning( "Word embedding file not specified; any extra target words will be treated as unks" ) kv = None with torch.no_grad(): model.word_emb = torch.nn.Embedding.from_pretrained( extend_word_embedding( model.word_emb.weight, vocab["words"], kv, vocab["words"].index(vocab.UNK_TOKEN), )) model.to(device) dev_accs = {} for wh in samples: _log.info("Evaluating on %s", wh) state = run_eval(model, vocab, samples[wh]) accs = state["counts"].accs if wh == "dev": dev_accs = accs print_accs(accs, on=wh, run=_run) if "type2counts" in state: _log.info("Type-wise accuracies:") for type_, c in state["type2counts"].items(): for key, acc in c.accs.items(): metric_name = f"{wh}_{type_}_{key}" _log.info(f"{metric_name}: {acc:.2%}") _run.log_scalar(metric_name, acc) for suffix in ("", "_nopunct"): metric_name = f"{wh}_{type_}_n_arcs{suffix}" _log.info("%s: %d", metric_name, getattr(c, f"n_arcs{suffix}")) _run.log_scalar(metric_name, getattr(c, f"n_arcs{suffix}")) return dev_accs.get("las_nopunct")
def eval_on_dev(state): _log.info("Evaluating on dev") eval_state = run_eval(model, vocab, samples["dev"]) accs = eval_state["counts"].accs print_accs(accs, run=_run, step=state["n_iters"]) state["dev_accs"] = accs