def evaluate(model, dataset, dataloader, tokenizer, opt): loss, curr_loss = 0.0, 0.0 model.eval() if hasattr(model, "module"): model = model.module if opt.write_crossattention_scores: model.overwrite_forward_crossattention() model.reset_score_storage() total = 0 exactmatch = [] if opt.write_results: write_path = Path(opt.checkpoint_dir) / opt.name / 'test_results' fw = open(write_path / '%d.txt' % opt.global_rank, 'a') with torch.no_grad(): for i, batch in enumerate(dataloader): (idx, _, _, context_ids, context_mask) = batch if opt.write_crossattention_scores: model.reset_score_storage() outputs = model.generate( input_ids=context_ids.cuda(), attention_mask=context_mask.cuda(), max_length=50, ) if opt.write_crossattention_scores: crossattention_scores = model.get_crossattention_scores( context_mask.cuda()) for k, o in enumerate(outputs): ans = tokenizer.decode(o, skip_special_tokens=True) example = dataset.get_example(idx[k]) question = example['question'] gold = example['answers'] exid = example['id'] score = src.evaluation.ems(ans, gold) exactmatch.append(score) if opt.write_results: fw.write(str(exid) + "\t" + ans + '\n') if opt.write_crossattention_scores: ctxs = example['ctxs'] for j in range(context_ids.size(1)): ctxs[j]['score'] = crossattention_scores[k, j].item() total += 1 if (i + 1) % opt.eval_print_freq == 0: log = f'Process rank:{opt.global_rank}, {i+1} / {len(dataloader)}' log += f' -- average = {np.mean(exactmatch):.3f}' logger.warning(log) logger.warning( f'Process rank:{opt.global_rank}, total {total} -- average = {np.mean(exactmatch):.3f}' ) if opt.is_distributed: torch.distributed.barrier() score, total = src.util.weighted_average(np.mean(exactmatch), total, opt) return score, total
def evaluate(model, dataset, tokenizer, collator, opt): sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=opt.per_gpu_batch_size, drop_last=False, num_workers=10, collate_fn=collator ) model.eval() total = 0 exactmatch = [] model = model.module if hasattr(model, "module") else model with torch.no_grad(): for i, batch in enumerate(dataloader): (idx, _, _, context_ids, context_mask) = batch outputs = model.generate( input_ids=context_ids.cuda(), attention_mask=context_mask.cuda(), max_length=50 ) for k, o in enumerate(outputs): ans = tokenizer.decode(o, skip_special_tokens=True) gold = dataset.get_example(idx[k])['answers'] score = src.evaluation.ems(ans, gold) total += 1 exactmatch.append(score) exactmatch, total = src.util.weighted_average(np.mean(exactmatch), total, opt) return exactmatch