def validate_unofficial(args, data_loader, model, global_stats, mode): """Run one full unofficial validation. """ eval_time = utils.Timer() start_acc = utils.AverageMeter() end_acc = utils.AverageMeter() exact_match = utils.AverageMeter() # Make predictions examples = 0 for ex in data_loader: batch_size = ex[0].size(0) pred_s, pred_e, _ = model.predict(ex) target_s, target_e = ex[-3:-1] # We get metrics for independent start/end and joint start/end accuracies = eval_accuracies(pred_s, target_s, pred_e, target_e) start_acc.update(accuracies[0], batch_size) end_acc.update(accuracies[1], batch_size) exact_match.update(accuracies[2], batch_size) # If getting train accuracies, sample max 10k examples += batch_size if mode == 'train' and examples >= 1e4: break logger.info('%s valid unofficial: Epoch = %d | start = %.2f | ' % (mode, global_stats['epoch'], start_acc.avg) + 'end = %.2f | exact = %.2f | examples = %d | ' % (end_acc.avg, exact_match.avg, examples) + 'valid time = %.2f (s)' % eval_time.time()) return {'exact_match': exact_match.avg}
def eval_accuracies(pred_s, target_s, pred_e, target_e): """An unofficial evalutation helper. Compute exact start/end/complete match accuracies for a batch. """ # Convert 1D tensors to lists of lists (compatibility) if torch.is_tensor(target_s): target_s = [[e] for e in target_s] target_e = [[e] for e in target_e] # Compute accuracies from targets batch_size = len(pred_s) start = utils.AverageMeter() end = utils.AverageMeter() em = utils.AverageMeter() for i in range(batch_size): # Start matches if pred_s[i] in target_s[i]: start.update(1) else: start.update(0) # End matches if pred_e[i] in target_e[i]: end.update(1) else: end.update(0) # Both start and end match if any([1 for _s, _e in zip(target_s[i], target_e[i]) if _s == pred_s[i] and _e == pred_e[i]]): em.update(1) else: em.update(0) return start.avg * 100, end.avg * 100, em.avg * 100
def validate_official(args, data_loader, model, global_stats, offsets, texts, answers): """Run one full official validation. Uses exact spans and same exact match/F1 score computation. Extra arguments: offsets: The character start/end indices for the tokens in each context. texts: Map of qid --> raw text of examples context (matches offsets). answers: Map of qid --> list of accepted answers. """ eval_time = utils.Timer() f1 = utils.AverageMeter() exact_match = utils.AverageMeter() nil_count = [ans[0] for ans in answers.values() if ans[0] == 'NIL'] print("nil answers:", len(nil_count)) # Run through examples examples = 0 for ex in data_loader: ex_id, batch_size = ex[-1], ex[0].size(0) pred_s, pred_e, _ = model.predict(ex) for i in range(batch_size): try: s_offset = offsets[ex_id[i]][pred_s[i][0]][0] #print("s_offset:", s_offset) e_offset = offsets[ex_id[i]][pred_e[i][0]][1] #print("e_offset:", e_offset) prediction = texts[ex_id[i]][s_offset:e_offset] #How can I print the question and context here? print("pred:", prediction) # Compute metrics print(answers[ex_id[i]]) ground_truths = answers[ex_id[i]] exact_match.update(utils.metric_max_over_ground_truths( utils.exact_match_score, prediction, ground_truths)) f1.update(utils.metric_max_over_ground_truths( utils.f1_score, prediction, ground_truths)) except KeyError: print("key error at key ",[ex_id[i]]) continue examples += batch_size logger.info('dev valid official: Epoch = %d | EM = %.2f | ' % (global_stats['epoch'], exact_match.avg * 100) + 'F1 = %.2f | examples = %d | valid time = %.2f (s)' % (f1.avg * 100, examples, eval_time.time())) return {'exact_match': exact_match.avg * 100, 'f1': f1.avg * 100}
def train(args, data_loader, model, global_stats): """Run through one epoch of model training with the provided data loader.""" # Initialize meters + timers train_loss = utils.AverageMeter() epoch_time = utils.Timer() # Run one epoch for idx, ex in enumerate(data_loader): try: train_loss.update(*model.update(ex)) if idx % args.display_iter == 0: logger.info('train: Epoch = %d | iter = %d/%d | ' % (global_stats['epoch'], idx, len(data_loader)) + 'loss = %.2f | elapsed time = %.2f (s)' % (train_loss.avg, global_stats['timer'].time())) train_loss.reset() except: logger.info(ex, idx) continue logger.info('train: Epoch %d done. Time for epoch = %.2f (s)' % (global_stats['epoch'], epoch_time.time())) # Checkpoint if args.checkpoint: model.checkpoint(args.model_file + '.checkpoint', global_stats['epoch'] + 1)
def validate_official(args, data_loader, model, global_stats, offsets, texts, answers): """Run one full official validation. Uses exact spans and same exact match/F1 score computation. Extra arguments: offsets: The character start/end indices for the tokens in each context. texts: Map of qid --> raw text of examples context (matches offsets). answers: Map of qid --> list of accepted answers. """ eval_time = utils.Timer() f1 = utils.AverageMeter() exact_match = utils.AverageMeter() # Run through examples examples = 0 for ex in data_loader: ex_id, batch_size = ex[-1], ex[0].size(0) pred_s, pred_e, _ = model.predict(ex) for i in range(batch_size): s_offset = offsets[ex_id[i]][pred_s[i][0]][0] e_offset = offsets[ex_id[i]][pred_e[i][0]][1] prediction = texts[ex_id[i]][s_offset:e_offset] # Compute metrics ground_truths = answers[ex_id[i]] exact_match.update( utils.metric_max_over_ground_truths(utils.exact_match_score, prediction, ground_truths)) f1.update( utils.metric_max_over_ground_truths(utils.f1_score, prediction, ground_truths)) examples += batch_size logger.info('dev valid official: Epoch = %d | EM = %.2f | ' % (global_stats['epoch'], exact_match.avg * 100) + 'F1 = %.2f | examples = %d | valid time = %.2f (s)' % (f1.avg * 100, examples, eval_time.time())) return {'exact_match': exact_match.avg * 100, 'f1': f1.avg * 100}