def set_sim(answer, prediction): ground_truths = [] for a in answer: ground_truths.append(" ".join([w for w in a])) res = utils.metric_max_over_ground_truths(utils.f1_score, prediction, ground_truths) return res
def validate_with_doc(args, data_loader, model, global_stats, exs_with_doc, docs_by_question, mode): '''Run one full unofficial validation with docs. Unofficial = doesn't use SQuAD script. ''' eval_time = utils.Timer() f1 = utils.AverageMeter() exact_match = utils.AverageMeter() logger.info('validate_with_doc') # Intialize counters examples = 0 aa = [0.0 for i in range(num_docs)] # increment only if example has answer bb = [0.0 for i in range(num_docs)] # increment regardless for idx, ex_with_doc in enumerate(data_loader): ex = ex_with_doc[0] batch_size, ex_id = ex[0].size(0), ex[-1] # --------------------------------------------------------------------- # Document Selector # --------------------------------------------------------------------- ''' ex_with_doc = [tensor] x1 = document word indices [batch * len_d] [tensor] x1_f = document word features indices [batch * len_d * nfeat] [tensor] x1_mask = document padding mask [batch * len_d] [tensor] x2 = question word indices [batch * len_q] [tensor] x2_mask = question padding mask [batch * len_q] [list] indices [batch] ''' scores_doc_num = model.predict_with_doc(ex_with_doc) scores = [{} for i in range(batch_size)] # --------------------------------------------------------------------- # Document Reader # --------------------------------------------------------------------- for idx_doc in range(0, num_docs): ex = ex_with_doc[idx_doc] pred_s, pred_e, pred_score = model.predict( ex, top_n=display_num) for i in range(batch_size): idx_doc_i = idx_doc %len(docs_by_question[ex_id[i]]) doc_text = docs_by_question[ex_id[i]][idx_doc_i]['document'] # try to read the 10 best predicted answers (this may trigger # an 'index out of range' exception) for k in range(display_num): try: prediction = [doc_text[j] for j in range(pred_s[i][k], pred_e[i][k]+1)] prediction = ' '.join(prediction).lower() # update prediction scores if (prediction not in scores[i]): scores[i][prediction] = 0 scores[i][prediction] += (pred_score[i][k] * scores_doc_num[i][idx_doc]) except: pass # Get the 10 most likely answers from the batch and see if the answer # is actually in there for i in range(batch_size): _, indices = scores_doc_num[i].sort(0, descending = True) for j in range(0, display_num): idx_doc = indices[j] idx_doc_i = idx_doc %len(docs_by_question[ex_id[i]]) doc_text = docs_by_question[ex_id[i]][idx_doc_i]['document'] ex_answer = exs_with_doc[ex_id[i]]['answer'] # Looking for the answer in the document... if (has_answer(args, ex_answer, doc_text)[0]): aa[j]= aa[j] + 1 bb[j]= bb[j]+1 # Update performance metrics for i in range(batch_size): best_score = 0 prediction = '' for key in scores[i]: if (scores[i][key] > best_score): best_score = scores[i][key] prediction = key ground_truths = [] ex_answer = exs_with_doc[ex_id[i]]['answer'] # Ground truth answers if (args.dataset == 'CuratedTrec'): # not applicable ground_truths = ex_answer else: for a in ex_answer: ground_truths.append(' '.join([w for w in a])) exact_match.update( utils.metric_max_over_ground_truths( utils.exact_match_score, prediction, ground_truths)) f1.update( utils.metric_max_over_ground_truths( utils.f1_score, prediction, ground_truths)) examples += batch_size if (mode=='train' and examples>=1000): break try: for j in range(display_num): if (j>0): aa[j]= aa[j]+aa[j-1] bb[j]= bb[j]+bb[j-1] except: pass txt = '{} valid official with doc: Epoch = {} | EM = {:.2f} | ' txt += 'F1 = {:.2f} | examples = {} | valid time = {:.2f} (s)' logger.info(txt.format( mode, global_stats['epoch'], exact_match.avg * 100, f1.avg * 100, examples, eval_time.time())) return {'exact_match': exact_match.avg * 100, 'f1': f1.avg * 100}
def validate_unofficial_with_doc(args, data_loader, model, global_stats, exs_with_doc, docs_by_question, mode): """Run one full unofficial validation with docs. Unofficial = doesn't use SQuAD script. """ eval_time = utils.Timer() f1 = utils.AverageMeter() exact_match = utils.AverageMeter() out_set = set({33, 42, 45, 70, 39}) logger.info("validate_unofficial_with_doc") # Run through examples examples = 0 aa = [0.0 for i in range(vector.num_docs)] bb = [0.0 for i in range(vector.num_docs)] aa_sum = 0.0 display_num = 10 for idx, ex_with_doc in enumerate(data_loader): ex = ex_with_doc[0] batch_size, question, ex_id = ex[0].size(0), ex[3], ex[-1] scores_doc_num = model.predict_with_doc(ex_with_doc) scores = [{} for i in range(batch_size)] tot_sum = [0.0 for i in range(batch_size)] tot_sum1 = [0.0 for i in range(batch_size)] neg_sum = [0.0 for i in range(batch_size)] min_sum = [[] for i in range(batch_size)] min_sum1 = [[] for i in range(batch_size)] for idx_doc in range(0, vector.num_docs): ex = ex_with_doc[idx_doc] pred_s, pred_e, pred_score = model.predict(ex, top_n=10) for i in range(batch_size): doc_text = docs_by_question[ex_id[i]][idx_doc % len( docs_by_question[ex_id[i]])]["document"] has_answer_t = has_answer(args, exs_with_doc[ex_id[i]]['answer'], doc_text) for k in range(10): try: prediction = [] for j in range(pred_s[i][k], pred_e[i][k] + 1): prediction.append(doc_text[j]) prediction = " ".join(prediction).lower() if (prediction not in scores[i]): scores[i][prediction] = 0 scores[i][prediction] += pred_score[i][ k] * scores_doc_num[i][idx_doc] except: pass for i in range(batch_size): _, indices = scores_doc_num[i].sort(0, descending=True) for j in range(0, display_num): idx_doc = indices[j] doc_text = docs_by_question[ex_id[i]][idx_doc % len( docs_by_question[ex_id[i]])]["document"] if (has_answer(args, exs_with_doc[ex_id[i]]['answer'], doc_text)[0]): aa[j] = aa[j] + 1 bb[j] = bb[j] + 1 for i in range(batch_size): best_score = 0 prediction = "" for key in scores[i]: if (scores[i][key] > best_score): best_score = scores[i][key] prediction = key # Compute metrics ground_truths = [] answer = exs_with_doc[ex_id[i]]['answer'] if (args.dataset == "CuratedTrec"): ground_truths = answer else: for a in answer: ground_truths.append(" ".join([w for w in a])) #logger.info(prediction) #logger.info(ground_truths) exact_match.update( utils.metric_max_over_ground_truths(utils.exact_match_score, prediction, ground_truths)) f1.update( utils.metric_max_over_ground_truths(utils.f1_score, prediction, ground_truths)) a = sorted(scores[i].items(), key=lambda d: d[1], reverse=True) examples += batch_size if (mode == "train" and examples >= 1000): break try: for j in range(0, display_num): if (j > 0): aa[j] = aa[j] + aa[j - 1] bb[j] = bb[j] + bb[j - 1] logger.info(aa[j] / bb[j]) except: pass logger.info(aa_sum) if (mode == 'dev' or mode == 'train'): g.write("*" * 50 + "\n") g.close() logger.info('%s valid official with doc: Epoch = %d | EM = %.2f | ' % (mode, global_stats['epoch'], exact_match.avg * 100) + 'F1 = %.2f | examples = %d | valid time = %.2f (s)' % (f1.avg * 100, examples, eval_time.time())) return {'exact_match': exact_match.avg * 100, 'f1': f1.avg * 100}