def evaluate(self, data: List[ContextAndQuestion], true_len, **kargs): spans, model_scores, pred_losses = kargs["spans"], kargs["model_scores"], kargs["losses"] print("Len losses: %s, len spans: %s" % (len(pred_losses), len(spans))) pred_none_probs = kargs.get("none_probs", None) if pred_none_probs is not None: print("Len pred_none_probs: %d" % (len(pred_none_probs))) pred_f1s = np.zeros(len(data)) pred_em = np.zeros(len(data)) for i in tqdm(range(len(data)), total=len(data), ncols=80, desc="scoring"): point = data[i] f1 = 0.0 em = 0.0 if point.answer is not None: pred_span = spans[i] pred_text = point.paragraph.get_original_text(pred_span[0], pred_span[1]) f1 = 0 em = False for answer in data[i].answer.answer_text: f1 = max(f1, squad_f1_score(pred_text, answer)) if not em: em = squad_em_score(pred_text, answer) pred_f1s[i] = f1 pred_em[i] = em results = {"model_confs": model_scores, "predicted_spans": spans, "question_ids": [x.question_id for x in data], "text_f1": pred_f1s, "text_em": pred_em, "loss": pred_losses} if pred_none_probs: results["none_probs"] = pred_none_probs return Evaluation({}, results)
def evaluate(self, data: List[ContextAndQuestion], true_len, **kargs): spans, model_scores = kargs["spans"], kargs["model_scores"] results = { "model_conf": model_scores, "predicted_span": spans, "question_id": [x.question_id for x in data] } return Evaluation({}, results)
def evaluate(self, data: List[DocumentParagraphQuestion], true_len, **kargs): print("Begining evaluation") spans, model_scores = np.array(kargs["spans"]), np.array( kargs["model_scores"]) pred_f1s = np.zeros(len(data)) pred_em = np.zeros(len(data)) text_answers = [] print("Scoring...") for i in tqdm(range(len(data)), total=len(data), ncols=80): point = data[i] if point.answer is None and not self.record_text_ans: continue text = point.get_context() pred_span = spans[i] pred_text = " ".join(text[pred_span[0]:pred_span[1] + 1]) if self.record_text_ans: text_answers.append(pred_text) if point.answer is None: continue f1 = 0 em = False for answer in data[i].answer.answer_text: f1 = max(f1, trivia_f1_score(pred_text, answer)) if not em: em = trivia_em_score(pred_text, answer) pred_f1s[i] = f1 pred_em[i] = em results = {} results["n_answers"] = [ 0 if x.answer is None else len(x.answer.answer_spans) for x in data ] if self.record_text_ans: results["text_answer"] = text_answers results["predicted_score"] = model_scores results["predicted_start"] = spans[:, 0] results["predicted_end"] = spans[:, 1] results["text_f1"] = pred_f1s results["rank"] = [x.rank for x in data] results["text_em"] = pred_em results["para_start"] = [x.para_range[0] for x in data] results["para_end"] = [x.para_range[1] for x in data] results["question_id"] = [x.question_id for x in data] results["doc_id"] = [x.doc_id for x in data] if "none_logit" in kargs: results["none_logit"] = kargs["none_logit"] results["none_prob"] = kargs["none_prob"] return Evaluation({}, results)
def evaluate(self, data: List[RankedParagraphQuestion], true_len, **kargs): spans, model_scores = np.array(kargs["spans"]), np.array( kargs["model_scores"]) pred_f1s = np.zeros(len(data)) pred_em = np.zeros(len(data)) text_answers = [] for i in tqdm(range(len(data)), total=len(data), ncols=80, desc="scoring"): point = data[i] if point.answer is None and not self.record_text_ans: continue pred_span = spans[i] pred_text = point.paragraph.get_original_text( pred_span[0], pred_span[1]) if self.record_text_ans: text_answers.append(pred_text) if point.answer is None: continue f1 = 0 em = False for answer in data[i].answer.answer_text: f1 = max(f1, squad_f1_score(pred_text, answer)) if not em: em = squad_em_score(pred_text, answer) pred_f1s[i] = f1 pred_em[i] = em results = {} results["n_answers"] = [ 0 if x.answer is None else len(x.answer.answer_spans) for x in data ] if self.record_text_ans: results["text_answer"] = text_answers results["predicted_score"] = model_scores results["predicted_start"] = spans[:, 0] results["predicted_end"] = spans[:, 1] results["text_f1"] = pred_f1s results["rank"] = [x.rank for x in data] results["text_em"] = pred_em results["question_id"] = [x.question_id for x in data] return Evaluation({}, results)
def evaluate(self, data: List[DocumentParagraphQuestion], true_len, **kargs): spans, model_scores = np.array(kargs["spans"]), np.array( kargs["model_scores"]) pred_f1s = np.zeros(len(data)) pred_em = np.zeros(len(data)) text_answers = [] spans_aggr = [] scores_aggr = [] for i in tqdm(range(len(data)), total=len(data), ncols=80, desc="scoring"): point = data[i] if point.answer is None and not self.record_text_ans: continue text = point.get_context() all_spans = [] r, c = spans[i].shape for j in range(r): for k in range(c): if j % 2 == 0: all_spans.append((spans[i][j][k], spans[i][j][k] + spans[i][j + 1][k])) spans_aggr.append(all_spans) all_scores = np.reshape( model_scores[i], model_scores[i].shape[0] * model_scores[i].shape[1]).tolist() scores_aggr.append(all_scores) all_texts = [] for span in all_spans: pred_text = " ".join(text[span[0]:span[1] + 1]) all_texts.append(pred_text) #pred_span = spans[i] pred_text = all_texts[np.argmax(all_scores)] if self.record_text_ans: text_answers.append(all_texts) if point.answer is None: continue f1 = 0 em = False for answer in data[i].answer.answer_text: f1 = max(f1, trivia_f1_score(pred_text, answer)) if not em: em = trivia_em_score(pred_text, answer) pred_f1s[i] = f1 pred_em[i] = em results = {} results["n_answers"] = [ 0 if x.answer is None else len(x.answer.answer_spans) for x in data ] results["question"] = [' '.join(x.question) for x in data] if self.record_text_ans: results["text_answer"] = text_answers results["predicted_score"] = scores_aggr # results["predicted_start"] = spans_aggr # results["predicted_end"] = spans_aggr results["predicted_span"] = spans_aggr results["text_f1"] = pred_f1s results["rank"] = [x.rank for x in data] results["text_em"] = pred_em results["para_start"] = [x.para_range[0] for x in data] results["para_end"] = [x.para_range[1] for x in data] results["question_id"] = [x.question_id for x in data] results["doc_id"] = [x.doc_id for x in data] return Evaluation({}, results)