Exemplo n.º 1
0
    def evaluate(self, data: List[DocumentParagraphQuestion], true_len,
                 **kargs):
        print("Begining evaluation")
        spans, model_scores = np.array(kargs["spans"]), np.array(
            kargs["model_scores"])

        pred_f1s = np.zeros(len(data))
        pred_em = np.zeros(len(data))
        text_answers = []

        print("Scoring...")
        for i in tqdm(range(len(data)), total=len(data), ncols=80):
            point = data[i]
            if point.answer is None and not self.record_text_ans:
                continue
            text = point.get_context()
            pred_span = spans[i]
            pred_text = " ".join(text[pred_span[0]:pred_span[1] + 1])
            if self.record_text_ans:
                text_answers.append(pred_text)
                if point.answer is None:
                    continue

            f1 = 0
            em = False
            for answer in data[i].answer.answer_text:
                f1 = max(f1, trivia_f1_score(pred_text, answer))
                if not em:
                    em = trivia_em_score(pred_text, answer)

            pred_f1s[i] = f1
            pred_em[i] = em

        results = {}
        results["n_answers"] = [
            0 if x.answer is None else len(x.answer.answer_spans) for x in data
        ]
        if self.record_text_ans:
            results["text_answer"] = text_answers
        results["predicted_score"] = model_scores
        results["predicted_start"] = spans[:, 0]
        results["predicted_end"] = spans[:, 1]
        results["text_f1"] = pred_f1s
        results["rank"] = [x.rank for x in data]
        results["text_em"] = pred_em
        results["para_start"] = [x.para_range[0] for x in data]
        results["para_end"] = [x.para_range[1] for x in data]
        results["question_id"] = [x.question_id for x in data]
        results["doc_id"] = [x.doc_id for x in data]
        if "none_logit" in kargs:
            results["none_logit"] = kargs["none_logit"]
            results["none_prob"] = kargs["none_prob"]
        return Evaluation({}, results)
    def evaluate(self, data: List[DocumentParagraphQuestion], true_len,
                 **kargs):
        spans, model_scores = np.array(kargs["spans"]), np.array(
            kargs["model_scores"])

        pred_f1s = np.zeros(len(data))
        pred_em = np.zeros(len(data))
        text_answers = []
        spans_aggr = []
        scores_aggr = []

        for i in tqdm(range(len(data)),
                      total=len(data),
                      ncols=80,
                      desc="scoring"):
            point = data[i]
            if point.answer is None and not self.record_text_ans:
                continue
            text = point.get_context()

            all_spans = []
            r, c = spans[i].shape
            for j in range(r):
                for k in range(c):
                    if j % 2 == 0:
                        all_spans.append((spans[i][j][k],
                                          spans[i][j][k] + spans[i][j + 1][k]))
            spans_aggr.append(all_spans)
            all_scores = np.reshape(
                model_scores[i],
                model_scores[i].shape[0] * model_scores[i].shape[1]).tolist()
            scores_aggr.append(all_scores)

            all_texts = []
            for span in all_spans:
                pred_text = " ".join(text[span[0]:span[1] + 1])
                all_texts.append(pred_text)

            #pred_span = spans[i]
            pred_text = all_texts[np.argmax(all_scores)]
            if self.record_text_ans:
                text_answers.append(all_texts)
                if point.answer is None:
                    continue

            f1 = 0
            em = False
            for answer in data[i].answer.answer_text:
                f1 = max(f1, trivia_f1_score(pred_text, answer))
                if not em:
                    em = trivia_em_score(pred_text, answer)

            pred_f1s[i] = f1
            pred_em[i] = em

        results = {}
        results["n_answers"] = [
            0 if x.answer is None else len(x.answer.answer_spans) for x in data
        ]
        results["question"] = [' '.join(x.question) for x in data]
        if self.record_text_ans:
            results["text_answer"] = text_answers
        results["predicted_score"] = scores_aggr
        # results["predicted_start"] = spans_aggr
        # results["predicted_end"] = spans_aggr
        results["predicted_span"] = spans_aggr
        results["text_f1"] = pred_f1s
        results["rank"] = [x.rank for x in data]
        results["text_em"] = pred_em
        results["para_start"] = [x.para_range[0] for x in data]
        results["para_end"] = [x.para_range[1] for x in data]
        results["question_id"] = [x.question_id for x in data]
        results["doc_id"] = [x.doc_id for x in data]
        return Evaluation({}, results)