示例#1
0
 def annotation_to_answer_and_type(
         self, annotation) -> Tuple[Tuple[str, ...], str]:
     if self._dataset_name == 'quoref':
         return answer_json_to_strings(
             annotation)  # because we use a dropified version
     else:
         return answer_json_to_strings(annotation)
    def call(self, prediction: Union[str, List], ground_truths: List, predicted_ability: str) -> Union[str, List]:
        # If you wanted to split this out by answer type, you could look at [1] here and group by
        # that, instead of only keeping [0].
        gold_answers = []
        for annotation in ground_truths:
            if 'yesno' in annotation and annotation['yesno']:
                gold_answer = tuple([str(annotation['yesno'])]), 'yesno'
            else:
                gold_answer = answer_json_to_strings(annotation)
            gold_answers.append(gold_answer)

        ground_truth_answer_strings, ground_truth_answer_types = list(zip(*gold_answers))
        (exact_match, f1_score), maximizing_ground_truth_index = CustomDropEmAndF1.metric_max_over_ground_truths(
                drop_em_and_f1,
                prediction,
                ground_truth_answer_strings
        )
        self._total_em += exact_match
        self._total_f1 += f1_score
        self._count += 1

        # Best answer type is selected, just as in drop_eval
        answer_type = ground_truth_answer_types[maximizing_ground_truth_index]
        self._answer_type_head_em[answer_type][predicted_ability] += exact_match
        self._answer_type_head_f1[answer_type][predicted_ability] += f1_score
        self._answer_type_head_count[answer_type][predicted_ability] += 1

        return (exact_match, f1_score), ground_truths[maximizing_ground_truth_index]
示例#3
0
 def evaluate_logical_form(self, logical_form: str,
                           answer_json: JsonDict) -> Tuple[float, float]:
     """
     Takes a logical form, and the answer dict from the original dataset, and returns exact match
     and f1 measures, according to the two official metrics.
     """
     answer_string, _ = drop_eval.answer_json_to_strings(answer_json)
     if not self.verbose:
         executor_logger = logging.getLogger(
             'semparse.language.drop_executor')
         executor_logger.setLevel(logging.ERROR)
     try:
         denotation = self.execute(logical_form)
     except Exception:  # pylint: disable=broad-except
         if self.verbose:
             LOGGER.warning(f'Failed to execute: {logical_form}')
         return 0.0, 0.0
     if isinstance(denotation, list):
         denotation_list = [
             str(denotation_item) for denotation_item in denotation
         ]
     else:
         denotation_list = [str(denotation)]
     em_score, f1_score = drop_eval.get_metrics(denotation_list,
                                                answer_string)
     return em_score, f1_score
示例#4
0
def f1metric(prediction: Union[str, List],
             ground_truths: List):  # type: ignore
    """
    Parameters
    ----------a
    prediction: ``Union[str, List]``
        The predicted answer from the model evaluated. This could be a string, or a list of string
        when multiple spans are predicted as answer.
    ground_truths: ``List``
        All the ground truth answer annotations.
    """
    # If you wanted to split this out by answer type, you could look at [1] here and group by
    # that, instead of only keeping [0].
    ground_truth_answer_strings = [
        answer_json_to_strings(annotation)[0] for annotation in ground_truths
    ]
    exact_match, f1_score = metric_max_over_ground_truths(
        drop_em_and_f1, prediction, ground_truth_answer_strings)

    return (exact_match, f1_score)