def annotation_to_answer_and_type( self, annotation) -> Tuple[Tuple[str, ...], str]: if self._dataset_name == 'quoref': return answer_json_to_strings( annotation) # because we use a dropified version else: return answer_json_to_strings(annotation)
def call(self, prediction: Union[str, List], ground_truths: List, predicted_ability: str) -> Union[str, List]: # If you wanted to split this out by answer type, you could look at [1] here and group by # that, instead of only keeping [0]. gold_answers = [] for annotation in ground_truths: if 'yesno' in annotation and annotation['yesno']: gold_answer = tuple([str(annotation['yesno'])]), 'yesno' else: gold_answer = answer_json_to_strings(annotation) gold_answers.append(gold_answer) ground_truth_answer_strings, ground_truth_answer_types = list(zip(*gold_answers)) (exact_match, f1_score), maximizing_ground_truth_index = CustomDropEmAndF1.metric_max_over_ground_truths( drop_em_and_f1, prediction, ground_truth_answer_strings ) self._total_em += exact_match self._total_f1 += f1_score self._count += 1 # Best answer type is selected, just as in drop_eval answer_type = ground_truth_answer_types[maximizing_ground_truth_index] self._answer_type_head_em[answer_type][predicted_ability] += exact_match self._answer_type_head_f1[answer_type][predicted_ability] += f1_score self._answer_type_head_count[answer_type][predicted_ability] += 1 return (exact_match, f1_score), ground_truths[maximizing_ground_truth_index]
def evaluate_logical_form(self, logical_form: str, answer_json: JsonDict) -> Tuple[float, float]: """ Takes a logical form, and the answer dict from the original dataset, and returns exact match and f1 measures, according to the two official metrics. """ answer_string, _ = drop_eval.answer_json_to_strings(answer_json) if not self.verbose: executor_logger = logging.getLogger( 'semparse.language.drop_executor') executor_logger.setLevel(logging.ERROR) try: denotation = self.execute(logical_form) except Exception: # pylint: disable=broad-except if self.verbose: LOGGER.warning(f'Failed to execute: {logical_form}') return 0.0, 0.0 if isinstance(denotation, list): denotation_list = [ str(denotation_item) for denotation_item in denotation ] else: denotation_list = [str(denotation)] em_score, f1_score = drop_eval.get_metrics(denotation_list, answer_string) return em_score, f1_score
def f1metric(prediction: Union[str, List], ground_truths: List): # type: ignore """ Parameters ----------a prediction: ``Union[str, List]`` The predicted answer from the model evaluated. This could be a string, or a list of string when multiple spans are predicted as answer. ground_truths: ``List`` All the ground truth answer annotations. """ # If you wanted to split this out by answer type, you could look at [1] here and group by # that, instead of only keeping [0]. ground_truth_answer_strings = [ answer_json_to_strings(annotation)[0] for annotation in ground_truths ] exact_match, f1_score = metric_max_over_ground_truths( drop_em_and_f1, prediction, ground_truth_answer_strings) return (exact_match, f1_score)