def compute(self, guess: str, answers: Iterable[str]) -> F1Metric: if guess is None or answers is None: return F1Metric(0, 0) guess = RareWordF1Calculator._filter(self._freq_dist, self._cutoff_count, guess) answers = [ RareWordF1Calculator._filter(self._freq_dist, self._cutoff_count, a) for a in answers ] if not any(len(a) for a in answers): # no rare words in labels, set denominator to zero return F1Metric(0, 0) return F1Metric.compute(guess, answers)
def test_custom_eval(self): """ Test whether custom evaluation works. """ with testing_utils.capture_output(): parser = setup_args() opt = parser.parse_args([ '--task', 'wizard_of_wikipedia', '--datatype', 'valid', '--label-type', 'chosen_sent', ]) teacher = create_task_agent_from_taskname(opt)[0] title = 'Gardening' cands = list('four') text = "Gardening\nI like Gardening, even when I've only been doing it for a short time." response = 'I live on a farm, we garden all year long, it is very relaxing.' checked_sent = ( 'Gardening is considered by many people to be a relaxing activity.' ) checked_sent_label = f'{title}{TOKEN_KNOWLEDGE}{checked_sent}' retrieval_metric_keys = [ 'passage_r@1', 'passage_r@5', 'title_r@1', 'title_r@5' ] chosen_sent_teacher_action = Message({ 'text': text, 'labels': [checked_sent_label], 'title': [title], 'checked_sentence': [checked_sent], }) correct_chosen_sent_response = Message({ 'text': checked_sent_label, 'title_candidates': [title] + cands, 'text_candidates': [checked_sent_label] + cands, }) top5_chosen_sent_response = Message({ 'text': f'hello{TOKEN_KNOWLEDGE}goodbye', 'title_candidates': cands + [title], 'text_candidates': cands + [checked_sent_label], }) incorrect_chosen_sent_response = Message({ 'text': f'hello{TOKEN_KNOWLEDGE}goodbye', 'title_candidates': cands, 'text_candidates': cands, }) response_teacher_action = Message({ 'text': text, 'labels': [response], 'checked_sentence': checked_sent }) high_f1_response = Message({'text': checked_sent}) low_f1_response = Message({'text': 'incorrect'}) # 1) Test with correct top sentence teacher.reset_metrics() teacher.custom_evaluation( chosen_sent_teacher_action, [checked_sent_label], correct_chosen_sent_response, ) report = teacher.report() for k in retrieval_metric_keys: assert k in report assert report[k] == AverageMetric(1) # 2) Test with top sentence in top 5 teacher.reset_metrics() teacher.custom_evaluation(chosen_sent_teacher_action, [checked_sent_label], top5_chosen_sent_response) report = teacher.report() for k in retrieval_metric_keys: assert k in report assert report[k] == AverageMetric( 1) if '5' in k else AverageMetric(0) # 3) Test with no top sentences teacher.reset_metrics() teacher.custom_evaluation( chosen_sent_teacher_action, [checked_sent_label], incorrect_chosen_sent_response, ) report = teacher.report() for k in retrieval_metric_keys: assert k in report assert report[k] == AverageMetric(0) # 4) Test knowledge f1 with high f1 teacher.label_type = 'response' teacher.reset_metrics() teacher.custom_evaluation(response_teacher_action, [response], high_f1_response) report = teacher.report() assert 'knowledge_f1' in report assert report['knowledge_f1'] == F1Metric(1) # 5) Test knowledge f1 with low f1 teacher.reset_metrics() teacher.custom_evaluation(response_teacher_action, [response], low_f1_response) report = teacher.report() assert 'knowledge_f1' in report assert report['knowledge_f1'] == F1Metric(0)