def _compute_nltk_bleu(self, batch: Batch, texts: List[str]): """ Compute BLEU score between text and label(s), using the NLTK BLEU Scorer. Note this differs from BLEU in ParlAI metrics in that the answers are unnormalized (no removal of stop words, etc.) :param batch: Batch of observations :param texts: list of string predictions """ results = {} for i, p in enumerate(texts): obs = batch.observations[i] references = [] for lbl in obs['eval_labels']: references.append( self._v2t( self._vectorize_text( lbl, True, True, self.label_truncate, False ) ) ) for k in range(1, 5): b = BleuMetric.compute(p, references, k) if b is None: b = 0 if k not in results: results[k] = [] results[k].append(b) for k in range(1, 5): self.record_local_metric(f'nltk_bleu{k}', results[k])
def custom_evaluation(self, teacher_action: Message, labels, model_response: Message): resp = model_response.get('text') if not resp: return if teacher_action['type'] == 'apicall' and resp.startswith( 'apicall: '): gold = teacher_action['slots'] slot_strs = resp[9:].split(' ; ') parsed = {} for slot_str in slot_strs: if ' = ' not in slot_str: if slot_str != '': # syntactically invalid generations should count against us self.metrics.add('slot_p', AverageMetric(0)) continue name, value = slot_str.split(' = ') parsed[name] = value # slot precision for k, v in parsed.items(): self.metrics.add('slot_p', AverageMetric(v == gold.get(k))) # slot recall for k, v in gold.items(): self.metrics.add('slot_r', AverageMetric(v == parsed.get(k))) elif teacher_action['type'] == 'apiresp': delex_resp = self._delex(resp, teacher_action['slots']) delex_label = self._delex(labels[0], teacher_action['slots']) self.metrics.add('delex_bleu', BleuMetric.compute(delex_resp, [delex_label]))
def __init__( self, guess: str, labels: Optional[List[str]], prefixes: Optional[List[str]] = None, shared: Dict[str, Any] = None, ) -> None: super().__init__(shared=shared) self.prefixes = prefixes if prefixes else [] bleu = BleuMetric.compute(guess, labels) f1 = F1Metric.compute(guess, labels) self.add_with_prefixes("nlg_bleu", bleu) self.add_with_prefixes("nlg_f1", f1)
def custom_evaluation( self, teacher_action: Message, labels: Optional[Tuple[str]], model_response: Message, ) -> None: """ Various F1 metrics for the generated model response. """ if not model_response.get('text'): # No response generated by model. return resp = model_response['text'] # F1 metric over the *selected* knowledge. self.metrics.add( 'knowledge_f1_docs', F1Metric.compute(resp, teacher_action[CONST.SELECTED_DOCS]), ) self.metrics.add( 'knowledge_f1_sentences', F1Metric.compute(resp, teacher_action[CONST.SELECTED_SENTENCES]), ) # F1 Metrics over the *retrieved* docs. self.metrics.add( 'f1_retrieved_docs', F1Metric.compute(resp, ' '.join(teacher_action[CONST.RETRIEVED_DOCS])), ) self.metrics.add( 'max_f1_retrieved_docs', F1Metric.compute(resp, teacher_action[CONST.RETRIEVED_DOCS]), ) selected_doc_senetences = teacher_action[CONST.SELECTED_DOCS][0].split( '\n') all_doc_senetences = [] for doc in teacher_action[CONST.RETRIEVED_DOCS]: all_doc_senetences.extend(doc.split('\n')) self.metrics.add('exact_copied_sentences', ExactMatchMetric.compute(resp, all_doc_senetences)) self.metrics.add( 'max_substring_copied_sentences', CopiedSubstringMetric.compute(resp, all_doc_senetences), ) self.metrics.add( 'max_substring_copied_docs', CopiedSubstringMetric.compute( resp, teacher_action[CONST.RETRIEVED_DOCS]), ) self.metrics.add( 'substring_copied_docs', CopiedSubstringMetric.compute( resp, [''.join(teacher_action[CONST.RETRIEVED_DOCS])]), ) self.metrics.add( 'max_f1_selected_docs_senetences', F1Metric.compute(resp, selected_doc_senetences), ) self.metrics.add('max_f1_docs_senetences', F1Metric.compute(resp, all_doc_senetences)) # N-gram matching metrics for k in range(1, 5): # 1..4 self.metrics.add( f'max_bleu_selected_docs_senetences-{k}', BleuMetric.compute(resp, selected_doc_senetences, k), ) r1, r2, rL = RougeMetric.compute_many(resp, selected_doc_senetences) self.metrics.add('max_rouge_selected_docs_senetences_1', r1) self.metrics.add('max_rouge_selected_docs_senetences_2', r2) self.metrics.add('max_rouge_selected_docs_senetences_L', rL)
def custom_evaluation( self, teacher_action: Message, labels: Optional[Tuple[str]], model_response: Message, ): if 'metrics' in model_response and 'type' in teacher_action: # keep copies of metrics across both api calls/responses prefix = teacher_action['type'] keys = list(model_response['metrics'].keys()) for k in keys: self.metrics.add(f'{prefix}_{k}', model_response['metrics'][k]) if 'text' not in model_response or not labels or 'type' not in teacher_action: return domain = teacher_action['domain'] if teacher_action['type'] == 'apicall': # also count slot accuracy text = model_response['text'] slot_guesses = set( text.replace( CALL_TOKEN + " ", "").split(' ; ')) # prevent cheating via repeated guesses correct = 0 for slot_guess in slot_guesses: if ' = ' not in slot_guess: continue try: slot, guess = slot_guess.split(' = ') except ValueError: continue if teacher_action['slots'].get(slot) == guess: self.metrics.add('slot_p', AverageMetric(1)) self.metrics.add(f'{domain}_slot_p', AverageMetric(1)) correct += 1 else: self.metrics.add('slot_p', AverageMetric(0)) self.metrics.add(f'{domain}_slot_p', AverageMetric(0)) logging.debug( f"Bad slot guess '{slot_guess}' != {teacher_action['slots']}" ) if teacher_action['slots']: self.metrics.add( 'slot_r', AverageMetric(correct, len(teacher_action['slots']))) self.metrics.add( f'{domain}_slot_r', AverageMetric(correct, len(teacher_action['slots'])), ) self.metrics.add( 'jga', AverageMetric(correct == len(teacher_action['slots']))) elif teacher_action['type'] == 'apiresp': # keep track of statistics by domain f1_metric = F1Metric.compute(model_response['text'], labels) bleu_metric = BleuMetric.compute(model_response['text'], labels) self.metrics.add(f'{domain}_lex_f1', f1_metric) self.metrics.add(f'{domain}_lex_bleu', bleu_metric) delex_text = model_response['text'] delex_label = labels[0] # compute delexicalized string metrics for slot, value in teacher_action['slots'].items(): delex_text = delex_text.replace(value, slot) delex_label = delex_label.replace(value, slot) f1_metric = F1Metric.compute(delex_text, (delex_label, )) self.metrics.add('delex_f1', f1_metric) self.metrics.add(f'{domain}_delex_f1', f1_metric) bleu_metric = BleuMetric.compute(delex_text, [delex_label]) self.metrics.add('delex_bleu', bleu_metric) self.metrics.add(f'{domain}_delex_bleu', bleu_metric)