def test_rouge(self):
     evaluator = language_evaluation.RougeEvaluator(num_parallel_calls=5)
     sample_predictions = SAMPLE_PREDICTIONS * 5000
     sample_answers = SAMPLE_ANSWERS * 5000
     results = evaluator.run_evaluation(sample_predictions, sample_answers)
     #results = evaluator.run_evaluation(SAMPLE_PREDICTIONS, SAMPLE_ANSWERS)
     pprint(results)
예제 #2
0
 def test_rouge(self):
     evaluator = language_evaluation.RougeEvaluator()
     results = evaluator.run_evaluation(SAMPLE_PREDICTIONS, SAMPLE_ANSWERS)
     pprint(results)
예제 #3
0
def run_wow_evaluation(results_dict, checkpoint_dir, mode):
    global_step = int(tf.compat.v1.train.get_global_step())
    if 'episode_mask' in results_dict:
        episode_mask = results_dict['episode_mask']
    else:
        episode_mask = None

    trim_fn = _trim_after_eos
    knowledge_separator = BERT_KNOWLEDGE_SEPARATOR

    predictions = trim_fn(results_dict['predictions'], mask=episode_mask)
    answers = trim_fn(results_dict['answers'], mask=episode_mask)
    contexts = trim_fn(results_dict['context'], mask=episode_mask)
    knowledge_sent_gts = trim_fn(results_dict['knowledge_sent_gt'],
                                 mask=episode_mask)
    knowledge_sent_preds = trim_fn(results_dict['knowledge_sent_pred'],
                                   mask=episode_mask)

    # XXX: Dump outputs

    # Show examples
    show_indices = random.sample(range(len(predictions)), 10)
    for index in show_indices:
        prediction = predictions[index]
        answer = answers[index]
        knowledge_sent_gt = knowledge_sent_gts[index]
        knowledge_sent_pred = knowledge_sent_preds[index]
        tqdm.write(f"{index} ({mode}).")
        tqdm.write(f"(knowledge_gt) {knowledge_sent_gt}")
        tqdm.write(f"(knowledge_pred) {knowledge_sent_pred}")
        tqdm.write(f"(gt) {answer}")
        tqdm.write(f"(pred) {prediction}\n\n")

    # Evaluation
    rouge_evaluator = language_evaluation.RougeEvaluator(
        num_parallel_calls=1, tokenization_fn=normalize_answer)
    perplexity = np.exp(np.mean(results_dict['gen_loss']))
    total_loss = np.mean(results_dict['loss'])
    knowledge_accuracy = accuracy_score(np.zeros(
        results_dict['knowledge_predictions'].shape, dtype=np.int32),
                                        results_dict['knowledge_predictions'],
                                        sample_weight=episode_mask)

    rouge_result = rouge_evaluator.run_evaluation(predictions, answers)
    loss_result = {
        'perplexity': perplexity,
        'total_loss': total_loss,
        'accuracy': knowledge_accuracy,
    }

    # Optional metrics
    if 'knowledge_loss' in results_dict:
        knowledge_loss = np.mean(results_dict['knowledge_loss'])
        loss_result['knowledge_loss'] = knowledge_loss
    if 'kl_loss' in results_dict:
        kl_loss = np.mean(results_dict['kl_loss'])
        loss_result['kl_loss'] = kl_loss

    log_dict = {}
    log_dict.update(rouge_result)
    log_dict.update(loss_result)

    summaries = {
        f"{mode}_test_loss": loss_result,
        f"{mode}_rouge": rouge_result
    }

    return summaries, log_dict
예제 #4
0
def add_multi_results(results_dict, rouge_result, loss_result, predictions,
                      episode_mask, trim_fn):
    multi_responses = results_dict['multi_responses']
    num_responses = results_dict['num_responses'][episode_mask]
    multi_gt_knowledge_sentences = results_dict['multi_gt_knowledge_sentences']
    knowledge_sent_preds = results_dict['knowledge_sent_pred'][episode_mask]

    multi_rouge_evaluator = language_evaluation.RougeEvaluator(
        num_parallel_calls=1, tokenization_fn=normalize_answer, average=False)
    multi_rouge_results_list = []
    multi_accuracy_list = []
    for i in range(multi_responses.shape[1]):
        # choose best rouge scores among multi responses
        responses = trim_fn(multi_responses[:, i], mask=episode_mask)
        multi_rouge_result = multi_rouge_evaluator.run_evaluation(
            predictions, responses)
        multi_rouge_result['rouge1'][0] = multi_rouge_result['rouge1'][0] * (
            num_responses > i)
        multi_rouge_result['rouge2'][0] = multi_rouge_result['rouge2'][0] * (
            num_responses > i)
        multi_rouge_result['rougeL'][0] = multi_rouge_result['rougeL'][0] * (
            num_responses > i)
        multi_rouge_results_list.append(multi_rouge_result)

        # knowledge accuracy
        gt_knowledge_sentences = multi_gt_knowledge_sentences[:,
                                                              i][episode_mask]
        knowledge_min_length = min(gt_knowledge_sentences.shape[-1],
                                   knowledge_sent_preds.shape[-1])
        multi_accuracy_list.append(np.logical_not(np.logical_not(
            gt_knowledge_sentences[:,:knowledge_min_length] == \
            knowledge_sent_preds[:,:knowledge_min_length]).sum(axis=1)))
    multi_rouge1_results = np.stack(
        [x['rouge1'][0] for x in multi_rouge_results_list], axis=0)
    multi_rouge2_results = np.stack(
        [x['rouge2'][0] for x in multi_rouge_results_list], axis=0)
    multi_rougeL_results = np.stack(
        [x['rougeL'][0] for x in multi_rouge_results_list], axis=0)
    multi_rouge1_results = np.transpose(multi_rouge1_results, [1, 0])
    multi_rouge2_results = np.transpose(multi_rouge2_results, [1, 0])
    multi_rougeL_results = np.transpose(multi_rougeL_results, [1, 0])
    multi_rouge1_max_indices = np.argmax(multi_rouge1_results, axis=1)
    max_multi_rouge1_results = np.max(multi_rouge1_results, axis=1)

    range_indices = np.arange(len(multi_rouge1_max_indices))
    max_multi_rouge2_results = multi_rouge2_results[range_indices,
                                                    multi_rouge1_max_indices]
    max_multi_rougeL_results = multi_rougeL_results[range_indices,
                                                    multi_rouge1_max_indices]

    multi_rouge1 = sum(max_multi_rouge1_results) / len(
        max_multi_rouge1_results)
    multi_rouge2 = sum(max_multi_rouge2_results) / len(
        max_multi_rouge2_results)
    multi_rougeL = sum(max_multi_rougeL_results) / len(
        max_multi_rougeL_results)
    rouge_result['rouge1_multi_responses'] = multi_rouge1
    rouge_result['rouge2_multi_responses'] = multi_rouge2
    rouge_result['rougeL_multi_responses'] = multi_rougeL

    # accuracy
    multi_accuracies = np.transpose(np.stack(multi_accuracy_list, axis=0),
                                    [1, 0])
    multi_accuracies = multi_accuracies.sum(axis=1).astype(bool)
    multi_accuracy = sum(multi_accuracies) / len(multi_accuracies)
    loss_result['accuracy_multi_responses'] = multi_accuracy

    # perplexity
    multi_perplexity = np.exp(np.mean(results_dict['multi_gen_loss']))
    loss_result['perplexity_multi_responses'] = multi_perplexity

    return rouge_result, loss_result