예제 #1
0
def eval_batch_output(tgt_tensor_or_tokens: Union[torch.Tensor,
                                                  List[List[str]]],
                      vocab: Vocab, oov_dict: dict,
                      *pred_tensors: torch.Tensor) -> List[Dict[str, float]]:
    """
  :param tgt_tensor_or_tokens: the gold standard, either as indices or textual tokens
  :param vocab: the fixed-size vocab
  :param oov_dict: out-of-vocab dict
  :param pred_tensors: one or more systems' prediction (output tensors)
  :return: two-level score lookup (system index => ROUGE metric => value)

  Evaluate one or more systems' output.
  """
    oov_idx2word, ext_vocab_size = reverse_dict(oov_dict)
    decoded_batch = [
        decode_batch_output(pred_tensor, vocab, oov_idx2word)
        for pred_tensor in pred_tensors
    ]
    if isinstance(tgt_tensor_or_tokens, torch.Tensor):
        gold_summaries = decode_batch_output(tgt_tensor_or_tokens, vocab,
                                             oov_idx2word)
    else:
        gold_summaries = tgt_tensor_or_tokens
    scores = rouge(gold_summaries, *decoded_batch)
    return scores
예제 #2
0
def sample_rouges(self, pairs, idx_excepts, hh_vertices_rouge,
                  wf_vertices_rouge):
    idxs_include = [
        item for item in range(len(pairs)) if item not in idx_excepts
    ]
    gold_summaries = np.array([ex.tgt_sents
                               for ex in pairs])[idxs_include].tolist()
    src_include = np.array([ex.src_words
                            for ex in pairs])[idxs_include].tolist()
    tgt_size = np.array(np.array([len(ex.tgt_sents)
                                  for ex in pairs]))[idxs_include].tolist()
    sample_idxs = np.random.choice(len(src_include), size=10, replace=False)
    for idx in sample_idxs:
        first_k = src_include[idx][:tgt_size[idx]]
        last_k = src_include[idx][-tgt_size[idx]:]
        hh = np.array(src_include[idx])[sorted(
            hh_vertices_rouge[idx])].tolist()
        wf = np.array(src_include[idx])[sorted(
            wf_vertices_rouge[idx])].tolist()
        summary = gold_summaries[idx]
        # print out
        txt = open(
            os.path.join(
                '../../figure/sample/', 'sample_%s_%s_%s.txt' %
                (self.dataset, self.dataset_type, str(idx))), 'a')
        # Ref size: tot
        txt.write('%s\n' % [item for sub in summary for item in sub])
        txt.write('%s\n' % [item for sub in first_k for item in sub])
        txt.write('%s\n' % [item for sub in last_k for item in sub])
        txt.write('%s\n' % [item for sub in hh for item in sub])
        txt.write('%s\n' % [item for sub in wf for item in sub])
        txt.write(
            'Rouge(First,Last,HH,WF): %s,%s,%s,%s' %
            (rouge([item for sub in summary for item in sub],
                   [item for sub in first_k for item in sub]),
             rouge([item for sub in summary
                    for item in sub], [item for sub in last_k
                                       for item in sub]),
             rouge([item for sub in summary
                    for item in sub], [item for sub in hh for item in sub]),
             rouge([item for sub in summary
                    for item in sub], [item for sub in wf for item in sub])))
        txt.write('%s\n' % [item for sub in src_include[idx] for item in sub])
        txt.close()

    print("Done Save!")
예제 #3
0
def rouge_print(gold_summaries, src_examples, src_vertices):
    src_rouge = [
        np.array(x)[sorted(y)].tolist()
        for x, y in zip(src_examples, src_vertices)
    ]
    score = rouge([sum(x, []) for x in gold_summaries],
                  [sum(y, []) for y in src_rouge])[0]
    return score['1_f'] * 100 / 1.0, score['2_f'] * 100 / 1.0, score[
        'l_f'] * 100 / 1.0, score['su4_f'] * 100 / 1.0
예제 #4
0
def eval_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, criterion=None, *, pack_seq=True,
               show_cover_loss=False) -> Tuple[float, float]:
    """Test the `model` on the `batch`, return the ROUGE score and the loss."""
    decoded_batch, out = decode_batch(batch, model, vocab, criterion=criterion, pack_seq=pack_seq,
                                      show_cover_loss=show_cover_loss)
    examples = batch[0]
    gold_summaries = [ex.tgt for ex in examples]
    scores = rouge(gold_summaries, decoded_batch)
    return out.loss_value, scores[0]['l_f']
예제 #5
0
def eval_bs_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, *, pack_seq=True, beam_size=4,
                  min_out_len=1, max_out_len=None, len_in_words=True, best_only=True,
                  details: bool = True) -> Tuple[Optional[List[Dict[str, float]]], Optional[str]]:
    """
    :param batch: a test batch of a single example
    :param model: a trained summarizer
    :param vocab: vocabulary of the trained summarizer
    :param pack_seq: currently has no effect as batch size is 1
    :param beam_size: the beam size
    :param min_out_len: required minimum output length
    :param max_out_len: required maximum output length (if None, use the model's own value)
    :param len_in_words: if True, count output length in words instead of tokens (i.e. do not count
                         punctuations)
    :param best_only: if True, run ROUGE only on the best hypothesis instead of all `beam size` many
    :param details: if True, also return a string containing the result of this document
    :return: two-level score lookup (hypothesis index => ROUGE metric => value)

    Test a trained summarizer on a document using the beam search decoder.
    """
    assert len(batch.examples) == 1
    with torch.no_grad():
        input_tensor = batch.input_tensor.to(DEVICE)  # (src_len, 1)

        if not pack_seq:
            input_lengths = None
        else:
            # use PAD
            input_lengths = batch.input_lengths
            mask = create_mask(input_lengths)
        # 返回beam_size大小的hypo
        hypotheses = model.beam_search(input_tensor, input_lengths,
                                       batch.ext_vocab_size, beam_size, min_out_len=min_out_len,
                                       max_out_len=max_out_len, len_in_words=len_in_words, mask=mask)
    if best_only:
        to_decode = [hypotheses[0].tokens]
    else:
        to_decode = [h.tokens for h in hypotheses]
    decoded_batch = decode_batch_output(to_decode, vocab, batch.oov_dict)
    if details:
        file_content = "[System Summary]\n" + format_tokens(decoded_batch[0])
    else:
        file_content = None
    if batch.examples[0].tgt is not None:  # run ROUGE if gold standard summary exists
        gold_summaries = [batch.examples[0].tgt for _ in range(len(decoded_batch))]
        scores = rouge(gold_summaries, decoded_batch)
        if details:
            file_content += "\n\n\n[Reference Summary]\n" + format_tokens(batch.examples[0].tgt)
            file_content += "\n\n\n[ROUGE Scores]\n" + format_rouge_scores(scores[0]) + "\n"
    else:
        scores = None
    if details:
        file_content += "\n\n\n[Source Text]\n" + format_tokens(batch.examples[0].src)
    return scores, file_content
예제 #6
0
    hyp_list = []
    ref_list = []
    for batch in tqdm(test_dataloader, desc="Iteration"):
        batch = tuple(t.to(device) for t in batch)
        pred, _ = model.beam_decode(batch[0], batch[1], 3, 3)
        src, tgt = batch[0], batch[2]
        for i in range(BATCH_SIZE):
            sample_src = "".join(tokenizer.convert_ids_to_tokens(src[i].cpu().numpy())).split('[CLS]')[1].split('[SEP]')[0] + '\n'
            sample_tgt = "".join(tokenizer.convert_ids_to_tokens(tgt[i].cpu().numpy())).split('[CLS]')[1].split('[SEP]')[0] + '\n'
            sample_pred = "".join(tokenizer.convert_ids_to_tokens(pred[i][0])).split('[SEP]')[0] + '\n'
            f_log.write('\n**********\n')
            f_log.write('Source: ' + sample_src)
            f_log.write('Glod: ' + sample_tgt)
            f_log.write('Hypothesis: ' + sample_pred)
            # f_hyp.write(sample_pred)
            # f_ref.write(sample_tgt)
            hyp_list.append(sample_pred)
            ref_list.append(sample_tgt)
    rouge_1 = rouge(hyp_list, ref_list, 1)
    rouge_2 = rouge(hyp_list, ref_list, 2)
    logger.info('******Results******')
    logger.info(f'Rouge-1: {rouge_1}')
    logger.info(f'Rouge-2: {rouge_2}')
    f_log.write('**********\n')
    f_log.write(f'Rouge-1: {rouge_1}\n')
    f_log.write(f'Rouge-2: {rouge_2}\n')   
    logger.info('Evaluation finished.')