def eval_batch_output(tgt_tensor_or_tokens: Union[torch.Tensor, List[List[str]]], vocab: Vocab, oov_dict: dict, *pred_tensors: torch.Tensor) -> List[Dict[str, float]]: """ :param tgt_tensor_or_tokens: the gold standard, either as indices or textual tokens :param vocab: the fixed-size vocab :param oov_dict: out-of-vocab dict :param pred_tensors: one or more systems' prediction (output tensors) :return: two-level score lookup (system index => ROUGE metric => value) Evaluate one or more systems' output. """ oov_idx2word, ext_vocab_size = reverse_dict(oov_dict) decoded_batch = [ decode_batch_output(pred_tensor, vocab, oov_idx2word) for pred_tensor in pred_tensors ] if isinstance(tgt_tensor_or_tokens, torch.Tensor): gold_summaries = decode_batch_output(tgt_tensor_or_tokens, vocab, oov_idx2word) else: gold_summaries = tgt_tensor_or_tokens scores = rouge(gold_summaries, *decoded_batch) return scores
def sample_rouges(self, pairs, idx_excepts, hh_vertices_rouge, wf_vertices_rouge): idxs_include = [ item for item in range(len(pairs)) if item not in idx_excepts ] gold_summaries = np.array([ex.tgt_sents for ex in pairs])[idxs_include].tolist() src_include = np.array([ex.src_words for ex in pairs])[idxs_include].tolist() tgt_size = np.array(np.array([len(ex.tgt_sents) for ex in pairs]))[idxs_include].tolist() sample_idxs = np.random.choice(len(src_include), size=10, replace=False) for idx in sample_idxs: first_k = src_include[idx][:tgt_size[idx]] last_k = src_include[idx][-tgt_size[idx]:] hh = np.array(src_include[idx])[sorted( hh_vertices_rouge[idx])].tolist() wf = np.array(src_include[idx])[sorted( wf_vertices_rouge[idx])].tolist() summary = gold_summaries[idx] # print out txt = open( os.path.join( '../../figure/sample/', 'sample_%s_%s_%s.txt' % (self.dataset, self.dataset_type, str(idx))), 'a') # Ref size: tot txt.write('%s\n' % [item for sub in summary for item in sub]) txt.write('%s\n' % [item for sub in first_k for item in sub]) txt.write('%s\n' % [item for sub in last_k for item in sub]) txt.write('%s\n' % [item for sub in hh for item in sub]) txt.write('%s\n' % [item for sub in wf for item in sub]) txt.write( 'Rouge(First,Last,HH,WF): %s,%s,%s,%s' % (rouge([item for sub in summary for item in sub], [item for sub in first_k for item in sub]), rouge([item for sub in summary for item in sub], [item for sub in last_k for item in sub]), rouge([item for sub in summary for item in sub], [item for sub in hh for item in sub]), rouge([item for sub in summary for item in sub], [item for sub in wf for item in sub]))) txt.write('%s\n' % [item for sub in src_include[idx] for item in sub]) txt.close() print("Done Save!")
def rouge_print(gold_summaries, src_examples, src_vertices): src_rouge = [ np.array(x)[sorted(y)].tolist() for x, y in zip(src_examples, src_vertices) ] score = rouge([sum(x, []) for x in gold_summaries], [sum(y, []) for y in src_rouge])[0] return score['1_f'] * 100 / 1.0, score['2_f'] * 100 / 1.0, score[ 'l_f'] * 100 / 1.0, score['su4_f'] * 100 / 1.0
def eval_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, criterion=None, *, pack_seq=True, show_cover_loss=False) -> Tuple[float, float]: """Test the `model` on the `batch`, return the ROUGE score and the loss.""" decoded_batch, out = decode_batch(batch, model, vocab, criterion=criterion, pack_seq=pack_seq, show_cover_loss=show_cover_loss) examples = batch[0] gold_summaries = [ex.tgt for ex in examples] scores = rouge(gold_summaries, decoded_batch) return out.loss_value, scores[0]['l_f']
def eval_bs_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, *, pack_seq=True, beam_size=4, min_out_len=1, max_out_len=None, len_in_words=True, best_only=True, details: bool = True) -> Tuple[Optional[List[Dict[str, float]]], Optional[str]]: """ :param batch: a test batch of a single example :param model: a trained summarizer :param vocab: vocabulary of the trained summarizer :param pack_seq: currently has no effect as batch size is 1 :param beam_size: the beam size :param min_out_len: required minimum output length :param max_out_len: required maximum output length (if None, use the model's own value) :param len_in_words: if True, count output length in words instead of tokens (i.e. do not count punctuations) :param best_only: if True, run ROUGE only on the best hypothesis instead of all `beam size` many :param details: if True, also return a string containing the result of this document :return: two-level score lookup (hypothesis index => ROUGE metric => value) Test a trained summarizer on a document using the beam search decoder. """ assert len(batch.examples) == 1 with torch.no_grad(): input_tensor = batch.input_tensor.to(DEVICE) # (src_len, 1) if not pack_seq: input_lengths = None else: # use PAD input_lengths = batch.input_lengths mask = create_mask(input_lengths) # 返回beam_size大小的hypo hypotheses = model.beam_search(input_tensor, input_lengths, batch.ext_vocab_size, beam_size, min_out_len=min_out_len, max_out_len=max_out_len, len_in_words=len_in_words, mask=mask) if best_only: to_decode = [hypotheses[0].tokens] else: to_decode = [h.tokens for h in hypotheses] decoded_batch = decode_batch_output(to_decode, vocab, batch.oov_dict) if details: file_content = "[System Summary]\n" + format_tokens(decoded_batch[0]) else: file_content = None if batch.examples[0].tgt is not None: # run ROUGE if gold standard summary exists gold_summaries = [batch.examples[0].tgt for _ in range(len(decoded_batch))] scores = rouge(gold_summaries, decoded_batch) if details: file_content += "\n\n\n[Reference Summary]\n" + format_tokens(batch.examples[0].tgt) file_content += "\n\n\n[ROUGE Scores]\n" + format_rouge_scores(scores[0]) + "\n" else: scores = None if details: file_content += "\n\n\n[Source Text]\n" + format_tokens(batch.examples[0].src) return scores, file_content
hyp_list = [] ref_list = [] for batch in tqdm(test_dataloader, desc="Iteration"): batch = tuple(t.to(device) for t in batch) pred, _ = model.beam_decode(batch[0], batch[1], 3, 3) src, tgt = batch[0], batch[2] for i in range(BATCH_SIZE): sample_src = "".join(tokenizer.convert_ids_to_tokens(src[i].cpu().numpy())).split('[CLS]')[1].split('[SEP]')[0] + '\n' sample_tgt = "".join(tokenizer.convert_ids_to_tokens(tgt[i].cpu().numpy())).split('[CLS]')[1].split('[SEP]')[0] + '\n' sample_pred = "".join(tokenizer.convert_ids_to_tokens(pred[i][0])).split('[SEP]')[0] + '\n' f_log.write('\n**********\n') f_log.write('Source: ' + sample_src) f_log.write('Glod: ' + sample_tgt) f_log.write('Hypothesis: ' + sample_pred) # f_hyp.write(sample_pred) # f_ref.write(sample_tgt) hyp_list.append(sample_pred) ref_list.append(sample_tgt) rouge_1 = rouge(hyp_list, ref_list, 1) rouge_2 = rouge(hyp_list, ref_list, 2) logger.info('******Results******') logger.info(f'Rouge-1: {rouge_1}') logger.info(f'Rouge-2: {rouge_2}') f_log.write('**********\n') f_log.write(f'Rouge-1: {rouge_1}\n') f_log.write(f'Rouge-2: {rouge_2}\n') logger.info('Evaluation finished.')