示例#1
0
def eval_bs_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, *, pack_seq=True, beam_size=4,
                  min_out_len=1, max_out_len=None, len_in_words=True, best_only=True,
                  details: bool = True) -> Tuple[Optional[List[Dict[str, float]]], Optional[str]]:
    """
    :param batch: a test batch of a single example
    :param model: a trained summarizer
    :param vocab: vocabulary of the trained summarizer
    :param pack_seq: currently has no effect as batch size is 1
    :param beam_size: the beam size
    :param min_out_len: required minimum output length
    :param max_out_len: required maximum output length (if None, use the model's own value)
    :param len_in_words: if True, count output length in words instead of tokens (i.e. do not count
                         punctuations)
    :param best_only: if True, run ROUGE only on the best hypothesis instead of all `beam size` many
    :param details: if True, also return a string containing the result of this document
    :return: two-level score lookup (hypothesis index => ROUGE metric => value)

    Test a trained summarizer on a document using the beam search decoder.
    """
    assert len(batch.examples) == 1
    with torch.no_grad():
        input_tensor = batch.input_tensor.to(DEVICE)  # (src_len, 1)

        if not pack_seq:
            input_lengths = None
        else:
            # use PAD
            input_lengths = batch.input_lengths
            mask = create_mask(input_lengths)
        # 返回beam_size大小的hypo
        hypotheses = model.beam_search(input_tensor, input_lengths,
                                       batch.ext_vocab_size, beam_size, min_out_len=min_out_len,
                                       max_out_len=max_out_len, len_in_words=len_in_words, mask=mask)
    if best_only:
        to_decode = [hypotheses[0].tokens]
    else:
        to_decode = [h.tokens for h in hypotheses]
    decoded_batch = decode_batch_output(to_decode, vocab, batch.oov_dict)
    if details:
        file_content = "[System Summary]\n" + format_tokens(decoded_batch[0])
    else:
        file_content = None
    if batch.examples[0].tgt is not None:  # run ROUGE if gold standard summary exists
        gold_summaries = [batch.examples[0].tgt for _ in range(len(decoded_batch))]
        scores = rouge(gold_summaries, decoded_batch)
        if details:
            file_content += "\n\n\n[Reference Summary]\n" + format_tokens(batch.examples[0].tgt)
            file_content += "\n\n\n[ROUGE Scores]\n" + format_rouge_scores(scores[0]) + "\n"
    else:
        scores = None
    if details:
        file_content += "\n\n\n[Source Text]\n" + format_tokens(batch.examples[0].src)
    return scores, file_content
示例#2
0
def eval_texts_folders(args):
    files1 = os.listdir(args.ref_dir)
    files2 = os.listdir(args.cand_dir)
    assert (files1 == files2)
    ref = []
    cand = []
    for fn in files1:
        with open(os.path.join(args.ref_dir, fn),
                  'r',
                  encoding="utf-8",
                  errors='ignore') as f:
            ref.append([line.strip() for line in f.readlines()])
        with open(os.path.join(args.cand_dir, fn),
                  'r',
                  encoding="utf-8",
                  errors='ignore') as f:
            cand.append([line.strip() for line in f.readlines()])

    rouge = Rouge()
    rouge_score = rouge.get_scores([" ".join(p) for p in cand],
                                   [" ".join(g) for g in ref],
                                   avg=True)
    formatted_rouge_score = format_rouge_scores(rouge_score)
    rouge_table = format_rouge_table(rouge_score)
    similarity_score = calc_sbert_similarity(ref, cand)
    sentence_overlap = calc_sentence_coverage(cand, ref)
    print(formatted_rouge_score)
    print("Similarity score(sbert): %.3f" % similarity_score)
    print("Sentence overlap: %.3f" % sentence_overlap)
    print(rouge_table + " & %.3f & %.3f" %
          (similarity_score, sentence_overlap))
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    rouge_output_file = os.path.join(
        args.output_dir,
        "text_eval_results_{}_{}.txt".format(os.path.basename(args.cand_dir),
                                             os.path.basename(args.ref_dir)))
    with open(rouge_output_file, 'w', encoding="utf-8") as f:
        f.write(formatted_rouge_score)
        f.write("Sentence overlap: %.3f\n" % sentence_overlap)
        f.write("Similarity score(sbert): %.3f\n" % similarity_score)
        f.write(rouge_table + " & %.3f & %.3f \\\\" %
                (similarity_score, sentence_overlap))
示例#3
0
def cnndm_lead_n(args, logger):
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    eval_dataset = CNNDMBlobNoTokens(prefix='test', data_path=args.data_dir)
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size,
                                 collate_fn=lambda x: x,
                                 num_workers=args.num_workers)
    logger.info("***** Running evaluation  *****")
    logger.info("  Num examples = %d", len(eval_dataset))

    gold = []
    pred = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        stories = [x[0] for x in batch]
        summaries = [x[1] for x in batch]

        for story, summary in zip(stories, summaries):
            pred.append(story[:3])
            gold.append(summary)
    rouge = Rouge()
    simple_rouge_score_greed = np.mean(
        [simple_rouge(every7[i], gold[i]) for i in range(len(gold))]) * 100
    rouge_score_greed = format_rouge_scores(
        rouge.get_scores([" ".join(p) for p in every7],
                         [" ".join(g) for g in gold],
                         avg=True))
    similarity_score_greed = calc_sbert_similarity(every7, gold)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    rouge_output_file = os.path.join(args.output_dir, "cnndm_every7.txt")
    with open(rouge_output_file, 'w', encoding="utf-8") as f:
        f.write("lead-3")
        f.write(rouge_score_greed)
        f.write("\nSimple rouge score: %f\n" % simple_rouge_score_greed)
        f.write("Similarity score(sbert): %f\n" % similarity_score_greed)
示例#4
0
def cnndm_test_full(args, model, logger):
    model = AutoExtSummarizer(args)
    model.to(args.device)
    model.eval()

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    train_dataset = CNNDMBlobNoTokens(prefix='test',
                                      data_path=args.data_dir,
                                      label_key=args.label_key)
    train_sampler = SequentialSampler(train_dataset)
    model_collate_fn = functools.partial(collate,
                                         pad_token_id=tokenizer.pad_token_id)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=1,
                                  collate_fn=single_collate,
                                  num_workers=args.num_workers)

    logger.info("***** Running CNNDM evaluation  *****")
    logger.info("  Num examples = %d", len(train_dataset))

    gold = []
    pred = []

    for batch in tqdm(train_dataloader, desc="Evaluating"):  #, disable=True):

        summary = batch[0][2]
        story = batch[0][0]

        blocks = create_labeled_blocks(args, batch[0], tokenizer)
        block_scores = []
        memory = None
        for block in blocks:
            _batch = Batch([block], pad_token_id=tokenizer.pad_token_id)
            source = _batch.src.to(args.device)
            encoder_mask = _batch.mask.to(args.device)
            clss = _batch.clss.to(args.device)
            cls_mask = _batch.mask_cls.to(args.device).bool()

            with torch.no_grad():
                sent_scores, mask, memory = model(source,
                                                  encoder_mask,
                                                  clss,
                                                  cls_mask,
                                                  memory=memory)
                #Seperates padding from the ones that are actually 0
                sent_scores = sent_scores + mask.float()
                sent_scores = sent_scores.cpu().data.numpy()
                block_scores.extend(sent_scores[0])
        selected_ids = np.argsort(block_scores)[::-1]
        _pred = []
        for i in selected_ids:
            candidate = story[i].strip()
            if (not _block_tri(candidate, _pred)):
                _pred.append(candidate)
            if len(_pred) == 3:
                break
        pred.append(_pred)
        gold.append(summary)

    #python rouge implementation
    rouge = Rouge()
    rouge_score = rouge.get_scores([" ".join(p) for p in pred],
                                   [" ".join(g) for g in gold],
                                   avg=True)
    rouge_score_formatted = format_rouge_scores(rouge_score)
    rouge_table = format_rouge_table(rouge_score)

    similarity_score = calc_sbert_similarity(pred, gold)
    #similarity_score = 0

    print(rouge_score_formatted)
    print("Similarity score(sbert): %.3f" % similarity_score)
    print(rouge_table + " & %.3f" % similarity_score)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    rouge_output_file = os.path.join(
        args.output_dir, "cnndm_test_full_results_{}_{}.txt".format(
            args.model_name,
            os.path.basename(args.model_path).split(".")[0]))
    with open(rouge_output_file, 'w', encoding="utf-8") as f:
        f.write(rouge_score_formatted)
        f.write("Similarity score(sbert): %.3f\n" % similarity_score)
        f.write(rouge_table + " & %.3f" % (similarity_score))
示例#5
0
def cnndm_test(args, model, logger):
    #Init model
    model = AutoExtSummarizer(args)
    model.to(args.device)
    model.eval()

    if "score" in args.label_key:
        criterion = torch.nn.BCELoss(reduction='sum')
    else:
        criterion = torch.nn.MSELoss(reduction='sum')

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    eval_dataset = CNNDMBlob(prefix='test',
                             data_path=args.data_dir,
                             tokenizer=tokenizer)
    eval_sampler = SequentialSampler(eval_dataset)

    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size,
                                 collate_fn=functools.partial(
                                     collate,
                                     pad_token_id=tokenizer.pad_token_id,
                                     is_test=True),
                                 num_workers=args.num_workers,
                                 pin_memory=True)

    logger.info("***** Running CNNDM evaluation  *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    eval_loss = 0.0
    nb_eval_steps = 0

    gold = []
    pred = []
    avg_loss = 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):  #, disable=True):
        source = batch.src.to(args.device)
        encoder_mask = batch.mask.to(args.device)
        scores = batch.labels.to(args.device)
        clss = batch.clss.to(args.device)
        cls_mask = batch.mask_cls.to(args.device).bool()
        target = batch.labels.to(args.device)

        with torch.no_grad():
            sent_scores, mask, memory = model(
                source,
                encoder_mask,
                clss,
                cls_mask,
            )
            loss = criterion(sent_scores, target.float())
            avg_loss += loss.item()
            #Seperates padding from the ones that are actually 0
            sent_scores = sent_scores + mask.float()
            sent_scores = sent_scores.cpu().data.numpy()
            #Returns indexes sorted by descending score
            selected_ids = np.argsort(-sent_scores, 1)

            for idx, row in enumerate(selected_ids):
                _pred = []
                for i in row:
                    if i >= len(batch.src_str[idx]):
                        continue
                    candidate = batch.src_str[idx][i].strip()
                    if args.block_tri:
                        if (not _block_tri(candidate, _pred)):
                            _pred.append(candidate)
                    else:
                        _pred.append(candidate)
                    if len(_pred) == 3:
                        break
                pred.append(_pred)
                _gold = batch.tgt_str[idx]
                gold.append(_gold)
            #break
    simple_rouge_score = []
    for i in range(len(gold)):
        simple_rouge_score.append(simple_rouge(pred[i], gold[i]))
    simple_rouge_score = np.mean(simple_rouge_score) * 100
    avg_loss = avg_loss / (len(eval_dataloader) * args.batch_size)

    #python rouge implementation
    rouge = Rouge()
    rouge_score = rouge.get_scores([" ".join(p) for p in pred],
                                   [" ".join(g) for g in gold],
                                   avg=True)
    rouge_score_formatted = format_rouge_scores(rouge_score)
    rouge_table = format_rouge_table(rouge_score)

    similarity_score = calc_sbert_similarity(pred, gold)

    print(rouge_score_formatted)
    print("Simple rouge score: %.3f" % simple_rouge_score)
    print("Avg loss: %.3f" % avg_loss)
    print("Similarity score(sbert): %.3f" % similarity_score)
    print(rouge_table + " & %.3f & %.3f" %
          (simple_rouge_score, similarity_score))

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    rouge_output_file = os.path.join(
        args.output_dir, "cnndm_test_results_{}_{}.txt".format(
            args.model_name,
            os.path.basename(args.model_path).split(".")[0]))
    with open(rouge_output_file, 'w', encoding="utf-8") as f:
        f.write(rouge_score_formatted)
        f.write("\nSimple rouge score: %.3f\n" % simple_rouge_score)
        f.write("Avg loss: %.3f\n" % avg_loss)
        f.write("Similarity score(sbert): %.3f\n" % similarity_score)
        f.write(rouge_table + " & %.3f & %.3f" %
                (simple_rouge_score, similarity_score))
示例#6
0
def cnndm_label_ref(args, logger):
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    eval_dataset = CNNDMBlobNoTokens(prefix='test',
                                     data_path=args.data_dir,
                                     tokenizer=tokenizer,
                                     label_key=args.label_key,
                                     max_pos=9999)
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size,
                                 collate_fn=functools.partial(
                                     collate,
                                     pad_token_id=tokenizer.pad_token_id,
                                     is_test=True),
                                 num_workers=args.num_workers)
    logger.info("***** Running evaluation  *****")
    logger.info("  Num examples = %d", len(eval_dataset))

    gold = []
    lead3pred = []
    greedyselectpred = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):

        source = batch.src
        encoder_mask = batch.mask
        scores = batch.labels
        clss = batch.clss
        cls_mask = batch.mask_cls

        scores = np.argsort(-scores, 1)

        for idx, row in enumerate(scores):
            _pred = []

            for jdx, i in enumerate(row):
                if batch.labels[idx][i] == 0:
                    break

                if i >= len(batch.src_str[idx]):
                    continue
                candidate = batch.src_str[idx][i].strip()

                _pred.append(candidate)
                if len(_pred) == 3:
                    break

            greedyselectpred.append(_pred)
            lead3pred.append(batch.src_str[idx][:3])
            gold.append(batch.tgt_str[idx])

    rouge = Rouge()
    simple_rouge_score_greed = np.mean(
        [simple_rouge(greedyselectpred[i], gold[i])
         for i in range(len(gold))]) * 100
    rouge_score_greed = format_rouge_scores(
        rouge.get_scores([" ".join(p) for p in greedyselectpred],
                         [" ".join(g) for g in gold],
                         avg=True))
    similarity_score_greed = calc_sbert_similarity(greedyselectpred, gold)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    rouge_output_file = os.path.join(args.output_dir, "cnndm_ref_results.txt")
    with open(rouge_output_file, 'w', encoding="utf-8") as f:
        f.write("GREED")
        f.write(rouge_score_greed)
        f.write("\nSimple rouge score: %f\n" % simple_rouge_score_greed)
        f.write("Similarity score(sbert): %f\n" % similarity_score_greed)