示例#1
0
def evaluate(args):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                              do_lower_case=True)
    model = BertAbs.from_pretrained(
        "remi/bertabs-finetuned-extractive-abstractive-summarization")
    model.to(args.device)
    model.eval()

    symbols = {
        "BOS": tokenizer.vocab["[unused0]"],
        "EOS": tokenizer.vocab["[unused1]"],
        "PAD": tokenizer.vocab["[PAD]"],
    }

    if args.compute_rouge:
        reference_summaries = []
        generated_summaries = []

        import nltk

        import rouge

        nltk.download("punkt")
        rouge_evaluator = rouge.Rouge(
            metrics=["rouge-n", "rouge-l"],
            max_n=2,
            limit_length=True,
            length_limit=args.beam_size,
            length_limit_type="words",
            apply_avg=True,
            apply_best=False,
            alpha=0.5,  # Default F1_score
            weight_factor=1.2,
            stemming=True,
        )

    # these (unused) arguments are defined to keep the compatibility
    # with the legacy code and will be deleted in a next iteration.
    args.result_path = ""
    args.temp_dir = ""

    data_iterator = build_data_iterator(args, tokenizer)
    predictor = build_predictor(args, tokenizer, symbols, model)

    logger.info("***** Running evaluation *****")
    logger.info("  Number examples = %d", len(data_iterator.dataset))
    logger.info("  Batch size = %d", args.batch_size)
    logger.info("")
    logger.info("***** Beam Search parameters *****")
    logger.info("  Beam size = %d", args.beam_size)
    logger.info("  Minimum length = %d", args.min_length)
    logger.info("  Maximum length = %d", args.max_length)
    logger.info("  Alpha (length penalty) = %.2f", args.alpha)
    logger.info("  Trigrams %s be blocked",
                ("will" if args.block_trigram else "will NOT"))

    for batch in tqdm(data_iterator):
        batch_data = predictor.translate_batch(batch)
        translations = predictor.from_batch(batch_data)
        summaries = [format_summary(t) for t in translations]
        save_summaries(summaries, args.summaries_output_dir,
                       batch.document_names)

        if args.compute_rouge:
            reference_summaries += batch.tgt_str
            generated_summaries += summaries

    if args.compute_rouge:
        scores = rouge_evaluator.get_scores(generated_summaries,
                                            reference_summaries)
        str_scores = format_rouge_scores(scores)
        save_rouge_scores(str_scores)
        print(str_scores)
示例#2
0
def evaluate(args):
    # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    tokenizer = BertTokenizer.from_pretrained(
        "/data/wanyao/ghproj_d/transformers/summarization/")
    # sys.exit()
    print('load model...')
    # config = BertAbsConfig.from_json_file('/data/wanyao/ghproj_d/transformers/summarization/config.json')
    # model = BertAbs.from_pretrained("/data/wanyao/ghproj_d/transformers/summarization/", config=config)
    model = BertAbs.from_pretrained(
        "/data/wanyao/ghproj_d/transformers/summarization/")

    # model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")

    model.to(args.device)
    model.eval()

    symbols = {
        "BOS": tokenizer.vocab["[unused0]"],
        "EOS": tokenizer.vocab["[unused1]"],
        "PAD": tokenizer.vocab["[PAD]"],
    }

    if args.compute_rouge:
        reference_summaries = []
        generated_summaries = []

        import rouge
        import nltk

        # nltk.download("punkt")
        rouge_evaluator = rouge.Rouge(
            metrics=["rouge-n", "rouge-l"],
            max_n=2,
            limit_length=True,
            length_limit=args.beam_size,
            length_limit_type="words",
            apply_avg=True,
            apply_best=False,
            alpha=0.5,  # Default F1_score
            weight_factor=1.2,
            stemming=True,
        )

    # these (unused) arguments are defined to keep the compatibility
    # with the legacy code and will be deleted in a next iteration.
    args.result_path = ""
    args.temp_dir = ""

    data_iterator = build_data_iterator(args, tokenizer)
    predictor = build_predictor(args, tokenizer, symbols, model)

    logger.info("***** Running evaluation *****")
    logger.info("  Number examples = %d", len(data_iterator.dataset))
    logger.info("  Batch size = %d", args.batch_size)
    logger.info("")
    logger.info("***** Beam Search parameters *****")
    logger.info("  Beam size = %d", args.beam_size)
    logger.info("  Minimum length = %d", args.min_length)
    logger.info("  Maximum length = %d", args.max_length)
    logger.info("  Alpha (length penalty) = %.2f", args.alpha)
    logger.info("  Trigrams %s be blocked",
                ("will" if args.block_trigram else "will NOT"))

    iterator = 0
    for batch in data_iterator:
        # print('batch-: ', batch)
        print('batch-src: ', batch.src.size())
        print(batch.src)
        batch_data = predictor.translate_batch(batch)
        translations = predictor.from_batch(batch_data)
        summaries = [format_summary(t) for t in translations]
        save_summaries(summaries, args.summaries_output_dir,
                       batch.document_names)

        if args.compute_rouge:
            reference_summaries += batch.tgt_str
            generated_summaries += summaries

        logging.info('iterator: {}'.format(iterator))
        iterator += 1

        if iterator >= 1:
            break

    if args.compute_rouge:
        print('generated_summaries: ', generated_summaries)
        print('reference_summaries: ', reference_summaries)
        scores = rouge_evaluator.get_scores(generated_summaries,
                                            reference_summaries)
        str_scores = format_rouge_scores(scores)
        save_rouge_scores(str_scores)
        print(str_scores)
示例#3
0
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                          do_lower_case=True)

model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
model.to(args['device'])
model.eval()

symbols = {
    "BOS": tokenizer.vocab["[unused0]"],
    "EOS": tokenizer.vocab["[unused1]"],
    "PAD": tokenizer.vocab["[PAD]"],
}

data_iterator = build_data_iterator(args, tokenizer)
predictor = build_predictor(args, tokenizer, symbols, model)

if args['compute_rouge']:
    reference_summaries = []
    generated_summaries = []

    import rouge
    import nltk

    nltk.download("punkt")
    rouge_evaluator = rouge.Rouge(
        metrics=["rouge-n", "rouge-l"],
        max_n=2,
        limit_length=True,
        length_limit=args.beam_size,
        length_limit_type="words",
def evaluate(args):
    #  store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                              do_lower_case=True)
    # runs bertabs-finetuned-cnndm-extractive-abstractive-summarization model
    model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
    model.to(args.device)
    model.eval()

    symbols = {
        "BOS": tokenizer.vocab["[unused0]"],
        "EOS": tokenizer.vocab["[unused1]"],
        "PAD": tokenizer.vocab["[PAD]"],
    }

    if args.compute_rouge:
        reference_summaries = []
        generated_summaries = []

        import rouge
        import nltk

        nltk.download("punkt")
        # creates rouge evaluator for model evaluation
        rouge_evaluator = rouge.Rouge(
            metrics=["rouge-n", "rouge-l"],
            max_n=2,
            limit_length=True,
            length_limit=args.beam_size,
            length_limit_type="words",
            apply_avg=True,
            apply_best=False,
            alpha=0.5,  # Default F1_score
            weight_factor=1.2,
            stemming=True,
        )

    # these (unused) arguments are defined to keep the compatibility
    # with the legacy code and will be deleted in a next iteration.
    args.result_path = ""
    args.temp_dir = ""

    # creates embeddings
    data_iterator = build_data_iterator(args, tokenizer)
    # creates score for each sentence within document using GNMTGlobalScorer class in modeling_bertabs.py
    predictor = build_predictor(args, tokenizer, symbols, model)

    # store model summary within log files that are stored every n steps
    logger.info("***** Running evaluation *****")
    logger.info("  Number examples = %d", len(data_iterator.dataset))
    logger.info("  Batch size = %d", args.batch_size)
    logger.info("")
    logger.info("***** Beam Search parameters *****")
    logger.info("  Beam size = %d", args.beam_size)
    logger.info("  Minimum length = %d", args.min_length)
    logger.info("  Maximum length = %d", args.max_length)
    logger.info("  Alpha (length penalty) = %.2f", args.alpha)
    logger.info("  Trigrams %s be blocked",
                ("will" if args.block_trigram else "will NOT"))

    for batch in tqdm(data_iterator):
        # Generates summaries from one batch of data.
        batch_data = predictor.translate_batch(batch)
        # compare translations to pretrained translations
        translations = predictor.from_batch(batch_data)
        # transforms the output of the `from_batch` function into nicely formatted summaries.
        summaries = [format_summary(t) for t in translations]
        save_summaries(summaries, args.summaries_output_dir,
                       batch.document_names)

        # calculate rouge score for predicted vs actual summaries
        if args.compute_rouge:
            reference_summaries += batch.tgt_str
            generated_summaries += summaries

    # calculate rouge score for predicted vs actual summaries
    if args.compute_rouge:
        scores = rouge_evaluator.get_scores(generated_summaries,
                                            reference_summaries)
        str_scores = format_rouge_scores(scores)
        save_rouge_scores(str_scores)
        print(str_scores)