Exemplo n.º 1
0
import pandas as pd
import json
from evaluation_script import normalize_answer, f1_score, exact_match_score ,rougel_score
import numpy as np
from IPython import embed

nqfile="../narrativeqa/qaps.csv"
inputfile="out/NarrativeQA9062233-first-only/test_predictions.json"
df=pd.read_csv(nqfile)
test_df=df[df['set']=='test']

test_prd=json.load(open(inputfile,'r'))

n_paragraphs=[10,15,20]
f1s, ems ,rougels = [[] for _ in n_paragraphs], [[] for _ in n_paragraphs], [[] for _ in n_paragraphs]
for j,predictions in enumerate(test_prd.values()):
    groundtruth = [test_df.iloc[j,3],test_df.iloc[j,4]]
    predictions = predictions[:-1]
    if len(groundtruth)==0:
        for i in range(len(n_paragraphs)):
            f1s[i].append(0)
            ems[i].append(0)
            rougels[i].append(0)
        continue
    for i, prediction in enumerate(predictions):
        f1s[i].append(max([f1_score(prediction, gt)[0] for gt in groundtruth]))
        ems[i].append(max([exact_match_score(prediction, gt) for gt in groundtruth]))
        rougels[i].append(max([rougel_score(prediction, gt) for gt in groundtruth]))
for n, f1s_, ems_, rougels_ in zip(n_paragraphs, f1s, ems,rougels):
    print("n=%d\tF1 %.2f\tEM %.2f\tR-L %.2f"%(n, np.mean(f1s_)*100, np.mean(ems_)*100,np.mean(rougels_)*100))          
Exemplo n.º 2
0
def write_predictions(logger,
                      all_examples,
                      all_features,
                      all_results,
                      n_best_size,
                      do_lower_case,
                      output_prediction_file,
                      output_nbest_file,
                      verbose_logging,
                      write_prediction=True,
                      n_paragraphs=None):
    """Write final predictions to the json file."""

    example_index_to_features = collections.defaultdict(list)
    for feature in all_features:
        example_index_to_features[feature.example_index].append(feature)

    unique_id_to_result = {}
    for result in all_results:
        unique_id_to_result[result.unique_id] = result

    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction", [
            "paragraph_index", "feature_index", "start_index", "end_index",
            "logit", "no_answer_logit"
        ])

    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()

    if verbose_logging:
        all_examples = tqdm(enumerate(all_examples))
    else:
        all_examples = enumerate(all_examples)

    for (example_index, example) in all_examples:
        features = example_index_to_features[example_index]
        if len(features) == 0 and n_paragraphs is None:
            pred = _NbestPrediction(text="empty",
                                    logit=-1000,
                                    no_answer_logit=1000)
            all_predictions[example.qas_id] = ("empty", example.all_answers)
            all_nbest_json[example.qas_id] = [pred]
            continue

        prelim_predictions = []
        yn_predictions = []

        if n_paragraphs is None:
            results = sorted(enumerate(features),
                             key=lambda f: unique_id_to_result[f[1].unique_id].
                             switch[3])[:1]
        else:
            results = enumerate(features)
        for (feature_index, feature) in results:
            result = unique_id_to_result[feature.unique_id]
            scores = []
            start_logits = result.start_logits[:len(feature.tokens)]
            end_logits = result.end_logits[:len(feature.tokens)]
            for (i, s) in enumerate(start_logits):
                for (j, e) in enumerate(end_logits[i:i + 10]):
                    scores.append(((i, i + j), s + e))

            scores = sorted(scores, key=lambda x: x[1], reverse=True)

            cnt = 0
            for (start_index, end_index), score in scores:
                if start_index >= len(feature.tokens):
                    continue
                if end_index >= len(feature.tokens):
                    continue
                if start_index not in feature.token_to_orig_map:
                    continue
                if end_index not in feature.token_to_orig_map:
                    continue
                if not feature.token_is_max_context.get(start_index, False):
                    continue
                if end_index < start_index:
                    continue
                prelim_predictions.append(
                    _PrelimPrediction(
                        paragraph_index=feature.paragraph_index,
                        feature_index=feature_index,
                        start_index=start_index,
                        end_index=end_index,
                        logit=-result.switch[3],  #score,
                        no_answer_logit=result.switch[3]))
                if n_paragraphs is None:
                    if write_predictions and len(
                            prelim_predictions) >= n_best_size:
                        break
                    elif not write_predictions:
                        break
                cnt += 1

        prelim_predictions = sorted(prelim_predictions,
                                    key=lambda x: x.logit,
                                    reverse=True)
        no_answer_logit = result.switch[3]

        def get_nbest_json(prelim_predictions):

            seen_predictions = {}
            nbest = []
            for pred in prelim_predictions:
                if len(nbest) >= n_best_size:
                    break

                if pred.start_index == pred.end_index == -1:
                    final_text = "yes"
                elif pred.start_index == pred.end_index == -2:
                    final_text = "no"
                else:
                    feature = features[pred.feature_index]

                    tok_tokens = feature.tokens[pred.start_index:(
                        pred.end_index + 1)]
                    orig_doc_start = feature.token_to_orig_map[
                        pred.start_index]
                    orig_doc_end = feature.token_to_orig_map[pred.end_index]
                    orig_tokens = feature.doc_tokens[orig_doc_start:(
                        orig_doc_end + 1)]
                    tok_text = " ".join(tok_tokens)

                    # De-tokenize WordPieces that have been split off.
                    tok_text = tok_text.replace(" ##", "")
                    tok_text = tok_text.replace("##", "")

                    # Clean whitespace
                    tok_text = tok_text.strip()
                    tok_text = " ".join(tok_text.split())
                    orig_text = " ".join(orig_tokens)

                    final_text = get_final_text(tok_text, orig_text, do_lower_case, \
                                                logger, verbose_logging)

                if final_text in seen_predictions:
                    continue

                nbest.append(
                    _NbestPrediction(text=final_text,
                                     logit=pred.logit,
                                     no_answer_logit=no_answer_logit))

            # In very rare edge cases we could have no valid predictions. So we
            # just create a nonce prediction in this case to avoid failure.
            if not nbest:
                nbest.append(
                    _NbestPrediction(text="empty",
                                     logit=0.0,
                                     no_answer_logit=no_answer_logit))

            assert len(nbest) >= 1

            total_scores = []
            for entry in nbest:
                total_scores.append(entry.logit)

            probs = _compute_softmax(total_scores)
            nbest_json = []
            for (i, entry) in enumerate(nbest):
                output = collections.OrderedDict()
                output['text'] = entry.text
                output['probability'] = probs[i]
                output['logit'] = entry.logit
                output['no_answer_logit'] = entry.no_answer_logit
                nbest_json.append(output)

            assert len(nbest_json) >= 1
            return nbest_json

        if n_paragraphs is None:
            nbest_json = get_nbest_json(prelim_predictions)
            all_predictions[example.qas_id] = (nbest_json[0]["text"],
                                               example.all_answers)
            all_nbest_json[example.qas_id] = nbest_json
        else:
            all_predictions[example.qas_id] = []
            all_nbest_json[example.qas_id] = []
            for n in n_paragraphs:
                nbest_json = get_nbest_json([pred for pred in prelim_predictions if \
                                             pred.paragraph_index<n])
                all_predictions[example.qas_id].append(nbest_json[0]["text"])
            all_predictions[example.qas_id].append(example.all_answers)

    if write_prediction:
        logger.info("Writing predictions to: %s" % (output_prediction_file))
        logger.info("Writing nbest to: %s" % (output_nbest_file))

        with open(output_prediction_file, "w") as writer:
            writer.write(json.dumps(all_predictions, indent=4) + "\n")

        with open(output_nbest_file, "w") as writer:
            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")

    if n_paragraphs is None:
        f1s, ems, rougels = [], [], []
        for prediction, groundtruth in all_predictions.values():
            if len(groundtruth) == 0:
                f1s.append(0)
                ems.append(0)
                rougels.append(0)
                continue
            f1s.append(max([f1_score(prediction, gt)[0]
                            for gt in groundtruth]))
            ems.append(
                max([exact_match_score(prediction, gt) for gt in groundtruth]))
            rougels.append(
                max([rougel_score(prediction, gt) for gt in groundtruth]))
        final_f1, final_em, final_rougel = np.mean(f1s), np.mean(ems), np.mean(
            rougels)
    else:
        f1s, ems, rougels = [[] for _ in n_paragraphs
                             ], [[] for _ in n_paragraphs
                                 ], [[] for _ in n_paragraphs]
        for predictions in all_predictions.values():
            groundtruth = predictions[-1]
            predictions = predictions[:-1]
            if len(groundtruth) == 0:
                for i in range(len(n_paragraphs)):
                    f1s[i].append(0)
                    ems[i].append(0)
                    rougels[i].append(0)
                continue
            for i, prediction in enumerate(predictions):
                f1s[i].append(
                    max([f1_score(prediction, gt)[0] for gt in groundtruth]))
                ems[i].append(
                    max([
                        exact_match_score(prediction, gt) for gt in groundtruth
                    ]))
                rougels[i].append(
                    max([rougel_score(prediction, gt) for gt in groundtruth]))
                if (max([rougel_score(prediction, gt)
                         for gt in groundtruth]) != 1):
                    print(f'prediction: {prediction}')
                    print(f'groudtruth: {groundtruth}')
                    print(
                        f'{max([rougel_score(prediction, gt) for gt in groundtruth])}\n'
                    )
        for n, f1s_, ems_, rougels_ in zip(n_paragraphs, f1s, ems, rougels):
            logger.info("n=%d\tF1 %.2f\tEM %.2f\tR-L %.2f" %
                        (n, np.mean(f1s_) * 100, np.mean(ems_) * 100,
                         np.mean(rougels_) * 100))
        final_f1, final_em, final_rougel = np.mean(f1s[-1]), np.mean(
            ems[-1]), np.mean(rougels[-1])
    return final_em, final_f1, final_rougel