Пример #1
0
def read_ir_result(path, prependlinum=False, concatev=False):
    """
    Returns
    instances: list of dictionary
    update instance['predicted_sentences'] with list of evidences (list of str)
    """
    instances = read_jsonl(path)
    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    titles = list()

    # make list of titles
    for instance in instances:
        titles.extend([title for title, _ in instance["predicted_sentences"]])

    # load title2line2sentences
    t2l2s = load_doclines(titles, t2jnum)

    for instance in instances:
        if concatev:
            instance["evidence"] = [" ".join(get_evidence_sentence_list(
                instance["predicted_sentences"], t2l2s, prependlinum=prependlinum))]
        else:
            instance["evidence"] = get_evidence_sentence_list(
                instance["predicted_sentences"], t2l2s, prependlinum=prependlinum)

    return instances
def read_ir_result(path, n_sentences=5):
    '''
    读取句子检索的结果
    '''
    short_evidences_counter = 0

    instances = read_jsonl(path)
    for instance in instances:
        if len(instance['predicted_sentences']) < n_sentences:
            short_evidences_counter += 1
        instance['predicted_sentences'] = instance[
            'predicted_sentences'][:n_sentences]  # 只保留前 n 个句子
    print('short_evidences: {} / {}'.format(short_evidences_counter,
                                            len(instances)))

    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path('data/wiki-pages/'),
        doctitles=abs_path('data/preprocessed_data/doctitles'))

    titles = list()
    # 获取所有标题的列表
    for instance in instances:
        titles.extend([title for title, _ in instance['predicted_sentences']])

    t2l2s = load_doclines(titles, t2jnum)

    # 证据语句
    for instance in instances:
        instance['evidence'] = get_evidence_sentence_list(
            instance['predicted_sentences'], t2l2s)

    return instances
Пример #3
0
    def __init__(self, jsonl_file, n_sentences=5, sampling=False, use_ev_scores=False, test=False):
        instances = read_jsonl(jsonl_file)

        if sampling:
            instances = sample(instances)

        self.instances = instances
        self.n_sentences = n_sentences
        self.test = test
        self.use_ev_scores = use_ev_scores
Пример #4
0
def read_ir_result(path,
                   n_sentences=5,
                   prependlinum=False,
                   prependtitle=False,
                   concatev=False):
    """
    Returns
    instances: list of dictionary
    update instance['predicted_sentences'] with list of evidences (list of str)
    """
    short_evidences_counter = 0
    instances = read_jsonl(path)
    # only read n_sentences
    for instance in instances:
        if len(instance["predicted_sentences"]) < n_sentences:
            short_evidences_counter += 1
        instance["predicted_sentences"] = instance[
            "predicted_sentences"][:n_sentences]
    print("short_evidences: {} / {}".format(short_evidences_counter,
                                            len(instances)))

    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    titles = list()

    # make list of titles
    for instance in instances:
        titles.extend([title for title, _ in instance["predicted_sentences"]])

    # load title2line2sentences
    t2l2s = load_doclines(titles, t2jnum)

    for instance in instances:
        if concatev:
            instance["evidence"] = [
                " ".join(
                    get_evidence_sentence_list(instance["predicted_sentences"],
                                               t2l2s,
                                               prependlinum=prependlinum,
                                               prependtitle=prependtitle))
            ]
        else:
            instance["evidence"] = get_evidence_sentence_list(
                instance["predicted_sentences"],
                t2l2s,
                prependlinum=prependlinum,
                prependtitle=prependtitle)

    return instances
Пример #5
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("src")
    parser.add_argument("tar")
    parser.add_argument("--use_ir_pred", action="store_true")
    parser.add_argument("--prependlinum", action="store_true")
    parser.add_argument("--prependtitle", action="store_true")
    parser.add_argument("--convert_test", action="store_true")
    # parser.add_argument("--testset", help="turn on when you convert test data", action="store_true")
    args = parser.parse_args()
    print(args)

    if args.convert_test:
        test_in = '''[{"id": 15812, "verifiable": "VERIFIABLE", "label": "REFUTES", "claim": "Peggy Sue Got Married is a Egyptian film released in 1986.", "evidence": [[[31205, 37902, "Peggy_Sue_Got_Married", 0], [31205, 37902, "Francis_Ford_Coppola", 0]], [[31211, 37908, "Peggy_Sue_Got_Married", 0]]], "predicted_pages": ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", "Peggy_Sue_Got_Married_-LRB-song-RRB-", "Peggy_Sue_Got_Married", "Peggy_Sue", "Peggy_Sue_-LRB-band-RRB-"], "predicted_sentences": [["Peggy_Sue_Got_Married", 0], ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", 0], ["Peggy_Sue_Got_Married_-LRB-song-RRB-", 0], ["Peggy_Sue", 0], ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", 2]]}, {"id": 229289, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Neal Schon was named in 1954.", "evidence": [[[273626, null, null, null]]], "predicted_pages": ["Neal_Schon", "Neal", "Named", "Was_-LRB-Not_Was-RRB-", "Was"], "predicted_sentences": [["Neal_Schon", 0], ["Neal_Schon", 6], ["Neal_Schon", 5], ["Neal_Schon", 1], ["Neal_Schon", 2]]}]'''

        print("input:\n", test_in)
        fever_format = json.loads(test_in)
        snli_format_instances = convert(fever_format, prependlinum=args.prependlinum, prependtitle=args.prependtitle, use_ir_prediction=args.use_ir_pred)
        print("\noutput:\n", json.dumps(snli_format_instances, indent=4))

    else:
        assert not os.path.exists(args.tar), "file {} alreadly exists".format(
            args.tar)
        keyerr_count = 0

        instances = read_jsonl(args.src)
        snli_format_instances = convert(instances, prependlinum=args.prependlinum, prependtitle=args.prependtitle, use_ir_prediction=args.use_ir_pred)
        save_jsonl(snli_format_instances, args.tar)
Пример #6
0
def save_wrong_instances(actual_file, predicted_labels_file,
                         predicted_evidence_file, out_file):
    label_predictions = read_jsonl(predicted_labels_file)
    ev_predictions = read_jsonl(predicted_evidence_file)
    actual = read_jsonl(actual_file)

    all_titles = list()
    for ev_pred, act in zip(ev_predictions, actual):
        ev_titles = [title for title, _ in ev_pred["predicted_sentences"]]
        act_titles = [
            title for evidence_set in act["evidence"]
            for _, _, title, _ in evidence_set
        ]
        titles = ev_titles + act_titles
        all_titles.extend(titles)

    print("loading wiki data...")
    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    t2l2s = load_doclines(all_titles, t2jnum)

    counter = 0
    observations = list()

    print("loading vocabulary list...")
    import pickle
    with open("vocab_list.db", "rb") as f:
        vocab = pickle.load(f)

    pos_counter = 0
    neg_counter = 0
    print("processing predictions...")
    for label_pred, ev_pred, act in tqdm(
            zip(label_predictions, ev_predictions, actual)):
        actual_label = act["label"]
        assert actual_label == label_pred["actual"]

        pred_label = label_pred["predicted"]
        if pred_label != actual_label:
            continue

        counter += 1
        actual_ev = act["evidence"]
        pred_labels = label_pred["prediction_list"]
        pred_ev = ev_pred["predicted_sentences"]
        pred_ev_sent = resolve_evidences(pred_ev, t2l2s, actual=False)

        claim = act["claim"]
        ev_contained = convert(compare_evidences(actual_ev, pred_ev))
        actual_ev_sent = resolve_evidences(actual_ev, t2l2s)
        assert not (actual_label != "NOT ENOUGH INFO"
                    and len(actual_ev_sent) != len(actual_ev))

        pred_sentence = " ".join(pred_ev_sent)
        ac_sentence = " ".join(sent for sentences in actual_ev_sent
                               for sent in sentences
                               if sent != "**Not Found**")
        unk_words = find_unk(pred_sentence + " " + ac_sentence, vocab)

        if pred_label == actual_label:
            pos_counter += 1
        else:
            neg_counter += 1

        # overwrite when label is NEI
        if actual_label == "NOT ENOUGH INFO":
            ev_contained = ["-" for e in ev_contained]

        # # skip for NEI or no correct evidence.
        # if ev_contained == ["X"] * 5 and ev_contained != ["-"] * 5:
        #     continue

        label_pred_ev = [
            "<{}> <{}> {}".format(label, contained, ev) for label, contained,
            ev in zip(shorten_labels(pred_labels), ev_contained, pred_ev)
        ]
        actual_ev = ev_pred["evidence"]

        observations.append({
            "id": act["id"],
            "claim": claim,
            "predicted_evidences": label_pred_ev,
            "predicted_sentences": pred_ev_sent,
            "predicted_label": pred_label,
            "actual_evidence": actual_ev,
            "actual_sentences": actual_ev_sent,
            "actual_label": actual_label,
            "unk_words": unk_words
        })

    random.shuffle(observations)
    save_jsonl_pretty_print(observations, out_file)
    print("pos_counter", pos_counter)
    print("neg_counter", neg_counter)
    print("wrong labels:", counter)