예제 #1
0
def resolve_evidences(evidences, t2l2s, actual=True):
    """
    actual: [[[id, id, "title", linum], [id, id, "title", linum]], [[id, id, "title", linum]], ... ]
    not actual ( predicted ):
    [(title, linum), (title, linum), ...]
    """
    if actual:
        evidence_sentences = list()
        for evidence_set in evidences:
            evidence_linum = [(title, linum)
                              for _, _, title, linum in evidence_set]
            for title, _ in evidence_linum:
                if title is not None and title not in t2l2s:
                    pass
                    # print(title, len(evidences), evidences)

            evidence_linum = [(title, linum)
                              for _, _, title, linum in evidence_set
                              if title in t2l2s]

            # continue if evidence_linum is empty
            if not evidence_linum:
                evidence_sentences.append(["**Not Found**"])
                continue

            sentences = get_evidence_sentence_list(evidence_linum, t2l2s)

            evidence_sentences.append(sentences)

    else:
        evidence_sentences = get_evidence_sentence_list(evidences, t2l2s)

    return evidence_sentences
예제 #2
0
def read_ir_result(path, prependlinum=False, concatev=False):
    """
    Returns
    instances: list of dictionary
    update instance['predicted_sentences'] with list of evidences (list of str)
    """
    instances = read_jsonl(path)
    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    titles = list()

    # make list of titles
    for instance in instances:
        titles.extend([title for title, _ in instance["predicted_sentences"]])

    # load title2line2sentences
    t2l2s = load_doclines(titles, t2jnum)

    for instance in instances:
        if concatev:
            instance["evidence"] = [" ".join(get_evidence_sentence_list(
                instance["predicted_sentences"], t2l2s, prependlinum=prependlinum))]
        else:
            instance["evidence"] = get_evidence_sentence_list(
                instance["predicted_sentences"], t2l2s, prependlinum=prependlinum)

    return instances
예제 #3
0
def _convert_instance(instance, t2l2s, prependlinum, prependtitle, use_ir_prediction):
    """convert single instance to either one or multiple instances
    Args
    instance: instance of FEVER dataset.
    t2l2s: output of titles_to_jsonl_num

    Returns
    list of converted instances
    """

    def _evidence_format(evidences):
        """return evidence sentence from (possibly) multiple evidence sentences"""
        return " ".join(evidences)

    converted_instances = list()
    # assert instance["evidence"] == [[[hoge, hoge, title, linum], [hoge, hoge, title, linum]], [[..],[..],..], ...]
    if use_ir_prediction:
        evidence_linum = [(title, linum) for title, linum in instance["predicted_sentences"]
                          if title in t2l2s]
        for eidx, (title, linum) in enumerate(evidence_linum):

            converted_instances.append(
                snli_format(
                    id="{}-{}".format(instance["id"], str(eidx)),
                    pair_id="{}-{}".format(instance["id"], str(eidx)),
                    label=convert_label(instance["label"]),
                    evidence=_evidence_format(
                        get_evidence_sentence_list(
                            [(title, linum)], t2l2s, prependlinum=prependlinum, prependtitle=prependtitle)),
                    claim=instance["claim"]))

    else:
        for eidx, evidence_set in enumerate(instance["evidence"]):
            evidence_linum = [(title, linum)
                              for _, _, title, linum in evidence_set
                              if title in t2l2s]

            # continue if evidence_linum is empty
            if not evidence_linum:
                continue
            converted_instances.append(
                snli_format(
                    id="{}-{}".format(instance["id"], str(eidx)),
                    pair_id="{}-{}".format(instance["id"], str(eidx)),
                    label=convert_label(instance["label"]),
                    evidence=_evidence_format(
                        get_evidence_sentence_list(
                            evidence_linum, t2l2s, prependlinum=prependlinum, prependtitle=prependtitle)),
                    claim=instance["claim"]))
    return converted_instances
예제 #4
0
def read_ir_result(path,
                   n_sentences=5,
                   prependlinum=False,
                   prependtitle=False,
                   concatev=False):
    """
    Returns
    instances: list of dictionary
    update instance['predicted_sentences'] with list of evidences (list of str)
    """
    short_evidences_counter = 0
    instances = read_jsonl(path)
    # only read n_sentences
    for instance in instances:
        if len(instance["predicted_sentences"]) < n_sentences:
            short_evidences_counter += 1
        instance["predicted_sentences"] = instance[
            "predicted_sentences"][:n_sentences]
    print("short_evidences: {} / {}".format(short_evidences_counter,
                                            len(instances)))

    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    titles = list()

    # make list of titles
    for instance in instances:
        titles.extend([title for title, _ in instance["predicted_sentences"]])

    # load title2line2sentences
    t2l2s = load_doclines(titles, t2jnum)

    for instance in instances:
        if concatev:
            instance["evidence"] = [
                " ".join(
                    get_evidence_sentence_list(instance["predicted_sentences"],
                                               t2l2s,
                                               prependlinum=prependlinum,
                                               prependtitle=prependtitle))
            ]
        else:
            instance["evidence"] = get_evidence_sentence_list(
                instance["predicted_sentences"],
                t2l2s,
                prependlinum=prependlinum,
                prependtitle=prependtitle)

    return instances
def read_ir_result(path, n_sentences=5):
    '''
    读取句子检索的结果
    '''
    short_evidences_counter = 0

    instances = read_jsonl(path)
    for instance in instances:
        if len(instance['predicted_sentences']) < n_sentences:
            short_evidences_counter += 1
        instance['predicted_sentences'] = instance[
            'predicted_sentences'][:n_sentences]  # 只保留前 n 个句子
    print('short_evidences: {} / {}'.format(short_evidences_counter,
                                            len(instances)))

    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path('data/wiki-pages/'),
        doctitles=abs_path('data/preprocessed_data/doctitles'))

    titles = list()
    # 获取所有标题的列表
    for instance in instances:
        titles.extend([title for title, _ in instance['predicted_sentences']])

    t2l2s = load_doclines(titles, t2jnum)

    # 证据语句
    for instance in instances:
        instance['evidence'] = get_evidence_sentence_list(
            instance['predicted_sentences'], t2l2s)

    return instances