def read_ir_result(path, n_sentences=5):
    '''
    读取句子检索的结果
    '''
    short_evidences_counter = 0

    instances = read_jsonl(path)
    for instance in instances:
        if len(instance['predicted_sentences']) < n_sentences:
            short_evidences_counter += 1
        instance['predicted_sentences'] = instance[
            'predicted_sentences'][:n_sentences]  # 只保留前 n 个句子
    print('short_evidences: {} / {}'.format(short_evidences_counter,
                                            len(instances)))

    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path('data/wiki-pages/'),
        doctitles=abs_path('data/preprocessed_data/doctitles'))

    titles = list()
    # 获取所有标题的列表
    for instance in instances:
        titles.extend([title for title, _ in instance['predicted_sentences']])

    t2l2s = load_doclines(titles, t2jnum)

    # 证据语句
    for instance in instances:
        instance['evidence'] = get_evidence_sentence_list(
            instance['predicted_sentences'], t2l2s)

    return instances
示例#2
0
def run_sent_ret(config):
    train, dev = load_paper_dataset()

    with open('data/preprocessed_data/edocs.bin', 'rb') as rb:
        edocs = pickle.load(rb)

    with open(config['doc_ret_model'], 'rb') as rb:
        dmodel = pickle.load(rb)

    t2jnum = titles_to_jsonl_num()

    try:
        with open(config['sent_ret_model'], 'rb') as rb:
            model = pickle.load(rb)  # 加载模型参数
    except BaseException:
        try:
            selected = load_selected(config['sent_ret_line'])  # 加载采样数据
        except BaseException:
            docs = doc_ret(train, edocs, model=dmodel)
            selected = select_lines(docs, t2jnum, train,
                                    config['sent_ret_line'])

        model = sent_ret_model()
        X, y = model.process_train(selected, train)  # 训练模型
        model.fit(X, y)

        with open(config['sent_ret_model'], 'wb') as wb:
            pickle.dump(model, wb)

    docs = doc_ret(dev, edocs, model=dmodel)  # 进行文档检索
    lines = load_doc_lines(docs, t2jnum)
    evidence = sent_ret(dev, docs, lines, best=config['n_best'],
                        model=model)  # 进行句子检索
    line_hits(dev, evidence)  # 评估结果
示例#3
0
def get_evidence(data=dict()):
    with open("data/edocs.bin", "rb") as rb:
        edocs = pickle.load(rb)
    with open("data/doc_ir_model.bin", "rb") as rb:
        dmodel = pickle.load(rb)
    t2jnum = titles_to_jsonl_num()
    with open("data/line_ir_model.bin", "rb") as rb:
        lmodel = pickle.load(rb)
    docs = doc_ir(data, edocs, model=dmodel)
    lines = load_doc_lines(docs, t2jnum)
    evidence = line_ir(data, docs, lines, model=lmodel)
    return docs, evidence
示例#4
0
def evi_ret(data=dict(), n_docs=5, n_sents=5):
    with open('./data/preprocessed_data/edocs.bin', 'rb') as rb:
        edocs = pickle.load(rb)

    with open('./results/doc_ret/doc_ret_model.bin', 'rb') as rb:
        dmodel = pickle.load(rb)

    t2jnum = titles_to_jsonl_num()

    with open('./results/sent_ret/sent_ret_model.bin', 'rb') as rb:
        lmodel = pickle.load(rb)

    docs = doc_ret(data, edocs, model=dmodel, best=n_docs)
    lines = load_doc_lines(docs, t2jnum)
    evidence = sent_ret(data, docs, lines, model=lmodel, best=n_sents)

    return docs, evidence
示例#5
0
def convert(instances, prependlinum=False, prependtitle=False, use_ir_prediction=False):
    """convert FEVER format to jack SNLI format
    Arg
    instances: list of dictionary of FEVER format

    Returns
    instances: list of dictionary of jack SNLI format
    """
    # get all titles and load t2l2s
    all_titles = list()

    # use "predicted_sentences" for NEI
    for instance in tqdm(instances, desc="process for NEI"):
        if instance["label"] == "NOT ENOUGH INFO":
            evidences = instance["predicted_sentences"]
            # assert evidences == [(title, linum), (title, linum), ...]

            # change its shape to the normal evidence format
            evidences = [[["dummy", "dummy", title, linum]]
                         for title, linum in evidences]
            instance["evidence"] = evidences

        if use_ir_prediction:
            titles = [title for title, _ in instance["predicted_sentences"]]
        else:
            titles = [
                title for evidence_set in instance["evidence"]
            for _, _, title, _ in evidence_set
            ]
        all_titles.extend(titles)

    print("loading wiki data...")
    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    t2l2s = load_doclines(all_titles, t2jnum)

    converted_instances = list()
    for instance in tqdm(instances, desc="conversion"):
        converted_instances.extend(
            _convert_instance(
                instance, t2l2s, prependlinum=prependlinum, prependtitle=prependtitle, use_ir_prediction=use_ir_prediction))

    return converted_instances
示例#6
0
def run_doc_ret(config):
    train, dev = load_paper_dataset()

    if os.path.exists(config['doc_ret_model']):
        with open(config['doc_ret_model'], 'rb') as rb:
            model = pickle.load(rb)
    else:
        if os.path.exists(config['doc_ret_docs']):
            selected = load_selected(config['doc_ret_docs'])
        else:
            selected = sample_docs(train, config['doc_ret_docs'])

        # 建立模型
        model = doc_ret_model()
        # 对训练数据进行预处理
        X, y = model.process_train(selected, train)
        # 训练模型
        model.fit(X, y)
        # 存储训练好的模型
        with open(config['doc_ret_model'], 'wb') as wb:
            pickle.dump(model, wb)

    if os.path.exists('data/preprocessed_data/edocs.bin'):
        with open('data/preprocessed_data/edocs.bin', 'rb') as rb:
            edocs = pickle.load(rb)
    else:
        t2jnum = titles_to_jsonl_num()
        edocs = title_edict(t2jnum)
        with open('data/preprocessed_data/edocs.bin', 'wb') as wb:
            pickle.dump(edocs, wb)

    print(len(model.f2v))
    # 使用训练好的模型对验证集进行文档检索
    docs = doc_ret(dev, edocs, best=config['n_best'], model=model)
    # 对检索结果进行评估
    title_hits(dev, docs)
示例#7
0
def select_docs(train):
    samp_size = 25000
    tots = {"SUPPORTS": 0, "REFUTES": 0}
    sofar = {"SUPPORTS": 0, "REFUTES": 0}
    try:
        with open("data/edocs.bin", "rb") as rb:
            edocs = pickle.load(rb)
    except:
        t2jnum = titles_to_jsonl_num()
        edocs = title_edict(t2jnum)
        with open("data/edocs.bin", "wb") as wb:
            pickle.dump(edocs, wb)
    examples = Counter()
    id2titles = dict()
    for example in train:
        cid = example["id"]
        claim = example["claim"]
        l = example["label"]
        if l == 'NOT ENOUGH INFO':
            continue
        all_evidence = [e for eset in example["evidence"] for e in eset]
        docs = set()
        for ev in all_evidence:
            evid = ev[2]
            if evid != None:
                docs.add(evid)
        t2phrases = find_titles_in_claim(example["claim"], edocs)
        id2titles[cid] = t2phrases
        flag = False
        for title in t2phrases:
            if title in docs:
                flag = True
        if flag:
            tots[l] += 1
    selected = dict()
    for example in tqdm(train):
        yn = 0
        cid = example["id"]
        l = example["label"]
        if l == 'NOT ENOUGH INFO':
            continue
        all_evidence = [e for eset in example["evidence"] for e in eset]
        docs = set()
        for ev in all_evidence:
            evid = ev[2]
            if evid != None:
                docs.add(evid)
        #t2phrases=find_titles_in_claim(example["claim"],edocs)
        t2phrases = id2titles[cid]
        for title in t2phrases:
            if title in docs:
                yn = 1
        prob = (samp_size - sofar[l]) / (tots[l])
        if yn == 1 and random() < prob:
            titles = list(t2phrases.keys())
            shuffle(titles)
            flagy = False
            flagn = False
            for t in titles:
                if not flagy and t in docs:
                    ty = t
                    flagy = True
                if not flagn and t not in docs:
                    tn = t
                    flagn = True
                if flagy and flagn:
                    selected[cid] = dict()
                    for t, y_n in [(ty, 1), (tn, 0)]:
                        ps = t2phrases[t]
                        shuffle(ps)
                        p, s = ps[0]
                        selected[cid][y_n] = [t, p, s]
                    sofar[l] += 1
                    break
        if yn == 1:
            tots[l] -= 1
    with open("data/doc_ir_docs", "w") as w:
        for cid in selected:
            for yn in selected[cid]:
                [t, p, s] = selected[cid][yn]
                w.write(
                    str(cid) + "\t" + str(yn) + "\t" + t + "\t" + p + "\t" +
                    str(s) + "\n")
    for l in sofar:
        print(l, sofar[l])
    return selected
示例#8
0
            if cid in selected:
                claim = example["claim"]
                ctoks = word_tokenize(claim.lower())
                titles = list()
                for yn in selected[cid]:
                    [title, phrase, start] = selected[cid][yn]
                    titles.append(title)
                rdocs[example["id"]] = (titles, ctoks)
        try:
            t2tf = titles_to_tf()
            doctf = load_doc_tf(rdocs, t2tf)
        except:
            term_and_doc_freqs()
            t2tf = titles_to_tf()
            doctf = load_doc_tf(rdocs, t2tf)
        X, y = model.process_train(selected, train, doctf)
        model.fit(X, y)
        with open("data/doc_ir_model.bin", "wb") as wb:
            pickle.dump(model, wb)
    try:
        with open("data/edocs.bin", "rb") as rb:
            edocs = pickle.load(rb)
    except:
        t2jnum = titles_to_jsonl_num()
        edocs = title_edict(t2jnum)
        with open("data/edocs.bin", "wb") as wb:
            pickle.dump(edocs, wb)
    print(len(model.f2v))
    docs = doc_ir(dev, edocs, model=model)
    title_hits(dev, docs)
示例#9
0
def convert(instances,
            prependlinum=False,
            prependtitle=False,
            use_ir_prediction=False,
            n_sentences=5,
            depparse_batch_size=32,
            num_samples=None):
    """convert FEVER format to jack SNLI format
    Arg
    instances: list of dictionary of FEVER format

    Returns
    instances: list of dictionary of jack SNLI format
    """

    if not ("label" in instances[0]):
        test = True
    else:
        test = False

    # get all titles and load t2l2s
    all_titles = list()

    # use "predicted_sentences" for NEI
    for instance in tqdm(instances, desc="process for NEI"):
        if ("label" not in instance) or (instance["label"]
                                         == "NOT ENOUGH INFO"):
            evidences = instance["predicted_sentences"][:n_sentences]
            # assert evidences == [(title, linum), (title, linum), ...]

            # change its shape to the normal evidence format
            evidences = [[["dummy", "dummy", title, linum]]
                         for title, linum in evidences]
            instance["evidence"] = evidences

        if use_ir_prediction:
            titles = [
                title
                for title, _ in instance["predicted_sentences"][:n_sentences]
            ]
        else:
            titles = [
                title for evidence_set in instance["evidence"]
                for _, _, title, _ in evidence_set
            ]
        all_titles.extend(titles)

    print("loading wiki data...")
    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    t2l2s = load_doclines(all_titles, t2jnum)

    converted_instances = list()
    for instance in tqdm(instances, desc="conversion"):
        converted_instances.extend(
            _convert_instance(instance,
                              t2l2s,
                              prependlinum=prependlinum,
                              prependtitle=prependtitle,
                              use_ir_prediction=use_ir_prediction,
                              n_sentences=n_sentences))

    print("evaluating dependency...", file=sys.stderr)
    dep_type_invalid_cnt = 0

    if num_samples is None:
        num_samples = len(converted_instances)
    for i in tqdm(range(0, num_samples, depparse_batch_size)):
        nlp_input = ""
        n_sent = 0
        for j in range(i, min(len(converted_instances),
                              i + depparse_batch_size)):
            question = converted_instances[j]["sentence2"]
            support = converted_instances[j]["sentence1"]
            converted_instances[j]["q_tokenized"] = pattern.findall(question)
            converted_instances[j]["s_tokenized"] = pattern.findall(support)
            nlp_input += ((" ".join(converted_instances[j]["q_tokenized"])) + "\n" + \
                        " ".join(converted_instances[j]["s_tokenized"]) + "\n")
            n_sent += 2
        doc = nlp(nlp_input)
        assert len(doc.sentences) == n_sent
        for j in range(i, min(len(converted_instances),
                              i + depparse_batch_size)):
            converted_instances[j]["q_tokenized"] = [
                t.text for t in doc.sentences[(j - i) * 2].tokens
            ]
            converted_instances[j]["s_tokenized"] = [
                t.text for t in doc.sentences[(j - i) * 2 + 1].tokens
            ]
            converted_instances[j]["q_dep_i"] = [None] * (len(
                converted_instances[j]["q_tokenized"]))
            converted_instances[j]["q_dep_j"] = [None] * (len(
                converted_instances[j]["q_tokenized"]))
            converted_instances[j]["q_dep_type"] = [None] * (len(
                converted_instances[j]["q_tokenized"]))
            converted_instances[j]["s_dep_i"] = [None] * (len(
                converted_instances[j]["s_tokenized"]))
            converted_instances[j]["s_dep_j"] = [None] * (len(
                converted_instances[j]["s_tokenized"]))
            converted_instances[j]["s_dep_type"] = [None] * (len(
                converted_instances[j]["s_tokenized"]))

            for idx, d in enumerate(doc.sentences[(j - i) * 2].dependencies):
                if type2id.unit2id(d[1]) is None:
                    dep_type_invalid_cnt += 1
                    continue
                if d[1] == 'root':
                    converted_instances[j]["q_dep_i"][idx] = int(
                        d[2].index) - 1
                    converted_instances[j]["q_dep_j"][idx] = int(
                        d[2].index) - 1
                    converted_instances[j]["q_dep_type"][
                        idx] = type2id.unit2id(d[1])
                    continue
                converted_instances[j]["q_dep_i"][idx] = int(d[0].index) - 1
                converted_instances[j]["q_dep_j"][idx] = int(d[2].index) - 1
                converted_instances[j]["q_dep_type"][idx] = type2id.unit2id(
                    d[1])
                idx += 1
            idx = 0
            for idx, d in enumerate(doc.sentences[(j - i) * 2 +
                                                  1].dependencies):
                if type2id.unit2id(d[1]) is None:
                    dep_type_invalid_cnt += 1
                    continue
                if d[1] == 'root':
                    converted_instances[j]["s_dep_i"][idx] = int(
                        d[2].index) - 1
                    converted_instances[j]["s_dep_j"][idx] = int(
                        d[2].index) - 1
                    converted_instances[j]["s_dep_type"][
                        idx] = type2id.unit2id(d[1])
                    continue
                converted_instances[j]["s_dep_i"][idx] = int(d[0].index) - 1
                converted_instances[j]["s_dep_j"][idx] = int(d[2].index) - 1
                converted_instances[j]["s_dep_type"][idx] = type2id.unit2id(
                    d[1])
                idx += 1

    print('Number of invalid dependency type',
          dep_type_invalid_cnt,
          file=sys.stderr)

    return converted_instances
示例#10
0
def save_wrong_instances(actual_file, predicted_labels_file,
                         predicted_evidence_file, out_file):
    label_predictions = read_jsonl(predicted_labels_file)
    ev_predictions = read_jsonl(predicted_evidence_file)
    actual = read_jsonl(actual_file)

    all_titles = list()
    for ev_pred, act in zip(ev_predictions, actual):
        ev_titles = [title for title, _ in ev_pred["predicted_sentences"]]
        act_titles = [
            title for evidence_set in act["evidence"]
            for _, _, title, _ in evidence_set
        ]
        titles = ev_titles + act_titles
        all_titles.extend(titles)

    print("loading wiki data...")
    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    t2l2s = load_doclines(all_titles, t2jnum)

    counter = 0
    observations = list()

    print("loading vocabulary list...")
    import pickle
    with open("vocab_list.db", "rb") as f:
        vocab = pickle.load(f)

    pos_counter = 0
    neg_counter = 0
    print("processing predictions...")
    for label_pred, ev_pred, act in tqdm(
            zip(label_predictions, ev_predictions, actual)):
        actual_label = act["label"]
        assert actual_label == label_pred["actual"]

        pred_label = label_pred["predicted"]
        if pred_label != actual_label:
            continue

        counter += 1
        actual_ev = act["evidence"]
        pred_labels = label_pred["prediction_list"]
        pred_ev = ev_pred["predicted_sentences"]
        pred_ev_sent = resolve_evidences(pred_ev, t2l2s, actual=False)

        claim = act["claim"]
        ev_contained = convert(compare_evidences(actual_ev, pred_ev))
        actual_ev_sent = resolve_evidences(actual_ev, t2l2s)
        assert not (actual_label != "NOT ENOUGH INFO"
                    and len(actual_ev_sent) != len(actual_ev))

        pred_sentence = " ".join(pred_ev_sent)
        ac_sentence = " ".join(sent for sentences in actual_ev_sent
                               for sent in sentences
                               if sent != "**Not Found**")
        unk_words = find_unk(pred_sentence + " " + ac_sentence, vocab)

        if pred_label == actual_label:
            pos_counter += 1
        else:
            neg_counter += 1

        # overwrite when label is NEI
        if actual_label == "NOT ENOUGH INFO":
            ev_contained = ["-" for e in ev_contained]

        # # skip for NEI or no correct evidence.
        # if ev_contained == ["X"] * 5 and ev_contained != ["-"] * 5:
        #     continue

        label_pred_ev = [
            "<{}> <{}> {}".format(label, contained, ev) for label, contained,
            ev in zip(shorten_labels(pred_labels), ev_contained, pred_ev)
        ]
        actual_ev = ev_pred["evidence"]

        observations.append({
            "id": act["id"],
            "claim": claim,
            "predicted_evidences": label_pred_ev,
            "predicted_sentences": pred_ev_sent,
            "predicted_label": pred_label,
            "actual_evidence": actual_ev,
            "actual_sentences": actual_ev_sent,
            "actual_label": actual_label,
            "unk_words": unk_words
        })

    random.shuffle(observations)
    save_jsonl_pretty_print(observations, out_file)
    print("pos_counter", pos_counter)
    print("neg_counter", neg_counter)
    print("wrong labels:", counter)
示例#11
0
def sample_docs(train, save_file):
    '''
    在训练数据中进行采样

    返回值:
    seleted[cid][yn] = [title, phrase, start]
    '''
    samp_size = 25000
    tots = {'SUPPORTS': 0, 'REFUTES': 0}
    sofar = {'SUPPORTS': 0, 'REFUTES': 0}

    # 读取文档标题字典
    if os.path.exists('data/preprocessed_data/edocs.bin'):
        with open('data/preprocessed_data/edocs.bin', 'rb') as rb:
            edocs = pickle.load(rb)
    else:
        t2jnum = titles_to_jsonl_num()
        edocs = title_edict(t2jnum)
        with open('data/preprocessed_data/edocs.bin', 'wb') as wb:
            pickle.dump(edocs, wb)

    id2titles = dict()

    for example in train:
        cid = example['id']
        label = example['label']

        if label == 'NOT ENOUGH INFO':
            continue

        # 构建训练集中的证据对应的文档集
        all_evidence = [
            evi for evi_set in example['evidence'] for evi in evi_set
        ]
        docs = set()
        for evi in all_evidence:
            evi_doc = evi[2]
            if evi_doc is not None:
                docs.add(evi_doc)

        # 将 claim 中存在的 title 转换为对应的标题短语
        t2phrases = find_titles_in_claim(example['claim'], edocs)

        id2titles[cid] = t2phrases
        flag = False
        for title in t2phrases:
            if title in docs:
                flag = True

        # 如果 claim 中出现的 title 存在于证据对应的文档集中,即 claim 可通过其中出现的 title 来获取对应的证据
        if flag:
            tots[label] += 1

    selected = dict()

    # 进行采样选择数据
    for example in tqdm(train):
        yn = 0  # yn 表示类型,1:正样本,0:负样本
        cid = example['id']
        label = example['label']

        if label == 'NOT ENOUGH INFO':
            continue

        all_evidence = [
            evi for evi_set in example['evidence'] for evi in evi_set
        ]
        docs = set()
        for evi in all_evidence:
            evi_doc = evi[2]
            if evi_doc is not None:
                docs.add(evi_doc)

        t2phrases = id2titles[cid]  # 通过实例的 cid 来直接获取对应的 title 字典
        for title in t2phrases:
            if title in docs:
                yn = 1

        prob = (samp_size - sofar[label]) / (tots[label])

        if yn == 1 and random() < prob:
            titles = list(t2phrases.keys())
            shuffle(titles)
            flagy = False
            flagn = False

            for t in titles:
                if not flagy and t in docs:
                    ty = t
                    flagy = True
                if not flagn and t not in docs:
                    tn = t
                    flagn = True
                if flagy and flagn:
                    selected[cid] = dict()
                    for t, y_n in [(ty, 1), (tn, 0)]:
                        ps = t2phrases[t]
                        shuffle(ps)
                        p, s = ps[0]
                        selected[cid][y_n] = [t, p, s]
                    sofar[label] += 1
                    break

        if yn == 1:
            tots[label] -= 1

    # 将采样结果写入文件
    with open(save_file, 'w') as w:
        for cid in selected:
            for yn in selected[cid]:
                [t, p, s] = selected[cid][yn]
                w.write(
                    str(cid) + '\t' + str(yn) + '\t' + t + '\t' + p + '\t' +
                    str(s) + '\n')

    for label in sofar:
        print(label, sofar[label])

    return selected