예제 #1
0
def run_sent_ret(config):
    train, dev = load_paper_dataset()

    with open('data/preprocessed_data/edocs.bin', 'rb') as rb:
        edocs = pickle.load(rb)

    with open(config['doc_ret_model'], 'rb') as rb:
        dmodel = pickle.load(rb)

    t2jnum = titles_to_jsonl_num()

    try:
        with open(config['sent_ret_model'], 'rb') as rb:
            model = pickle.load(rb)  # 加载模型参数
    except BaseException:
        try:
            selected = load_selected(config['sent_ret_line'])  # 加载采样数据
        except BaseException:
            docs = doc_ret(train, edocs, model=dmodel)
            selected = select_lines(docs, t2jnum, train,
                                    config['sent_ret_line'])

        model = sent_ret_model()
        X, y = model.process_train(selected, train)  # 训练模型
        model.fit(X, y)

        with open(config['sent_ret_model'], 'wb') as wb:
            pickle.dump(model, wb)

    docs = doc_ret(dev, edocs, model=dmodel)  # 进行文档检索
    lines = load_doc_lines(docs, t2jnum)
    evidence = sent_ret(dev, docs, lines, best=config['n_best'],
                        model=model)  # 进行句子检索
    line_hits(dev, evidence)  # 评估结果
예제 #2
0
def get_evidence(data=dict()):
    with open("data/edocs.bin", "rb") as rb:
        edocs = pickle.load(rb)
    with open("data/doc_ir_model.bin", "rb") as rb:
        dmodel = pickle.load(rb)
    t2jnum = titles_to_jsonl_num()
    with open("data/line_ir_model.bin", "rb") as rb:
        lmodel = pickle.load(rb)
    docs = doc_ir(data, edocs, model=dmodel)
    lines = load_doc_lines(docs, t2jnum)
    evidence = line_ir(data, docs, lines, model=lmodel)
    return docs, evidence
예제 #3
0
def evi_ret(data=dict(), n_docs=5, n_sents=5):
    with open('./data/preprocessed_data/edocs.bin', 'rb') as rb:
        edocs = pickle.load(rb)

    with open('./results/doc_ret/doc_ret_model.bin', 'rb') as rb:
        dmodel = pickle.load(rb)

    t2jnum = titles_to_jsonl_num()

    with open('./results/sent_ret/sent_ret_model.bin', 'rb') as rb:
        lmodel = pickle.load(rb)

    docs = doc_ret(data, edocs, model=dmodel, best=n_docs)
    lines = load_doc_lines(docs, t2jnum)
    evidence = sent_ret(data, docs, lines, model=lmodel, best=n_sents)

    return docs, evidence
예제 #4
0
def select_lines(docs, t2jnum, train, save_file):
    '''
    在训练数据中进行采样,并生成负样本

    返回值:
    seleted[cid][yn] = [title, l_id, l_txt, score]
    '''
    selected = dict()
    rlines = load_doc_lines(docs, t2jnum)
    samp_size = 20000  # 采样数量

    tots = {'SUPPORTS': 0, 'REFUTES': 0}  # 全体训练集情况
    sofar = {'SUPPORTS': 0, 'REFUTES': 0}  # 记录当前采样情况

    examples = Counter()
    for example in train:
        cid = example['id']
        label = example['label']

        if label == 'NOT ENOUGH INFO':
            continue

        # 对该样本提取相关的所有证据
        all_evidence = [
            evi for evi_set in example['evidence'] for evi in evi_set
        ]
        evi_set = set()  # 该样本包含的证据文档标题的集合
        for evi in all_evidence:
            evi_d = evi[2]  # 证据文档标题
            if evi_d is not None:
                evi_set.add(evi_d)

        flag = False  # 标记检索得到的文档是否在样本的证据集中
        for doc, score in docs[cid]:  # docs: 文档检索的结果, doc:文档标题, score: 标题得分
            if doc in evi_set:
                flag = True
        if flag:
            tots[label] += 1  # 记录全体训练集中文档检索正确的样本数
            examples[label] += 1

    for example in train:
        cid = example['id']
        label = example['label']

        if label == 'NOT ENOUGH INFO':
            continue

        # 对该样本提取相关的所有证据
        all_evidence = [
            evi for evi_set in example['evidence'] for evi in evi_set
        ]
        lines = dict()  # evi_d -> evi_line
        for evi in all_evidence:
            evi_d = evi[2]  # 证据标题
            evi_line = evi[3]  # 证据所在行号
            if evi_d is not None:
                if evi_d not in lines:
                    lines[evi_d] = set()
                lines[evi_d].add(evi_line)  # 证据信息对应的行号的集合

        flag = False  # 标记检索到的文档是否在样本证据中
        for doc, score in docs[cid]:
            if doc in lines:
                flag = True
        if flag:
            prob = (samp_size - sofar[label]) / (tots[label]
                                                 )  # 目前还未采样的比例,也就是采样该样本的概率
            if random() < prob:
                ylines = list()
                nlines = list()
                for title, score in docs[cid]:
                    for l_id in rlines[title]:
                        l_txt = rlines[title][l_id]
                        if title in lines and l_id in lines[title]:
                            ylines.append([title, l_id, l_txt, score])  # 正样本
                        elif l_txt != '':
                            nlines.append([title, l_id, l_txt, score])  # 负样本
                selected[cid] = dict()
                for yn, ls in [(1, ylines), (0, nlines)]:
                    shuffle(ls)
                    selected[cid][yn] = ls[0]
                sofar[label] += 1
            tots[label] -= 1

    with open(save_file, 'w') as w:
        for cid in selected:
            for yn in selected[cid]:
                [title, l_id, l_txt, score] = selected[cid][yn]
                w.write(
                    str(cid) + '\t' + str(yn) + '\t' + title + '\t' +
                    str(l_id) + '\t' + str(l_txt) + '\t' + str(score) + '\n')

    for l in sofar:
        print(l, sofar[l])

    return selected
예제 #5
0
        evidence[cid]=list()
        tscores=docs[cid]
        claim=example["claim"]
        evidence[cid]=best_lines(claim,tscores,lines,best,model)
    return evidence



if __name__ == "__main__":
    t2jnum=titles_to_jsonl_num()
    try:
        with open("data/edocs.bin","rb") as rb:
            edocs=pickle.load(rb)
    except:
        edocs=title_edict(t2jnum)
        with open("data/edocs.bin","wb") as wb:
            pickle.dump(edocs,wb)
    train, dev = load_split_trainset(9999)
    docs=doc_ir(dev,edocs)
    print(len(docs))
    lines=load_doc_lines(docs,t2jnum)
    print(len(lines))
    evidence=line_ir(dev,docs,lines)
    line_hits(dev,evidence)
    docs=doc_ir(train,edocs)
    print(len(docs))
    lines=load_doc_lines(docs,t2jnum)
    print(len(lines))
    evidence=line_ir(train,docs,lines)
    line_hits(train,evidence)
예제 #6
0
def select_lines(docs, t2jnum, train):
    selected = dict()
    rlines = load_doc_lines(docs, t2jnum)
    samp_size = 20000
    tots = {"SUPPORTS": 0, "REFUTES": 0}
    sofar = {"SUPPORTS": 0, "REFUTES": 0}
    examples = Counter()
    for example in train:
        cid = example["id"]
        claim = example["claim"]
        l = example["label"]
        if l == 'NOT ENOUGH INFO':
            continue
        all_evidence = [e for eset in example["evidence"] for e in eset]
        evset = set()
        for ev in all_evidence:
            evid = ev[2]
            if evid != None:
                evset.add(evid)
        flag = False
        for doc, score in docs[cid]:
            if doc in evset:
                flag = True
        if flag:
            tots[l] += 1
            examples[l] += 1
    for l, c in examples.most_common():
        print(l, c)
    for example in train:
        cid = example["id"]
        claim = example["claim"]
        l = example["label"]
        if l == 'NOT ENOUGH INFO':
            continue
        all_evidence = [e for eset in example["evidence"] for e in eset]
        lines = dict()
        for ev in all_evidence:
            evid = ev[2]
            evline = ev[3]
            if evid != None:
                if evid not in lines:
                    lines[evid] = set()
                lines[evid].add(evline)
        flag = False
        for doc, score in docs[cid]:
            if doc in lines:
                flag = True
        if flag:
            prob = (samp_size - sofar[l]) / (tots[l])
            if random() < prob:
                ylines = list()
                nlines = list()
                for title, score in docs[cid]:
                    for l_id in rlines[title]:
                        l_txt = rlines[title][l_id]
                        if title in lines and l_id in lines[title]:
                            ylines.append([title, l_id, l_txt, score])
                        elif l_txt != "":
                            nlines.append([title, l_id, l_txt, score])
                selected[cid] = dict()
                for yn, ls in [(1, ylines), (0, nlines)]:
                    shuffle(ls)
                    selected[cid][yn] = ls[0]
                sofar[l] += 1
            tots[l] -= 1
    with open("data/line_ir_lines", "w") as w:
        for cid in selected:
            for yn in selected[cid]:
                [t, i, l, s] = selected[cid][yn]
                w.write(
                    str(cid) + "\t" + str(yn) + "\t" + t + "\t" + str(i) +
                    "\t" + str(l) + "\t" + str(s) + "\n")
    for l in sofar:
        print(l, sofar[l])
    return selected