def run_sent_ret(config): train, dev = load_paper_dataset() with open('data/preprocessed_data/edocs.bin', 'rb') as rb: edocs = pickle.load(rb) with open(config['doc_ret_model'], 'rb') as rb: dmodel = pickle.load(rb) t2jnum = titles_to_jsonl_num() try: with open(config['sent_ret_model'], 'rb') as rb: model = pickle.load(rb) # 加载模型参数 except BaseException: try: selected = load_selected(config['sent_ret_line']) # 加载采样数据 except BaseException: docs = doc_ret(train, edocs, model=dmodel) selected = select_lines(docs, t2jnum, train, config['sent_ret_line']) model = sent_ret_model() X, y = model.process_train(selected, train) # 训练模型 model.fit(X, y) with open(config['sent_ret_model'], 'wb') as wb: pickle.dump(model, wb) docs = doc_ret(dev, edocs, model=dmodel) # 进行文档检索 lines = load_doc_lines(docs, t2jnum) evidence = sent_ret(dev, docs, lines, best=config['n_best'], model=model) # 进行句子检索 line_hits(dev, evidence) # 评估结果
def get_evidence(data=dict()): with open("data/edocs.bin", "rb") as rb: edocs = pickle.load(rb) with open("data/doc_ir_model.bin", "rb") as rb: dmodel = pickle.load(rb) t2jnum = titles_to_jsonl_num() with open("data/line_ir_model.bin", "rb") as rb: lmodel = pickle.load(rb) docs = doc_ir(data, edocs, model=dmodel) lines = load_doc_lines(docs, t2jnum) evidence = line_ir(data, docs, lines, model=lmodel) return docs, evidence
def evi_ret(data=dict(), n_docs=5, n_sents=5): with open('./data/preprocessed_data/edocs.bin', 'rb') as rb: edocs = pickle.load(rb) with open('./results/doc_ret/doc_ret_model.bin', 'rb') as rb: dmodel = pickle.load(rb) t2jnum = titles_to_jsonl_num() with open('./results/sent_ret/sent_ret_model.bin', 'rb') as rb: lmodel = pickle.load(rb) docs = doc_ret(data, edocs, model=dmodel, best=n_docs) lines = load_doc_lines(docs, t2jnum) evidence = sent_ret(data, docs, lines, model=lmodel, best=n_sents) return docs, evidence
def select_lines(docs, t2jnum, train, save_file): ''' 在训练数据中进行采样,并生成负样本 返回值: seleted[cid][yn] = [title, l_id, l_txt, score] ''' selected = dict() rlines = load_doc_lines(docs, t2jnum) samp_size = 20000 # 采样数量 tots = {'SUPPORTS': 0, 'REFUTES': 0} # 全体训练集情况 sofar = {'SUPPORTS': 0, 'REFUTES': 0} # 记录当前采样情况 examples = Counter() for example in train: cid = example['id'] label = example['label'] if label == 'NOT ENOUGH INFO': continue # 对该样本提取相关的所有证据 all_evidence = [ evi for evi_set in example['evidence'] for evi in evi_set ] evi_set = set() # 该样本包含的证据文档标题的集合 for evi in all_evidence: evi_d = evi[2] # 证据文档标题 if evi_d is not None: evi_set.add(evi_d) flag = False # 标记检索得到的文档是否在样本的证据集中 for doc, score in docs[cid]: # docs: 文档检索的结果, doc:文档标题, score: 标题得分 if doc in evi_set: flag = True if flag: tots[label] += 1 # 记录全体训练集中文档检索正确的样本数 examples[label] += 1 for example in train: cid = example['id'] label = example['label'] if label == 'NOT ENOUGH INFO': continue # 对该样本提取相关的所有证据 all_evidence = [ evi for evi_set in example['evidence'] for evi in evi_set ] lines = dict() # evi_d -> evi_line for evi in all_evidence: evi_d = evi[2] # 证据标题 evi_line = evi[3] # 证据所在行号 if evi_d is not None: if evi_d not in lines: lines[evi_d] = set() lines[evi_d].add(evi_line) # 证据信息对应的行号的集合 flag = False # 标记检索到的文档是否在样本证据中 for doc, score in docs[cid]: if doc in lines: flag = True if flag: prob = (samp_size - sofar[label]) / (tots[label] ) # 目前还未采样的比例,也就是采样该样本的概率 if random() < prob: ylines = list() nlines = list() for title, score in docs[cid]: for l_id in rlines[title]: l_txt = rlines[title][l_id] if title in lines and l_id in lines[title]: ylines.append([title, l_id, l_txt, score]) # 正样本 elif l_txt != '': nlines.append([title, l_id, l_txt, score]) # 负样本 selected[cid] = dict() for yn, ls in [(1, ylines), (0, nlines)]: shuffle(ls) selected[cid][yn] = ls[0] sofar[label] += 1 tots[label] -= 1 with open(save_file, 'w') as w: for cid in selected: for yn in selected[cid]: [title, l_id, l_txt, score] = selected[cid][yn] w.write( str(cid) + '\t' + str(yn) + '\t' + title + '\t' + str(l_id) + '\t' + str(l_txt) + '\t' + str(score) + '\n') for l in sofar: print(l, sofar[l]) return selected
evidence[cid]=list() tscores=docs[cid] claim=example["claim"] evidence[cid]=best_lines(claim,tscores,lines,best,model) return evidence if __name__ == "__main__": t2jnum=titles_to_jsonl_num() try: with open("data/edocs.bin","rb") as rb: edocs=pickle.load(rb) except: edocs=title_edict(t2jnum) with open("data/edocs.bin","wb") as wb: pickle.dump(edocs,wb) train, dev = load_split_trainset(9999) docs=doc_ir(dev,edocs) print(len(docs)) lines=load_doc_lines(docs,t2jnum) print(len(lines)) evidence=line_ir(dev,docs,lines) line_hits(dev,evidence) docs=doc_ir(train,edocs) print(len(docs)) lines=load_doc_lines(docs,t2jnum) print(len(lines)) evidence=line_ir(train,docs,lines) line_hits(train,evidence)
def select_lines(docs, t2jnum, train): selected = dict() rlines = load_doc_lines(docs, t2jnum) samp_size = 20000 tots = {"SUPPORTS": 0, "REFUTES": 0} sofar = {"SUPPORTS": 0, "REFUTES": 0} examples = Counter() for example in train: cid = example["id"] claim = example["claim"] l = example["label"] if l == 'NOT ENOUGH INFO': continue all_evidence = [e for eset in example["evidence"] for e in eset] evset = set() for ev in all_evidence: evid = ev[2] if evid != None: evset.add(evid) flag = False for doc, score in docs[cid]: if doc in evset: flag = True if flag: tots[l] += 1 examples[l] += 1 for l, c in examples.most_common(): print(l, c) for example in train: cid = example["id"] claim = example["claim"] l = example["label"] if l == 'NOT ENOUGH INFO': continue all_evidence = [e for eset in example["evidence"] for e in eset] lines = dict() for ev in all_evidence: evid = ev[2] evline = ev[3] if evid != None: if evid not in lines: lines[evid] = set() lines[evid].add(evline) flag = False for doc, score in docs[cid]: if doc in lines: flag = True if flag: prob = (samp_size - sofar[l]) / (tots[l]) if random() < prob: ylines = list() nlines = list() for title, score in docs[cid]: for l_id in rlines[title]: l_txt = rlines[title][l_id] if title in lines and l_id in lines[title]: ylines.append([title, l_id, l_txt, score]) elif l_txt != "": nlines.append([title, l_id, l_txt, score]) selected[cid] = dict() for yn, ls in [(1, ylines), (0, nlines)]: shuffle(ls) selected[cid][yn] = ls[0] sofar[l] += 1 tots[l] -= 1 with open("data/line_ir_lines", "w") as w: for cid in selected: for yn in selected[cid]: [t, i, l, s] = selected[cid][yn] w.write( str(cid) + "\t" + str(yn) + "\t" + t + "\t" + str(i) + "\t" + str(l) + "\t" + str(s) + "\n") for l in sofar: print(l, sofar[l]) return selected