def resolve_evidences(evidences, t2l2s, actual=True): """ actual: [[[id, id, "title", linum], [id, id, "title", linum]], [[id, id, "title", linum]], ... ] not actual ( predicted ): [(title, linum), (title, linum), ...] """ if actual: evidence_sentences = list() for evidence_set in evidences: evidence_linum = [(title, linum) for _, _, title, linum in evidence_set] for title, _ in evidence_linum: if title is not None and title not in t2l2s: pass # print(title, len(evidences), evidences) evidence_linum = [(title, linum) for _, _, title, linum in evidence_set if title in t2l2s] # continue if evidence_linum is empty if not evidence_linum: evidence_sentences.append(["**Not Found**"]) continue sentences = get_evidence_sentence_list(evidence_linum, t2l2s) evidence_sentences.append(sentences) else: evidence_sentences = get_evidence_sentence_list(evidences, t2l2s) return evidence_sentences
def read_ir_result(path, prependlinum=False, concatev=False): """ Returns instances: list of dictionary update instance['predicted_sentences'] with list of evidences (list of str) """ instances = read_jsonl(path) t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) titles = list() # make list of titles for instance in instances: titles.extend([title for title, _ in instance["predicted_sentences"]]) # load title2line2sentences t2l2s = load_doclines(titles, t2jnum) for instance in instances: if concatev: instance["evidence"] = [" ".join(get_evidence_sentence_list( instance["predicted_sentences"], t2l2s, prependlinum=prependlinum))] else: instance["evidence"] = get_evidence_sentence_list( instance["predicted_sentences"], t2l2s, prependlinum=prependlinum) return instances
def _convert_instance(instance, t2l2s, prependlinum, prependtitle, use_ir_prediction): """convert single instance to either one or multiple instances Args instance: instance of FEVER dataset. t2l2s: output of titles_to_jsonl_num Returns list of converted instances """ def _evidence_format(evidences): """return evidence sentence from (possibly) multiple evidence sentences""" return " ".join(evidences) converted_instances = list() # assert instance["evidence"] == [[[hoge, hoge, title, linum], [hoge, hoge, title, linum]], [[..],[..],..], ...] if use_ir_prediction: evidence_linum = [(title, linum) for title, linum in instance["predicted_sentences"] if title in t2l2s] for eidx, (title, linum) in enumerate(evidence_linum): converted_instances.append( snli_format( id="{}-{}".format(instance["id"], str(eidx)), pair_id="{}-{}".format(instance["id"], str(eidx)), label=convert_label(instance["label"]), evidence=_evidence_format( get_evidence_sentence_list( [(title, linum)], t2l2s, prependlinum=prependlinum, prependtitle=prependtitle)), claim=instance["claim"])) else: for eidx, evidence_set in enumerate(instance["evidence"]): evidence_linum = [(title, linum) for _, _, title, linum in evidence_set if title in t2l2s] # continue if evidence_linum is empty if not evidence_linum: continue converted_instances.append( snli_format( id="{}-{}".format(instance["id"], str(eidx)), pair_id="{}-{}".format(instance["id"], str(eidx)), label=convert_label(instance["label"]), evidence=_evidence_format( get_evidence_sentence_list( evidence_linum, t2l2s, prependlinum=prependlinum, prependtitle=prependtitle)), claim=instance["claim"])) return converted_instances
def read_ir_result(path, n_sentences=5, prependlinum=False, prependtitle=False, concatev=False): """ Returns instances: list of dictionary update instance['predicted_sentences'] with list of evidences (list of str) """ short_evidences_counter = 0 instances = read_jsonl(path) # only read n_sentences for instance in instances: if len(instance["predicted_sentences"]) < n_sentences: short_evidences_counter += 1 instance["predicted_sentences"] = instance[ "predicted_sentences"][:n_sentences] print("short_evidences: {} / {}".format(short_evidences_counter, len(instances))) t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) titles = list() # make list of titles for instance in instances: titles.extend([title for title, _ in instance["predicted_sentences"]]) # load title2line2sentences t2l2s = load_doclines(titles, t2jnum) for instance in instances: if concatev: instance["evidence"] = [ " ".join( get_evidence_sentence_list(instance["predicted_sentences"], t2l2s, prependlinum=prependlinum, prependtitle=prependtitle)) ] else: instance["evidence"] = get_evidence_sentence_list( instance["predicted_sentences"], t2l2s, prependlinum=prependlinum, prependtitle=prependtitle) return instances
def read_ir_result(path, n_sentences=5): ''' 读取句子检索的结果 ''' short_evidences_counter = 0 instances = read_jsonl(path) for instance in instances: if len(instance['predicted_sentences']) < n_sentences: short_evidences_counter += 1 instance['predicted_sentences'] = instance[ 'predicted_sentences'][:n_sentences] # 只保留前 n 个句子 print('short_evidences: {} / {}'.format(short_evidences_counter, len(instances))) t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path('data/wiki-pages/'), doctitles=abs_path('data/preprocessed_data/doctitles')) titles = list() # 获取所有标题的列表 for instance in instances: titles.extend([title for title, _ in instance['predicted_sentences']]) t2l2s = load_doclines(titles, t2jnum) # 证据语句 for instance in instances: instance['evidence'] = get_evidence_sentence_list( instance['predicted_sentences'], t2l2s) return instances