def read_ir_result(path, prependlinum=False, concatev=False): """ Returns instances: list of dictionary update instance['predicted_sentences'] with list of evidences (list of str) """ instances = read_jsonl(path) t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) titles = list() # make list of titles for instance in instances: titles.extend([title for title, _ in instance["predicted_sentences"]]) # load title2line2sentences t2l2s = load_doclines(titles, t2jnum) for instance in instances: if concatev: instance["evidence"] = [" ".join(get_evidence_sentence_list( instance["predicted_sentences"], t2l2s, prependlinum=prependlinum))] else: instance["evidence"] = get_evidence_sentence_list( instance["predicted_sentences"], t2l2s, prependlinum=prependlinum) return instances
def read_ir_result(path, n_sentences=5): ''' 读取句子检索的结果 ''' short_evidences_counter = 0 instances = read_jsonl(path) for instance in instances: if len(instance['predicted_sentences']) < n_sentences: short_evidences_counter += 1 instance['predicted_sentences'] = instance[ 'predicted_sentences'][:n_sentences] # 只保留前 n 个句子 print('short_evidences: {} / {}'.format(short_evidences_counter, len(instances))) t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path('data/wiki-pages/'), doctitles=abs_path('data/preprocessed_data/doctitles')) titles = list() # 获取所有标题的列表 for instance in instances: titles.extend([title for title, _ in instance['predicted_sentences']]) t2l2s = load_doclines(titles, t2jnum) # 证据语句 for instance in instances: instance['evidence'] = get_evidence_sentence_list( instance['predicted_sentences'], t2l2s) return instances
def __init__(self, jsonl_file, n_sentences=5, sampling=False, use_ev_scores=False, test=False): instances = read_jsonl(jsonl_file) if sampling: instances = sample(instances) self.instances = instances self.n_sentences = n_sentences self.test = test self.use_ev_scores = use_ev_scores
def read_ir_result(path, n_sentences=5, prependlinum=False, prependtitle=False, concatev=False): """ Returns instances: list of dictionary update instance['predicted_sentences'] with list of evidences (list of str) """ short_evidences_counter = 0 instances = read_jsonl(path) # only read n_sentences for instance in instances: if len(instance["predicted_sentences"]) < n_sentences: short_evidences_counter += 1 instance["predicted_sentences"] = instance[ "predicted_sentences"][:n_sentences] print("short_evidences: {} / {}".format(short_evidences_counter, len(instances))) t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) titles = list() # make list of titles for instance in instances: titles.extend([title for title, _ in instance["predicted_sentences"]]) # load title2line2sentences t2l2s = load_doclines(titles, t2jnum) for instance in instances: if concatev: instance["evidence"] = [ " ".join( get_evidence_sentence_list(instance["predicted_sentences"], t2l2s, prependlinum=prependlinum, prependtitle=prependtitle)) ] else: instance["evidence"] = get_evidence_sentence_list( instance["predicted_sentences"], t2l2s, prependlinum=prependlinum, prependtitle=prependtitle) return instances
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("src") parser.add_argument("tar") parser.add_argument("--use_ir_pred", action="store_true") parser.add_argument("--prependlinum", action="store_true") parser.add_argument("--prependtitle", action="store_true") parser.add_argument("--convert_test", action="store_true") # parser.add_argument("--testset", help="turn on when you convert test data", action="store_true") args = parser.parse_args() print(args) if args.convert_test: test_in = '''[{"id": 15812, "verifiable": "VERIFIABLE", "label": "REFUTES", "claim": "Peggy Sue Got Married is a Egyptian film released in 1986.", "evidence": [[[31205, 37902, "Peggy_Sue_Got_Married", 0], [31205, 37902, "Francis_Ford_Coppola", 0]], [[31211, 37908, "Peggy_Sue_Got_Married", 0]]], "predicted_pages": ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", "Peggy_Sue_Got_Married_-LRB-song-RRB-", "Peggy_Sue_Got_Married", "Peggy_Sue", "Peggy_Sue_-LRB-band-RRB-"], "predicted_sentences": [["Peggy_Sue_Got_Married", 0], ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", 0], ["Peggy_Sue_Got_Married_-LRB-song-RRB-", 0], ["Peggy_Sue", 0], ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", 2]]}, {"id": 229289, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Neal Schon was named in 1954.", "evidence": [[[273626, null, null, null]]], "predicted_pages": ["Neal_Schon", "Neal", "Named", "Was_-LRB-Not_Was-RRB-", "Was"], "predicted_sentences": [["Neal_Schon", 0], ["Neal_Schon", 6], ["Neal_Schon", 5], ["Neal_Schon", 1], ["Neal_Schon", 2]]}]''' print("input:\n", test_in) fever_format = json.loads(test_in) snli_format_instances = convert(fever_format, prependlinum=args.prependlinum, prependtitle=args.prependtitle, use_ir_prediction=args.use_ir_pred) print("\noutput:\n", json.dumps(snli_format_instances, indent=4)) else: assert not os.path.exists(args.tar), "file {} alreadly exists".format( args.tar) keyerr_count = 0 instances = read_jsonl(args.src) snli_format_instances = convert(instances, prependlinum=args.prependlinum, prependtitle=args.prependtitle, use_ir_prediction=args.use_ir_pred) save_jsonl(snli_format_instances, args.tar)
def save_wrong_instances(actual_file, predicted_labels_file, predicted_evidence_file, out_file): label_predictions = read_jsonl(predicted_labels_file) ev_predictions = read_jsonl(predicted_evidence_file) actual = read_jsonl(actual_file) all_titles = list() for ev_pred, act in zip(ev_predictions, actual): ev_titles = [title for title, _ in ev_pred["predicted_sentences"]] act_titles = [ title for evidence_set in act["evidence"] for _, _, title, _ in evidence_set ] titles = ev_titles + act_titles all_titles.extend(titles) print("loading wiki data...") t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) t2l2s = load_doclines(all_titles, t2jnum) counter = 0 observations = list() print("loading vocabulary list...") import pickle with open("vocab_list.db", "rb") as f: vocab = pickle.load(f) pos_counter = 0 neg_counter = 0 print("processing predictions...") for label_pred, ev_pred, act in tqdm( zip(label_predictions, ev_predictions, actual)): actual_label = act["label"] assert actual_label == label_pred["actual"] pred_label = label_pred["predicted"] if pred_label != actual_label: continue counter += 1 actual_ev = act["evidence"] pred_labels = label_pred["prediction_list"] pred_ev = ev_pred["predicted_sentences"] pred_ev_sent = resolve_evidences(pred_ev, t2l2s, actual=False) claim = act["claim"] ev_contained = convert(compare_evidences(actual_ev, pred_ev)) actual_ev_sent = resolve_evidences(actual_ev, t2l2s) assert not (actual_label != "NOT ENOUGH INFO" and len(actual_ev_sent) != len(actual_ev)) pred_sentence = " ".join(pred_ev_sent) ac_sentence = " ".join(sent for sentences in actual_ev_sent for sent in sentences if sent != "**Not Found**") unk_words = find_unk(pred_sentence + " " + ac_sentence, vocab) if pred_label == actual_label: pos_counter += 1 else: neg_counter += 1 # overwrite when label is NEI if actual_label == "NOT ENOUGH INFO": ev_contained = ["-" for e in ev_contained] # # skip for NEI or no correct evidence. # if ev_contained == ["X"] * 5 and ev_contained != ["-"] * 5: # continue label_pred_ev = [ "<{}> <{}> {}".format(label, contained, ev) for label, contained, ev in zip(shorten_labels(pred_labels), ev_contained, pred_ev) ] actual_ev = ev_pred["evidence"] observations.append({ "id": act["id"], "claim": claim, "predicted_evidences": label_pred_ev, "predicted_sentences": pred_ev_sent, "predicted_label": pred_label, "actual_evidence": actual_ev, "actual_sentences": actual_ev_sent, "actual_label": actual_label, "unk_words": unk_words }) random.shuffle(observations) save_jsonl_pretty_print(observations, out_file) print("pos_counter", pos_counter) print("neg_counter", neg_counter) print("wrong labels:", counter)