def score_converter_scaled(org_data_file, full_sent_list, scale_prob=0.5, delete_prob=True): """ :param org_data_file: :param full_sent_list: append full_sent_score list to evidence of original data file :param delete_prob: delete the probability for sanity check :param scale_prob: 0.5 :return: """ d_list = common.load_jsonl(org_data_file) augmented_dict = dict() print("Build selected sentences file:", len(full_sent_list)) for sent_item in tqdm(full_sent_list): selection_id = sent_item[ 'selection_id'] # The id for the current one selection. org_id = int(selection_id.split('<##>')[0]) if org_id in augmented_dict: augmented_dict[org_id].append(sent_item) else: augmented_dict[org_id] = [sent_item] for item in d_list: if int(item['id']) not in augmented_dict: # cur_predicted_sentids = [] cur_adv_predicted_sentids = [] else: # cur_predicted_sentids = [] # formating doc_id + c_score.SENTLINT + line_number cur_adv_predicted_sentids = [] sents = augmented_dict[int(item['id'])] # Modify some mechaism here to selection sentence whether by some score or label for sent_i in sents: if sent_i['prob'] >= scale_prob: cur_adv_predicted_sentids.append( (sent_i['sid'], sent_i['score'], sent_i['prob'])) # del sent_i['prob'] cur_adv_predicted_sentids = sorted(cur_adv_predicted_sentids, key=lambda x: -x[1]) item[ 'scored_sentids'] = cur_adv_predicted_sentids[: 5] # Important sentences for scaling training. Jul 21. item['predicted_sentids'] = [ sid for sid, _, _ in item['scored_sentids'] ][:5] item['predicted_evidence'] = convert_evidence2scoring_format( item['predicted_sentids']) item['predicted_label'] = item['label'] # give ground truth label # Removing all score and prob if delete_prob: for sent_item in full_sent_list: if 'score' in sent_item.keys(): del sent_item['score'] del sent_item['prob'] return d_list
def threshold_sampler_insure_unique_list(org_data_file, full_sent_list, prob_threshold=0.5, logist_threshold=None, top_n=5): """ Providing samples to the Training set by a probability threshold on the upstream selected sentences. """ d_list = org_data_file augmented_dict: Dict[int, Dict[str, Dict]] = dict() print("Build selected sentences file:", len(full_sent_list)) for sent_item in tqdm(full_sent_list): selection_id = sent_item[ 'selection_id'] # The id for the current one selection. org_id = int(selection_id.split('<##>')[0]) remain_str = selection_id.split('<##>')[1] # doc_id = remain_str.split(c_scorer.SENT_LINE)[0] # ln = int(remain_str.split(c_scorer.SENT_LINE)[1]) if org_id in augmented_dict: if remain_str not in augmented_dict[org_id]: augmented_dict[org_id][remain_str] = sent_item else: print("Exist") else: augmented_dict[org_id] = {remain_str: sent_item} for item in d_list: if int(item['id']) not in augmented_dict: # print("Potential error?") cur_predicted_sentids = [] else: cur_predicted_sentids = [ ] # formating doc_id + c_score.SENTLINT + line_number sents = augmented_dict[int(item['id'])].values() # Modify some mechaism here to selection sentence whether by some score or label for sent_i in sents: if sent_i['prob'] >= prob_threshold: cur_predicted_sentids.append( (sent_i['sid'], sent_i['score'], sent_i['prob']) ) # Important sentences for scaling training. Jul 21. # del sent_i['prob'] cur_predicted_sentids = sorted(cur_predicted_sentids, key=lambda x: -x[1]) item[ 'scored_sentids'] = cur_predicted_sentids[: top_n] # Important sentences for scaling training. Jul 21. item['predicted_sentids'] = [ sid for sid, _, _ in item['scored_sentids'] ][:top_n] item['predicted_evidence'] = convert_evidence2scoring_format( item['predicted_sentids']) # item['predicted_label'] = item['label'] # give ground truth label return d_list
def threshold_sampler(org_data_file, full_sent_list, prob_threshold=0.5, logist_threshold=None, top_n=5): """ Providing samples to the Training set by a probability threshold on the upstream selected sentences. """ d_list = common.load_jsonl(org_data_file) augmented_dict = dict() print("Build selected sentences file:", len(full_sent_list)) for sent_item in tqdm(full_sent_list): selection_id = sent_item[ 'selection_id'] # The id for the current one selection. org_id = int(selection_id.split('<##>')[0]) if org_id in augmented_dict: # change some logic to remove duplicate. augmented_dict[org_id].append(sent_item) else: augmented_dict[org_id] = [sent_item] for item in d_list: if int(item['id']) not in augmented_dict: cur_predicted_sentids = [] else: cur_predicted_sentids = [ ] # formating doc_id + c_score.SENTLINT + line_number sents = augmented_dict[int(item['id'])] # Modify some mechaism here to selection sentence whether by some score or label for sent_i in sents: if sent_i['prob'] >= prob_threshold: cur_predicted_sentids.append( (sent_i['sid'], sent_i['score'], sent_i['prob']) ) # Important sentences for scaling training. Jul 21. # del sent_i['prob'] cur_predicted_sentids = sorted(cur_predicted_sentids, key=lambda x: -x[1]) item[ 'scored_sentids'] = cur_predicted_sentids[: top_n] # Important sentences for scaling training. Jul 21. item['predicted_sentids'] = [ sid for sid, _, _ in item['scored_sentids'] ][:top_n] item['predicted_evidence'] = convert_evidence2scoring_format( item['predicted_sentids']) # item['predicted_label'] = item['label'] # give ground truth label return d_list
def score_converter(org_data_file, full_sent_list, top_k=5, prob_thr=0.5): """ Combines sentences of same claim :param org_data_file: :param full_sent_list: append full_sent_score list to evidence of original data file :param top_k: top k sentences to be retrieved :param prob_thr: probability threshold for retrieved sentences :return: """ d_list = common.load_jsonl(org_data_file) augmented_dict = dict() print("Build selected sentences file:", len(full_sent_list)) for sent_item in tqdm(full_sent_list): selection_id = sent_item['selection_id'] org_id = int(selection_id.split('<##>')[0]) if org_id in augmented_dict: augmented_dict[org_id].append(sent_item) else: augmented_dict[org_id] = [sent_item] for item in d_list: if int(item['id']) not in augmented_dict: cur_predicted_sentids = [] else: cur_predicted_sentids = [] sents = augmented_dict[int(item['id'])] for sent_i in sents: if sent_i['prob'] >= prob_thr: cur_predicted_sentids.append( (sent_i['sid'], sent_i['score'])) cur_predicted_sentids = sorted(cur_predicted_sentids, key=lambda x: -x[1]) item['scored_sentids'] = cur_predicted_sentids item['predicted_sentids'] = [sid for sid, _ in item['scored_sentids'] ][:top_k] item['predicted_evidence'] = convert_evidence2scoring_format( item['predicted_sentids']) item['predicted_label'] = item[ 'label'] # give ground truth label (for OFEVER calculation) # Removing all score and prob for sent_item in full_sent_list: if 'score' in sent_item.keys(): del sent_item['score'] del sent_item['prob'] return d_list
def navie_results_builder_for_sanity_check(org_data_file, full_sent_list): """ :param org_data_file: :param full_sent_list: append full_sent_score list to evidence of original data file :return: """ d_list = common.load_jsonl(org_data_file) augmented_dict = dict() print("Build selected sentences file") for sent_item in tqdm(full_sent_list): selection_id = sent_item[ 'selection_id'] # The id for the current one selection. org_id = int(selection_id.split('<##>')[0]) if org_id in augmented_dict: augmented_dict[org_id].append(sent_item) else: augmented_dict[org_id] = [sent_item] for item in d_list: if int(item['id']) not in augmented_dict: cur_predicted_sentids = [] else: cur_predicted_sentids = [ ] # formating doc_id + c_score.SENTLINT + line_number sents = augmented_dict[int(item['id'])] # Modify some mechaism here to selection sentence whether by some score or label for sent_i in sents: if sent_i['selection_label'] == "true": cur_predicted_sentids.append(sent_i['sid']) item['predicted_sentids'] = cur_predicted_sentids item['predicted_evidence'] = convert_evidence2scoring_format( item['predicted_sentids']) item['predicted_label'] = item['label'] return d_list
def select_sent_with_prob_for_eval_list(input_file, additional_file, prob_dict_file, tokenized=False, pipeline=False, is_demo=False): """ This method select sentences with upstream sentence retrieval. :param input_file: This should be the file with 5 sentences selected. :return: """ cursor = fever_db.get_cursor() if isinstance(additional_file, list): additional_d_list = additional_file else: additional_d_list = load_data(additional_file) additional_data_dict = dict() for add_item in additional_d_list: additional_data_dict[add_item['id']] = add_item d_list = input_file for item in tqdm(d_list): e_list = additional_data_dict[item['id']]['predicted_sentids'] if not pipeline: assert additional_data_dict[item['id']]['label'] == item['label'] assert additional_data_dict[ item['id']]['verifiable'] == item['verifiable'] assert additional_data_dict[item['id']]['id'] == item['id'] pred_evidence_list = [] for i, cur_e in enumerate(e_list): doc_id = cur_e.split(c_scorer.SENT_LINE)[0] ln = int(cur_e.split( c_scorer.SENT_LINE)[1]) # Important changes Bugs: July 21 pred_evidence_list.append((doc_id, ln)) pred_evidence = check_sentences.Evidences(pred_evidence_list) evidence_text_list = evidence_list_to_text_list(cursor, pred_evidence, contain_head=True, id_tokenized=tokenized) evidences = sorted(pred_evidence, key=lambda x: (x[0], x[1])) item_id = int(item['id']) evidence_text_list_with_prob = [] for text, (doc_id, ln) in zip(evidence_text_list, evidences): ssid = (item_id, doc_id, int(ln)) if ssid not in prob_dict_file: print("Some sentence pair don't have 'prob'.") prob = 0.5 else: prob = prob_dict_file[ssid]['prob'] assert item['claim'] == prob_dict_file[ssid]['claim'] evidence_text_list_with_prob.append((text, prob)) if tokenized: pass else: item['claim'] = ' '.join(easy_tokenize(item['claim'])) item['evid'] = evidence_text_list_with_prob item['predicted_evidence'] = convert_evidence2scoring_format(e_list) item['predicted_sentids'] = e_list # This change need to be saved. # item['predicted_label'] = additional_data_dict[item['id']]['label'] return d_list
def score_converter(org_data_file, full_sent_list, upstream_file, top_k=5, prob_thr=0.5): """ Combines sentences of same claim and retrieves only top k sentences with probability greater than threshold :param org_data_file: :param full_sent_list: append full_sent_score list to evidence of original data file :param upstream_file: sentences missed during hyperlink sentences generation are gathered from this file :param top_k: top k sentences to be retrieved :param prob_thr: probability threshold for retrieved sentences :return: """ d_list = common.load_jsonl(org_data_file) ans_list = common.load_jsonl(upstream_file) augmented_dict = dict() print("Build selected sentences file:", len(full_sent_list)) for sent_item in tqdm(full_sent_list): selection_id = sent_item['selection_id'][0] org_id = int(selection_id.split('<##>')[0]) if org_id in augmented_dict: augmented_dict[org_id].append(sent_item) else: augmented_dict[org_id] = [sent_item] for item, ans in zip(d_list, ans_list): if int(item['id']) not in augmented_dict: cur_predicted_sentids = [] else: cur_predicted_sentids_dict = dict() cur_predicted_sentids = [] sents = augmented_dict[int(item['id'])] for sent_i in sents: if (sent_i['fsid'], sent_i['fscore']) not in cur_predicted_sentids_dict: cur_predicted_sentids_dict[(sent_i['fsid'], sent_i['fscore'])] = [] assert len(sent_i['selection_id']) == len(sent_i['score']) assert len(sent_i['selection_id']) == len(sent_i['prob']) for sid, score, prob in zip(sent_i['selection_id'],sent_i['score'],sent_i['prob']): if prob >= prob_thr: cur_predicted_sentids_dict[(sent_i['fsid'], sent_i['fscore'])].append((sid.split('<##>')[1], score)) # if a first iteration sentence does not contain hyperlink, it would have been missed before, but added here for sid1, score1, _ in ans['scored_sentids']: isthere = False for sid2, score2 in cur_predicted_sentids_dict: if sid1 == sid2: isthere = True if score1 != score2: print("Something wrong!") break if not isthere: cur_predicted_sentids_dict[(sid1, score1)] = [] sorted_keys = sorted(cur_predicted_sentids_dict, key=lambda x: (-x[1])) for k in sorted_keys: cps_tmp = cur_predicted_sentids_dict[k] cps_tmp = sorted(cps_tmp, key=lambda x: (-x[1])) cur_predicted_sentids.append(k) cur_predicted_sentids.extend(cps_tmp[:1]) item['scored_sentids'] = cur_predicted_sentids # if none of the first iteration sentences contain hyperlinks, they would have been missed before, but added here for sid1, score1, _ in ans['scored_sentids']: isthere = False for sid2, _ in item['scored_sentids']: if sid1 == sid2: isthere = True break if not isthere: item['scored_sentids'].append((sid1, score1)) item['predicted_sentids'] = [sid for sid, _ in item['scored_sentids']][:top_k] item['predicted_evidence'] = convert_evidence2scoring_format(item['predicted_sentids']) item['predicted_label'] = item['label'] # give ground truth label # Removing all score and prob for sent_item in full_sent_list: if 'score' in sent_item.keys(): del sent_item['score'] del sent_item['prob'] return d_list