def enum_true_instance(sel_per_review=0) -> Iterable[Tuple[Claim, Claim, str]]: reviews: List[Review] = load_parsed() def rank_fn(e = Tuple[Claim, Claim]): claim1, claim2 = e return num_common_terms(claim1.text, claim2.text) for review in reviews: pair_per_review = [] yes_claim_list = lfilter(lambda c: c.assertion == "YS", review.claim_list) no_claim_list = lfilter(lambda c: c.assertion == "NO", review.claim_list) for yes_claim in yes_claim_list: for no_claim in no_claim_list: e = yes_claim, no_claim pair_per_review.append(e) pair_per_review.sort(key=rank_fn, reverse=True) if sel_per_review == 0: pairs = pair_per_review else: pairs = pair_per_review[:sel_per_review] for claim1, claim2 in pairs: yield claim1, claim2
def enum_true_instance() -> Iterable[Tuple[Claim, Claim, str]]: reviews: List[Review] = load_parsed() for review in reviews: yes_claim_list = lfilter(lambda c: c.assertion == "YS", review.claim_list) no_claim_list = lfilter(lambda c: c.assertion == "NO", review.claim_list) for yes_claim in yes_claim_list: for no_claim in no_claim_list: yield yes_claim, no_claim, "Yes/No from a same review" yield no_claim, yes_claim, "No/Yes from a same review"
def __init__(self, split): super(ProcessedResourcePredict10, self).__init__(split) candidate_docs_d: Dict[QueryID, List[str]] = top100_doc_ids(split) new_candidate_docs_d: Dict[QueryID, List[str]] = {} for qid, doc_ids in candidate_docs_d.items(): pos_doc_ids = lfilter(lambda doc_id: self.get_label(qid, doc_id), doc_ids) neg_doc_ids = lfilter(lambda doc_id: not self.get_label(qid, doc_id), doc_ids) n_neg = 10 - len(pos_doc_ids) random.shuffle(neg_doc_ids) doc_ids_selected = pos_doc_ids + neg_doc_ids[:n_neg] assert len(doc_ids_selected) <= 10 new_candidate_docs_d[qid] = doc_ids_selected self.candidate_doc_d = new_candidate_docs_d
def enum_neg_instance2() -> Iterable[Tuple[Claim, Claim, str]]: reviews: List[Review] = load_parsed() for review in reviews: yes_claim_list = lfilter(lambda c: c.assertion == "YS", review.claim_list) no_claim_list = lfilter(lambda c: c.assertion == "NO", review.claim_list) for c1, c2 in combinations(yes_claim_list, 2): yield c1, c2, "{}/{} from a same review".format( c1.assertion, c2.assertion) for c1, c2 in combinations(no_claim_list, 2): yield c1, c2, "{}/{} from a same review".format( c1.assertion, c2.assertion)
def combine_subjectivity_annotation( doc: MPQARawDoc, ann_list: List[MPQAAnnLine]) -> MPQADocSubjectiveInfo: def is_sentence_annot(ann: MPQAAnnLine) -> bool: return ann.ann_type == "GATE_sentence" # identify sentences sentences = lfilter(is_sentence_annot, ann_list) sentences.sort(key=lambda s: s.span[0]) if not sentences: print(ann_list) assert sentences def is_it_about_subjective(ann: MPQAAnnLine) -> bool: return ann.ann_type in [EXPRESSIVE_SUBJECTIVITY, DIRECT_SUBJECTIVITY] # filter subjectivity related ones ann_about_subjectivity = lfilter(is_it_about_subjective, ann_list) def find_sentence(span) -> int: st, ed = span for s in sentences: st_s, ed_s = s.span if st_s <= st and ed <= ed_s: return s.id raise KeyError() # Match sentence with annotation global num_error s_list_to_ann_list: Dict[int, List] = defaultdict(list) for annot in ann_about_subjectivity: try: if annot.span == (0, 0): continue sentence_id = find_sentence(annot.span) s_list_to_ann_list[sentence_id].append(annot) except KeyError: num_error += 1 annot_sent_list = [] for raw_sent in sentences: ann_list = s_list_to_ann_list[raw_sent.id] tags = list([a.ann_type for a in ann_list]) annot_sent = Sentence(raw_sent.id, raw_sent.span, tags, ann_list) annot_sent_list.append(annot_sent) return MPQADocSubjectiveInfo(doc.doc_id, doc.content, annot_sent_list)
def filter_avail(claims): cpid_resolute: Dict[str, CPID] = load_cpid_resolute( FileName("resolute_dict_580_606")) cid_list: List[int] = lmap(lambda x: int(x.split("_")[0]), cpid_resolute.values()) cid_list: Set[int] = set(cid_list) return lfilter(lambda x: x['cId'] in cid_list, claims)
def generate_instances(self, claim: Dict, data_id_manager) -> List[Payload]: cid = claim['cId'] claim = claim['text'] perspectives = self.candidate_perspective[cid] passages = self.cid_to_passages[cid] if self.filter_good: filter_condition = score_over_zero else: def filter_condition(dummy): return True good_passages: List[List[str]] = left( lfilter(filter_condition, passages)) output = [] for pid in perspectives: is_correct = any([pid in cluster for cluster in self.gold[cid]]) for passage_idx, passage in enumerate(good_passages): perspective = perspective_getter(pid) info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx} p = Payload(passage, claim, perspective, data_id_manager.assign(info), is_correct) output.append(p) return output
def generate_instances(self, claim: Dict, data_id_manager: DataIDManager) -> List[Instance]: cid = claim['cId'] claim = claim['text'] passages = self.cid_to_passages[cid] good_passages: List[List[str]] = left( lfilter(score_over_zero, passages)) not_good_passages: List[List[str]] = left( lfilter_not(score_over_zero, passages)) n_good = len(good_passages) n_not_good = len(not_good_passages) random_passage = list([self.random_sample(cid) for _ in range(10)]) # len(pair_list_g_ng) = n_not_good ( assuming n_not_good > n_good) def make_instance(passage, label): info = {'cid': cid} return Instance(claim, passage, label, data_id_manager.assign(info)) l1 = lmap(lambda p: make_instance(p, 1), good_passages) l2 = lmap(lambda p: make_instance(p, 0), not_good_passages) l3 = lmap(lambda p: make_instance(p, 0), random_passage) print("g: ng : rand = {} : {} : {}".format(len(l1), len(l2), len(l3))) return l1 + l2 + l3
def main(): train_data = load_argu_data_from_pickle("training") averager = Averager() for text, label in train_data[:200]: print(label) raw_text = text.text text_list: List[str] = raw_text.split("\n\n") def is_empty_line(l): return l.strip() text_list = lfilter(is_empty_line, text_list) sentence_list = lflatten(lmap(sent_tokenize, text_list)) def is_reference(l): if len(l) < 3: return False if l[0] == "[" and l[1] == "i": return True if l[0] == "[" and l[2] == "]": return True if "http://" in l: return True return False sentence_list = lfilter_not(is_reference, sentence_list) averager.append(len(sentence_list)) print(averager.get_average())
def work(st, ed): st = int(st) ed = int(ed) q_config_id = Q_CONFIG_ID_BM25_10000 ci = DynRankedListInterface(make_doc_query, q_config_id) all_data_points = load_train_data_point() print("Running {}~{} of {}".format(st, ed, len(all_data_points))) num_request = 10000 todo = all_data_points[st:ed] not_done = lfilter(partial(db_not_contains, q_config_id), todo) queries: List[DocQuery] = lmap(datapoint_to_doc_query, not_done) print("Executing {} queries".format(len(queries))) ranked_list_dict: Dict[str, List[SimpleRankedListEntry]] = \ send_doc_queries(ci.disk_path, num_request, queries, 600) qid_list = lmap(dp_to_qid, not_done) print("{} of {} succeed".format(len(ranked_list_dict), len(queries))) def add_to_db(query_id: str): if query_id in ranked_list_dict: r = ranked_list_dict[query_id] q_res_id: str = "{}_{}".format(query_id, q_config_id) if not has_key(QueryResult, q_res_id): save(QueryResult, q_res_id, r) foreach(add_to_db, qid_list) flush()
def collect_good_passages(data_id_to_info: Dict[int, Dict], passage_score_path: FilePath, config: Dict ): recover_subtokens = get_recover_subtokens() score_cut = config['score_cut'] top_k = config['top_k'] grouped_scores: Dict[int, List[Dict]] = read_passage_scores(passage_score_path, data_id_to_info, recover_subtokens) def get_score_from_logit(logits): return scipy.special.softmax(logits)[1] def is_good(d: Dict): score = get_score_from_logit(d['logits']) return score >= score_cut output = [] num_passges = [] for cid, passages in grouped_scores.items(): good_passages = lfilter(is_good, passages) good_passages.sort(key=lambda d: get_score_from_logit(d['logits']), reverse=True) num_passges.append(len(good_passages)) if good_passages: output.append((cid, good_passages[:top_k])) else: scores = list([get_score_from_logit(d['logits']) for d in passages]) scores.sort(reverse=True) print(num_passges) print("{} of {} claims has passages".format(len(output), len(grouped_scores))) return output
def select_vertices_edges(counter) -> Tuple[Edges, List[Any]]: def is_not_funct(word): if len(word) > 2: return True return word not in ",.)(:'\"`-?''``,%" #print("total pairs", len(counter)) vertice_counter = get_vertices_info(counter) #print("total terms", len(vertice_counter)) common_vertices = list([(k, cnt) for k, cnt in vertice_counter.items() if cnt > 100]) common_vertices.sort(key=lambda x: x[1], reverse=True) # print(left(common_vertices[:20])) # print("Terms with more than 100 appearance : ", len(common_vertices)) valid_vertices: List[Any] = lfilter(is_not_funct, left(common_vertices)) valid_pairs = list([((a, b), cnt) for (a, b), cnt in counter.items() if a in valid_vertices and b in valid_vertices]) # print("valid pairs", len(valid_pairs)) unnormalized_edges: Dict[Any, Dict] = {} for (a, b), cnt in valid_pairs: if a not in unnormalized_edges: unnormalized_edges[a] = Counter() unnormalized_edges[a][b] += cnt edges = {} for vertex_a, raw_edges in unnormalized_edges.items(): total = sum(raw_edges.values()) local_edges = Counter() for vertex_b, cnt in raw_edges.items(): prob = cnt / total local_edges[vertex_b] = prob edges[vertex_a] = local_edges return Edges(edges), valid_vertices
def extract_qk_unit(info_path, pred_path, config_path) -> Iterable[QKUnit]: info = load_combine_info_jsons(info_path, qk_convert_map, False) predictions = join_prediction_with_info(pred_path, info) grouped: Dict[str, List[Dict]] = group_by(predictions, lambda x: x['query'].query_id) config = json.load(open(config_path, "r")) score_cut = config['score_cut'] top_k = config['top_k'] def is_good(entry): return get_regression_score(entry) > score_cut select_rate_list = [] qk_units = [] for qid, entries in grouped.items(): any_entry = entries[0] query = any_entry['query'] good_entries = lfilter(is_good, entries) good_entries.sort(key=get_regression_score, reverse=True) selected_entries = good_entries[:top_k] if not selected_entries: continue kd_list = lmap(lambda x: x['kdp'], selected_entries) qk_units.append((query, kd_list)) select_rate = len(selected_entries) / len(entries) select_rate_list.append(select_rate) print("{} of {} qk units selected".format(len(qk_units), len(grouped))) print("average select rate", average(select_rate_list)) return qk_units
def get_feature_binary_model(claim_id, perspective_id, claim_text, perspective_text, ci: DynRankedListInterface, is_mention_fn: Callable[[Counter[str], str, str], bool], ) -> Tuple[Counter, int]: def is_mention(doc: Counter) -> bool: return is_mention_fn(doc, claim_text, perspective_text) print(claim_id, perspective_id) ranked_docs: List[SimpleRankedListEntry] = ci.query(claim_id, perspective_id, claim_text, perspective_text) ranked_docs = ranked_docs[:100] print("{} docs in ranked list".format(len(ranked_docs))) doc_id_list: List[str] = lmap(get_doc_id, ranked_docs) tf_d = load_multiple(CluewebDocTF, doc_id_list, True) not_found = [] for idx, doc_id in enumerate(doc_id_list): if doc_id not in tf_d: not_found.append(idx) ranked_docs_tf = tf_d.values() mentioned_docs: List[Counter] = lfilter(is_mention, ranked_docs_tf) print("Found doc", len(tf_d), "mentioned doc", len(mentioned_docs)) docs_rel_freq: List[Counter] = lmap(div_by_doc_len, mentioned_docs) num_doc: int = len(docs_rel_freq) p_w_m: Counter = average_tf_over_docs(docs_rel_freq, num_doc) return p_w_m, num_doc
def extract_predictions(score_d, split): candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle( split) # only evalaute what's available valid_cids: Set[int] = set(left(score_d.keys())) sub_candidates: List[Tuple[int, List[Dict]]] = lfilter( lambda x: x[0] in valid_cids, candidates) print("{} claims are evaluated".format(len(sub_candidates))) def make_decisions(e: Tuple[int, List[Dict]]): cid, p_list = e decisions = [] for p in p_list: pid = int(p['pid']) query_id = CPIDPair((cid, pid)) if query_id in score_d: score = score_d[query_id] else: score = 0 binary = 1 if score > 0.5 else 0 decisions.append((cid, pid, binary)) return cid, decisions predictions = lmap(make_decisions, candidates) return predictions
def stats(): entries = list(read()) print("Total items", len(entries)) unique = set() for e in entries: unique.add((e.doc_id, e.part_idx)) print("Unique passages", len(unique)) avg_value_list = lmap(lambda x: x.avg_value, entries) predicted_score_list = lmap(lambda x: x.predicted_score, entries) good_doc = set() for e in entries: if e.predicted_score > 0.9: good_doc.add(e.doc_id) r = get_correlation(avg_value_list, predicted_score_list) print(r) over_09 = lfilter(lambda x: x.predicted_score > 0.9, entries) under_01 = lfilter(lambda x: x.predicted_score < 0.1, entries) doc_over_09 = lfilter(lambda x: x.doc_id in good_doc, entries) doc_over_09_under_01 = lfilter(lambda x: x.doc_id in good_doc, under_01) def is_good(x): return x.avg_value > 0.01 def is_bad(x): return x.avg_value < -0.01 for criteria in [is_good, is_bad]: good_global = lfilter(criteria, entries) good_over_09 = lfilter(criteria, over_09) good_under_01 = lfilter(criteria, under_01) good_doc_over_09 = lfilter(criteria, doc_over_09) good_doc_over_09_under_01 = lfilter(criteria, doc_over_09_under_01) job = criteria.__name__ print("global {} rate".format(job), get_rate_str(len(good_global), len(entries))) print("over 09 {} rate".format(job), get_rate_str(len(good_over_09), len(over_09))) print("under 01 {} rate".format(job), get_rate_str(len(good_under_01), len(under_01))) print("doc over 09 {} rate".format(job), get_rate_str(len(good_doc_over_09), len(doc_over_09))) print( "doc over 09 under 01 {} rate".format(job), get_rate_str(len(good_doc_over_09_under_01), len(doc_over_09_under_01)))
def paragraph_scorer(idf_fn: Callable[[str], float], q_terms: Set[str], paragraph: List[str]) -> float: paragraph_terms = set(paragraph) mentioned_terms = lfilter(lambda x: x in paragraph_terms, q_terms) mentioned_terms = re_tokenize(mentioned_terms) score = sum(lmap(idf_fn, mentioned_terms)) return score
def a_relevant(save_name, q_res_path, claims): top_n = 10 ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.5 tokenizer = PCTokenizer() all_passages = [] entries = [] num_pos_sum = 0 num_pos_exists = 0 for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) num_pos = len(lfilter(lambda x: x[1] > 0, passages)) num_pos_sum += num_pos if num_pos > 0: num_pos_exists += 1 all_passages.extend(passages) entries.append((c, passages)) print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum, num_pos_exists)) data = entries, all_passages save_to_pickle(data, save_name)
def generate_instances(self, claim: Dict, data_id_manager) -> List[PairedInstance]: cid = claim['cId'] perspective_clusters: List[List[int]] = self.gold[cid] passages = self.cid_to_passages[cid] gold_candidate_texts: List[str] = flatten_map(perspective_getter, perspective_clusters) good_passages: List[List[str]] = left( lfilter(score_over_zero, passages)) not_good_passages: List[List[str]] = left( lfilter_not(score_over_zero, passages)) # print("good/not_good passages : {}/{}".format(len(good_passages), len(not_good_passages))) # make good vs not_good pairs # about 100 items pair_list_g_ng: List[Tuple[ List[str], List[str]]] = generate_pairwise_combinations( not_good_passages, good_passages, True) # make not_good vs random pairs # about 100 items pair_list_ng_rand: List[Tuple[List[str], List[str]]] = list([ (inst, self.random_sample(cid)) for inst in not_good_passages ]) # generate (candiate_texts) X (two pair_list), while limit maximum to 5 * len(two pair_list) = 1000 max_insts = 100 * 2 * 5 def infinite_passage_iterator(): while True: for pair in pair_list_g_ng: strict_good = 1 strict_bad = 0 yield pair, strict_good, strict_bad for pair in pair_list_ng_rand: strict_good = 0 strict_bad = 1 yield pair, strict_good, strict_bad itr = infinite_passage_iterator() all_passage_pair_len = len(pair_list_g_ng) + len(pair_list_ng_rand) n_passage_per_inst = int(max_insts / len(gold_candidate_texts)) + 1 n_passage_per_inst = min(all_passage_pair_len, n_passage_per_inst) all_insts = [] for candidate in gold_candidate_texts: for _ in range(n_passage_per_inst): passage_pair, strict_good, strict_bad = itr.__next__() passage_good, passage_worse = passage_pair insts = PairedInstance(passage_good, passage_worse, candidate, strict_good, strict_bad) all_insts.append(insts) return all_insts
def a_relevant(): d_ids = list(load_train_claim_ids()) claims: List[Dict] = get_claims_from_ids(d_ids) claims = claims top_n = 10 q_res_path = FilePath( "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) preload_docs(ranked_list, claims, top_n) claim_lms = build_gold_lms(claims) claim_lms_d = {lm.cid: lm for lm in claim_lms} bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms)) log_bg_lm = get_lm_log(bg_lm) stopwords = load_stopwords_for_query() alpha = 0.3 tokenizer = PCTokenizer() all_passages = [] entries = [] for c in claims: q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])] claim_lm = claim_lms_d[c['cId']] log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha)) log_odd: Counter = subtract(log_topic_lm, log_bg_lm) claim_text = c['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) scores = [] for t in claim_tokens: if t in log_odd: scores.append(log_odd[t]) base = average(scores) def get_passage_score(p): def get_score(t): if t in stopwords: return 0 return log_odd[tokenizer.stemmer.stem(t)] return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0 passages = iterate_passages(q_res, top_n, get_passage_score) all_passages.extend(passages) a_rel_passages = lfilter(lambda x: x[1] > 0, passages) entries.append((c, a_rel_passages)) data = entries, all_passages save_to_pickle(data, "pc_train_a_passages")
def filter_map(qk_unit: QKUnit): query, kdp_list = qk_unit good_doc_list = good_doc_list_d[query.query_id] def is_good(kdp): return kdp.doc_id in good_doc_list new_kdp_list = lfilter(is_good, kdp_list) print("{} -> {}".format(len(kdp_list), len(new_kdp_list))) if not new_kdp_list: stat_count["no kdp"] += 1 return query, new_kdp_list
def get_ap_list_from_score_d(score_d, split): candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle( split) # only evalaute what's available valid_cids: Set[int] = set(left(score_d.keys())) sub_candidates: List[Tuple[int, List[Dict]]] = lfilter( lambda x: x[0] in valid_cids, candidates) print("{} claims are evaluated".format(len(sub_candidates))) predictions = predict_from_dict(score_d, sub_candidates, 50) cids = left(predictions) ap_list = get_average_precision_list(predictions, False) return ap_list, cids
def eval_map(split, score_d: Dict[CPIDPair, float], debug=False): # load pre-computed perspectives candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle( split) # only evalaute what's available valid_cids: Set[int] = set(left(score_d.keys())) sub_candidates: List[Tuple[int, List[Dict]]] = lfilter( lambda x: x[0] in valid_cids, candidates) print("{} claims are evaluated".format(len(sub_candidates))) print(left(sub_candidates)) predictions = predict_from_dict(score_d, sub_candidates, 50) return evaluate_map(predictions, debug)
def work(self, job_id): features: List[ParagraphFeature] = pickle.load( open(os.path.join(self.input_dir, str(job_id)), "rb")) def include(f: ParagraphFeature) -> bool: return f.datapoint.id in self.dp_id_set features: List[ParagraphFeature] = lfilter(include, features) if features: self.write(features, job_id) else: print("No features")
def filter_with_ranked_list( qk_untis: List[QKUnit], ranked_list_d: Dict[str, List[TrecRankedListEntry]], threshold, top_k, ) -> List[QKUnit]: out_qk_units = [] for q, k_list in qk_untis: try: cur_ranked_list = ranked_list_d[q.query_id] entries: Dict[str, TrecRankedListEntry] = { e.doc_id: e for e in cur_ranked_list } n_k_list = len(k_list) not_found_set = set() def get_score(k: KDP): key = k.to_str() if key in entries: s: TrecRankedListEntry = entries[key] return s.score else: not_found_set.add(key) return -1e10 k_list.sort(key=get_score, reverse=True) def higher(k: KDP) -> bool: return get_score(k) >= threshold if threshold is not None: k_list = lfilter(higher, k_list) if top_k is None or top_k == -1: pass else: k_list = k_list[:top_k] out_qk_units.append((q, k_list)) if not_found_set: print("For query {}, {} of {} do not have score".format( q.query_id, len(not_found_set), n_k_list)) except KeyError as e: print(e, "KeyError", q.query_id) print(lmap(len, right(out_qk_units))) return out_qk_units
def paragraph_scorer(paragraph: Paragraph) -> ScoreParagraph: paragraph_terms = set(paragraph.tokens) mentioned_terms = lfilter(lambda x: x in paragraph_terms, cp_tokens) mentioned_terms = re_tokenize(mentioned_terms) def idf(term: str): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf+0.5)/(clue12_13_df[term]+0.5)) score = sum(lmap(idf, mentioned_terms)) max_score = sum(lmap(idf, cp_tokens)) return ScoreParagraph(paragraph=paragraph, score=score)
def featurize_fn(voca, voca2idx, datapoint): rm_list, label = datapoint nonzero = lfilter(lambda x: x > 0, right(rm_list)) if nonzero: nonzero_min = min(nonzero) else: nonzero_min = 0 terms = left(rm_list) term_ids = lmap(lambda x: voca2idx[x], terms) scores = list([s if s > 0 else 0.2 * nonzero_min for s in right(rm_list)]) v = np.zeros([len(voca)]) for idx, score in zip(term_ids, scores): v[idx] = score return v, label
def idf_scorer(doc: Counter, claim_text: str, perspective_text: str) -> bool: cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize(perspective_text) cp_tokens = lmap(lambda x: x.lower(), cp_tokens) cp_tokens = set(cp_tokens) mentioned_terms = lfilter(lambda x: x in doc, cp_tokens) mentioned_terms = re_tokenize(mentioned_terms) def idf(term: str): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf+0.5)/(clue12_13_df[term]+0.5)) score = sum(lmap(idf, mentioned_terms)) max_score = sum(lmap(idf, cp_tokens)) return score > max_score * 0.8
def collect_good_passages(data_id_to_info: Dict[str, Dict], passage_score_path: FilePath, config: Dict ) -> List[Tuple[str, List[QKOutEntry]]]: global recover_subtokens recover_subtokens = get_recover_subtokens() score_cut = config['score_cut'] top_k = config['top_k'] score_type = config['score_type'] fetch_field_list = ["logits", "input_ids", "data_id"] data: List[Dict] = join_prediction_with_info(passage_score_path, data_id_to_info, fetch_field_list ) qk_out_entries: List[QKOutEntry] = lmap(QKOutEntry.from_dict, data) grouped: Dict[str, List[QKOutEntry]] = group_by(qk_out_entries, lambda x: x.query.query_id) def get_score_from_logit_local(logits) -> float: return get_score_from_logit(score_type, logits) def get_score(entry: QKOutEntry): return get_score_from_logit_local(entry.logits) def is_good(qk_out_entry: QKOutEntry): score = get_score_from_logit_local(qk_out_entry.logits) return score >= score_cut output = [] num_passges = [] for cid, passages in grouped.items(): good_passages = lfilter(is_good, passages) good_passages.sort(key=get_score, reverse=True) num_passges.append(len(good_passages)) if good_passages: output.append((cid, good_passages[:top_k])) else: scores = lmap(get_score, passages) scores.sort(reverse=True) print(num_passges) print("{} of {} query has passages".format(len(output), len(grouped))) return output
def main(): qk_list = load_from_pickle("pc_evidence_qk") split = "train" split = "dev" tprint("Building query lms") query_lms = get_query_lms(split) split_query_ids = list(query_lms.keys()) def is_split(qk: QKUnit): q, k = qk if q.query_id in split_query_ids: return True else: return False qk_for_split = lfilter(is_split, qk_list) tprint("start filtering") filtered_qk = filter_qk(qk_for_split, query_lms) save_to_pickle(filtered_qk, "pc_evi_filtered_qk_{}".format(split))