def paragraph_scorer(idf_fn: Callable[[str], float], q_terms: Set[str], paragraph: List[str]) -> float: paragraph_terms = set(paragraph) mentioned_terms = lfilter(lambda x: x in paragraph_terms, q_terms) mentioned_terms = re_tokenize(mentioned_terms) score = sum(lmap(idf_fn, mentioned_terms)) return score
def select_paragraph( docs: Dict[str, List[List[str]]], clue12_13_df, claim_list: List[Dict], strategy="topk", ) -> List[Tuple[str, List[List[str]]]]: claim_id_to_text: Dict[int, str] = {c['cId']: c['text'] for c in claim_list} cdf = 50 * 1000 * 1000 top_k = 100 not_found_set = set() def idf(term: str): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5)) r: List[Tuple[str, List[List[str]]]] = [] ticker = TimeEstimator(len(docs)) for claim_id, docs in docs.items(): claim_text = claim_id_to_text[int(claim_id)] q_terms = set(re_tokenize(nltk.tokenize.word_tokenize(claim_text))) def scorer(para: List[str]) -> float: return paragraph_scorer(idf, q_terms, para) max_score = sum(lmap(idf, q_terms)) def get_best_per_doc(doc: List[str]) -> List[Tuple[List[str], float]]: paragraph_list: Iterable[List[str]] = enum_paragraph([doc]) paragraph_scored_list: List[Tuple[List[str], float]] = lmap_pairing( scorer, paragraph_list) paragraph_scored_list.sort(key=lambda x: x[1], reverse=True) return paragraph_scored_list[:1] selected: List[Tuple[List[str], float]] = list( flatten(lmap(get_best_per_doc, docs))) # if strategy == "topk": # selected: List[Tuple[List[str], float]] = paragraph_scored_list[:top_k] # elif strategy == "cutoff": # cut_off = max_score * 0.6 # selected: List[Tuple[List[str], float]] = lfilter(lambda x: x[1] > cut_off, paragraph_scored_list) # else: # assert False e = claim_id, left(selected) r.append(e) ticker.tick() return r
def paragraph_scorer(paragraph: Paragraph) -> ScoreParagraph: paragraph_terms = set(paragraph.tokens) mentioned_terms = lfilter(lambda x: x in paragraph_terms, cp_tokens) mentioned_terms = re_tokenize(mentioned_terms) def idf(term: str): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf+0.5)/(clue12_13_df[term]+0.5)) score = sum(lmap(idf, mentioned_terms)) max_score = sum(lmap(idf, cp_tokens)) return ScoreParagraph(paragraph=paragraph, score=score)
def idf_scorer(doc: Counter, claim_text: str, perspective_text: str) -> bool: cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize(perspective_text) cp_tokens = lmap(lambda x: x.lower(), cp_tokens) cp_tokens = set(cp_tokens) mentioned_terms = lfilter(lambda x: x in doc, cp_tokens) mentioned_terms = re_tokenize(mentioned_terms) def idf(term: str): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf+0.5)/(clue12_13_df[term]+0.5)) score = sum(lmap(idf, mentioned_terms)) max_score = sum(lmap(idf, cp_tokens)) return score > max_score * 0.8