def optimizeAdding(self, precomputed_query, terms, weights, retrieval_model): index = 0 selected_terms = [] rank = BIG_VALUE start_time = time.time() history = [] while rank != -1 and rank > 1 and index < len(terms): selected_terms.append(terms[index]) if terms[index][0] not in self.all_term_scores: print(terms[index][0], "not in", self.all_term_scores) selected_kws = [(term[0], self.all_term_scores.get(term[0], 0)) for term in selected_terms] scores = self.getQueryRank(precomputed_query, selected_kws, retrieval_model, weights) history.append((deepcopy(selected_terms), scores["rank_kw"], scores["rank_kw_weight"], scores["rank_avg"])) index += 1 if len(history) == 0: pick = ([], 0) else: # print("Chosen terms:", terms,"\n\n") pick = min(history, key=lambda x: tokenWeight(x)) if pick[1] == BIG_VALUE: pick = min(history, key=lambda x: len(x[0])) took = time.time() - start_time return pick[0], float(pick[1]), "Adding", took
def takeMinMaxDivW(kws, param={}): sorted_list = sorted(kws.items(), key=lambda x: tokenWeight(x), reverse=param.get("reverse", False)) for kw in sorted_list[:param["num"]]: kws[kw[0]] *= param["mul"]
def takeMinW(kws, param=1): sorted_list = sorted(kws.items(), key=lambda x: tokenWeight(x), reverse=False) for kw in sorted_list[:param]: kws[kw[0]] *= 10
def tweakW(kws, param={}): max_list = sorted(kws.items(), key=lambda x: tokenWeight(x), reverse=True) min_list = list(reversed(max_list)) for kw in max_list[:param["max_num"]]: kws[kw[0]] *= param["max_mul"] for kw in min_list[:param["min_num"]]: kws[kw[0]] *= param["min_mul"]
def getSortedTerms(term_scores, precomputed_query, options={}): all_term_scores = addUpAllTermScores(term_scores, options=options) terms = sorted(six.iteritems(all_term_scores), key=lambda x: x[1], reverse=True) counts = getCountsInQueryForMatchingTerms(precomputed_query) terms = [(term[0], tokenWeight(term)) for term in terms] return all_term_scores, terms, counts
def makeQueryFromContext(context): counts = Counter([t["text"].lower() for t in context["tokens"]]) query = [] for kw in context["best_kws"]: # (text, count, boost, bool, field, distance) new_token = (kw[0], counts[kw[0]], tokenWeight(kw), None, None, None) # new_token = (text, counts[text], float(weight)/counts[text], None, None, None) query.append(new_token) return query
def adjustDistW(kws, param=1): sorted_list = sorted(kws.items(), key=lambda x: tokenWeight(x), reverse=False) max_val = max(kws.values()) mean_val = max_val / len(kws) for kw in sorted_list: if kws[kw[0]] < mean_val * param: kws[kw[0]] *= 10 else: break
def runQueryAndMeasureKeywordSelection(precomputed_query, selected_keywords, retrieval_model, weights, kw_data): """ Run queries to compute scores for the provided keywords, adds them to the [kw_data] dict :param precomputed_query: original precomputed_query as provided by testing pipeline :param selected_keywords: tuples of (keyword,weight) coming out of exp["keyword_selector"].selectKeywords() :param retrieval_model: retrieval instance (e.g. ElasticRetrievalBoost) :param cit: citation dict :param weights: the weights used for retrieval at this point, configurable in the experiment :param kw_data: dict that will eventually be stored, containing the precomputed_query, selected kws, etc. """ kw_counts = getCountsInQuery(precomputed_query, selected_keywords) # print("Query:", precomputed_query["query_text"]) query = deepcopy(precomputed_query) # StructuredToken(token, count, boost, bool, field, distance) query["structured_query"] = StructuredQuery([{ "token": kw[0], "count": kw_counts.get(kw[0], 0), "boost": 1 } for kw in selected_keywords]) kw_data["kw_selection_scores"] = runAndMeasureOneQuery( query, weights, retrieval_model) # if any([" " in kw[0] for kw in selected_keywords]): # print(precomputed_query["file_guid"],precomputed_query["citation_id"]) # print(precomputed_query["vis_text"]) # print(precomputed_query["keyphrases"]) # print("\n\n") # if precomputed_query["file_guid"] == 'b884c939-144c-4d95-9d30-097f8b83e1d3' and precomputed_query["citation_id"]=='cit9': # print("stop") query = deepcopy(precomputed_query) query["structured_query"] = StructuredQuery([{ "token": kw[0], "count": kw_counts.get(kw[0], 0), "boost": tokenWeight(kw) } for kw in selected_keywords]) kw_data["kw_selection_weight_scores"] = runAndMeasureOneQuery( query, weights, retrieval_model)
def runMod(query, mod): if len(query["best_kws"]) == 0: return None kws = {kw[0]: tokenWeight(kw) for kw in query["best_kws"]} newq = deepcopy(query) func = globals()[mod["name"]] if mod.get("param"): func(kws, mod["param"]) else: func(kws) newq["best_kws"] = [k for k in kws.items()] newq["structured_query"] = makeQueryFromContext(newq) return newq
def getOneContextFeatures(context): """ Prepares a single context's data for any nn. Takes ["token_features"] from list of sentences and returns a single list of token features. """ all_keywords = {t[0]: tokenWeight(t) for t in context["best_kws"]} featureset = getAnnotationListsForContext(context["tokens"], all_keywords) tokens, to_extract, weights = zip(*featureset) max_weight = max(weights) if max_weight > 0: weights = [w / max_weight for w in weights] context["extract_mask"] = to_extract context["tokens"] = tokens context["weight_mask"] = weights return context
def listAllKeywordsToExtractFromReader(reader): """ Lists all keywords that are marked as extract:true in a list or reader object """ to_extract = [] for kw_data in reader: if isinstance(kw_data, dict): best_kws = {t[0]: tokenWeight(t) for t in kw_data["best_kws"]} for sent in kw_data["context"]: for token in sent["token_features"]: if token["text"].lower() in best_kws: to_extract.append(token["text"].lower()) elif isinstance(kw_data, tuple): if kw_data[1]: to_extract.append(kw_data[0]["text"]) return Counter(to_extract)
def selectKeywords(self, precomputed_query, doc_list, retrieval_model, parameters, cit, weights, norm_term_scores=None, docFreq=None, maxDocs=None, rawScores=None): if parameters.get("use_c3_stopword_list", False): norm_term_scores = filterC3Stopwords(norm_term_scores) if parameters.get("filter_stopwords", True): norm_term_scores = filterStopwords(retrieval_model, norm_term_scores, docFreq) all_term_scores, terms, counts = getSortedTerms(norm_term_scores, precomputed_query, options=parameters) if not parameters.get("use_weights", True): # if parameters.get("use_counts", True): # # (term, count, weight) # terms2 = [(term[0], term[2], 1) for term in terms] # else: terms2 = [(term[0], 1) for term in terms] terms = terms2 use_kps = parameters.get("use_kps", False) if not use_kps: return [(term[0], tokenWeight(term)) for term in terms] kp_method = parameters.get("kp_method", "add") norm_term_scores = {term[0]: tokenWeight(term) for term in terms} if kp_method == "add": res = [] for kp in precomputed_query.get("keyphrases", []): if any([term not in norm_term_scores for term in kp]): # KP doesn't fully match, can't add it continue score = getKPScore(parameters, norm_term_scores, kp) kp = " ".join(kp) res.append((kp, score)) return terms + res elif kp_method == "sub": res = [] term_counts = {term[0]: tokenWeight(term) for term in terms} for kp in precomputed_query.get("keyphrases", []): if any([term not in norm_term_scores for term in kp]): # KP doesn't fully match, can't add it continue score = getKPScore(parameters, norm_term_scores, kp) for term in kp: term_counts[term] -= 1 kp = " ".join(kp) res.append((kp, score)) terms = [(term, norm_term_scores[term]) for term in term_counts if term_counts[term] > 0] return terms + res else: raise ValueError("Unknown kp_method")
def selectKeywords(self, precomputed_query, doc_list, retrieval_model, parameters, cit, weights, norm_term_scores=None, docFreq=None, maxDocs=None, rawScores=None): if parameters.get("use_all_original_text", False): original_query = precomputed_query["vis_text"] original_query = original_query.replace("__cit", " ") original_query = original_query.replace("__author", " ") else: original_query = precomputed_query["query_text"] # terms = original_query.lower().split() all_tokens = re.findall(r"\w+", original_query.lower()) terms = list(set(all_tokens)) counts = Counter(all_tokens) if parameters.get("use_weights", True): norm_term_scores = { guid: {t: norm_term_scores[guid].get(t, 0.0) for t in terms} for guid in precomputed_query["match_guids"] } else: norm_term_scores = { guid: {t: 1.0 for t in terms} for guid in precomputed_query["match_guids"] } if parameters.get("use_c3_stopword_list", False): norm_term_scores = filterC3Stopwords(norm_term_scores) if parameters.get("filter_stopwords", True): norm_term_scores = filterStopwords(retrieval_model, norm_term_scores, docFreq) all_term_scores = addUpAllTermScores(norm_term_scores) terms = sorted(six.iteritems(all_term_scores), key=lambda x: x[1], reverse=True) precomputed_query["structured_query"] = StructuredQuery([{ "token": term, "boost": all_term_scores[term], "count": counts.get(term, 0) } for term in all_term_scores]) terms = [(term[0], tokenWeight(term)) for term in terms] return terms