def __init__(self, exp_dir, params={}, train_data_filename="feature_data.json.gz", test_data_filename="feature_data_test.json.gz"): self.exp_dir = exp_dir self.features_data_filename = os.path.join(self.exp_dir, train_data_filename) self.test_data_filename = os.path.join(self.exp_dir, test_data_filename) self.dtype = np.float32 self.dict_vectorizer = DictVectorizer(dtype=self.dtype, sparse=False) self.all_context_features = [] self.regression = params.get("regression", False) self.stopwords = getStopwords(self.exp_dir) self.features_to_filter_out = ["lemma", "dep", "pos", "text", "lemma_"] self.corpus_label = "" if "pmc" in self.exp_dir: self.corpus_label = "PMC" self.corpus = "pmc" elif "aac" in self.exp_dir: self.corpus = "aac" self.corpus_label = "AAC"
def selectKeywords(self, precomputed_query, doc_list, retrieval_model, parameters, cit, weights, norm_term_scores=None, docFreq=None, maxDocs=None, rawScores=None): stopwords = getStopwords(retrieval_model.index_name) stopwords = [t for t in list(stopwords) if len(t) > 1] norm_term_scores = filterTermScores(norm_term_scores, docFreq, min_docs_to_match=0, max_docs_to_match=None, min_term_len=2) this_pct_of_formulas = termsPercentOfFormulas(rawScores["formulas"], stopwords) this_pct_of_match_formulas = termsPercentOfFormulas( rawScores["match_formulas"], stopwords) # normalised term scores are the scores for the matches (added) divided by the total term value all_term_scores, terms, counts = getSortedTerms(norm_term_scores, precomputed_query, options=parameters) this_pct_of_norm_term_scores = termsPercentOfTermScores( all_term_scores, stopwords) if len(self.pct_of_formulas) == 0: for term2 in stopwords: self.pct_of_formulas[term2] = [] self.pct_of_match_formulas[term2] = [] self.pct_of_norm_term_scores[term2] = [] for term in this_pct_of_formulas: self.pct_of_formulas[term].append( sum(this_pct_of_formulas[term]) / len(this_pct_of_formulas[term])) self.pct_of_match_formulas[term].append( sum(this_pct_of_match_formulas[term]) / len(this_pct_of_match_formulas[term])) self.pct_of_norm_term_scores[term].append( sum(this_pct_of_norm_term_scores[term]) / len(this_pct_of_norm_term_scores[term])) self.num_results += 1 print("pct_of_formulas - the: %0.2f%%" % (sum(self.pct_of_formulas["the"]) / len(self.pct_of_formulas["the"]) * 100)) print("pct_of_norm_term_scores - the: %0.2f%%" % (sum(self.pct_of_norm_term_scores["the"]) / len(self.pct_of_norm_term_scores["the"]) * 100)) return []
def filterStopwords(retrieval_model, term_scores, docFreq): stopwords = getStopwords(retrieval_model.index_name) term_scores = filterTermScores(term_scores, docFreq, min_docs_to_match=1, max_docs_to_match=None, min_term_len=3, stopword_list=stopwords) return term_scores
def makeBaselineQueries(queries_filename): stopwords = getStopwords(getCorpusFromPath(queries_filename)) queries = json.load(open(queries_filename, "r")) for precomputed_query in queries: original_query = precomputed_query["vis_text"] original_query = original_query.replace("__cit", " ") original_query = original_query.replace("__author", " ") original_query = original_query.replace("__ref", " ") all_tokens = re.findall(r"\w+", original_query.lower()) terms = list(set(all_tokens)) counts = Counter(all_tokens) terms_to_extract = [ t for t in terms if len(t) > 1 and t not in stopwords ] precomputed_query["structured_query"] = [(t, counts[t], 1) for t in terms_to_extract] json.dump(queries, open(queries_filename.replace(".json", "_baseline.json"), "w"))
def addManualKPsToQueries(queries_filename, annot_filename=None): queries = json.load(open(queries_filename, "r")) if not annot_filename: annot_filename = getAnnotFilename(queries_filename) all_kps, _ = readAnnotationFile(annot_filename) q_dir = os.path.dirname(queries_filename) q_name = os.path.splitext(os.path.basename(queries_filename))[0] annot_q_file = os.path.join(q_dir, q_name + "_annot.json") res = [] stopwords = getStopwords(q_dir) for query in queries: query_id = query["file_guid"] + "_" + query["citation_id"] context_kw = [] context_kp = [] kws = [] for kp in all_kps[query_id]: kws = re.split("\W+", kp.strip()) kws = [ kw.lower() for kw in kws if kw not in stopwords and len(kw) > 2 ] context_kw.extend(kws) if " " in kp: context_kp.append(kp) if len(kws) == 0: continue print(context_kw) new_q = makeEvaluationQuery(query, context_kw, context_kp) res.append(new_q) json.dump(res, open(annot_q_file, "w")) print("Queries written:", len(res))
def statsOnPredictionsFile(filename): with open(filename, "r") as f: queries = json.load(f) statsOnPredictions(queries, getStopwords(getCorpusFromPath(filename)))