def processOneQuery(self, precomputed_query): """ Runs the retrieval and evaluation for a single query """ if self.exp.get("queries_classification", "") not in ["", None]: q_type = precomputed_query[self.exp.get("queries_classification")] if self.per_class_count[q_type] < self.max_per_class_results: self.per_class_count[q_type] += 1 else: print("Too many queries of type %s already" % q_type) return guid = precomputed_query["file_guid"] self.logger.total_citations += self.files_dict[guid]["resolvable_citations"] all_doc_methods = deepcopy(self.main_all_doc_methods) # If we're running per-file resolution and we are now on a different file, load its model if not self.exp["full_corpus"] and guid != self.previous_guid: self.previous_guid = guid self.loadModel(guid) # create a dict where every field gets a weight of 1 for method in self.main_all_doc_methods: all_doc_methods[method]["runtime_parameters"] = { x: 1 for x in self.main_all_doc_methods[method]["runtime_parameters"] } self.current_all_doc_methods = all_doc_methods # for every method used for extracting BOWs for doc_method in all_doc_methods: # Log everything if the logger is enabled ## self.logger.logReport("Citation: "+precomputed_query["citation_id"]+"\n Query method:"+precomputed_query["query_method"]+" \nDoc method: "+doc_method +"\n") ## self.logger.logReport(precomputed_query["query_text"]+"\n") # ACTUAL RETRIEVAL HAPPENING - run query retrieved = self.tfidfmodels[doc_method].runQuery( precomputed_query, addExtraWeights(all_doc_methods[doc_method]["runtime_parameters"], self.exp), guid, max_results=exp.get("max_results_recall", MAX_RESULTS_RECALL), ) if not retrieved: # the query was empty or something self.addEmptyResult(guid, precomputed_query, doc_method) else: self.addResult(guid, precomputed_query, doc_method, retrieved) if self.exp.get("add_random_control_result", False): self.addRandomControlResult(guid, precomputed_query) self.logger.showProgressReport(guid) # prints out info on how it's going
def dynamicWeightValues(self, split_fold): """ Find the best combination of weights using a greedy heuristic, not testing every possible one, but selecting the best one at each stage """ all_doc_methods = getDictOfTestingMethods(self.exp["doc_methods"]) annotated_boost_methods = [x for x in all_doc_methods if all_doc_methods[x]["type"] in ["annotated_boost"]] initialization_methods = [1] ## initialization_methods=[1,"random"] MIN_WEIGHT = 0 ## self.exp["movements"]=[-1,3] self.exp["movements"] = [-1, 6, -2] best_weights = {} numfolds = self.exp.get("cross_validation_folds", 2) ## counter=weightCounterList(exp["weight_values"]) print("Processing zones ", self.exp["train_weights_for"]) for query_type in self.exp["train_weights_for"]: best_weights[query_type] = {} results_compare = [] retrieval_results = self.loadPrecomputedFormulas(query_type) if len(retrieval_results) == 0: print("No precomputed formulas for ", query_type) continue if len(retrieval_results) < numfolds: print("Number of results is smaller than number of folds for zone type ", query_type) continue cv = cross_validation.KFold( len(retrieval_results), n_folds=numfolds, shuffle=False, random_state=None ) # indices=True, k=None cv = [k for k in cv] traincv, testcv = cv[split_fold] if isinstance(retrieval_results, ResultIncrementalReader): train_set = retrieval_results.subset(traincv) elif isinstance(retrieval_results, list): train_set = [retrieval_results[i] for i in traincv] else: raise ValueError("Unkown class of results") ## train_set=retrieval_results.subset(traincv) ## train_set=[retrieval_results[i] for i in traincv] if len(train_set) == 0: print("Training set len is 0!") return defaultdict(lambda: 1) print("Training for citations in ", query_type, "zones:", len(train_set), "/", len(retrieval_results)) for method in annotated_boost_methods: res = {} for weight_initalization in initialization_methods: if weight_initalization == 1: ## counter.initWeights(all_doc_methods[method]["runtime_parameters"]) weights = {x: 1 for x in all_doc_methods[method]["runtime_parameters"]} elif weight_initalization == "random": weights = {x: random.randint(-10, 10) for x in all_doc_methods[method]["runtime_parameters"]} ## counter.weights={x:random.randint(-10,10) for x in all_doc_methods[method]["runtime_parameters"]} all_doc_methods[method]["runtime_parameters"] = weights print("Computing initial score...") scores = self.measurePrecomputedResolution( train_set, method, addExtraWeights(weights, self.exp), query_type ) score_baseline = scores[0][self.exp["metric"]] previous_score = score_baseline first_baseline = score_baseline score_progression = [score_baseline] global GLOBAL_FILE_COUNTER ## drawWeights(self.exp,weights,query_type+"_weights_"+str(GLOBAL_FILE_COUNTER)) ## drawScoreProgression(self.exp,score_progression,query_type+"_"+str(GLOBAL_FILE_COUNTER)) GLOBAL_FILE_COUNTER += 1 overall_improvement = score_baseline passes = 0 print("Finding best weights...") while passes < 3 or overall_improvement > 0: for direction in self.exp["movements"]: # [-1,6,-2] print("Direction: ", direction) for index in range(len(weights)): ## print("Weight: ", index) weight_name = weights.keys()[index] prev_weight = weights[weight_name] # hard lower limit of 0 for weights weights[weight_name] = max(MIN_WEIGHT, weights[weight_name] + direction) scores = self.measurePrecomputedResolution( train_set, method, addExtraWeights(weights, self.exp), query_type ) this_score = scores[0][self.exp["metric"]] if this_score <= previous_score: weights[weight_name] = prev_weight else: previous_score = this_score overall_improvement = this_score - score_baseline score_baseline = this_score score_progression.append(this_score) # This is to export the graphs as weights are trained ## drawWeights(self.exp,weights,query_type+"_weights_"+str(GLOBAL_FILE_COUNTER)) ## drawScoreProgression(self.exp,{self.exp["metric"]:score_progression},query_type+"_"+str(GLOBAL_FILE_COUNTER)) GLOBAL_FILE_COUNTER += 1 passes += 1 scores = self.measurePrecomputedResolution( train_set, method, addExtraWeights(weights, self.exp), query_type ) this_score = scores[0][self.exp["metric"]] ## if split_fold is not None: ## split_set_str="_s"+str(split_fold) ## else: ## split_set_str="" ## print "Weight inialization:",weight_initalization improvement = ( 100 * ((this_score - first_baseline) / float(first_baseline)) if first_baseline > 0 else 0 ) print( " Weights found, with score: {:.5f}".format(this_score), " Improvement: {:.2f}%".format(improvement), ) best_weights[query_type][method] = addExtraWeights(weights, self.exp) print(" ", weights.values()) if self.exp.get("smooth_weights", None): # this is to smooth a bit the weights in case they're too crazy for weight in best_weights[query_type][method]: amount = abs(min(1, best_weights[query_type][method][weight]) / float(3)) if best_weights[query_type][method][weight] > 1: best_weights[query_type][method][weight] -= amount elif best_weights[query_type][method][weight] < 1: best_weights[query_type][method][weight] += amount res[weight_initalization] = this_score results_compare.append(res) ## better=0 ## diff=0 ## for res in results_compare: ## if res["random"] > res[1]: ## better+=1 ## diff+=res[1]-res["random"] ## print "Random inialization better than dynamic setting",better,"times" ## print "Avg difference between methods:",diff/float(len(results_compare)) for init_method in initialization_methods: if len(results_compare) > 0: avg = sum([res[init_method] for res in results_compare]) / float(len(results_compare)) else: avg = 0 print("Avg for ", init_method, ":", avg) ## if split_set is not None: ## split_set_str="_s"+str(split_set) ## else: ## split_set_str="" ## filename=getSafeFilename(self.exp["exp_dir"]+"weights_"+query_type+"_"+str(counter.getPossibleValues())+split_set_str+filename_add+".csv") ## data.to_csv(filename) return best_weights
def measureScoresOfWeights(self, best_weights): """ Using precomputed weights from another split set, apply and report score """ numfolds = self.exp.get("cross_validation_folds", 2) results = [] fold_results = [] metrics = ["avg_mrr", "avg_ndcg", "avg_precision", "precision_total"] print("Experiment:", self.exp["name"]) print("Metric:", self.exp["metric"]) print("Weight movements:", self.exp.get("movements", None)) for split_fold in range(numfolds): weights = best_weights[split_fold] improvements = [] better_zones = [] better_zones_details = [] for query_type in self.exp["train_weights_for"]: retrieval_results = self.loadPrecomputedFormulas(query_type) if len(retrieval_results) == 0: continue if len(retrieval_results) < numfolds: print("Number of results is smaller than number of folds for zone type ", query_type) continue cv = cross_validation.KFold(len(retrieval_results), n_folds=numfolds, shuffle=False, random_state=None) cv = [k for k in cv] # run the generator traincv, testcv = cv[split_fold] if isinstance(retrieval_results, ResultIncrementalReader): test_set = retrieval_results.subset(testcv) elif isinstance(retrieval_results, list): test_set = [retrieval_results[i] for i in testcv] else: raise ValueError("Unkown class of results") for method in weights[query_type]: weights_baseline = addExtraWeights( {x: 1 for x in self.all_doc_methods[method]["runtime_parameters"]}, self.exp ) scores = self.measurePrecomputedResolution(test_set, method, weights_baseline, query_type) baseline_score = scores[0][self.exp["metric"]] ## print "Score for "+query_type+" weights=1:", baseline_score result = { "query_type": query_type, "fold": split_fold, "score": baseline_score, "method": method, "type": "baseline", "improvement": None, "pct_improvement": None, "num_data_points": len(retrieval_results), } for metric in metrics: result[metric] = scores[0][metric] for weight in weights[query_type][method]: result[weight] = 1 results.append(result) scores = self.measurePrecomputedResolution( test_set, method, weights[query_type][method], query_type ) this_score = scores[0][self.exp["metric"]] ## print "Score with trained weights:",this_score impro = this_score - baseline_score pct_impro = 100 * (impro / baseline_score) if baseline_score != 0 else 0 improvements.append((impro * len(test_set)) / len(retrieval_results)) result = { "query_type": query_type, "fold": split_fold, "score": this_score, "method": method, "type": "weight", "improvement": impro, "pct_improvement": pct_impro, "num_data_points": len(retrieval_results), } if impro > 0: better_zones.append(query_type) better_zones_details.append((query_type, pct_impro)) for metric in metrics: result[metric] = scores[0][metric] for weight in weights[query_type][method]: result[weight] = weights[query_type][method][weight] results.append(result) fold_result = { "fold": split_fold, "avg_improvement": sum(improvements) / float(len(improvements)) if len(improvements) > 0 else 0, "num_improved_zones": len([x for x in improvements if x > 0]), "num_zones": len(improvements), "better_zones": better_zones, "better_zones_details": better_zones_details, } fold_results.append(fold_result) print("For fold", split_fold) print("Average improvement:", fold_result["avg_improvement"]) print("Weights better than default in", fold_result["num_improved_zones"], "/", fold_result["num_zones"]) ## print("Better zones:",better_zones) print("Better zones, pct improvement:", better_zones_details) xtra = "_".join(self.exp["train_weights_for"]) data = pd.DataFrame(results) data.to_csv(self.exp["exp_dir"] + self.exp["name"] + "_improvements_" + xtra + ".csv") fold_data = pd.DataFrame(fold_results) fold_data.to_csv(self.exp["exp_dir"] + self.exp["name"] + "_folds_" + xtra + ".csv")