示例#1
0
    def processOneQuery(self, precomputed_query):
        """
            Runs the retrieval and evaluation for a single query
        """
        if self.exp.get("queries_classification", "") not in ["", None]:
            q_type = precomputed_query[self.exp.get("queries_classification")]
            if self.per_class_count[q_type] < self.max_per_class_results:
                self.per_class_count[q_type] += 1
            else:
                print("Too many queries of type %s already" % q_type)
                return

        guid = precomputed_query["file_guid"]
        self.logger.total_citations += self.files_dict[guid]["resolvable_citations"]

        all_doc_methods = deepcopy(self.main_all_doc_methods)

        # If we're running per-file resolution and we are now on a different file, load its model
        if not self.exp["full_corpus"] and guid != self.previous_guid:
            self.previous_guid = guid
            self.loadModel(guid)

        # create a dict where every field gets a weight of 1
        for method in self.main_all_doc_methods:
            all_doc_methods[method]["runtime_parameters"] = {
                x: 1 for x in self.main_all_doc_methods[method]["runtime_parameters"]
            }

        self.current_all_doc_methods = all_doc_methods

        # for every method used for extracting BOWs
        for doc_method in all_doc_methods:
            # Log everything if the logger is enabled
            ##                self.logger.logReport("Citation: "+precomputed_query["citation_id"]+"\n Query method:"+precomputed_query["query_method"]+" \nDoc method: "+doc_method +"\n")
            ##                self.logger.logReport(precomputed_query["query_text"]+"\n")

            # ACTUAL RETRIEVAL HAPPENING - run query
            retrieved = self.tfidfmodels[doc_method].runQuery(
                precomputed_query,
                addExtraWeights(all_doc_methods[doc_method]["runtime_parameters"], self.exp),
                guid,
                max_results=exp.get("max_results_recall", MAX_RESULTS_RECALL),
            )

            if not retrieved:  # the query was empty or something
                self.addEmptyResult(guid, precomputed_query, doc_method)
            else:
                self.addResult(guid, precomputed_query, doc_method, retrieved)

        if self.exp.get("add_random_control_result", False):
            self.addRandomControlResult(guid, precomputed_query)

        self.logger.showProgressReport(guid)  # prints out info on how it's going
示例#2
0
    def dynamicWeightValues(self, split_fold):
        """
            Find the best combination of weights using a greedy heuristic, not
            testing every possible one, but selecting the best one at each stage
        """
        all_doc_methods = getDictOfTestingMethods(self.exp["doc_methods"])
        annotated_boost_methods = [x for x in all_doc_methods if all_doc_methods[x]["type"] in ["annotated_boost"]]

        initialization_methods = [1]
        ##    initialization_methods=[1,"random"]
        MIN_WEIGHT = 0
        ##    self.exp["movements"]=[-1,3]
        self.exp["movements"] = [-1, 6, -2]

        best_weights = {}

        numfolds = self.exp.get("cross_validation_folds", 2)
        ##    counter=weightCounterList(exp["weight_values"])

        print("Processing zones ", self.exp["train_weights_for"])

        for query_type in self.exp["train_weights_for"]:
            best_weights[query_type] = {}
            results_compare = []

            retrieval_results = self.loadPrecomputedFormulas(query_type)
            if len(retrieval_results) == 0:
                print("No precomputed formulas for ", query_type)
                continue

            if len(retrieval_results) < numfolds:
                print("Number of results is smaller than number of folds for zone type ", query_type)
                continue

            cv = cross_validation.KFold(
                len(retrieval_results), n_folds=numfolds, shuffle=False, random_state=None
            )  # indices=True, k=None
            cv = [k for k in cv]

            traincv, testcv = cv[split_fold]
            if isinstance(retrieval_results, ResultIncrementalReader):
                train_set = retrieval_results.subset(traincv)
            elif isinstance(retrieval_results, list):
                train_set = [retrieval_results[i] for i in traincv]
            else:
                raise ValueError("Unkown class of results")
            ##            train_set=retrieval_results.subset(traincv)
            ##            train_set=[retrieval_results[i] for i in traincv]
            if len(train_set) == 0:
                print("Training set len is 0!")
                return defaultdict(lambda: 1)

            print("Training for citations in ", query_type, "zones:", len(train_set), "/", len(retrieval_results))
            for method in annotated_boost_methods:
                res = {}

                for weight_initalization in initialization_methods:
                    if weight_initalization == 1:
                        ##                    counter.initWeights(all_doc_methods[method]["runtime_parameters"])
                        weights = {x: 1 for x in all_doc_methods[method]["runtime_parameters"]}
                    elif weight_initalization == "random":
                        weights = {x: random.randint(-10, 10) for x in all_doc_methods[method]["runtime_parameters"]}
                    ##                    counter.weights={x:random.randint(-10,10) for x in all_doc_methods[method]["runtime_parameters"]}

                    all_doc_methods[method]["runtime_parameters"] = weights
                    print("Computing initial score...")
                    scores = self.measurePrecomputedResolution(
                        train_set, method, addExtraWeights(weights, self.exp), query_type
                    )

                    score_baseline = scores[0][self.exp["metric"]]
                    previous_score = score_baseline
                    first_baseline = score_baseline
                    score_progression = [score_baseline]

                    global GLOBAL_FILE_COUNTER
                    ##                    drawWeights(self.exp,weights,query_type+"_weights_"+str(GLOBAL_FILE_COUNTER))
                    ##                    drawScoreProgression(self.exp,score_progression,query_type+"_"+str(GLOBAL_FILE_COUNTER))
                    GLOBAL_FILE_COUNTER += 1

                    overall_improvement = score_baseline
                    passes = 0

                    print("Finding best weights...")
                    while passes < 3 or overall_improvement > 0:
                        for direction in self.exp["movements"]:  # [-1,6,-2]
                            print("Direction: ", direction)
                            for index in range(len(weights)):
                                ##                                print("Weight: ", index)
                                weight_name = weights.keys()[index]
                                prev_weight = weights[weight_name]
                                # hard lower limit of 0 for weights
                                weights[weight_name] = max(MIN_WEIGHT, weights[weight_name] + direction)

                                scores = self.measurePrecomputedResolution(
                                    train_set, method, addExtraWeights(weights, self.exp), query_type
                                )
                                this_score = scores[0][self.exp["metric"]]

                                if this_score <= previous_score:
                                    weights[weight_name] = prev_weight
                                else:
                                    previous_score = this_score

                        overall_improvement = this_score - score_baseline
                        score_baseline = this_score
                        score_progression.append(this_score)

                        # This is to export the graphs as weights are trained
                        ##                        drawWeights(self.exp,weights,query_type+"_weights_"+str(GLOBAL_FILE_COUNTER))
                        ##                        drawScoreProgression(self.exp,{self.exp["metric"]:score_progression},query_type+"_"+str(GLOBAL_FILE_COUNTER))
                        GLOBAL_FILE_COUNTER += 1

                        passes += 1

                    scores = self.measurePrecomputedResolution(
                        train_set, method, addExtraWeights(weights, self.exp), query_type
                    )
                    this_score = scores[0][self.exp["metric"]]

                    ##                if split_fold is not None:
                    ##                    split_set_str="_s"+str(split_fold)
                    ##                else:
                    ##                    split_set_str=""

                    ##                print "Weight inialization:",weight_initalization
                    improvement = (
                        100 * ((this_score - first_baseline) / float(first_baseline)) if first_baseline > 0 else 0
                    )
                    print(
                        "   Weights found, with score: {:.5f}".format(this_score),
                        " Improvement: {:.2f}%".format(improvement),
                    )
                    best_weights[query_type][method] = addExtraWeights(weights, self.exp)
                    print("   ", weights.values())

                    if self.exp.get("smooth_weights", None):
                        # this is to smooth a bit the weights in case they're too crazy
                        for weight in best_weights[query_type][method]:
                            amount = abs(min(1, best_weights[query_type][method][weight]) / float(3))
                            if best_weights[query_type][method][weight] > 1:
                                best_weights[query_type][method][weight] -= amount
                            elif best_weights[query_type][method][weight] < 1:
                                best_weights[query_type][method][weight] += amount

                    res[weight_initalization] = this_score

                results_compare.append(res)

        ##        better=0
        ##        diff=0
        ##    for res in results_compare:
        ##        if res["random"] > res[1]:
        ##            better+=1
        ##        diff+=res[1]-res["random"]

        ##    print "Random inialization better than dynamic setting",better,"times"
        ##    print "Avg difference between methods:",diff/float(len(results_compare))
        for init_method in initialization_methods:
            if len(results_compare) > 0:
                avg = sum([res[init_method] for res in results_compare]) / float(len(results_compare))
            else:
                avg = 0
            print("Avg for ", init_method, ":", avg)
        ##        if split_set is not None:
        ##            split_set_str="_s"+str(split_set)
        ##        else:
        ##            split_set_str=""
        ##        filename=getSafeFilename(self.exp["exp_dir"]+"weights_"+query_type+"_"+str(counter.getPossibleValues())+split_set_str+filename_add+".csv")
        ##        data.to_csv(filename)

        return best_weights
示例#3
0
    def measureScoresOfWeights(self, best_weights):
        """
            Using precomputed weights from another split set, apply and report score
        """

        numfolds = self.exp.get("cross_validation_folds", 2)

        results = []
        fold_results = []
        metrics = ["avg_mrr", "avg_ndcg", "avg_precision", "precision_total"]

        print("Experiment:", self.exp["name"])
        print("Metric:", self.exp["metric"])
        print("Weight movements:", self.exp.get("movements", None))

        for split_fold in range(numfolds):
            weights = best_weights[split_fold]
            improvements = []
            better_zones = []
            better_zones_details = []

            for query_type in self.exp["train_weights_for"]:
                retrieval_results = self.loadPrecomputedFormulas(query_type)
                if len(retrieval_results) == 0:
                    continue

                if len(retrieval_results) < numfolds:
                    print("Number of results is smaller than number of folds for zone type ", query_type)
                    continue

                cv = cross_validation.KFold(len(retrieval_results), n_folds=numfolds, shuffle=False, random_state=None)
                cv = [k for k in cv]  # run the generator
                traincv, testcv = cv[split_fold]
                if isinstance(retrieval_results, ResultIncrementalReader):
                    test_set = retrieval_results.subset(testcv)
                elif isinstance(retrieval_results, list):
                    test_set = [retrieval_results[i] for i in testcv]
                else:
                    raise ValueError("Unkown class of results")

                for method in weights[query_type]:
                    weights_baseline = addExtraWeights(
                        {x: 1 for x in self.all_doc_methods[method]["runtime_parameters"]}, self.exp
                    )

                    scores = self.measurePrecomputedResolution(test_set, method, weights_baseline, query_type)
                    baseline_score = scores[0][self.exp["metric"]]
                    ##            print "Score for "+query_type+" weights=1:", baseline_score
                    result = {
                        "query_type": query_type,
                        "fold": split_fold,
                        "score": baseline_score,
                        "method": method,
                        "type": "baseline",
                        "improvement": None,
                        "pct_improvement": None,
                        "num_data_points": len(retrieval_results),
                    }
                    for metric in metrics:
                        result[metric] = scores[0][metric]
                    for weight in weights[query_type][method]:
                        result[weight] = 1
                    results.append(result)

                    scores = self.measurePrecomputedResolution(
                        test_set, method, weights[query_type][method], query_type
                    )
                    this_score = scores[0][self.exp["metric"]]
                    ##            print "Score with trained weights:",this_score
                    impro = this_score - baseline_score
                    pct_impro = 100 * (impro / baseline_score) if baseline_score != 0 else 0
                    improvements.append((impro * len(test_set)) / len(retrieval_results))

                    result = {
                        "query_type": query_type,
                        "fold": split_fold,
                        "score": this_score,
                        "method": method,
                        "type": "weight",
                        "improvement": impro,
                        "pct_improvement": pct_impro,
                        "num_data_points": len(retrieval_results),
                    }
                    if impro > 0:
                        better_zones.append(query_type)
                        better_zones_details.append((query_type, pct_impro))

                    for metric in metrics:
                        result[metric] = scores[0][metric]
                    for weight in weights[query_type][method]:
                        result[weight] = weights[query_type][method][weight]
                    results.append(result)

            fold_result = {
                "fold": split_fold,
                "avg_improvement": sum(improvements) / float(len(improvements)) if len(improvements) > 0 else 0,
                "num_improved_zones": len([x for x in improvements if x > 0]),
                "num_zones": len(improvements),
                "better_zones": better_zones,
                "better_zones_details": better_zones_details,
            }
            fold_results.append(fold_result)
            print("For fold", split_fold)
            print("Average improvement:", fold_result["avg_improvement"])
            print("Weights better than default in", fold_result["num_improved_zones"], "/", fold_result["num_zones"])
            ##            print("Better zones:",better_zones)
            print("Better zones, pct improvement:", better_zones_details)

        xtra = "_".join(self.exp["train_weights_for"])
        data = pd.DataFrame(results)
        data.to_csv(self.exp["exp_dir"] + self.exp["name"] + "_improvements_" + xtra + ".csv")

        fold_data = pd.DataFrame(fold_results)
        fold_data.to_csv(self.exp["exp_dir"] + self.exp["name"] + "_folds_" + xtra + ".csv")