Exemplo n.º 1
0
    def optimizeAdding(self, precomputed_query, terms, weights,
                       retrieval_model):
        index = 0
        selected_terms = []
        rank = BIG_VALUE

        start_time = time.time()
        history = []
        while rank != -1 and rank > 1 and index < len(terms):
            selected_terms.append(terms[index])

            if terms[index][0] not in self.all_term_scores:
                print(terms[index][0], "not in", self.all_term_scores)

            selected_kws = [(term[0], self.all_term_scores.get(term[0], 0))
                            for term in selected_terms]

            scores = self.getQueryRank(precomputed_query, selected_kws,
                                       retrieval_model, weights)

            history.append((deepcopy(selected_terms), scores["rank_kw"],
                            scores["rank_kw_weight"], scores["rank_avg"]))
            index += 1

        if len(history) == 0:
            pick = ([], 0)
        else:
            #        print("Chosen terms:", terms,"\n\n")
            pick = min(history, key=lambda x: tokenWeight(x))
            if pick[1] == BIG_VALUE:
                pick = min(history, key=lambda x: len(x[0]))

        took = time.time() - start_time

        return pick[0], float(pick[1]), "Adding", took
Exemplo n.º 2
0
def takeMinMaxDivW(kws, param={}):
    sorted_list = sorted(kws.items(),
                         key=lambda x: tokenWeight(x),
                         reverse=param.get("reverse", False))

    for kw in sorted_list[:param["num"]]:
        kws[kw[0]] *= param["mul"]
Exemplo n.º 3
0
def takeMinW(kws, param=1):
    sorted_list = sorted(kws.items(),
                         key=lambda x: tokenWeight(x),
                         reverse=False)

    for kw in sorted_list[:param]:
        kws[kw[0]] *= 10
Exemplo n.º 4
0
def tweakW(kws, param={}):
    max_list = sorted(kws.items(), key=lambda x: tokenWeight(x), reverse=True)
    min_list = list(reversed(max_list))

    for kw in max_list[:param["max_num"]]:
        kws[kw[0]] *= param["max_mul"]

    for kw in min_list[:param["min_num"]]:
        kws[kw[0]] *= param["min_mul"]
Exemplo n.º 5
0
def getSortedTerms(term_scores, precomputed_query, options={}):
    all_term_scores = addUpAllTermScores(term_scores, options=options)

    terms = sorted(six.iteritems(all_term_scores),
                   key=lambda x: x[1],
                   reverse=True)
    counts = getCountsInQueryForMatchingTerms(precomputed_query)
    terms = [(term[0], tokenWeight(term)) for term in terms]

    return all_term_scores, terms, counts
Exemplo n.º 6
0
def makeQueryFromContext(context):
    counts = Counter([t["text"].lower() for t in context["tokens"]])

    query = []
    for kw in context["best_kws"]:
        # (text, count, boost, bool, field, distance)
        new_token = (kw[0], counts[kw[0]], tokenWeight(kw), None, None, None)
        # new_token = (text, counts[text], float(weight)/counts[text], None, None, None)
        query.append(new_token)

    return query
Exemplo n.º 7
0
def adjustDistW(kws, param=1):
    sorted_list = sorted(kws.items(),
                         key=lambda x: tokenWeight(x),
                         reverse=False)
    max_val = max(kws.values())
    mean_val = max_val / len(kws)

    for kw in sorted_list:
        if kws[kw[0]] < mean_val * param:
            kws[kw[0]] *= 10
        else:
            break
Exemplo n.º 8
0
def runQueryAndMeasureKeywordSelection(precomputed_query, selected_keywords,
                                       retrieval_model, weights, kw_data):
    """
        Run queries to compute scores for the provided keywords, adds them to the [kw_data] dict

        :param precomputed_query: original precomputed_query as provided by testing pipeline
        :param selected_keywords: tuples of (keyword,weight) coming out of exp["keyword_selector"].selectKeywords()
        :param retrieval_model: retrieval instance (e.g. ElasticRetrievalBoost)
        :param cit: citation dict
        :param weights: the weights used for retrieval at this point, configurable in the experiment
        :param kw_data: dict that will eventually be stored, containing the precomputed_query, selected kws, etc.
    """
    kw_counts = getCountsInQuery(precomputed_query, selected_keywords)

    # print("Query:", precomputed_query["query_text"])
    query = deepcopy(precomputed_query)

    # StructuredToken(token, count, boost, bool, field, distance)
    query["structured_query"] = StructuredQuery([{
        "token":
        kw[0],
        "count":
        kw_counts.get(kw[0], 0),
        "boost":
        1
    } for kw in selected_keywords])
    kw_data["kw_selection_scores"] = runAndMeasureOneQuery(
        query, weights, retrieval_model)

    # if any([" " in kw[0] for kw in selected_keywords]):
    #     print(precomputed_query["file_guid"],precomputed_query["citation_id"])
    #     print(precomputed_query["vis_text"])
    #     print(precomputed_query["keyphrases"])
    #     print("\n\n")

    # if precomputed_query["file_guid"] == 'b884c939-144c-4d95-9d30-097f8b83e1d3' and precomputed_query["citation_id"]=='cit9':
    #     print("stop")

    query = deepcopy(precomputed_query)
    query["structured_query"] = StructuredQuery([{
        "token":
        kw[0],
        "count":
        kw_counts.get(kw[0], 0),
        "boost":
        tokenWeight(kw)
    } for kw in selected_keywords])
    kw_data["kw_selection_weight_scores"] = runAndMeasureOneQuery(
        query, weights, retrieval_model)
Exemplo n.º 9
0
def runMod(query, mod):
    if len(query["best_kws"]) == 0:
        return None
    kws = {kw[0]: tokenWeight(kw) for kw in query["best_kws"]}
    newq = deepcopy(query)

    func = globals()[mod["name"]]
    if mod.get("param"):
        func(kws, mod["param"])
    else:
        func(kws)

    newq["best_kws"] = [k for k in kws.items()]
    newq["structured_query"] = makeQueryFromContext(newq)
    return newq
Exemplo n.º 10
0
def getOneContextFeatures(context):
    """
        Prepares a single context's data for any nn. Takes ["token_features"] from list
        of sentences and returns a single list of token features.
    """
    all_keywords = {t[0]: tokenWeight(t) for t in context["best_kws"]}

    featureset = getAnnotationListsForContext(context["tokens"], all_keywords)
    tokens, to_extract, weights = zip(*featureset)
    max_weight = max(weights)
    if max_weight > 0:
        weights = [w / max_weight for w in weights]

    context["extract_mask"] = to_extract
    context["tokens"] = tokens
    context["weight_mask"] = weights

    return context
Exemplo n.º 11
0
def listAllKeywordsToExtractFromReader(reader):
    """
        Lists all keywords that are marked as extract:true in a list or reader object
    """
    to_extract = []

    for kw_data in reader:
        if isinstance(kw_data, dict):
            best_kws = {t[0]: tokenWeight(t) for t in kw_data["best_kws"]}
            for sent in kw_data["context"]:
                for token in sent["token_features"]:
                    if token["text"].lower() in best_kws:
                        to_extract.append(token["text"].lower())
        elif isinstance(kw_data, tuple):
            if kw_data[1]:
                to_extract.append(kw_data[0]["text"])

    return Counter(to_extract)
Exemplo n.º 12
0
    def selectKeywords(self,
                       precomputed_query,
                       doc_list,
                       retrieval_model,
                       parameters,
                       cit,
                       weights,
                       norm_term_scores=None,
                       docFreq=None,
                       maxDocs=None,
                       rawScores=None):

        if parameters.get("use_c3_stopword_list", False):
            norm_term_scores = filterC3Stopwords(norm_term_scores)

        if parameters.get("filter_stopwords", True):
            norm_term_scores = filterStopwords(retrieval_model,
                                               norm_term_scores, docFreq)

        all_term_scores, terms, counts = getSortedTerms(norm_term_scores,
                                                        precomputed_query,
                                                        options=parameters)

        if not parameters.get("use_weights", True):
            # if parameters.get("use_counts", True):
            #     # (term, count, weight)
            #     terms2 = [(term[0], term[2], 1) for term in terms]
            # else:
            terms2 = [(term[0], 1) for term in terms]
            terms = terms2

        use_kps = parameters.get("use_kps", False)
        if not use_kps:
            return [(term[0], tokenWeight(term)) for term in terms]

        kp_method = parameters.get("kp_method", "add")

        norm_term_scores = {term[0]: tokenWeight(term) for term in terms}

        if kp_method == "add":
            res = []
            for kp in precomputed_query.get("keyphrases", []):
                if any([term not in norm_term_scores for term in kp]):
                    # KP doesn't fully match, can't add it
                    continue

                score = getKPScore(parameters, norm_term_scores, kp)

                kp = " ".join(kp)
                res.append((kp, score))

            return terms + res
        elif kp_method == "sub":
            res = []
            term_counts = {term[0]: tokenWeight(term) for term in terms}
            for kp in precomputed_query.get("keyphrases", []):
                if any([term not in norm_term_scores for term in kp]):
                    # KP doesn't fully match, can't add it
                    continue

                score = getKPScore(parameters, norm_term_scores, kp)

                for term in kp:
                    term_counts[term] -= 1

                kp = " ".join(kp)
                res.append((kp, score))
            terms = [(term, norm_term_scores[term]) for term in term_counts
                     if term_counts[term] > 0]
            return terms + res
        else:
            raise ValueError("Unknown kp_method")
Exemplo n.º 13
0
    def selectKeywords(self,
                       precomputed_query,
                       doc_list,
                       retrieval_model,
                       parameters,
                       cit,
                       weights,
                       norm_term_scores=None,
                       docFreq=None,
                       maxDocs=None,
                       rawScores=None):

        if parameters.get("use_all_original_text", False):
            original_query = precomputed_query["vis_text"]
            original_query = original_query.replace("__cit", " ")
            original_query = original_query.replace("__author", " ")
        else:
            original_query = precomputed_query["query_text"]

        # terms = original_query.lower().split()
        all_tokens = re.findall(r"\w+", original_query.lower())
        terms = list(set(all_tokens))
        counts = Counter(all_tokens)

        if parameters.get("use_weights", True):
            norm_term_scores = {
                guid: {t: norm_term_scores[guid].get(t, 0.0)
                       for t in terms}
                for guid in precomputed_query["match_guids"]
            }
        else:
            norm_term_scores = {
                guid: {t: 1.0
                       for t in terms}
                for guid in precomputed_query["match_guids"]
            }

        if parameters.get("use_c3_stopword_list", False):
            norm_term_scores = filterC3Stopwords(norm_term_scores)

        if parameters.get("filter_stopwords", True):
            norm_term_scores = filterStopwords(retrieval_model,
                                               norm_term_scores, docFreq)

        all_term_scores = addUpAllTermScores(norm_term_scores)
        terms = sorted(six.iteritems(all_term_scores),
                       key=lambda x: x[1],
                       reverse=True)

        precomputed_query["structured_query"] = StructuredQuery([{
            "token":
            term,
            "boost":
            all_term_scores[term],
            "count":
            counts.get(term, 0)
        } for term in all_term_scores])

        terms = [(term[0], tokenWeight(term)) for term in terms]

        return terms