예제 #1
0
    def __init__(self,
                 exp_dir,
                 params={},
                 train_data_filename="feature_data.json.gz",
                 test_data_filename="feature_data_test.json.gz"):

        self.exp_dir = exp_dir
        self.features_data_filename = os.path.join(self.exp_dir,
                                                   train_data_filename)
        self.test_data_filename = os.path.join(self.exp_dir,
                                               test_data_filename)
        self.dtype = np.float32
        self.dict_vectorizer = DictVectorizer(dtype=self.dtype, sparse=False)
        self.all_context_features = []
        self.regression = params.get("regression", False)
        self.stopwords = getStopwords(self.exp_dir)
        self.features_to_filter_out = ["lemma", "dep", "pos", "text", "lemma_"]
        self.corpus_label = ""

        if "pmc" in self.exp_dir:
            self.corpus_label = "PMC"
            self.corpus = "pmc"
        elif "aac" in self.exp_dir:
            self.corpus = "aac"
            self.corpus_label = "AAC"
예제 #2
0
    def selectKeywords(self,
                       precomputed_query,
                       doc_list,
                       retrieval_model,
                       parameters,
                       cit,
                       weights,
                       norm_term_scores=None,
                       docFreq=None,
                       maxDocs=None,
                       rawScores=None):

        stopwords = getStopwords(retrieval_model.index_name)
        stopwords = [t for t in list(stopwords) if len(t) > 1]
        norm_term_scores = filterTermScores(norm_term_scores,
                                            docFreq,
                                            min_docs_to_match=0,
                                            max_docs_to_match=None,
                                            min_term_len=2)

        this_pct_of_formulas = termsPercentOfFormulas(rawScores["formulas"],
                                                      stopwords)
        this_pct_of_match_formulas = termsPercentOfFormulas(
            rawScores["match_formulas"], stopwords)

        # normalised term scores are the scores for the matches (added) divided by the total term value

        all_term_scores, terms, counts = getSortedTerms(norm_term_scores,
                                                        precomputed_query,
                                                        options=parameters)

        this_pct_of_norm_term_scores = termsPercentOfTermScores(
            all_term_scores, stopwords)

        if len(self.pct_of_formulas) == 0:
            for term2 in stopwords:
                self.pct_of_formulas[term2] = []
                self.pct_of_match_formulas[term2] = []
                self.pct_of_norm_term_scores[term2] = []

        for term in this_pct_of_formulas:
            self.pct_of_formulas[term].append(
                sum(this_pct_of_formulas[term]) /
                len(this_pct_of_formulas[term]))
            self.pct_of_match_formulas[term].append(
                sum(this_pct_of_match_formulas[term]) /
                len(this_pct_of_match_formulas[term]))
            self.pct_of_norm_term_scores[term].append(
                sum(this_pct_of_norm_term_scores[term]) /
                len(this_pct_of_norm_term_scores[term]))

        self.num_results += 1
        print("pct_of_formulas - the: %0.2f%%" %
              (sum(self.pct_of_formulas["the"]) /
               len(self.pct_of_formulas["the"]) * 100))
        print("pct_of_norm_term_scores - the: %0.2f%%" %
              (sum(self.pct_of_norm_term_scores["the"]) /
               len(self.pct_of_norm_term_scores["the"]) * 100))
        return []
예제 #3
0
def filterStopwords(retrieval_model, term_scores, docFreq):
    stopwords = getStopwords(retrieval_model.index_name)
    term_scores = filterTermScores(term_scores,
                                   docFreq,
                                   min_docs_to_match=1,
                                   max_docs_to_match=None,
                                   min_term_len=3,
                                   stopword_list=stopwords)
    return term_scores
예제 #4
0
def makeBaselineQueries(queries_filename):
    stopwords = getStopwords(getCorpusFromPath(queries_filename))
    queries = json.load(open(queries_filename, "r"))

    for precomputed_query in queries:
        original_query = precomputed_query["vis_text"]
        original_query = original_query.replace("__cit", " ")
        original_query = original_query.replace("__author", " ")
        original_query = original_query.replace("__ref", " ")

        all_tokens = re.findall(r"\w+", original_query.lower())
        terms = list(set(all_tokens))
        counts = Counter(all_tokens)
        terms_to_extract = [
            t for t in terms if len(t) > 1 and t not in stopwords
        ]
        precomputed_query["structured_query"] = [(t, counts[t], 1)
                                                 for t in terms_to_extract]

    json.dump(queries,
              open(queries_filename.replace(".json", "_baseline.json"), "w"))
예제 #5
0
def addManualKPsToQueries(queries_filename, annot_filename=None):
    queries = json.load(open(queries_filename, "r"))

    if not annot_filename:
        annot_filename = getAnnotFilename(queries_filename)
    all_kps, _ = readAnnotationFile(annot_filename)

    q_dir = os.path.dirname(queries_filename)
    q_name = os.path.splitext(os.path.basename(queries_filename))[0]
    annot_q_file = os.path.join(q_dir, q_name + "_annot.json")

    res = []
    stopwords = getStopwords(q_dir)

    for query in queries:
        query_id = query["file_guid"] + "_" + query["citation_id"]
        context_kw = []
        context_kp = []
        kws = []
        for kp in all_kps[query_id]:
            kws = re.split("\W+", kp.strip())
            kws = [
                kw.lower() for kw in kws if kw not in stopwords and len(kw) > 2
            ]
            context_kw.extend(kws)
            if " " in kp:
                context_kp.append(kp)

        if len(kws) == 0:
            continue

        print(context_kw)
        new_q = makeEvaluationQuery(query, context_kw, context_kp)
        res.append(new_q)

    json.dump(res, open(annot_q_file, "w"))
    print("Queries written:", len(res))
예제 #6
0
def statsOnPredictionsFile(filename):
    with open(filename, "r") as f:
        queries = json.load(f)

    statsOnPredictions(queries, getStopwords(getCorpusFromPath(filename)))