示例#1
0
    def work(self, job_id):
        qid_list = self.query_group[job_id]
        missing_rel_cnt = 0
        missing_nrel_cnt = 0

        def empty_doc_fn(query_id, doc_id):
            rel_docs = self.ms_reader.qrel[query_id]
            nonlocal missing_rel_cnt
            nonlocal missing_nrel_cnt
            if doc_id in rel_docs:
                missing_rel_cnt += 1
            else:
                missing_nrel_cnt += 1

        for qid in qid_list:
            docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn)
            if qid not in self.candidate_docs_d:
                continue

            target_docs = self.candidate_docs_d[qid]
            tokens_d = {}
            for d in docs:
                if d.doc_id in target_docs:
                    tokens_d[d.doc_id] = []

            if len(tokens_d) < len(target_docs):
                log_variables(job_id, qid, tokens_d, target_docs)
                not_found_docs = list([
                    doc_id for doc_id in target_docs if doc_id not in tokens_d
                ])
                print("{} of {} not found: {}".format(len(not_found_docs),
                                                      len(target_docs),
                                                      not_found_docs))
示例#2
0
def main():
    my_variable = 10
    # print(varname(g_variable))
    # print(varname(my_variable))
    #
    avkai = 0
    log_variables(g_variable)


    log_variables(g_variable, my_variable)
示例#3
0
    def work(self, job_id):
        qid_list = self.resource.query_group[job_id]
        for qid in qid_list:
            if qid not in self.resource.candidate_doc_d:
                continue

            target_docs = self.resource.candidate_doc_d[qid]
            tokens_d = self.resource.get_doc_tokens_d(qid)

            for doc_id in target_docs:
                if doc_id not in tokens_d:
                    log_variables(qid, target_docs)
                    print("Not foudn: ", doc_id)
示例#4
0
    def work(self, job_id):
        qid_list = self.query_group[job_id]
        ticker = TimeEstimator(len(qid_list))
        missing_rel_cnt = 0
        missing_nrel_cnt = 0

        def empty_doc_fn(query_id, doc_id):
            rel_docs = self.ms_reader.qrel[query_id]
            nonlocal missing_rel_cnt
            nonlocal missing_nrel_cnt
            if doc_id in rel_docs:
                missing_rel_cnt += 1
            else:
                missing_nrel_cnt += 1

        def get_tf(text):
            tokens = self.tokenizer.tokenize_stem(text)
            return Counter(tokens)

        for qid in qid_list:
            if qid not in self.candidate_docs_d:
                continue

            docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn)
            ticker.tick()

            target_docs = self.candidate_docs_d[qid]
            tokens_d = {}
            for d in docs:
                if d.doc_id in target_docs:
                    title_tokens = self.tokenizer.tokenize_stem(d.title)
                    body_sents = sent_tokenize(d.body)
                    body_tf_list = lmap(get_tf, body_sents)
                    tokens_d[d.doc_id] = (title_tokens, body_tf_list)

            if len(tokens_d) < len(target_docs):
                log_variables(job_id, qid)
                print("{} of {} not found".format(len(tokens_d),
                                                  len(target_docs)))

            save_path = os.path.join(self.out_dir, str(qid))
            pickle.dump(tokens_d, open(save_path, "wb"))
示例#5
0
def main():
    q1 = read_queries_at(sys.argv[1])
    q2 = read_queries_at(sys.argv[2])

    print("len(q1)", len(q1))
    print("len(q2)", len(q2))

    q2_d = dict(q2)

    perfect_match = 0
    qid_match = 0
    for query_id, query_text in q1:
        if query_id in q2_d:
            qid_match += 1
            query_text_from2 = q2_d[query_id]
            if query_text.lower() == query_text_from2.lower():
                perfect_match += 1
            else:
                print(query_id)
                print(query_text)
                print(query_text_from2)

    log_variables(perfect_match, qid_match)