def calculate_overall_ranking(self, raw_queries, settings): api = API() mean_ap_whole = [] mean_ap_doc = [] queries = self.__raw_queries_to_queries(raw_queries) settings["mode"] = Mode.without_importance_to_sections settings_sec = copy.deepcopy(settings) settings_sec["mode"] = Mode.importance_to_sections for i, query in enumerate(queries): progressBar(i, len(queries)) ranked_papers_whole = api.get_papers({"whole-document": query["search_query"]}, settings) ranked_papers_sec = api.get_papers({query["imrad"]: query["search_query"]}, settings_sec) relevant_paper = [api.get_paper(reference["paper_id"]) for reference in query["references"]] ap_whole = self.average_precision(ranked_papers_whole, relevant_paper) ap_doc = self.average_precision(ranked_papers_sec, relevant_paper) mean_ap_whole.append(ap_whole) mean_ap_doc.append(ap_doc) result_whole = sum(mean_ap_whole) / len(mean_ap_whole) result_doc = sum(mean_ap_doc) / len(mean_ap_doc) print() print("{} & {} & {}".format(Mode.without_importance_to_sections.name.replace("_", " "), len(mean_ap_whole), round(result_whole, 4))) print("{} & {} & {}".format(Mode.importance_to_sections.name.replace("_", " "), len(mean_ap_doc), round(result_doc, 4)))
def compute_ranking_with_settings(self, settings, num_of_papers=0): papers = self.api.get_all_paper() # num_of_papers = len(papers) if num_of_papers == 0 or num_of_papers > len(papers) else num_of_papers # shuffled_papers = papers[:num_of_papers] # shuffle(shuffled_papers) mean_aps = [] for i, paper in enumerate(papers): progressBar(i, len(papers)) relevant_papers = [ self.api.get_paper(ref.get_paper_id()) for ref in paper.references if ref.paper_id ] if not relevant_papers: continue ranked_papers, queries = self.api.get_papers_with_paper( paper.filename, settings) ap = self.average_precision(ranked_papers, relevant_papers) mean_aps.append(ap) mean_ap = sum(mean_aps) / len(mean_aps) print() print("{} & {} & {}".format(settings["mode"].name.replace("_", " "), len(mean_aps), round(mean_ap, 4)))
def remove_duplicates_from_cited_by(): print("\nRemove Duplicates") api = API() papers = api.get_all_paper() for i, paper in enumerate(papers): progressBar(i, len(papers)) paper.cited_by = list(dict.fromkeys(paper.cited_by)) api.client.update_paper(paper)
def check_references(): print("\nCheck References") api = API() papers = api.get_all_paper() for i, paper in enumerate(papers): progressBar(i, len(papers)) other_papers = [p for p in papers if p.id != paper.id] for reference in paper.references: if not reference.get_paper_id(): continue ref_paper = api.get_paper(reference.get_paper_id()) if ref_paper.cited_by.count(paper.id) == 0: print() reference.paper_id = [] api.client.update_paper(paper) repair_corrupt_reference(reference, paper, other_papers, api)
def check_cited_by(): print("\nCheck Cited by") api = API() papers = api.get_all_paper() for i, paper in enumerate(papers): progressBar(i, len(papers)) for cited_paper_id in paper.cited_by: if not api.contains_paper(cited_paper_id): paper.cited_by.remove(cited_paper_id) api.client.update_paper(paper) continue cited_paper = api.get_paper(cited_paper_id) cited_paper_refs = [ref.get_paper_id() for ref in cited_paper.references if ref.get_paper_id()] if cited_paper_refs.count(paper.id) == 0: print() paper.cited_by.remove(cited_paper_id) api.client.update_paper(paper) link_references_to_paper(cited_paper, paper, api)
def calculate_ranking_sections(self, raw_queries, settings): api = API() mean_ap_intro, mean_ap_background, mean_ap_methods, mean_ap_result, mean_ap_discussion = [], [], [], [], [] queries = self.__raw_queries_to_queries(raw_queries) for i, query in enumerate(queries): progressBar(i, len(queries)) relevant_paper = [api.get_paper(reference["paper_id"]) for reference in query["references"]] ranked_papers_intro = api.get_papers({IMRaDType.INTRODUCTION.name: query["search_query"]}, settings) ranked_papers_background = api.get_papers({IMRaDType.BACKGROUND.name: query["search_query"]}, settings) ranked_papers_methods = api.get_papers({IMRaDType.METHODS.name: query["search_query"]}, settings) ranked_papers_result = api.get_papers({IMRaDType.RESULTS.name: query["search_query"]}, settings) ranked_papers_discussion = api.get_papers({IMRaDType.DISCUSSION.name: query["search_query"]}, settings) ap_intro = self.average_precision(ranked_papers_intro, relevant_paper) ap_background = self.average_precision(ranked_papers_background, relevant_paper) ap_methods = self.average_precision(ranked_papers_methods, relevant_paper) ap_result = self.average_precision(ranked_papers_result, relevant_paper) ap_discussion = self.average_precision(ranked_papers_discussion, relevant_paper) mean_ap_intro.append(ap_intro) mean_ap_background.append(ap_background) mean_ap_methods.append(ap_methods) mean_ap_result.append(ap_result) mean_ap_discussion.append(ap_discussion) print() print("{} & {} & {}".format(Mode.only_introduction.name.replace("_", " "), len(mean_ap_intro), sum(mean_ap_intro) / len(mean_ap_intro))) print("{} & {} & {}".format(Mode.only_background.name.replace("_", " "), len(mean_ap_background), sum(mean_ap_background) / len(mean_ap_background))) print("{} & {} & {}".format(Mode.only_methods.name.replace("_", " "), len(mean_ap_methods), sum(mean_ap_methods) / len(mean_ap_methods))) print("{} & {} & {}".format(Mode.only_results.name.replace("_", " "), len(mean_ap_result), sum(mean_ap_result) / len(mean_ap_result))) print("{} & {} & {}".format(Mode.only_discussion.name.replace("_", " "), len(mean_ap_discussion), sum(mean_ap_discussion) / len(mean_ap_discussion)))