def remove_frequent_terms(frequency=100): for l in tqdm(os.listdir(INDEX_PATH), ascii=True, desc="Giving most frequent terms their own index"): index = utils.load_index("indexes/" + l.split(".")[0]) for term in list(index.keys()): if index[term]["doc_frequency"] > frequency: new = {term: index[term]} utils.save_index(new, "indexes/inverted_index_" + term) index.pop(term) utils.save_index(index, filename="indexes/" + l.split(".")[0])
def remove_single_docs(): for l in tqdm(os.listdir(INDEX_PATH), ascii=True, desc="Removing terms with single doc_frequency"): index = utils.load_index("indexes/" + l.split(".")[0]) for term in list(index.keys()): for doc_id in list(index[term]["doc_ids"].keys()): if index[term]["doc_ids"][doc_id] <= 1: index[term]["doc_ids"].pop(doc_id) index[term]["doc_frequency"] -= 1 if index[term]["doc_frequency"] < 1: index.pop(term) utils.save_index(index, filename="indexes/" + l.split(".")[0])
def build_papers_index(filename, save=False): paper_index = dict() with open(filename, "rb") as f: papers_as_json = f.readlines() # since one paper in json per line for p in papers_as_json: paper, paperID = get_paper_from_json(p) paper_index[paperID] = paper CITATION_COUNTS[paperID] = paper["citations"] # save papers index to compressed file if save: utils.save_index(paper_index, filename="papers_index") return paper_index
def main(): try: os.makedirs("indexes") except: pass for filename in os.listdir(ARXIV_PATH): papers_index = build_papers_index(ARXIV_PATH + filename) inverted_index = build_inverted_index(papers_index, debug=False, desc=filename) split_and_save(inverted_index) sort_indexes() remove_single_docs() # 2.25GB before, 1.19GB after remove_frequent_terms() utils.save_index(CITATION_COUNTS, filename="citations")
def split_and_save(index): for l in tqdm(ALPHABET, ascii=True, desc="Processing letters"): terms = [i for i in index.keys() if i.startswith(l)] try: tmp = utils.load_index(filename="indexes/inverted_index_" + l) except FileNotFoundError: tmp = dict() for term in terms: if term not in tmp: # if we have a new term initialise index with info tmp[term] = index[term] else: # else merge the data tmp[term]["doc_frequency"] += index[term]["doc_frequency"] for pid in list(index[term]["doc_ids"].keys()): tmp[term]["doc_ids"][pid] = index[term]["doc_ids"][pid] utils.save_index(tmp, filename="indexes/inverted_index_" + l)
def sort_indexes(): for l in tqdm(os.listdir(INDEX_PATH), ascii=True, desc="Sorting indexes"): index = utils.load_index("indexes/" + l.split(".")[0]) index = { k: v for k, v in reversed( sorted(index.items(), key=lambda item: item[1]["doc_frequency"])) } for term in index.keys(): index[term]["doc_ids"] = { k: v for k, v in reversed( sorted(index[term]["doc_ids"].items(), key=lambda item: item[1])) } utils.save_index(index, filename="indexes/" + l.split(".")[0])