def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ config = ConfigClass(corpus_path, output_path, stemming) number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) Parse.stemmer = stemming corpus_list = r.read_corpus() for idx in range(len(corpus_list)): documents_list = r.read_file(file_name=corpus_list[idx], read_corpus=True) for i in tqdm(range(len(documents_list))): parsed_document = p.parse_doc(documents_list[i]) if i == len(documents_list) - 1 and idx == len(corpus_list) - 1: indexer.is_last_doc = True indexer.add_new_doc(parsed_document) number_of_documents += 1 indexer.is_last_doc = False documents_list = [] with open('spell_dict.json', 'w') as f: json.dump(indexer.spell_dict, f) pickle_out = open("docs_dict_and_extras", "wb") pickle.dump(indexer.docs_dict, pickle_out) pickle_out.close() start = time.time() indexer.merge_files() end = time.time() print("merge time was: {}".format(end - start)) utils.save_obj(indexer.inverted_idx, "inverted_index") pickle_out = open("docs_dict_and_extras", "ab") pickle.dump(number_of_documents, pickle_out) pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out) pickle.dump(indexer.dump_path, pickle_out) pickle_out.close()
def run_engine(self): """ :return: """ r = ReadFile(corpus_path=self._config.get__corpusPath()) number_of_files = 0 for i, file in enumerate(r.read_corpus()): # Iterate over every document in the file number_of_files += 1 for idx, document in enumerate(file): # parse the document parsed_document = self._parser.parse_doc(document) self._indexer.add_new_doc(parsed_document) self._indexer.entities_and_small_big() self._indexer.calculate_idf(self._parser.number_of_documents) # avg_doc_len = self._parser.total_len_docs / self._parser.number_of_documents # self._indexer.save_index("inverted_idx") # TODO - check the name of inverted_idx self._indexer.save_index("idx_bench.pkl")
def run_engine(config): """ :return: """ parser = Parse(config) r = ReadFile(corpus_path=config.get__corpusPath()) indexer = Indexer(config) number_of_files = 0 for i, file in enumerate(r.read_corpus()): # Iterate over every document in the file number_of_files += 1 for idx, document in enumerate(file): # parse the document parsed_document = parser.parse_doc(document) indexer.add_new_doc(parsed_document) indexer.check_last() indexer.merge_sort_parallel(3) indexer.calculate_idf(parser.number_of_documents) avg_doc_len = parser.total_len_docs / parser.number_of_documents utils.save_obj(avg_doc_len, config.get_savedFileMainFolder() + "\\data") utils.save_obj(indexer.inverted_idx, config.get_savedFileMainFolder() + "\\inverted_idx") utils.save_obj(indexer.docs_inverted, config.get_savedFileMainFolder() + "\\docs_inverted")