def run_engine(config): """ :return: """ number_of_documents = 0 sum_of_doc_lengths = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) indexer = Indexer(config, glove_dict) # documents_list = r.read_file(file_name=config.get__corpusPath()) parquet_documents_list = r.read_folder(config.get__corpusPath()) for parquet_file in parquet_documents_list: documents_list = r.read_file(file_name=parquet_file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 sum_of_doc_lengths += parsed_document.doc_length # index the document data indexer.add_new_doc(parsed_document) # saves last posting file after indexer has done adding documents. indexer.save_postings() if len(indexer.doc_posting_dict) > 0: indexer.save_doc_posting() utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path()) if len(indexer.document_posting_covid) > 0: indexer.save_doc_covid() indexer.delete_dict_after_saving() # merges posting files. indexer.merge_chunks() utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path()) dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents } utils.save_dict(dits, 'details', config.get_out_path())