def run_engine(config): """ :return: """ number_of_documents = 0 output_path = config.savedFileMainFolder r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) m_Indexer = Indexer(output_path) parquetPaths = [] for (dirPath, dirNames, fileNames) in os.walk(config.get__corpusPath()): for fileName in fileNames: parquetPaths.append((dirPath + '\\' + fileName)) for i in range(len(parquetPaths)): parquetPaths[i] = parquetPaths[i][parquetPaths[i].find('\\') + 1:] if ".DS_Store" in parquetPaths[i]: continue parquet = r.read_file(file_name=parquetPaths[i]) for document in parquet: number_of_documents += 1 parsed_document = p.parse_doc(document) # index the document data m_Indexer.add_new_doc(parsed_document) # if there's more postings to flush, do it. if len(m_Indexer.postingDictionary) > 0: utils.save_obj(m_Indexer.postingDictionary, m_Indexer.postingsPath + '/' + str(m_Indexer.pkl_key)) # Clear single terms and entities, updated inverted index to disk. clearSingleEntities(m_Indexer.inverted_idx, p, output_path, m_Indexer.num_of_docs_in_corpus) utils.save_obj(m_Indexer.inverted_idx, output_path + '/inverted_idx') m_Indexer.inverted_idx.clear() utils.save_obj(number_of_documents, output_path + '/PostingFiles/num_of_docs_in_corpus')
def run_engine(corpus_path='', output_path='', stemming=False): """ :return: """ # Create PostingFile directory if it doesn't exist number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=corpus_path) p = Parse(stemming) indexer = Indexer(config, output_path) # Get all parquet files from corpus path parquets = [] for root, dirs, files in os.walk(corpus_path): for name in files: if name.endswith((".parquet", ".htm")): parquets.append((root, name)) for index in range(len(parquets)): r.corpus_path = parquets[index][0] documents_list = r.read_file(file_name=parquets[index][1]) # Create a new process for each document with Pool(CPUCOUNT) as _p: for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list): number_of_documents += 1 indexer.add_new_doc(parsed_doc) _p.close() _p.join() p.entities.clear() indexer.finish_index() save_obj(indexer.term_dict, output_path + '/' + "inverted_idx") save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary") indexer.document_dict.clear() indexer.term_dict.clear()
def run_engine(corpus_path, output_path, stemming=False): """ :param corpus_path: path for parquet files :param output_path: path to write pickle files :param stemming: boolean to use stemming or not :return: """ ConfigClass(corpus_path, output_path, stemming) r = ReadFile(corpus_path) p = Parse(stemming) indexer = Indexer(output_path, stemming) if corpus_path.endswith('parquet'): documents_list = r.read_file(corpus_path) parseAndIndexDocuments(documents_list, p, indexer) else: documents_list = r.read_dir() while documents_list: parseAndIndexDocuments(documents_list, p, indexer) documents_list = r.read_dir() documents_list.clear() indexer.merge_posting_files() lda = LDA(output_path, indexer.dictdoc, stemming) lda.build_ldaModel()
def run_engine(config): """ :param config: :return: """ number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) indexer = Indexer(config) paruet_list = r.read_all_parquet() for list in paruet_list: #for i in tqdm(range(0,len(list))): # for every doc for i in range(0, len(list)): # for every doc # parse the document parsed_document = p.parse_doc(list[i]) if parsed_document is None: continue number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) #print('Finished parsing and indexing. Starting to export files') indexer.save_postings() # saves the remaining posting file . PostingsMerge(indexer).chunks_merging() utils.save_dict_as_pickle(indexer.inverted_idx, "inverted_idx", config.get_out_path())
def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None): """ This function build the inverted index over the corpus. send each tweet to parsing and indexing. if the stemming is True the parsing will use the stemmer on the tokens. :param glove_dict: Glove file including all word vectors :param corpus_path: root folder containing the raw tweet files :param output_path for the inverted index, posting files and tweets dictionary :param stemming if True use stemmer on terms """ config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path) r = ReadFile(corpus_path=config.get_corpusPath()) p = Parse(stemming) indexer = Indexer(config) all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet") all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths] start_time = time.time() file_counter = 0 for file_name in all_files_names: file_start_time = time.time() # print("start file :", file_counter) documents_list = [document for document in r.read_file(file_name=file_name)] # Iterate over every document in the file for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) indexer.add_new_doc(parsed_document, glove_dict) # print("end file number ", file_counter, " in: ", time.time() - file_start_time) file_counter += 1 total_time = time.time() - start_time indexer.finish_indexing()
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ number_of_documents = 0 config = ConfigClass(corpus_path, output_path, stemming) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, p.terms_dic_to_document) # Iterate over every document in the file for i in r.filesPath: documents_list = r.read_file(i) start_time = time.time() for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) # update the number of doc in system number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # print(time.time() - start_time) print('--------------------------') print('Start writing to disk left overs') indexer.save_all_left_overs() print('Finish without waiting ' + str(time.time() - start_time)) print('Start waiting') indexer.wait_untill_all_finish() print('End Waiting') print('Finished writing to disk left overs') print('--------------------------') print('Finished parsing and indexing. Starting to export files') print('Finish all Time ' + str(time.time() - start_time)) utils.save_obj(indexer.inverted_idx, "inverted_idx")
def test_add_new_doc(self): config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # text1 = '@ampalombo I was going to my grandsons baseball games and the dumb F****s made a mask mandatory, are you kidding me' assert indexer.add_new_doc() text = 'i wad born in 2019'
def test_reader(): global num_test_failed, results_summary num_test_failed = 0 r = ReadFile(corpus_path) correct_answers = [x['len'] for x in reader_inputs] student_answers = [ len(r.read_file(x['file'])) for x in reader_inputs ] test_part(correct_answers, student_answers, error_str="read") if num_test_failed == 0: results_summary.append('All Reader tests passed')
def write_content_for_tweet_id(): corpus_path = "C:\\Users\\ASUS\\Desktop\\Data" config = ConfigClass(corpus_path) r = ReadFile(corpus_path=config.get__corpusPath()) names = r.get_files_names_in_dir() with open("text.csv", "w", newline='', encoding='utf-8') as f: writer = csv.writer(f) for name in names: documents_list = r.read_file_by_name(file_name=str(name)) for doc in documents_list: if doc[0] in tweet_ids: writer.writerow([doc[0], doc[2]])
def __init__(self, config=None): """ init engine with the relevant model - Thesaurus_Searcher :param config: """ self._config = config try: self._reader = ReadFile(corpus_path=config.get__corpusPath()) except: self._reader = ReadFile("") self._parser = Parse() self._parser.STEMMER = config.toStem self._indexer = Indexer(config) self._model = Thesaurus_Searcher(self._indexer) self.last_parquet = False
def __init__(self, config=None): self._config = config self._parser = Parse(False) self.reader = ReadFile(corpus_path=config.get__corpusPath()) self._indexer = Indexer(config) self.model = self.initialize_glove_dict() self._indexer.set_glove_dict(self.model)
def main(): ''' The main loop for the program ''' config = ConfigClass() se = search_engine_best.SearchEngine(config=config) r = ReadFile(corpus_path=config.get__corpusPath()) # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1] # se.build_index_from_parquet(parquet_file_path) se.load_index('idx_bench') g = GUI() # s.load_existing_index() # load if exists, otherwise return empty list while True: event, values = g.window.read() if event is None: break if event == '_SEARCH_': g.clear() query = values['TERM'] start = datetime.now() relevant, tweets_id = se.search(query) end = datetime.now() total_time = (end - start).total_seconds() # print the results to output element index = 0 for tweet_id in tweets_id: if index < 25: print("%s. tweet id: %s" % (index + 1, tweet_id)) index += 1 print() print("About %s tweets (%s seconds)" % (relevant, total_time))
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve, word2vec): """ :return: """ # print("start: ", time.asctime(time.localtime(time.time()))) number_of_documents = 0 num_of_writes = 1 config = ConfigClass(corpus_path) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, word2vec) # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet') # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool) # Iterate over every document in the file counter = 0 names = r.get_files_names_in_dir() for name in names: documents_list = r.read_file_by_name(file_name=str(name)) for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) # parse the document if parsed_document == {}: # RT continue number_of_documents += 1 indexer.add_new_doc(parsed_document, num_of_writes) # index the document data counter += 1 if counter >= 500000: write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) counter = 0 # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time()))) num_of_writes += 1 # print('Finished parsing and indexing. Starting to export files') write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) # print("finish parser & index: ", time.asctime(time.localtime(time.time()))) indexer.inverted_idx = { key: val for key, val in indexer.inverted_idx.items() if val != 1 } utils.save_obj(indexer.inverted_idx, "inverted_idx") # print("finish save index: ", time.asctime(time.localtime(time.time()))) return num_of_writes
def run_engine(config, indexer): """ :return: """ number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config) doc = r.read_file('benchmark_data_train.snappy.parquet') for document in doc: parsed_document = p.parse_doc(document) indexer.add_new_doc(parsed_document) number_of_documents += 1 capital_letters = p.caps_dict indexer.change_inverted_by_caps(capital_letters) indexer.save_index('idx_bench')
def run_engine(corpus_path_, output_path_, stemming_): """ :return: """ number_of_documents = 0 config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_) config.corpusPath = corpus_path_ config.savedFileMainFolder=output_path_ r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) pathes = r.get_all_path_of_parquet() length_of_array = len(pathes) iteration = 0 is_stemmer = config.toStem parsed_doc_list = list() for i in range(0, length_of_array): documents_list = r.get_documents(pathes[i][0], pathes[i][0]) for doc, j in zip(documents_list, range(len(documents_list))): parsed_document = p.parse_doc(doc, stemmer=is_stemmer) if parsed_document == None: continue parsed_doc_list.append(parsed_document) number_of_documents += 1 if number_of_documents % 200000 == 0: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) iteration += 1 parsed_doc_list.clear() parsed_doc_list = list() elif j == len(documents_list) - 1 and i == length_of_array - 1: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) parsed_doc_list.clear() parsed_doc_list = list() indexer.merge_posting_file() indexer.merge_two_last_posting_file() indexer.split_posting_file_and_create_inverted_index() indexer.write_inverted_index_to_txt_file() number_of_documents = 0
def __init__(self, config=None): if not config: self._config = ConfigClass() else: self._config = config self._parser = Parse() self._indexer = Indexer(self._config) self._model = None self._reader = ReadFile(self._config.get__corpusPath())
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ config = ConfigClass(corpus_path, output_path, stemming) number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) Parse.stemmer = stemming corpus_list = r.read_corpus() for idx in range(len(corpus_list)): documents_list = r.read_file(file_name=corpus_list[idx], read_corpus=True) for i in tqdm(range(len(documents_list))): parsed_document = p.parse_doc(documents_list[i]) if i == len(documents_list) - 1 and idx == len(corpus_list) - 1: indexer.is_last_doc = True indexer.add_new_doc(parsed_document) number_of_documents += 1 indexer.is_last_doc = False documents_list = [] with open('spell_dict.json', 'w') as f: json.dump(indexer.spell_dict, f) pickle_out = open("docs_dict_and_extras", "wb") pickle.dump(indexer.docs_dict, pickle_out) pickle_out.close() start = time.time() indexer.merge_files() end = time.time() print("merge time was: {}".format(end - start)) utils.save_obj(indexer.inverted_idx, "inverted_index") pickle_out = open("docs_dict_and_extras", "ab") pickle.dump(number_of_documents, pickle_out) pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out) pickle.dump(indexer.dump_path, pickle_out) pickle_out.close()
def run_engine(config): """ :return: """ number_of_documents = 0 sum_of_doc_lengths = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) indexer = Indexer(config, glove_dict) # documents_list = r.read_file(file_name=config.get__corpusPath()) parquet_documents_list = r.read_folder(config.get__corpusPath()) for parquet_file in parquet_documents_list: documents_list = r.read_file(file_name=parquet_file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 sum_of_doc_lengths += parsed_document.doc_length # index the document data indexer.add_new_doc(parsed_document) # saves last posting file after indexer has done adding documents. indexer.save_postings() if len(indexer.doc_posting_dict) > 0: indexer.save_doc_posting() utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path()) if len(indexer.document_posting_covid) > 0: indexer.save_doc_covid() indexer.delete_dict_after_saving() # merges posting files. indexer.merge_chunks() utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path()) dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents } utils.save_dict(dits, 'details', config.get_out_path())
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ config = self._config indexer = self._indexer number_of_documents = 0 if (config.getoneFile()): df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.calculationSummerize() else: r = ReadFile(corpus_path=config.get__corpusPath()) for root, dirs, files in os.walk(config.get__corpusPath(), topdown=True): for name in files: ext = name.split('.')[-1] if ext == 'parquet': documents_list = r.read_folder(root, file_name=name) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # indexer.update_posting_files() # indexer.reset_cach() self._indexer.save_index('inverted_idx') print('Finished parsing and indexing.')
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ r = ReadFile() df = r.read_file(fn) documents_list = df # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.')
def run_engine(corpus_path=None, output_path=None, stemming=False, lemma=False, queries=None, num_docs_to_retrieve=None): """ :return: """ global config, number_of_documents number_of_documents = 0 config = ConfigClass() config.corpusPath = corpus_path config.set_output_path(output_path) config.toStem = stemming config.toLemm = lemma if os.path.exists(config.get_output_path()): shutil.rmtree(config.get_output_path()) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem, config.toLemm) indexer = Indexer(config) documents_list = [] for root, dirs, files in os.walk(corpus_path): r.set_corpus_path(root) for file in files: if file.endswith(".parquet"): documents_list += r.read_file(file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) documents_list.clear( ) # Finished parsing and indexing all files - need to clean all the used memory indexer.cleanup(number_of_documents)
def run_engine(self): """ :return: """ r = ReadFile(corpus_path=self._config.get__corpusPath()) number_of_files = 0 for i, file in enumerate(r.read_corpus()): # Iterate over every document in the file number_of_files += 1 for idx, document in enumerate(file): # parse the document parsed_document = self._parser.parse_doc(document) self._indexer.add_new_doc(parsed_document) self._indexer.entities_and_small_big() self._indexer.calculate_idf(self._parser.number_of_documents) # avg_doc_len = self._parser.total_len_docs / self._parser.number_of_documents # self._indexer.save_index("inverted_idx") # TODO - check the name of inverted_idx self._indexer.save_index("idx_bench.pkl")
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ number_of_documents = 0 r = ReadFile(corpus_path=self._config.get__corpusPath()) doc = r.read_file(fn) for document in doc: parsed_document = self._parser.parse_doc(document) self._indexer.add_new_doc(parsed_document) number_of_documents += 1 capital_letters = self._parser.caps_dict self._indexer.change_inverted_by_caps(capital_letters) self._indexer.save_index('idx_bench') print('Finished parsing and indexing.')
def run_engine(): """ :return: """ number_of_documents = 0 timer = True config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() #p = Parse(with_stemmer=True) indexer = Indexer(config) data_dir = 'Data' + os.sep + 'Data' npy_dirs = [root for root, dirs, files in os.walk(data_dir)] for dir_path in npy_dirs: files = [ os.path.join(dir_path, fname) for fname in os.listdir(dir_path) if fname.endswith('.parquet') ] for file in files: tweets = r.read_file(file_name=file) start_time = time.perf_counter() documents_list = multiprocessing.Pool(12).map(p.parse_doc, tweets) end_time = time.perf_counter() avg_time_per_tweet = (end_time - start_time) / len(tweets) print( f'Parsed {len(tweets)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds, average per tweet: {avg_time_per_tweet:0.8f} seconds' ) start_time = time.perf_counter() for parsed_document in documents_list: indexer.add_new_doc(parsed_document) end_time = time.perf_counter() print( f'Indexing {len(documents_list)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds' ) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
def run_engine(corpus_path='', output_path='.', stemming=False): """ Entry point for corpus parsing and indexing :param corpus_path: :param output_path: :param stemming: boolean that says if stemming should be apllied :return: total number of tweets parsed """ config = ConfigClass(corpus_path, stemming, output_path) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) tweets_parsed = parse_wrapper(r, p, config)
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ rd = ReadFile(fn) documents_list = rd.read_file() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.thresh_hold = 100000 self._indexer.thresh_hold_handler() self._indexer.save_index("inverted_idx")
def run_engine(): """ :return: """ number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
def run_engine(config): """ :return: """ parser = Parse(config) r = ReadFile(corpus_path=config.get__corpusPath()) indexer = Indexer(config) number_of_files = 0 for i, file in enumerate(r.read_corpus()): # Iterate over every document in the file number_of_files += 1 for idx, document in enumerate(file): # parse the document parsed_document = parser.parse_doc(document) indexer.add_new_doc(parsed_document) indexer.check_last() indexer.merge_sort_parallel(3) indexer.calculate_idf(parser.number_of_documents) avg_doc_len = parser.total_len_docs / parser.number_of_documents utils.save_obj(avg_doc_len, config.get_savedFileMainFolder() + "\\data") utils.save_obj(indexer.inverted_idx, config.get_savedFileMainFolder() + "\\inverted_idx") utils.save_obj(indexer.docs_inverted, config.get_savedFileMainFolder() + "\\docs_inverted")
def create_table(stemming, corpus): myclient = pymongo.MongoClient("mongodb://localhost:27017/") mydb = myclient["mydatabase"] mycol = mydb["global"] mycol.drop() r = ReadFile(corpus) p = Parse(stemming) for documents_list in r: step = 1 / len(documents_list) for document in documents_list: parsed_list = [t.text.lower() for t in p.parse_doc(document) if '$' not in t.text] for word_1 in parsed_list: query = {'term': word_1} row = mycol.find_one(query) if not row: mycol.insert_one({**query, 'terms': {}}) row = mycol.find_one(query) for word_2 in parsed_list: if word_2 not in row['terms'].keys(): row['terms'][word_2] = 0 row['terms'][word_2] += 1 try: mycol.update_one(query, {"$set": {'terms': row['terms']}}) except: print(row['terms']) r.progressbar.update(step) counter += 1 global_table = {} for word_1 in mycol.find(): top = [] for word_2 in word_1['terms'].keys(): s = word_1['terms'][word_2] / ( word_1['terms'][word_1['term']] + mycol.find_one({'term': word_2})['terms'][word_1['term']] - word_1['terms'][word_2]) if len(top) < 10: top.append((word_2, s)) top.sort(key=lambda score: score[1]) elif s > top[0][1]: top[0] = (word_2, s) top.sort(key=lambda score: score[1]) global_table[word_1['term']] = top utils.save_obj(global_table, f'global_table_{stemming}')
def run_engine(corpus_path, stemming, output_path): """ :return: """ r = ReadFile(corpus_path) p = Parse(stemming) m = BinaryMemoryPosting(os.path.join(output_path, PostingFile)) indexer = Indexer() max_posting_size = 100000 if os.path.exists(os.path.join(output_path, PostingFile)): os.remove(os.path.join(output_path, PostingFile)) if os.path.exists(InvertedIndexFile + '.pkl'): os.remove(InvertedIndexFile + '.pkl') if not os.path.exists(output_path): os.mkdir(output_path) # Iterate over every document in the file idx = 0 for documents_list in r: step = 1 / len(documents_list) for document in documents_list: parsed_list = p.parse_doc(document) # index the document data indexer.add_new_doc(parsed_list, idx, document[0]) idx += 1 if idx % max_posting_size == 0: m.Save(p.word_dict) r.progressbar.update(step) r.progressbar.close() m.Save(p.word_dict) global_table = utils.load_obj(f'global_table_{stemming}') inv_index = indexer.CreatInvertedIndex(p.word_dict, idx, global_table) m.Merge(inv_index) utils.save_obj(inv_index, InvertedIndexFile)