def run_engine(): """ :return: """ number_of_documents = 0 corpus_path = config.get__corpusPath() r = ReadFile(corpus_path) indexer = Indexer(config) p = Parse(config) #reading per folder r.create_files_name_list() files_list = [] # every index contains all tweets per folder for file_name in r.dates_list: tweets_per_date = r.read_file(file_name) files_list.append(tweets_per_date) #print("files_list", len(files_list)) num_of_tweets = 0 for folder_list in files_list: num_of_tweets += len(folder_list) #print("num_of_tweets", num_of_tweets) """#reading per folder r.create_files_name_list() threads = [] for file_name in r.dates_list: t = threading.Thread(target=r.read_file(file_name)) threads.append(t) t.start() print("files_list", r.files_list)""" """counter = 1 procs = [] # Iterate over every folder in the DATA for folder_list in files_list: proc = Process(target=test, args=(folder_list, counter, indexer, number_of_documents,)) procs.append(proc) proc.start() # complete the processes for proc in procs: proc.join() print('Finished parsing and indexing. Starting to export files')""" counter = 1 # Iterate over every folder in the DATA for folder_list in files_list: #print(counter) #print(datetime.now()) # Iterate over every tweet in the folder for idx, tweet in enumerate(folder_list): # parse the tweet parsed_document = p.parse_doc(tweet) number_of_documents += 1 # index the tweet data indexer.add_new_doc(parsed_document, num_of_tweets) #print("number of tweets", number_of_documents) #print(datetime.now()) counter += 1 #print('Finished parsing and indexing. Starting to export files') """#read only one folder documents_list = r.read_file(file_name='') num_indexed = len(documents_list) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document, num_indexed) #print('Finished parsing and indexing. Starting to export files')""" utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.tf_idf_dict, "tf_idf_dict") return indexer.get__lda__()