def test_preprocessing(): root_folder = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/plsa/test/' pclean.file_parts_number=10 pplsa.file_parts_number = 10 pclean.file_dict = root_folder + 'dict/test_dict' pclean.source_texts = root_folder + 'extracted/*.txt' pclean.output_dir = root_folder + 'cleaned/' # Do cleansing on the data and turing it to bad-of-words model pclean.pre_pro() # Train using PLSA pplsa.folder = pclean.output_dir[:-1] pplsa.dict_path = pclean.file_dict pplsa.folder = pclean.output_dir[:-1] pplsa.main()
def generate_topics(self): start_time_1 = time.time() pplsa.file_parts_number=10 pclean.file_parts_number = 10 pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] +'_dict' pclean.source_texts = self.source_texts + self.unique_folder_naming + '*.txt' pclean.output_dir = self.output_dir + self.unique_folder_naming os.mkdir(pclean.output_dir) # Do cleansing on the data and turing it to bad-of-words model pclean.pre_pro() # Train using PLSA pplsa.topic_divider = 0 pplsa.num_topics = 2 pplsa.folder = pclean.output_dir[:-1] pplsa.dict_path = pclean.file_dict pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx' pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional' pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png' # Folder paths to delete self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH self.output_dir_stream = pclean.output_dir self.file_dict_stream = pclean.file_dict os.mkdir(pplsa.PLSA_PARAMETERS_PATH) pplsa.main() end_time_1 = time.time() print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4))
def generate_topics_gensim(self, num_topics, passes, chunksize, update_every=0, alpha='auto', eta='auto', decay=0.5, offset=1.0, eval_every=1, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, minimum_phi_value=0.01, per_word_topics=True, callbacks=None): start_time_1 = time.time() pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] + '_dict' pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json' pclean.output_dir = self.output_dir + self.unique_folder_naming os.mkdir(pclean.output_dir) # Do cleansing on the data and turing it to bad-of-words model with open( self.lda_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: f.write('Preprocessing started.') pclean.pre_pro() with open( self.lda_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: f.write('Preprocessing finished. Topic analysis started.') with open(pclean.output_dir + 'cleaned.json', "r") as read_file: ret = json.load(read_file) data_lemmatized = [] for k in ret: data_lemmatized.append(ret[k].splitlines()) # Create Dictionary id2word = corpora.Dictionary(data_lemmatized) # Create Corpus texts = data_lemmatized # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # View # print(corpus[0:1]) # print(id2word[1]) self.lda_model = gensim.models.ldamodel.LdaModel( corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=random_state, update_every=update_every, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta, per_word_topics=per_word_topics, decay=decay, offset=offset, eval_every=eval_every, iterations=iterations, gamma_threshold=gamma_threshold, minimum_probability=minimum_probability, minimum_phi_value=minimum_phi_value, callbacks=callbacks) port_dict = porter_dictionary.porter_dictionary() topics = self.lda_model.show_topics(num_topics=num_topics, num_words=300, formatted=False) extracted_topics = [] for topic in topics: a_topic = [] for item in topic[1]: a_topic.append(item[0]) extracted_topics.append(a_topic) port_dict.load_dict(self.dict_path + self.unique_folder_naming[:-1] + '_dict') self.topics_destemmed = [] for i in extracted_topics: destemmed = [] for j in i: try: destemmed.append(port_dict.dictionary[j][0]) except: logging.exception('message') self.topics_destemmed.append(destemmed) '''
def generate_topics_json(self): start_time_1 = time.time() pplsa.file_parts_number = 10 pclean.file_parts_number = 10 pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] + '_dict' pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json' pclean.output_dir = self.output_dir + self.unique_folder_naming os.mkdir(pclean.output_dir) # Do cleansing on the data and turing it to bad-of-words model with open( self.plsa_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: f.write('Preprocessing started.') pclean.pre_pro() with open( self.plsa_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: f.write('Preprocessing finished. Topic analysis started.') # Train using PLSA pplsa.topic_divider = self.topic_divider pplsa.num_topics = self.num_topics pplsa.maxiter2 = self.max_iter pplsa.beta = self.beta pplsa.folder = pclean.output_dir[:-1] pplsa.dict_path = pclean.file_dict pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx' pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional' pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png' # Folder paths to delete self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH self.output_dir_stream = pclean.output_dir self.file_dict_stream = pclean.file_dict try: os.mkdir(pplsa.PLSA_PARAMETERS_PATH) except: print( '-----------------------Folder exists-------------------------' ) pplsa.main() end_time_1 = time.time() print('Total training time took:', round((end_time_1 - start_time_1) / 60, 4)) with open( self.plsa_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: f.write('Topic analysis finished.\n') f.write(str(round((end_time_1 - start_time_1) / 60, 4)))