def test_lda(model_file, dict_file, dbs_dir): """ Run training and display test results if visualize is true Args: model_file(str): saved model file to continue training on dict_file(str): dict_file path to load dictionary from dbs_dir(str): dir path to load databases from """ assert (os.path.isdir(dbs_dir)), "Invalid data directory path" lda = LDA() print 'Loading existing dictionary...' lda.load_dict_from_disk(dict_file) test_results = list() #Iterate over all data and train model for root, dirs, files in os.walk(dbs_dir): #Iterate over sub-dirs for d in files: db = Database() #Load database object from saved file db.load_from_disk(dbs_dir + '/' + d) #Add database to model lda.add_database(db) #Test model test_results.append(lda.test(model_file, db_name=db.get_name())) lda.remove_database(db.get_name()) del db gc.collect() #Print test results for idx, i in enumerate(test_results): print('Test results for database {}'.format(idx)) for j in i[0]: print('Topic: {} has probability: {}'.format(j[0], j[1])) counter = 0 for k in i[1]: print('Topic {} has topic-coherence score: {}'.format( counter, k[1])) counter += 1 print lda.model.show_topics()
def run_lda(data_dir, num_topics, use_mini_batches, batch_size, epochs, model_file, create_dict, dict_file, load_dbs): """ Run training and display test results if visualize is true Args: data_dir(str): directory containing director(y/ies) of data num_topics(int): Number of topics to train the model on batch_size(int): Size of mini batches used to train the model epochs(int): Number of epochs to train the data for on the train set model_file(str): saved model file to continue training on create_dict(bool): create dictionary from data or load dict from a file dict_file(str): dict_file path to load dictionary from load_dbs(bool): if true, load databases from saved pickle files """ assert (os.path.isdir(data_dir)), "Invalid data directory path" use_model_file = False if model_file: use_model_file = True #Create model lda = LDA(num_topics=num_topics) if create_dict: print 'Creating dictionary from data' #Create word to id mapping for all texts lda.create_dict(data_dir) lda.store_dict_to_disk('./dict/dictionary') else: print 'Loading existing dictionary...' lda.load_dict_from_disk(dict_file) #Iterate over all data and train model for root, dirs, files in os.walk(data_dir): if load_dbs: print 'Training will be done on existing databases' datum = files else: print 'Training will be done after creating databases from text files' datum = dirs #Iterate over sub-dirs for d in datum: db = None if not load_dbs: #Create database object db = Database(d, os.path.abspath(data_dir + '/' + d)) else: db = Database() #Load database object from saved file db.load_from_disk(data_dir + '/' + d) #Add database to model lda.add_database(db) if use_model_file: #Load model paramaters from model file and call train lda.train(model_file, db_name=db.get_name(), use_mini_batches=use_mini_batches, use_internal_dict=True, batch_size=batch_size, num_epochs=epochs) #Set to false, as we just need to load the model once and train it on the entire dataset use_model_file = False else: #Call train on the model lda.train(db_name=db.get_name(), use_mini_batches=use_mini_batches, use_internal_dict=True, batch_size=batch_size, num_epochs=epochs) if not load_dbs: #Remove db to free memory (can also save it if preferred) db.store_to_disk('./databases/' + d) lda.remove_database(db.get_name()) del db gc.collect() tmp_file = './models/' + d + str(num_topics) lda.save_model(tmp_file) #Save final model file_name = './models/final' + str(num_topics) lda.save_model(file_name)