def load_lda_parameters(mdl_cfg): dictionary_file = mdl_cfg['CORPUS']['dict_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_mdl_file = mdl_cfg['LDA']['lda_model_file'] lda_cos_index_file = mdl_cfg['LDA']['lda_cos_index_file'] if nexists(dictionary_file) and nexists(path_index_file): lda_file_path_index = load_file_paths_index(path_index_file) lda_dictionary = load_dictionary(dictionary_file) if nexists(lda_mdl_file) and nexists(lda_cos_index_file): lda_mdl, lda_index = load_lda_variables(lda_mdl_file, lda_cos_index_file) lda_theta_file = mdl_cfg['LDA']['lda_theta_file'] lda_theta = np.loadtxt(lda_theta_file) # loads the LDA theta from the model theta file num_docs, num_topics = lda_theta.shape min_lda_theta = np.min(np.min(lda_theta)) print 'LDA-theta is loaded: # of documents:', num_docs, \ '# of topics:', num_topics, 'min(Theta):', min_lda_theta lda_beta_file = mdl_cfg['LDA']['lda_beta_file'] lda_beta = np.loadtxt(lda_beta_file) # loads the LDA theta from the model theta file num_topics, vocab_size = lda_beta.shape min_lda_beta = np.min(np.min(lda_beta)) print 'LDA-beta is loaded: # of topics:', num_topics, \ '# of terms in the vocabulary:', vocab_size, \ 'min(Bheta):', min_lda_beta print return lda_dictionary, lda_mdl, lda_index, lda_file_path_index, lda_theta, lda_beta
def load_lsi_parameters(mdl_cfg): dictionary_file = mdl_cfg['CORPUS']['dict_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lsi_mdl_file = mdl_cfg['LSI']['lsi_model_file'] lsi_cos_index_file = mdl_cfg['LSI']['lsi_cos_index_file'] if nexists(dictionary_file) and nexists(path_index_file): lsi_file_path_index = load_file_paths_index(path_index_file) lsi_dictionary = load_dictionary(dictionary_file) if nexists(lsi_mdl_file) and nexists(lsi_cos_index_file): lsi_mdl, lsi_index = load_lsi_variables(lsi_mdl_file, lsi_cos_index_file) return lsi_dictionary, lsi_mdl, lsi_index, lsi_file_path_index
def load_tm(mdl_cfg): dictionary_file = mdl_cfg['CORPUS']['dict_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_mdl_file = mdl_cfg['LDA']['lda_model_file'] lda_cos_index_file = mdl_cfg['LDA']['lda_cos_index_file'] if nexists(dictionary_file) and nexists(path_index_file): lda_file_path_index = load_file_paths_index(path_index_file) lda_dictionary = load_dictionary(dictionary_file) if nexists(lda_mdl_file) and nexists(lda_cos_index_file): lda_mdl, lda_index = load_lda_variables(lda_mdl_file, lda_cos_index_file) return lda_dictionary, lda_mdl, lda_index, lda_file_path_index
config_file = "E:\\E-Discovery\\edrmv2txt-a-b-index-t50-s\\edrmv2txt-a-b-index-t50-s.cfg" M = 30 # number of terms used in coherence score topic_words_file = "top%d-topics-words.txt" % M # topic_similarites_file = "topics-sim-M%d.txt" % M mdl_cfg = read_config(config_file) # Loads the vocabulary vocab_file = mdl_cfg['CORPUS']['vocab_file'] vocab = dict() with open(vocab_file) as fp: for vocab_id, token in enumerate(fp): vocab[token.strip()] = vocab_id lda_mdl_file = mdl_cfg['LDA']['lda_model_file'] if nexists(lda_mdl_file): lda_mdl = gensim.models.ldamodel.LdaModel.load(lda_mdl_file) # Loads the corpus ldac_file = mdl_cfg['CORPUS']['blei_corpus_file'] lda_corpus = gensim.corpora.BleiCorpus(ldac_file) print 'Computing Mimno score...' coherence_scores = calc_Mimno_topic_coherence(lda_corpus, lda_mdl, vocab, M) sort_index = np.argsort(coherence_scores)[::-1] # desc order of coherence scores # print 'Computing topic entropy scores' # topic_entropies = calc_topic_entropy(lda_mdl)