def test_variational_inference(voc = None, docs = None, max_files = None, doc_num = None, n_topics = 20, dirich_param = .5, log_word_proba_given_topic = None, **kwargs): description = {'n_topics': n_topics, 'dirich_param': dirich_param} if(voc == None or docs == None): files_list = [path.join(path_to_reuters, 'reut2-000.sgm'), path.join(path_to_reuters, 'reut2-001.sgm'), path.join(path_to_reuters, 'reut2-002.sgm'), path.join(path_to_reuters, 'reut2-003.sgm'), path.join(path_to_reuters, 'reut2-004.sgm'), path.join(path_to_reuters, 'reut2-005.sgm'), path.join(path_to_reuters, 'reut2-006.sgm'), path.join(path_to_reuters, 'reut2-007.sgm'), path.join(path_to_reuters, 'reut2-008.sgm'), path.join(path_to_reuters, 'reut2-009.sgm')] print files_list description['data_files_list'] = files_list voc, docs = dp.build_voc(files_list) print voc.keys()[:10] voc_size = len(voc) description['voc_size'] = voc_size if doc_num == None: doc_count = len(docs) print 'doc_count: %d' % doc_count doc_num = np.random.randint(doc_count) if log_word_proba_given_topic == None: word_proba_given_topic = np.random.rand(n_topics * voc_size).reshape( (n_topics, voc_size)) word_proba_given_topic /= np.sum(word_proba_given_topic, axis = 1).reshape((-1,1)) log_word_proba_given_topic = np.log(word_proba_given_topic) # # test for a document d # var_dirich, var_multinom, log_likelihoods = vi.variational_inference( # docs[doc_num], dirich_param, log_word_proba_given_topic, **kwargs) # plt.figure(1) # plt.plot(log_likelihoods) # plt.xlabel('iterations') # plt.ylabel('expected log-likelihood') # plt.title('expected log-likelihood for a document d, k = ' # + str(n_topics)) # test for a corpus logger = Dirich_features_logger(root_results_dir = results_dir(), description = description) (dirich_param, word_logproba_given_topic, corpus_log_likelihood) \ = vi.latent_dirichlet_allocation(docs, n_topics, voc_size, max_iter = 200, var_inf_max_iter = 200, logger = logger)
def prepare_data(reuters_files, data_file_name, voc_file_name): voc = dp.build_voc(reuters_files) with open(data_file_name, 'w') as data_file: for doc in voc[1]: data_file.write('%d ' % np.size(doc, axis = 0)) for word in doc: data_file.write('%d:%d ' % (word[0], word[1])) data_file.write('\n') sorted_voc = sorted(voc[0].items(), key = itemgetter(1)) with open(voc_file_name, 'w') as voc_file: for item in sorted_voc: voc_file.write('%s\n' % item[0])