def run_hdp(): options = parse_args() random.seed(options.random_seed) # Read the training data. c_train_filename = options.data_path test_data_path = options.test_data_path c_test = read_data(test_data_path) c_test_word_count = sum([doc.total for doc in c_test.docs]) if options.test_data_path_in_folds is not None: test_data_path_in_folds = options.test_data_path_in_folds test_data_in_folds_filenames = glob(test_data_path_in_folds) test_data_in_folds_filenames.sort() num_folds = len(test_data_in_folds_filenames)/2 test_data_train_filenames = [] test_data_test_filenames = [] for i in range(num_folds): test_data_train_filenames.append(test_data_in_folds_filenames[2*i+1]) test_data_test_filenames.append(test_data_in_folds_filenames[2*i]) c_test_train_folds = [read_data(filename) for filename in test_data_train_filenames] c_test_test_folds = [read_data(filename) for filename in test_data_test_filenames] result_directory = "%s/corpus-%s" % (options.directory, options.corpus_name) print "creating directory %s" % result_directory if not os.path.isdir(result_directory): os.makedirs(result_directory) options_file = file("%s/options.dat" % result_directory, "w") for opt, value in options.__dict__.items(): options_file.write(str(opt) + " " + str(value) + "\n") options_file.close() print "creating hdp instance." bhdp_hp = hdp.hdp_hyperparameter(options.alpha, options.alpha, options.gamma, options.gamma, False) bhdp = hdp.hdp(options.T, options.K, options.D, options.W, options.eta, bhdp_hp) #bhdp.seed_init(c_train) print "setting up counters and log files." iter = 0 total_time = 0.0 total_doc_count = 0 likelihood = 0.0 old_likelihood = 0.0 converge = 1.0 log_file = file("%s/log.dat" % result_directory, "w") log_file.write("iteration time doc.count likelihood\n") test_log_file = file("%s/test-log.dat" % result_directory, "w") test_log_file.write("iteration time doc.count score word.count score.split word.count.split\n") while (options.max_iter == -1 or iter < options.max_iter) and total_time < options.max_time: t0 = time.clock() # Run one step iteration. likelihood = bhdp.em_on_large_data(c_train_filename, options.var_converge, fresh=(iter==0)) if iter > 0: converge = (likelihood - old_likelihood)/abs(old_likelihood) old_likelihood = likelihood print "iter = %d, likelihood = %f, converge = %f" % (iter, likelihood, converge) if converge < 0: print "warning, likelihood is decreasing!" total_time += time.clock() - t0 iter += 1 # increase the iter counter total_doc_count += options.D log_file.write("%d %d %d %.5f\n" % (iter, total_time, total_doc_count, likelihood)) log_file.flush() bhdp.save_topics('%s/doc_count-%d.topics' % (result_directory, total_doc_count)) cPickle.dump(bhdp, file('%s/doc_count-%d.model' % (result_directory, total_doc_count), 'w'), -1) print "\tworking on predictions." (lda_alpha, lda_beta) = bhdp.hdp_to_lda() # prediction on the fixed test in folds print "\tworking on fixed test data." test_score = 0.0 test_score_split = 0.0 c_test_word_count_split = 0 for doc in c_test.docs: (likelihood, gamma) = hdp.lda_e_step(doc, lda_alpha, lda_beta) test_score += likelihood (likelihood, count, gamma) = hdp.lda_e_step_split(doc, lda_alpha, lda_beta) test_score_split += likelihood c_test_word_count_split += count test_log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, test_score, c_test_word_count, test_score_split, c_test_word_count_split)) test_log_file.flush() # prediction on the test set in the folds # print "\tworking on test data in folds." # test_folds_log_file = file("%s/doc_count-%d.test.folds" % (result_directory, total_doc_count), "w") # test_folds_log_file.write("fold doc.id word count score\n") # for i in range(num_folds): # train_data = c_test_train_folds[i] # test_data = c_test_test_folds[i] # for (doc_id, train_doc, test_doc) in izip(range(train_data.num_docs), train_data.docs, test_data.docs): # if test_doc.total > 0: # (likelihood, gamma) = hdp.lda_e_step(train_doc, lda_alpha, lda_beta) # theta = gamma/np.sum(gamma) # lda_betad = lda_beta[:, test_doc.words] # log_predicts = np.log(np.dot(theta, lda_betad)) # log_info = "\n".join(["%d %d %d %d %.5f" % (i, doc_id, word, word_count, f) for (word, word_count, f) in izip(test_doc.words, test_doc.counts, log_predicts)]) # test_folds_log_file.write(log_info + "\n") # test_folds_log_file.close() log_file.close() print "Saving the final model and topics." bhdp.save_topics('%s/final.topics' % result_directory) cPickle.dump(bhdp, file('%s/final.model' % result_directory, 'w'), -1) (lda_alpha, lda_beta) = bhdp.hdp_to_lda() # prediction on the fixed test in folds print "\tworking on fixed test data." test_score = 0.0 test_score_split = 0.0 c_test_word_count_split = 0 for doc in c_test.docs: (likelihood, gamma) = hdp.lda_e_step(doc, lda_alpha, lda_beta) test_score += likelihood (likelihood, count, gamma) = hdp.lda_e_step_split(doc, lda_alpha, lda_beta) test_score_split += likelihood c_test_word_count_split += count test_log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, test_score, c_test_word_count, test_score_split, c_test_word_count_split)) test_log_file.flush() test_log_file.close()
def run_hdp(): options = parse_args() random.seed(options.random_seed) # Read the training data. c_train_filename = options.data_path c_train = read_data(c_train_filename) trlabels = utils.read_1D_array('trdataSLDALabels.txt'); trsz = utils.read_1D_array('trdatasz.txt'); trsz = trsz[0]; # Read the test data. test_data_path = options.test_data_path c_test = read_data(test_data_path) c_test_word_count = sum([doc.total for doc in c_test.docs]) testlabels = utils.read_1D_array('trdataSLDALabels.txt'); testsz = utils.read_1D_array('trdatasz.txt'); testsz = testsz[0]; f = file("localsticks_a.txt", "w") f.close() f = file("localsticks_b.txt", "w") f.close() result_directory = "%s/corpus-%s" % (options.directory, options.corpus_name) print "creating directory %s" % result_directory if not os.path.isdir(result_directory): os.makedirs(result_directory) options_file = file("%s/options.dat" % result_directory, "w") for opt, value in options.__dict__.items(): options_file.write(str(opt) + " " + str(value) + "\n") options_file.close() print "creating hdp instance for training." bhdp_hp = hdp.hdp_hyperparameter(options.alpha, options.alpha, options.gamma, options.gamma, True) bhdp = hdp.hdp(options.T, options.K, options.D, options.W, options.eta, trsz, bhdp_hp) print "setting up counters and log files." iter = 0 total_time = 0.0 total_doc_count = 0 likelihood = 0.0 old_likelihood = 0.0 converge = 1.0 while options.max_iter == -1 or iter < options.max_iter: t0 = time.clock() # Run one step iteration. print "EM iteration starts!" likelihood, trmodel = bhdp.em(c_train, trlabels, options.var_converge, trsz, fresh=(iter==0)) if iter > 0: converge = (likelihood - old_likelihood)/abs(old_likelihood) old_likelihood = likelihood print "iter = %d, likelihood = %f, converge = %f" % (iter, likelihood, converge) if converge < 0: print "warning, likelihood is decreasing!" total_time += time.clock() - t0 iter += 1 # increase the iter counter total_doc_count += options.D raw_input() # prediction on the fixed test data print "\tworking on fixed test data." print "creating hdp instance for test." bhdp_hp_test = hdp.hdp_hyperparameter(options.alpha, options.alpha, options.gamma, options.gamma, False) bhdp_test = hdp.hdp(options.T, options.K, options.D, options.W, options.eta, testsz, bhdp_hp_test) # prediction on test data Elogbeta = dirichlet_expectation(bhdp.m_beta) # the topics Elogsticks_1st = expect_log_sticks(bhdp.m_var_sticks) # global sticks m_var_zeta = np.zeros((testsz, options.K)) docnum = 0 for doc in c_test.docs: temp, m_var_zeta = bhdp_test.doc_inference(doc, docnum, Elogbeta, Elogsticks_1st, options.var_converge, m_var_zeta) likelihood += temp docnum +=1 x = utils.converttoliblinear(m_var_zeta) y = testlabels p_label, p_acc, p_val = predict(y, x, trmodel) ACC, MSE, SCC = evaluations(y, p_label) print "test accuracy: %f" %ACC
def run_hdp(): options = parse_args() random.seed(options.random_seed) # Read the training data. c_train_filename = options.data_path c_train = read_data(c_train_filename) trlabels = utils.read_1D_array('trdataSLDALabels.txt') trsz = utils.read_1D_array('trdatasz.txt') trsz = trsz[0] # Read the test data. test_data_path = options.test_data_path c_test = read_data(test_data_path) c_test_word_count = sum([doc.total for doc in c_test.docs]) testlabels = utils.read_1D_array('trdataSLDALabels.txt') testsz = utils.read_1D_array('trdatasz.txt') testsz = testsz[0] f = file("localsticks_a.txt", "w") f.close() f = file("localsticks_b.txt", "w") f.close() result_directory = "%s/corpus-%s" % (options.directory, options.corpus_name) print "creating directory %s" % result_directory if not os.path.isdir(result_directory): os.makedirs(result_directory) options_file = file("%s/options.dat" % result_directory, "w") for opt, value in options.__dict__.items(): options_file.write(str(opt) + " " + str(value) + "\n") options_file.close() print "creating hdp instance for training." bhdp_hp = hdp.hdp_hyperparameter(options.alpha, options.alpha, options.gamma, options.gamma, True) bhdp = hdp.hdp(options.T, options.K, options.D, options.W, options.eta, trsz, bhdp_hp) print "setting up counters and log files." iter = 0 total_time = 0.0 total_doc_count = 0 likelihood = 0.0 old_likelihood = 0.0 converge = 1.0 while options.max_iter == -1 or iter < options.max_iter: t0 = time.clock() # Run one step iteration. print "EM iteration starts!" likelihood, trmodel = bhdp.em(c_train, trlabels, options.var_converge, trsz, fresh=(iter == 0)) if iter > 0: converge = (likelihood - old_likelihood) / abs(old_likelihood) old_likelihood = likelihood print "iter = %d, likelihood = %f, converge = %f" % (iter, likelihood, converge) if converge < 0: print "warning, likelihood is decreasing!" total_time += time.clock() - t0 iter += 1 # increase the iter counter total_doc_count += options.D raw_input() # prediction on the fixed test data print "\tworking on fixed test data." print "creating hdp instance for test." bhdp_hp_test = hdp.hdp_hyperparameter(options.alpha, options.alpha, options.gamma, options.gamma, False) bhdp_test = hdp.hdp(options.T, options.K, options.D, options.W, options.eta, testsz, bhdp_hp_test) # prediction on test data Elogbeta = dirichlet_expectation(bhdp.m_beta) # the topics Elogsticks_1st = expect_log_sticks(bhdp.m_var_sticks) # global sticks m_var_zeta = np.zeros((testsz, options.K)) docnum = 0 for doc in c_test.docs: temp, m_var_zeta = bhdp_test.doc_inference(doc, docnum, Elogbeta, Elogsticks_1st, options.var_converge, m_var_zeta) likelihood += temp docnum += 1 x = utils.converttoliblinear(m_var_zeta) y = testlabels p_label, p_acc, p_val = predict(y, x, trmodel) ACC, MSE, SCC = evaluations(y, p_label) print "test accuracy: %f" % ACC