def get_graph_log(): """ make graphs - loglikelihood over number of topics choosen """ new_iterations = 5 n_topics = [] loglkhs = [] perplexities = [] for k in range(20, 200, 40): z, lkh, perplex2, data_iter = gibbs( data, k, V, M, alpha, beta, new_iterations, burn_in, sample_lag) n_topics.append(k) loglkhs.append(lkh) perplexities.append(perplex2[-1]) print time.time() - start # graph perplexity score vs number of topics plt.plot(n_topics, perplexities, 'o-') plt.title("perplexity score vs number of topics") plt.ylabel("perplexity score") plt.xlabel("Number of topics(T)") plt.savefig('output/perplexity_vs_topics_'+hashname+'.png') plt.close() # plot log-likelihood plt.plot(n_topics, loglkhs, 'o-') plt.title("log-likelihood") plt.ylabel("log P(w|T)") plt.xlabel("Number of topics(T)") plt.yscale('log') plt.savefig('output/loglk_'+hashname+'.png') plt.close()
def main(inputfile): hashname = str(random.getrandbits(32)) wd = WordModel(inputfile) data = wd.data V = wd.V M = wd.M # topics K = 100 # alpha= 0.5 alpha = 50/float(K) # beta = 200/float(V) #or 0.01 beta = 0.01 # gibbs settings iterations = 60 burn_in = 20 sample_lag = 10 # run simulation start = time.time() # gibbs sampling z, ldh, perplex, data_iter = gibbs( data, K, V, M, alpha, beta, iterations, burn_in, sample_lag) wordtopics = wd.map_wordtopics(z) wctable = wd.get_wordcount(wordtopics) top_results = wd.threshold_wordtable(wctable) # save results out_filename = 'results_'+hashname+'.txt' out_path = os.path.join("output", out_filename) savedfile = open(out_path, 'w') for i, item in enumerate(top_results): savedfile.write("Topic %d\n" % i) savedfile.write("word\t\tfreq\n---------- ----------\n") for k, v in item: if len(k) > 7: savedfile.write("%s\t%d\n" % (k, v)) else: savedfile.write("%s\t\t%d\n" % (k, v)) savedfile.write("\n") savedfile.close() def get_graph_log(): """ make graphs - loglikelihood over number of topics choosen """ new_iterations = 5 n_topics = [] loglkhs = [] perplexities = [] for k in range(20, 200, 40): z, lkh, perplex2, data_iter = gibbs( data, k, V, M, alpha, beta, new_iterations, burn_in, sample_lag) n_topics.append(k) loglkhs.append(lkh) perplexities.append(perplex2[-1]) print time.time() - start # graph perplexity score vs number of topics plt.plot(n_topics, perplexities, 'o-') plt.title("perplexity score vs number of topics") plt.ylabel("perplexity score") plt.xlabel("Number of topics(T)") plt.savefig('output/perplexity_vs_topics_'+hashname+'.png') plt.close() # plot log-likelihood plt.plot(n_topics, loglkhs, 'o-') plt.title("log-likelihood") plt.ylabel("log P(w|T)") plt.xlabel("Number of topics(T)") plt.yscale('log') plt.savefig('output/loglk_'+hashname+'.png') plt.close() def get_graph_perplex(): # print time.time() - start plt.plot(data_iter, perplex, 'o-') plt.title("perplexity score") plt.ylabel("Predicted perplexity") plt.xlabel("Number of iterations") plt.savefig('output/perplex_'+hashname+'.png') plt.close() if has_mpl: get_graph_perplex() get_graph_log()