Пример #1
0
    def get_graph_log():
        """ make graphs
            - loglikelihood over number of topics choosen
        """
        new_iterations = 5
        n_topics = []
        loglkhs = []
        perplexities = []
        for k in range(20, 200, 40):
            z, lkh, perplex2, data_iter = gibbs(
                data, k, V, M, alpha, beta,
                new_iterations, burn_in, sample_lag)
            n_topics.append(k)
            loglkhs.append(lkh)
            perplexities.append(perplex2[-1])

        print time.time() - start
        # graph perplexity score vs number of topics
        plt.plot(n_topics, perplexities, 'o-')
        plt.title("perplexity score vs number of topics")
        plt.ylabel("perplexity score")
        plt.xlabel("Number of topics(T)")
        plt.savefig('output/perplexity_vs_topics_'+hashname+'.png')
        plt.close()

        # plot log-likelihood
        plt.plot(n_topics, loglkhs, 'o-')
        plt.title("log-likelihood")
        plt.ylabel("log P(w|T)")
        plt.xlabel("Number of topics(T)")
        plt.yscale('log')
        plt.savefig('output/loglk_'+hashname+'.png')
        plt.close()
Пример #2
0
def main(inputfile):
    hashname = str(random.getrandbits(32))
    wd = WordModel(inputfile)
    data = wd.data

    V = wd.V
    M = wd.M
    # topics
    K = 100
    # alpha= 0.5
    alpha = 50/float(K)
    # beta = 200/float(V) #or 0.01
    beta = 0.01

    # gibbs settings
    iterations = 60
    burn_in = 20
    sample_lag = 10

    # run simulation
    start = time.time()
    # gibbs sampling
    z, ldh, perplex, data_iter = gibbs(
        data, K, V, M, alpha, beta, iterations, burn_in, sample_lag)

    wordtopics = wd.map_wordtopics(z)
    wctable = wd.get_wordcount(wordtopics)
    top_results = wd.threshold_wordtable(wctable)

    # save results
    out_filename = 'results_'+hashname+'.txt'
    out_path = os.path.join("output", out_filename)
    savedfile = open(out_path, 'w')
    for i, item in enumerate(top_results):
        savedfile.write("Topic %d\n" % i)
        savedfile.write("word\t\tfreq\n----------      ----------\n")
        for k, v in item:
            if len(k) > 7:
                savedfile.write("%s\t%d\n" % (k, v))
            else:
                savedfile.write("%s\t\t%d\n" % (k, v))
        savedfile.write("\n")
    savedfile.close()

    def get_graph_log():
        """ make graphs
            - loglikelihood over number of topics choosen
        """
        new_iterations = 5
        n_topics = []
        loglkhs = []
        perplexities = []
        for k in range(20, 200, 40):
            z, lkh, perplex2, data_iter = gibbs(
                data, k, V, M, alpha, beta,
                new_iterations, burn_in, sample_lag)
            n_topics.append(k)
            loglkhs.append(lkh)
            perplexities.append(perplex2[-1])

        print time.time() - start
        # graph perplexity score vs number of topics
        plt.plot(n_topics, perplexities, 'o-')
        plt.title("perplexity score vs number of topics")
        plt.ylabel("perplexity score")
        plt.xlabel("Number of topics(T)")
        plt.savefig('output/perplexity_vs_topics_'+hashname+'.png')
        plt.close()

        # plot log-likelihood
        plt.plot(n_topics, loglkhs, 'o-')
        plt.title("log-likelihood")
        plt.ylabel("log P(w|T)")
        plt.xlabel("Number of topics(T)")
        plt.yscale('log')
        plt.savefig('output/loglk_'+hashname+'.png')
        plt.close()

    def get_graph_perplex():
        # print time.time() - start
        plt.plot(data_iter, perplex, 'o-')
        plt.title("perplexity score")
        plt.ylabel("Predicted perplexity")
        plt.xlabel("Number of iterations")
        plt.savefig('output/perplex_'+hashname+'.png')
        plt.close()

    if has_mpl:
        get_graph_perplex()
        get_graph_log()