예제 #1
0
def infer(model_root, save_location, corpus):

    # Create new model and load in model root
    model = LDAModel(0, 0)
    model.load_model(model_root)

    # Initialize var gamma
    var_gamma = [[0 for x in range(model.num_topics)] for x in range(corpus.num_docs)]

    # Open file in which to write the lda likelihood
    filename = save_location + "-lda-likelihood.dat"
    file_pointer = open(filename, "w")

    for index in range(0, corpus.num_docs):

        # Tell user documents are still being processed
        if ((index % 100) == 0) and (index > 0):
            print("Document %d" % index)

        document = corpus.doc_list[index]
        phi = [[0 for x in range(model.num_topics)] for x in range(document.unique_word_count)]

        # Determine likelihood
        likelihood = util_functions.lda_inference(document, model, var_gamma[index], phi)

        # Write likelihood to file
        file_pointer.write("%5.5f\n" % likelihood)

    file_pointer.close()

    filename = save_location + "-gamma.dat"
    file_utils.save_gamma(filename, var_gamma, corpus.num_docs, model.num_topics)
예제 #2
0
def run_em(start, directory, corpus):
    # allocate variational parameters
    var_gamma = [[0 for x in range(global_att.NTOPICS)] \
                 for x in range(corpus.num_docs)]

    max_length = int(corpus.max_length())
    phi = [[0 for x in range(global_att.NTOPICS)] \
           for x in range(max_length)]

    # initialize model
    model = None

    if start == "seeded":

        model = ldamodel.LDAModel(corpus.num_terms, global_att.NTOPICS)
        ss = ldasuffstats.LDASuffStats(model)
        ss.corpus_initialize(model, corpus)
        model.mle(ss, 0)
        model.alpha = global_att.INITIAL_ALPHA

    elif start == "random":
        model = ldamodel.LDAModel(corpus.num_terms, global_att.NTOPICS)
        ss = ldasuffstats.LDASuffStats(model)
        ss.random_initialize(model)
        model.mle(ss, 0)
        model.alpha = global_att.INITIAL_ALPHA

    else:
        model = ldamodel.LDAModel(corpus.num_terms, global_att.NTOPICS)
        model.load_model(start)
        ss = ldasuffstats.LDASuffStats(model)

    filename = directory + "/000"
    model.save_model(filename)

    # run expectation maximization

    index = 0
    converged = 1
    likelihood = None
    likelihood_old = 0.000001

    filename = directory + "/likelihood.dat"
    likelihood_file = open(filename, "w")

    while ((converged < 0) or (converged > global_att.EM_CONVERGED)
           or (index <= 2)) and (index <= global_att.EM_MAX_ITER):

        index += 1
        print("**** em iteration %d ****" % index)
        likelihood = 0
        ss.zero_initialize(model)

        # e-step
        for doc_index in range(0, corpus.num_docs):

            # Show user script is still running
            if (doc_index % 100) == 0:
                print("Document %d" % doc_index)

            likelihood += util_functions.doc_e_step(corpus.doc_list[doc_index],
                                                    var_gamma[doc_index],
                                                    phi,
                                                    model,
                                                    ss)

        # m-step
        model.mle(ss, global_att.ESTIMATE_ALPHA)

        # check for convergence
        converged = (likelihood_old - likelihood) / (likelihood_old)
        if converged < 0:
            global_att.VAR_MAX_ITER *= 2

        likelihood_old = likelihood

        # output model and likelihood
        likelihood_file.write("%10.10f\t%5.5e\n" % (likelihood, converged))

        if (index % global_att.LAG) == 0:
            filename = "%s/%03d" % (directory, index)
            model.save_model(filename)
            filename = "%s/%03d.gamma" % (directory, index)
            file_utils.save_gamma(filename, var_gamma, corpus.num_docs, model.num_topics)

    # output the final model
    filename = "%s/final" % directory
    model.save_model(filename)
    filename = "%s/final.gamma" % directory
    file_utils.save_gamma(filename, var_gamma, corpus.num_docs, model.num_topics)

    # output the word assignments (for visualization)

    filename = "%s/word-assignments.dat" % directory
    w_assign_file = open(filename, "w")

    for doc_index in range(0, corpus.num_docs):

        # Show user script is still processing
        if (doc_index % 100) == 0:
            print("final e step document %d" % doc_index)

        likelihood += util_functions.lda_inference(corpus.doc_list[doc_index], model, var_gamma[doc_index], phi)
        file_utils.write_word_assignment(w_assign_file, corpus.doc_list[doc_index], phi, model)

    w_assign_file.close()
    likelihood_file.close()