예제 #1
0
def test_perplexity_vs_ntopics(corpus, model_dir, tp):
    plsi_perplexities = []
    plsi_mg_perplexities = []
    lda_perplexities = []

    model_files = os.listdir(model_dir)
    model_files = [model_file for model_file in model_files if tp in model_file]
    model_full_files = [model_dir + "/" + model_file for model_file in model_files]
    idxs = corpus["idxs"]
    for i in range(len(model_files)):
        model_fname = model_files[i]
        model_fullname = model_full_files[i]
        model_split = model_fname.split('_')
        mtype, mk, mcorp = model_split[0], model_split[1], model_split[2]
        params = dmg.load_model_params(model_fullname)
        total_words = 0
        for j in range(len(idxs)):
            total_words += idxs[j].shape[0]
        print 'total words', total_words
        # TODO: perhaps clean by allowing model to be instantiated from params

        if mtype == 'plsi':
            beta, pi = params["beta"], params["pi"]
            model = PLSIModel(int(mk), beta=beta, pi=pi)
            fin_ll, mg_ll = cperplexity(model, corpus, 'plsi')
            perplexity = -1 * fin_ll / total_words
            perplexity_mg = -1 * mg_ll / total_words
            perplexity = np.exp(perplexity)
            perplexity_mg = np.exp(perplexity_mg)
            plsi_perplexities.append((mk, perplexity))
            plsi_mg_perplexities.append((mk, perplexity_mg))
        elif mtype == 'lda':
            # TODO: write LDA perplexity
            alpha, beta = params["alpha"], params["beta"]
            model = LDAModel(int(mk), beta=beta, alpha=alpha)
            var_ll = cperplexity(model, corpus, 'lda')
            perplexity = -1 * var_ll / total_words
            lda_perplexities.append((mk, perplexity))

        if mtype == 'plsi':
            print "Marg PLSI Perplexity: {0}".format(perplexity_mg)
        print "Model={0}, K={1}, Perplexity={2}".format(mtype, mk, perplexity)

    # sort perplexities by num topics
    plsi_perplexities = sorted(plsi_perplexities, key=lambda x: x[0])
    lda_perplexities = sorted(lda_perplexities, key= lambda x: x[0])
    plsi_perplexities = [list(t) for t in zip(*plsi_perplexities)]
    lda_perplexities = [list(t) for t in zip(*lda_perplexities)]

    # plotting
    line_plsi, = plt.plot(plsi_perplexities[0], plsi_perplexities[1], 'b^')
    plt.plot(plsi_perplexities[0], plsi_perplexities[1], 'k')
    line_lda, = plt.plot(lda_perplexities[0], lda_perplexities[1], 'rs')
    plt.plot(lda_perplexities[0], lda_perplexities[1], 'r--')
    plt.title("Perplexity vs. Num Topics for Various Models")
    plt.legend(handles=[line_plsi, line_lda])
    plt.show()
예제 #2
0
def test_word_feature_reps(model_dir, num_words=500):
    model_files = os.listdir(model_dir)
    model_files = [model_dir + "/" + model_file for model_file in model_files]
    for model_file in model_files:
        last_part = model_file.split("/")[-1]
        print "\n", last_part, 'word features'
        params = dmg.load_model_params(model_file)
        if 'plsi' in model_file:
            word_gens = params['pi']
        else:
            word_gens= params["beta"]
        vocab = load_vocab(last_part)
        index = {v: k for (k, v) in vocab.items()}
        best_words = np.argpartition(word_gens, -500, axis=0)
        best_words = best_words[-500:,:]
        rand_idx = np.random.choice(best_words.shape[0], num_words)
        rand_words = best_words[rand_idx, :]
        #rand_idx = np.random.choice(word_gens.shape[0], num_words)
        #rand_words = word_gens[rand_idx, :]
        tsne = TSNE(n_components=2, random_state=0)
        projected = tsne.fit_transform(rand_words)
        # plotting
        fig = plt.figure(figsize=(18, 12))
        ax = fig.add_subplot(111)
        ax.plot(projected[:,0], projected[:,1],linestyle='None',marker='o',markersize=1)
        for i in range(projected.shape[0]):
            feat1, feat2 = projected[i, 0], projected[i, 1]
            word = index[rand_idx[i]].decode('utf-8')
            chars = []
            for j in word:
                if ord(j) < 128:
                    chars.append(j)
            word == ''.join(chars)
            print word
            ax.text(feat1, feat2, word, fontsize=15)
        plt.title(last_part)
        plt.show()
예제 #3
0
def test_best_topic_words(model_dir, result_dir, num_words=10):
    vocab_file = None
    model_files = os.listdir(model_dir)
    model_files = [model_dir + "/" + model_file for model_file in model_files]
    for model_file in model_files:
        last_part = model_file.split("/")[-1]
        print "\n", last_part, 'words'
        params = dmg.load_model_params(model_file)
        if 'plsi' in model_file:
            word_gens = params['pi']
        else:
            word_gens = params["beta"]
        vocab = load_vocab(last_part)
        vocab_size = word_gens.shape[0]
        index = {v: k for (k, v) in vocab.items()}
        best_words = np.argpartition(word_gens, -num_words, axis=0)
        best_words = best_words[-num_words:, :].T
        best_words = [[index[idx] for idx in words] for words in best_words]
        best_strings = [",".join(words) for words in best_words]
        best_strings = "\n".join(best_strings)
        result_file = open(result_dir + "/bw_" + last_part[:-4] + ".txt", 'w')
        print "\n", best_strings
        result_file.write(best_strings)
        result_file.close()