Пример #1
0
def get_topic_doc(doc_topics, num_topics, proc_corpus):
    assert len(doc_topics) == len(proc_corpus), "output from mallet is of different length than input corpus"

    topic_doc = {}

    for i in range(num_topics):
        topic_doc[i] = []

    # each document
    for i in range(len(doc_topics)):

        # each topic in each document
        for topic in doc_topics[i]:

            if topic[0] in topic_doc:
                topic_doc[topic[0]].append(
                    (proc_corpus[i][0], topic[1]))  # append (the ID of the segment in the corpus, and the topic
                # relevance to that segment)

    for topic in topic_doc:

        topic_doc[topic] = sorted(topic_doc[topic], key=(itemgetter(1)), reverse=True)
        pseudocount_list = [seg[1] for seg in topic_doc[topic]]
        thresh = threshold(pseudocount_list)
        new_doc = []
        for i in range(len(topic_doc[topic])):
            if topic_doc[topic][i][1] > thresh:
                new_doc.append(topic_doc[topic][i])

        topic_doc[topic] = new_doc

    return topic_doc
Пример #2
0
def run_elbow(model, feature_names):
    """Prints the topic information. Takes the sklearn.decomposition.LatentDiricheltAllocation lda model,
    the names of all the features, the number of words to be printined per topic, a list holding the freq
    of each topic in the corpus"""
    print("Elbow Limited Topics:")
    message_list = []

    for topic_idx, topic in enumerate(model.components_):

        message = "Topic #%d: " % (topic_idx)

        #get the names of the features in sorted order -> argsort() return sorted indicies
        list_feat = [feature_names[i]
                     for i in topic.argsort()[::-1]]  #[::-1] reverses list

        #get the frequencis of the top words (limited by the threshold function)
        feat_freq = sorted(topic, reverse=True)
        cutoff = threshold(sorted(topic, reverse=True))
        limited_freq = limit_by_threshold(feat_freq, cutoff)

        for j in range(len(limited_freq)):
            message += "%s: %s, " % (str(list_feat[j]), str(limited_freq[j]))

        message_list.append(message)
        print(message)
    print()

    return message_list
Пример #3
0
def visualize(doc_topic_dist):
    #go through each topic
    for i in range(NUM_TOPICS):
        f = plt.figure(i)
        plt.plot(sorted(doc_topic_dist[:, i], reverse=True))
        cutoff = threshold(sorted(doc_topic_dist[:, i]))
        plt.plot(1, cutoff, 'g', marker='o')
        plt.ylabel("Association")
        plt.xlabel("Document")
        plt.title("Topic %d" % (i))
        f.show()

    input()
Пример #4
0
def get_topic_word(topics):
    threshold_topic_word = []
    topics = [[topic[0], topic[1]] for topic in topics]
    # sort and reverse
    for i in range(len(topics)):
        freq = []
        words = topics[i][1]
        for word in words:
            freq.append(word[1])
        thresh = threshold(freq)
        for j in range(len(words)):
            if words[j][1] < thresh:
                topics[i][1] = topics[i][1][:j]
                break
    return topics