def get_topic_doc(doc_topics, num_topics, proc_corpus): assert len(doc_topics) == len(proc_corpus), "output from mallet is of different length than input corpus" topic_doc = {} for i in range(num_topics): topic_doc[i] = [] # each document for i in range(len(doc_topics)): # each topic in each document for topic in doc_topics[i]: if topic[0] in topic_doc: topic_doc[topic[0]].append( (proc_corpus[i][0], topic[1])) # append (the ID of the segment in the corpus, and the topic # relevance to that segment) for topic in topic_doc: topic_doc[topic] = sorted(topic_doc[topic], key=(itemgetter(1)), reverse=True) pseudocount_list = [seg[1] for seg in topic_doc[topic]] thresh = threshold(pseudocount_list) new_doc = [] for i in range(len(topic_doc[topic])): if topic_doc[topic][i][1] > thresh: new_doc.append(topic_doc[topic][i]) topic_doc[topic] = new_doc return topic_doc
def run_elbow(model, feature_names): """Prints the topic information. Takes the sklearn.decomposition.LatentDiricheltAllocation lda model, the names of all the features, the number of words to be printined per topic, a list holding the freq of each topic in the corpus""" print("Elbow Limited Topics:") message_list = [] for topic_idx, topic in enumerate(model.components_): message = "Topic #%d: " % (topic_idx) #get the names of the features in sorted order -> argsort() return sorted indicies list_feat = [feature_names[i] for i in topic.argsort()[::-1]] #[::-1] reverses list #get the frequencis of the top words (limited by the threshold function) feat_freq = sorted(topic, reverse=True) cutoff = threshold(sorted(topic, reverse=True)) limited_freq = limit_by_threshold(feat_freq, cutoff) for j in range(len(limited_freq)): message += "%s: %s, " % (str(list_feat[j]), str(limited_freq[j])) message_list.append(message) print(message) print() return message_list
def visualize(doc_topic_dist): #go through each topic for i in range(NUM_TOPICS): f = plt.figure(i) plt.plot(sorted(doc_topic_dist[:, i], reverse=True)) cutoff = threshold(sorted(doc_topic_dist[:, i])) plt.plot(1, cutoff, 'g', marker='o') plt.ylabel("Association") plt.xlabel("Document") plt.title("Topic %d" % (i)) f.show() input()
def get_topic_word(topics): threshold_topic_word = [] topics = [[topic[0], topic[1]] for topic in topics] # sort and reverse for i in range(len(topics)): freq = [] words = topics[i][1] for word in words: freq.append(word[1]) thresh = threshold(freq) for j in range(len(words)): if words[j][1] < thresh: topics[i][1] = topics[i][1][:j] break return topics