class LdaMalletHandler: def __init__(self, mallet_path): self.mallet_path = mallet_path def run_model(self, model_name, corpus, **kwargs): self.model_name = model_name self.dictionary = Dictionary(corpus) corpus_bow = [self.dictionary.doc2bow(text) for text in corpus] os.makedirs("ldamodels/"+model_name, exist_ok=True ) self.model = LdaMallet(self.mallet_path, corpus_bow, id2word=self.dictionary, prefix="./ldamodels/"+model_name+"/", **kwargs) def save_model(self): self.model.save("ldamodels/"+self.model_name+"/model.model") self.dictionary.save("ldamodels/"+self.model_name+"/dict.dict") def load_model(self, model_name): self.model_name = model_name self.dictionary = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict") self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model") self.model.mallet_path = self.mallet_path def doc_topics(self, doc_idx): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) return self.doc_retriever.doc_topics(doc_idx) def ext_doc_topics(self, ext_doc): doc_bow = self.dictionary.doc2bow(ext_doc) doc_topics = self.model[doc_bow] doc_topics.sort(key=lambda x: x[1], reverse=True) return doc_topics def ext_doc_n_most_similar(self, ext_doc, n=5, metric='cosine'): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) doc_bow = self.dictionary.doc2bow(ext_doc) doc_topics = self.model[doc_bow] topics = [] for topic in doc_topics: topics.append(topic[1]) most_similar = self.doc_retriever.n_most_similar(topics, n=n, metric=metric) return most_similar def n_most_representative(self, topic, n=3): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) topics = np.zeros(self.model.num_topics) topics[topic]=1 most_similar = self.doc_retriever.n_most_similar(topics, n=n) return most_similar def get_string_topics(self, num_topics=-1, num_words=10): if(num_topics==-1): num_topics = self.model.num_topics string_topics = [] for topic in self.model.print_topics(num_topics=num_topics, num_words=num_words): splitted = topic[1].split("\"") result = [splitted[2*i+1] for i in range(0,int(len(splitted)/2))] string_topics.append(" ".join(result)) return string_topics
def mallet_lda(self, num): id2word = corpora.Dictionary(self.data['token']) texts = self.data['token'] corpus = [id2word.doc2bow(text) for text in texts] os.environ['Mallet_HOME'] = 'C:\\Mallet' mallet_path = 'C:\\Mallet\\bin\\mallet' ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=num, id2word=id2word) return ldamallet.print_topics(num, num_words=6)
def lda(bow, df, vocab): # Generate and load corpus corpus = text_to_corpus(bow) corpus = np.load('corpus.npy') path_to_mallet = './mallet-2.0.8/bin/mallet' model = LdaMallet(path_to_mallet, corpus=corpus, num_topics=5, workers=4, id2word=vocab) res = model.print_topics(num_topics=-1, num_words=50) # print response for x in res: print(x) for x in model[corpus]: print(x)
dictionary.filter_extremes(no_above=0.5) # Convert to document term matrix (corpus) doc_term_mat_train = [dictionary.doc2bow(doc) for doc in docs_train] doc_term_mat_test = [dictionary.doc2bow(doc) for doc in docs_test] path_to_mallet_binary = r'C:\mallet\bin\mallet' if __name__ == "__main__": model = LdaMallet(path_to_mallet_binary, corpus=doc_term_mat_train, alpha=5, num_topics=10, id2word=dictionary, optimize_interval=50) topics = model.print_topics() for topic in topics: print(topic) # Compute Coherence Score for base model coherence_model_lda = CoherenceModel(model=model, corpus=doc_term_mat_train, texts=docs_train, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() gensim_model = ldamallet.malletmodel2ldamodel(model) # Visualize the topics vis_prepared = pyLDAvis.gensim.prepare(gensim_model, doc_term_mat_train, dictionary) pyLDAvis.save_html(vis_prepared, "mallet.html")
# Create the vocabulary for ii in files: doc_scanner.scan(tokenize_file(ii)) # Initialize the documents docs = doc_scanner.docs dictionary = Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] # start = time.time() # gensim_lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=args.num_topics, iterations=args.num_iterations) # time_took = time.time() - start # report(gensim_lda.print_topics(num_topics=10, num_words=50), filename="gensim", limit=50) # print(("Total time it took: %0.5f seconds" % (time_took))) mallet_file = "/home/jihwangk/Desktop/GitDir/Mallet/bin/mallet" # start = time.time() mallet_lda = LdaMallet(mallet_file, corpus=corpus, num_topics=args.num_topics, id2word=dictionary, iterations=args.num_iterations) # time_took = time.time() - start mallet_lda = gensim.models.wrappers.ldamallet.malletmodel2ldamodel( mallet_lda, iterations=args.num_iterations) report(mallet_lda.print_topics(num_topics=10, num_words=50), filename="mallet", limit=50) # print(("Total time it took: %0.5f seconds" % (time_took)))
# %% topic model estimation """ I focus on two models: - 8 topics, ~ local optimum - 30 topic, ~ global optimum """ # model with 8 topics # --+ estimate model lda_8 = LdaMallet( mallet_path, corpus=corpus, id2word=dictionary, num_topics=8, random_seed=123 ) # --+ print topics (20 words per topic) lda_8.print_topics(num_topics=8, num_words=20) # --+ translate topic modeling outcome lda_8 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_8) # --+ term-to-topic probabilities (10 words per topic) top_terms_line = lda_8.show_topics(num_topics=8, num_words=10) # ----+ rearrange data on top 10 terms per topic top_terms_m = [] for i in top_terms_line: topic_num = i[0] prob_terms = i[1].split("+") for term_sort, term in enumerate(prob_terms): weight = float(term.split("*")[0]) term = term.split("*")[1].strip('"| ') top_terms_m.append([topic_num, term_sort, weight, term]) df = pd.DataFrame(top_terms_m)
def build_lda_model(CIKs, num_topics, ngram_num): documents = [] lda_model = None dct = None corpus = None main_path = dirname(realpath(__file__)) + "/data/14d9" for CIK in CIKs: files = [ f for f in listdir(main_path + '/' + CIK) if isfile(join(main_path + '/' + CIK, f)) ] for file in files: try: with open(main_path + '/' + CIK + '/' + file, "r", encoding="latin-1") as f: for row in f: document = [ word for word in row.split(" ") if len(word) > 2 ] documents.append(document) except IOError as e: print("Couldn't open file (%s)." % e) # Add bigram, trigrams, and quadgrams bigram = Phrases(documents) documents = [bigram[line] for line in documents] trigram = Phrases(documents) documents = [trigram[line] for line in documents] quadgram = Phrases(documents) documents = [quadgram[line] for line in documents] documents = list( map( lambda document: list( filter(lambda word: word.count('_') == (ngram_num - 1), document)), documents)) # Dictionary dct = corpora.Dictionary(documents) # Corpus corpus = [dct.doc2bow(line) for line in documents] environ['MALLET_HOME'] = dirname(realpath(__file__)) + '/mallet-2.0.8/' mallet_path = dirname(realpath(__file__)) + "/mallet-2.0.8/bin/mallet" lda_mallet = LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dct, iterations=ITERATIONS) # Show Topics print("LDA Model MALLET") for idx in range(num_topics): print("Topic #%s-" % idx, lda_mallet.print_topic(idx, 10)) # Format topic and percentage for api export formatted_topics = [] for _, topic_str in lda_mallet.print_topics(): current_topic = [] for percent_topic in topic_str.split(' + '): percent, term = percent_topic.split('*') current_topic.append({ 'weight': float(percent) * 1000, 'term': term[1:-1] }) formatted_topics.append(current_topic) # Create df for analytics over topics df_dominant_topic, df_representative_topic = create_topic_analytics( lda_mallet, corpus, documents) return formatted_topics, df_dominant_topic.to_dict( 'records'), df_representative_topic.to_dict('records')
''' I focus on two models: - 8 topics, ~ local optimum - 30 topic, ~ global optimum ''' # model with 9 topics # --+ estimate model lda_9 = LdaMallet(mallet_path, corpus=corpus, id2word=dictionary, num_topics=9, random_seed=123) # --+ print topics (20 words per topic) lda_9.print_topics(num_topics=9, num_words=20) # --+ translate topic modeling outcome lda_9 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_9) # --+ term-to-topic probabilities (10 words per topic) top_terms_line = lda_9.show_topics(num_topics=9, num_words=10) # ----+ rearrange data on top 10 terms per topic top_terms_m = [] for i in top_terms_line: topic_num = i[0] prob_terms = i[1].split('+') for term_sort, term in enumerate(prob_terms): weight = float(term.split('*')[0]) term = term.split('*')[1].strip('"| ') top_terms_m.append([topic_num, term_sort, weight, term]) df = pd.DataFrame(top_terms_m)