class LDA_Doc(object): def __init__(self, list_of_list_of_words): print('Initializing the LDA_Doc instance') self.document_word_list = list_of_list_of_words def create_dictionary(self): print('Creating dictionary of words in list of list of words') # Dictionary could be prepared after removing stop words and lemmatizing the words self.dict = corpora.Dictionary(self.document_word_list) def create_corpus(self): print('Creating corpus with id for each document') self.corpus = [ self.dict.doc2bow(line) for line in self.document_word_list ] def create_lda_model(self): print('Initializing lda model') self.model = LdaMulticore(corpus=self.corpus, id2word=self.dict, random_state=100, num_topics=20, passes=10, chunksize=1000, batch=False, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) def compute_optimal_number_of_topic(self): pass def compute_coherence_score(self): pass def compute_complexity_perplexity(self): pass def saving_topicsKeywords_to_csv(self, path, collectionName, docFolderName): # Saving LDA model to disk # self.model.save(path+'/'+docFolderName+'/'+collectionName+'/lda_model') print('Saving LdaModel model topics with top 10 keywords') topics_list = [] for t in range(self.model.num_topics): topics_list.append([' ' + x[0] for x in self.model.show_topic(t)]) with open(path + '/' + docFolderName + '/' + collectionName + '/lda_text.csv', 'w', newline='') as out: csv_out = csv.writer(out) for row in topics_list: csv_out.writerow(row)
def train_lda(): """ Usage: python Wechat_LDA.py wechat.csv """ with open(sys.argv[1], 'r') as wx: for f in wx: seg = jieba.cut(f) seg = [word for word in seg if word not in stopwords] with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg: wx_seg.write(' '.join(seg)) documents = open('wechat_seg.txt', 'r') dictionary = corpora.Dictionary(LineSentence(documents)) corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)] tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True) tfidf_model.save('wechat_seg.txt.tfidf_model') # corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus]) lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1) lda_model.save('wechat_lda_model.pkl') topics = [] for doc in corpus: topics.append(lda_model[doc]) counts = np.zeros(100) for top_doc in topics: for ti, _ in top_doc: counts[ti] += 1 words = lda_model.show_topic(counts.argmax(), 64) with open('top_words.txt', 'w') as tw: writer = UnicodeWriter(tw) for w in words: writer.writerow((w[0], int(float(w[1])*1000)))
def _build_lda(self, name, corpus, num_topics=30, words_to_save=200, multicore=True): from gensim.models import LdaMulticore, LdaModel gdict = self.gensim_dictionary if multicore: lda = LdaMulticore(corpus=corpus, num_topics=num_topics, workers=3, id2word=gdict) else: lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=gdict) model = TopicModel(name=name, dictionary=self) model.save() topics = [] for i in range(num_topics): topic = lda.show_topic(i, topn=words_to_save) alpha = lda.alpha[i] topicm = Topic(model=model, name="?", alpha=alpha, index=i) topicm.save() topics.append(topicm) words = [] for prob, word_text in topic: word_index = gdict.token2id[word_text] word_id = self.get_word_id(word_index) tw = TopicWord(topic=topicm, word_id=word_id, word_index=word_index, probability=prob) words.append(tw) TopicWord.objects.bulk_create(words) most_likely_word_scores = topicm.word_scores\ .order_by('-probability')\ .prefetch_related('word') topicm.name = ', '.join( [score.word.text for score in most_likely_word_scores[:3]]) topicm.save() if settings.DEBUG: # prevent memory leaks from django.db import connection connection.queries = [] model.save_to_file(lda) return (model, lda)
def _build_lda(self, name, corpus, num_topics=30, words_to_save=200, multicore=True): from gensim.models import LdaMulticore, LdaModel gdict = self.gensim_dictionary if multicore: lda = LdaMulticore(corpus=corpus, num_topics=num_topics, workers=3, id2word=gdict) else: lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=gdict) model = TopicModel(name=name, dictionary=self) model.save() topics = [] for i in range(num_topics): topic = lda.show_topic(i, topn=words_to_save) alpha = lda.alpha[i] topicm = Topic(model=model, name="?", alpha=alpha, index=i) topicm.save() topics.append(topicm) words = [] for prob, word_text in topic: word_index = gdict.token2id[word_text] word_id = self.get_word_id(word_index) tw = TopicWord(topic=topicm, word_id=word_id, word_index=word_index, probability=prob) words.append(tw) TopicWord.objects.bulk_create(words) most_likely_word_scores = topicm.word_scores\ .order_by('-probability')\ .prefetch_related('word') topicm.name = ', '.join([score.word.text for score in most_likely_word_scores[:3]]) topicm.save() if settings.DEBUG: # prevent memory leaks from django.db import connection connection.queries = [] model.save_to_file(lda) return (model, lda)
def _build_lda(self, name, corpus, num_topics=30, words_to_save=200): from gensim.models import LdaMulticore gdict = self.gensim_dictionary lda = LdaMulticore(corpus=corpus, num_topics=num_topics, workers=3, id2word=gdict) model = TopicModel(name=name, dictionary=self) model.save() topics = [] for i in range(num_topics): topic = lda.show_topic(i, topn=words_to_save) alpha = lda.alpha[i] topicm = Topic(model=model, name="?", alpha=alpha, index=i) topicm.save() topics.append(topicm) words = [] for prob, word_text in topic: word_index = gdict.token2id[word_text] word_id = self.get_word_id(word_index) tw = TopicWord(topic=topicm, word_id=word_id, word_index=word_index, probability=prob) words.append(tw) TopicWord.objects.bulk_create(words) if settings.DEBUG: # prevent memory leaks from django.db import connection connection.queries = [] model.save_to_file(lda) return (model, lda)
def _build_lda(self, name, corpus, num_topics=30, words_to_save=200): from gensim.models import LdaMulticore gdict = self.gensim_dictionary lda = LdaMulticore(corpus=corpus, num_topics=num_topics, workers=3, id2word=gdict) model = TopicModel(name=name, dictionary=self) model.save() topics = [] for i in range(num_topics): topic = lda.show_topic(i, topn=words_to_save) alpha = lda.alpha[i] topicm = Topic(model=model, name="?", alpha=alpha, index=i) topicm.save() topics.append(topicm) words = [] for prob, word_text in topic: word_index = gdict.token2id[word_text] word_id = self.get_word_id(word_index) tw = TopicWord(topic=topicm, word_id=word_id, word_index=word_index, probability=prob) words.append(tw) TopicWord.objects.bulk_create(words) if settings.DEBUG: # prevent memory leaks from django.db import connection connection.queries = [] model.save_to_file(lda) return (model, lda)
print("created corpus") print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(comments_corpus)) num_topics = 150 if args.load: model = LdaMulticore.load("topic_models/model_comments") else: model = LdaMulticore(comments_corpus, id2word=dictionary, num_topics=num_topics) print("model done") model.save("topic_models/model_comments") print(model.print_topics(20)) top_topics = model.top_topics(comments_corpus) #, num_words=20) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) #from pprint import pprint #pprint(top_topics) for _ in range(10): idx = np.random.randint(0, len(comments_text)) print("comment: {} - topics: {}".format(comments_text[idx], [(model.show_topic(tid, topn=10), v) for tid, v in model[comments_corpus[idx]] if v > 0.15]))
def train_LDA_model(data, num_topics, CPUs): # Pre-processing sentences = [nltk.tokenize.sent_tokenize(doc) for doc in data] sentences = [val for sublist in sentences for val in sublist] data_words = list(sent_to_words(sentences)) # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en nlp = spacy.load('en', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization( data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Create Dictionary id2word = corpora.Dictionary(data_lemmatized) # Create Corpus texts = data_lemmatized # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # ## Train LDA Model # Build LDA model lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=50, chunksize=100, passes=10, per_word_topics=True, workers=CPUs) model_dest = lda_data_dir + 'LDA_model/all_years_2007_2017/lda_model_all_years.model' lda_model.save(model_dest) # Print the Keyword in the 10 topics pprint(lda_model.print_topics()) doc_lda = lda_model[corpus] # Visualize the topics vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) storage_dest_lda_html = lda_data_dir + 'LDA_model/all_years_2007_2017/all_years_2007_2017_local_lda.html' pyLDAvis.save_html(vis, storage_dest_lda_html) wordcloud_dest = lda_data_dir + 'LDA_model/all_years_2007_2017/wordclouds/' for t in range(lda_model.num_topics): plt.figure() dictionary = {} plt.imshow(WordCloud().fit_words( Convert(lda_model.show_topic(t, 30), dictionary))) plt.axis("off") plt.title("Topic_" + str(t)) plt.show() plt.savefig(wordcloud_dest + "Topic #" + str(t) + '.png') # set location on server return lda_model
corpus_topics = [ sorted(topics, key=lambda record: -record[1])[0] for topics in tm_results ] # In[ ]: # Get top significant terms and their probabilities for each topic using ldamallet topics = [[(term, round(wt, 3)) for term, wt in ldamallet.show_topic(n, topn=20)] for n in range(0, ldamallet.num_topics)] # In[ ]: # Get top significant terms and their probabilities for each topic using LDA multicore topics_ldamulticore = [[(term, round(wt, 3)) for term, wt in ldamulticore.show_topic(n, topn=20)] for n in range(0, ldamulticore.num_topics)] # In[ ]: import pickle from gensim.models import CoherenceModel ldamodel = pickle.load( open( "\\Users\\hamed\\Desktop\\ECE 143 Project Data Files\\ldamodel_100_QAT.pkl", "rb")) ldamulticore = pickle.load( open( "\\Users\\hamed\\Desktop\\ECE 143 Project Data Files\\ldamulticore_100_QAT.pkl", "rb"))
def LDA_model_train_out_of_time(df, features, num_topics = 30, subset = True, CPUs = 6): dest_all_model = dict() X = df[features] tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years') tbcv_folds = tbcv.split(df, validation_split_date = datetime.date(2008,12,31), date_column = 'sec_filing_date') k_folds = len(tbcv_folds) for k_index, (train_index, test_index) in enumerate(tbcv_folds): train_years_start = min(X.loc[train_index]['sec_filing_date']).year train_years_end = max(X.loc[train_index]['sec_filing_date']).year val_year = min(X.loc[test_index]['sec_filing_date']).year data_train = X.loc[train_index].drop('sec_filing_date', axis=1) data_val = X.loc[test_index].drop('sec_filing_date', axis=1) print("=========================================") print("==== K Fold Validation step => %d/%d ======" % (k_index+1, k_folds)) print("=========================================") start = time.time() data_train = data_train.values.tolist() data_train = [val for sublist in data_train for val in sublist] id2word_train, texts_train, corpus_train = prepare_LDA_text(data_train, subset = subset) end = time.time() print("Preparing training text took: " + str(end - start)) start = time.time() data_val = data_val.values.tolist() data_val = [val for sublist in data_val for val in sublist] id2word_val, texts_val, corpus_val = prepare_LDA_text(data_val, subset = subset) end = time.time() print("Preparing validation text took: " + str(end - start)) #Train LDA on Training data start = time.time() lda_model_train = LdaMulticore(corpus = corpus_train, id2word = id2word_train, num_topics = num_topics, random_state = 50, chunksize = 100, passes = 10, per_word_topics = True, workers = CPUs) doc_lda_train = lda_model_train[corpus_train] folder_train = str(train_years_start) + '_' + str(train_years_end) if not os.path.exists(lda_data_dir + 'LDA_model/' + folder_train + '/'): os.makedirs(lda_data_dir + 'LDA_model/' + folder_train + '/') dest_train = lda_data_dir + 'LDA_model/' + folder_train + '/' + 'lda_' + folder_train + '.model' lda_model_train.save(main_dir + dest_train) end = time.time() print("Train LDA on training data took: " + str(end - start)) with open(lda_data_dir + 'LDA_model/' + folder_train + '/' + 'id2word.pkl', "wb") as fp: pickle.dump(id2word_train, fp) with open(lda_data_dir + 'LDA_model/' + folder_train + '/' + "texts.txt", "wb") as fp: pickle.dump(texts_train, fp) with open(lda_data_dir + 'LDA_model/' + folder_train + '/' + "corpus.txt", "wb") as fp: pickle.dump(corpus_train, fp) #Train LDA on Validation data start = time.time() lda_model_val = LdaMulticore(corpus = corpus_val, id2word = id2word_val, num_topics = num_topics, random_state = 50, chunksize = 100, passes = 10, per_word_topics = True, workers = CPUs) doc_lda_val = lda_model_val[corpus_val] folder_val = str(val_year) if not os.path.exists(lda_data_dir + 'LDA_model/' + folder_val + '/'): os.makedirs(lda_data_dir + 'LDA_model/' + folder_val + '/') dest_val = lda_data_dir + 'LDA_model/' + folder_val + '/' + 'lda_' + folder_val + '.model' lda_model_val.save(main_dir + dest_val) end = time.time() print("Train LDA on validation data took: " + str(end - start)) with open(lda_data_dir + 'LDA_model/' + folder_val + '/' + 'id2word.pkl', "wb") as fp: pickle.dump(id2word_val, fp) with open(lda_data_dir + 'LDA_model/' + folder_val + '/' + "texts.txt", "wb") as fp: pickle.dump(texts_val, fp) with open(lda_data_dir + 'LDA_model/' + folder_val + '/' + "corpus.txt", "wb") as fp: pickle.dump(corpus_val, fp) dest_all_model[str(k_index+1)] = (dest_train, dest_val) #Create Visualization start = time.time() pyLDAvis.enable_notebook() vis_train = pyLDAvis.gensim.prepare(lda_model_train, corpus_train, id2word_train, sort_topics=False) dest_train_vs = lda_data_dir + 'LDA_model/' + folder_train + '/' + 'vis_' + folder_train + '.html' pyLDAvis.save_html(vis_train, dest_train_vs) end = time.time() print("Train LDA visualization took: " + str(end - start)) start = time.time() pyLDAvis.enable_notebook() vis_val = pyLDAvis.gensim.prepare(lda_model_val, corpus_val, id2word_val, sort_topics=False) dest_train_val = lda_data_dir + 'LDA_model/' + folder_val + '/' + 'vis_' + folder_val + '.html' pyLDAvis.save_html(vis_val, dest_train_val) end = time.time() print("Validation LDA visualization took: " + str(end - start)) # Create Word Clouds # Train for t in range(lda_model_train.num_topics): plt.figure() dictionary = {} plt.imshow(WordCloud().fit_words(Convert(lda_model_train.show_topic(t, 30), dictionary))) plt.axis("off") plt.title("Topic_" + str(t + 1)) plt.savefig("wordclouds/Topic #" + str(t + 1)+'.png') # set location on server plt.close() dest_train_zip = lda_data_dir + 'LDA_model/' + folder_train + '/' + 'wordclouds_' + folder_train + '.zip' zipf = zipfile.ZipFile(dest_train_zip, 'w', zipfile.ZIP_DEFLATED) zipdir('wordclouds/', zipf) zipf.close() # Val for t in range(lda_model_val.num_topics): plt.figure() dictionary = {} plt.imshow(WordCloud().fit_words(Convert(lda_model_val.show_topic(t, 30), dictionary))) plt.axis("off") plt.title("Topic_" + str(t + 1)) plt.savefig("wordclouds/Topic #" + str(t + 1) +'.png') # set location on server plt.close() dest_val_zip = lda_data_dir + 'LDA_model/' + folder_val + '/' + 'wordclouds_' + folder_val + '.zip' zipf = zipfile.ZipFile(dest_val_zip, 'w', zipfile.ZIP_DEFLATED) zipdir('wordclouds/', zipf) zipf.close() # Matching topics start = time.time() # Using the Cosine distance requires a customization of the 'LdaMulticore.diff' method in the gensim package. To avoid erros, the code uses the Jaccard distance, but this can be changed to Cosine if needed. array_distance = create_dist_matrix (dest_train, dest_val, distance='jaccard', num_words=300, normed=True) title = 'Matching Topics (' + str(train_years_start) + '-' + str(train_years_end) + ' vs. ' + str(val_year) + ')' + '\n' location = lda_data_dir + 'LDA_model/matching_' + str(val_year) + '.png' create_h_clustering (array_distance, n_topics = 30, title = title, location = location) end = time.time() print("Matching topics took: " + str(end - start)) return dest_all_model