def topics_by_lda(self, tokenized_corpus_path, num_topics=20, num_words=10, max_lines=10000, split="\s+", max_df=100): """ 读入经过分词的文件并且对其进行 LDA 训练 Arguments: tokenized_corpus_path -> string -- 经过分词的语料集地址 num_topics -> integer -- 主题数目 num_words -> integer -- 主题词数目 max_lines -> integer -- 每次读入的最大行数 split -> string -- 文档的词之间的分隔符 max_df -> integer -- 避免常用词,过滤超过该阈值的词 """ # 存放所有语料集信息 corpus = [] with open(tokenized_corpus_path, 'r', encoding='utf-8') as tokenized_corpus: flag = 0 for document in tokenized_corpus: # 判断是否读取了足够的行数 if (flag > max_lines): break # 将读取到的内容添加到语料集中 corpus.append(re.split(split, document)) flag = flag + 1 # 构建语料集的 BOW 表示 (vocab, DTM) = self.corpus2dtm(corpus, max_df=max_df) # 训练 LDA 模型 lda = LdaMulticore(matutils.Sparse2Corpus(DTM, documents_columns=False), num_topics=num_topics, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=4) # 打印并且返回主题数据 topics = lda.show_topics(num_topics=num_topics, num_words=num_words, formatted=False, log=False) for ti, topic in enumerate(topics): print("Topic", ti, ":", " ".join(word[0] for word in topic[1]))
class LDA(Pipe): """ LDA (Latent Dirichlet Allocation) model for unsupervised topic modeling. Takes vectors and returns topic vectors, which can be used for clustering. """ input = Pipe.type.vecs output = Pipe.type.vecs def __init__(self, n_topics=5): self.n_topics = n_topics self.trained = False def __call__(self, vecs): """ Return topic vectors. """ if not self.trained: self.train(vecs) self.trained = True distribs = [] for distrib in self.m[Scipy2Corpus(vecs)]: distribs.append([t[1] for t in distrib]) distribs = np.array(distribs) return distribs def train(self, vecs): """ Build the topic model. """ corp = Scipy2Corpus(vecs) self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3) def print_topics(self, vectorizer): vocab = vectorizer.vocabulary for topic in self.m.show_topics(num_topics=self.n_topics, num_words=10, formatted=False): print([vocab[int(ix)] for prob, ix in topic])
def fit_numtopics(train_corpus, test_corpus, id2word, num_topics_list, iters, workers, chunksize, logfilename, save=True): """ Args: num_topics_list = list of number of topics, a model will be fitted for each save: indicates whether model should be saved Returns: topics_dict = a dictionary of topics lists, where the key is the number of topics """ topics_dict = {} logfile = open(logfilename, 'w') for num_topics in num_topics_list: print('training', num_topics) np.random.seed(NUM) start_time = time.time() model = LdaMulticore(corpus=train_corpus, id2word=id2word, num_topics=num_topics, iterations=iters, eval_every=None, workers=workers, chunksize=chunksize) end_time = time.time() if save: fname = 'data\\orig_' + str(num_topics) + 'topics.lda' model.save(fname) per_word_bound = model.log_perplexity(test_corpus) perplexity = np.exp2(-1.0 * per_word_bound) logfile.write('\n' + 'num_topics: ' + str(num_topics) + '\n') logfile.write('perplexity: ' + str(perplexity) + '\n') logfile.write('train_time: ' + str(end_time - start_time) + '\n' + 'Topics: \n') topics = model.show_topics(num_topics=num_topics, num_words=20) topics_dict[str(num_topics)] = topics for topic in topics: logfile.write('\n\t' + topic.encode('ascii', 'ignore') + '\n') logfile.close() return topics_dict
class LDA(Pipe): """ LDA (Latent Dirichlet Allocation) model for unsupervised topic modeling. Takes vectors and returns topic vectors, which can be used for clustering. """ input = Pipe.type.vecs output = Pipe.type.vecs def __init__(self, n_topics=5): self.n_topics = n_topics self.trained = False def __call__(self, vecs): """ Return topic vectors. """ if not self.trained: self.train(vecs) self.trained = True distribs = [] for distrib in self.m[Scipy2Corpus(vecs)]: distribs.append([t[1] for t in distrib]) distribs = np.array(distribs) return distribs def train(self, vecs): """ Build the topic model. """ corp = Scipy2Corpus(vecs) self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3) def print_topics(self, vectorizer): vocab = vectorizer.vocabulary for topic in self.m.show_topics(num_topics=self.n_topics, num_words=10, formatted=False): print([vocab[int(ix)] for prob, ix in topic])
for i in ls.columns: print(f'{ls[i].mean(): .4f}') jv_avg = [(0,0.1793),(1,0.0007),(2,0.0792),(3,0.0382),(4,0.1750),(5,0.0628),(6,0.0770),(7,0.0147),(8,0.3506),(9,0.0226)] rk_avg = [(0,0.0544),(1,0.0014),(2,0.0610),(3,0.0123),(4,0.2093),(5,0.0467),(6,0.1689),(7,0.0021),(8,0.4197),(9,0.0242)] ls_avg = [(0,0.1349),(1,0.0009),(2,0.1084),(3,0.0072),(4,0.1119),(5,0.0581),(6,0.1487),(7,0.0282),(8,0.3402),(9,0.0616)] gensim.matutils.cossim(ls_avg, rk_avg) from collections import Counter topics = model.show_topics(formatted=False) data_flat = [w for w_list in texts for w in w_list] counter = Counter(data_flat) out = [] for i, topic in topics: for word, weight in topic: out.append([word, i , weight, counter[word]]) df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count']) import matplotlib.colors as mcolors # Plot Word Count and Weights of Topic Keywords fig, axes = plt.subplots(5, 2, figsize=(16,10), sharey=True, dpi=160) cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] for i, ax in enumerate(axes.flatten()):
data_dir = './%s_data'%data dictionary = Dictionary.load(os.path.join(data_dir, 'ne_nedf_weighting.dict')) bow_news = load_model(os.path.join(data_dir, 'ne8_nedf_%s_weighting.bow')%(topn_concepts)) dict_id2token = dict(dictionary.items()) lda = LdaMulticore(bow_news, id2word=dict_id2token, num_topics=n_topics, passes=passes, iterations=iterations,\ eval_every=eval_every, workers=workers, random_state=random_state) name = 'ne8_nedf_%s_topic%s_passes%s_iteration%s_random%s' % (topn_concepts, n_topics, passes, iterations, random_state) result_dir = os.path.join(data_dir, name) if not os.path.exists(result_dir): os.mkdir(result_dir) lda.save(os.path.join(result_dir, 'lda_model')) topics = lda.show_topics(num_topics=n_topics, num_words=20, log=False, formatted=False) with open(os.path.join(result_dir, 'topics.txt'), 'w', encoding='utf-8') as f: for topic in topics: f.write('topic ' + str(topic[0]) + ':\n') for t in topic[1]: f.write(t[0] + ': ' + str(t[1]) + '\n') f.write('\n') endtime = datetime.datetime.now() duration = (endtime - starttime).seconds duration_list.append(duration) print('Totol running for ', (endtime - starttime).seconds, ' seconds.') print(sum(duration_list)/len(duration_list)) '''
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # build tfidf if os.path.exists(outp + '_tfidf.mm'): mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm') else: tfidf = TfidfModel(wiki, id2word=dictionary, normalize=True) #tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format mm = tfidf[wiki] MmCorpus.serialize(outp + '_tfidf.mm', mm, progress_cnt=10000) logger.info("finished pre-processing, starting LDA %s", program) lda = LdaMulticore(mm, id2word=dictionary, workers=10, num_topics=ntopics) lda.save(model_name) topics = lda.show_topics(num_topics=ntopics, num_words=30) print(topics) logger.info("finished LDA %s", program) toptopics = lda.top_topics(corpus=wiki, dictionary=lda.id2word, coherence='u_mass') logger.info("top topicsL %s", 'u_mass') print(toptopics)
neg = carReviews.loc[carReviews.Vader_Rating <= 2.5, ['EntireReview']].sample(3).values for n in neg: print('------>',n[0]) #LDA Topic Modelling #Approach 1 reviews = carReviews["ReviewTokens"] dictionary = corpora.Dictionary(reviews) #Term document frequency doc_term_matrix = [dictionary.doc2bow(rev) for rev in reviews] #perform LDA ldamodel = LdaMulticore(corpus= doc_term_matrix, num_topics =8, id2word=dictionary,chunksize=2000, passes=20,per_word_topics=True) #get highlighted topics topics = ldamodel.show_topics() lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False) #show HTML view pyLDAvis.save_html(lda_display,open("lda_8_topics.html","w")) pprint(ldamodel.show_topics(formatted=False)) # Calculate coherence score def compute_coherence_score(lda_model,reviews): coherence = CoherenceModel(lda_model,texts = reviews,dictionary = dictionary ,coherence = "c_v") return coherence.get_coherence(),coherence.get_coherence_per_topic() coh_score,coh_by_topic = compute_coherence_score(ldamodel,reviews) print(coh_by_topic,coh_score)
class Model(): """ LDA (Latent Dirichlet Allocation) model for unsupervised topic modeling. TO DO: - this model has to be rebuilt for each comment section as new comments come in - what's the best way to manage that? Notes: - tried LDA on individual sentences, doesn't work as well. """ def __init__(self, n_topics=5, verbose=False): self.verbose = verbose self.n_topics = n_topics self.vectr = Vectorizer() def train(self, comments): """ Build the topic model from a list of documents (strings). Assumes documents have been pre-processed (e.g. stripped of HTML, etc) """ docs = [c.body for c in comments] vecs = self.vectr.vectorize(docs, train=True) corp = Scipy2Corpus(vecs) self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3) if self.verbose: self.print_topics() def featurize(self, docs): """ Return topic vectors for documents. """ vecs = self.vectr.vectorize(docs) dists = [] for dist in self.m[Scipy2Corpus(vecs)]: dists.append([t[1] for t in dist]) dists = np.array(dists) return dists def cluster(self, comments): """ Build clusters out of most likely topics. """ # If no model exists, train it. if not hasattr(self, 'm'): self.train(comments) clusters = [[] for _ in range(self.n_topics)] dists = self.featurize([c.body for c in comments]) for i, comment in enumerate(comments): topic = dists[i].argmax() clusters[topic].append(comment) return clusters def identify(self, docs): """ Labels a list of documents with their topic and probability for that topic. """ vecs = self.vectr.vectorize(docs) dists = self.featurize(docs) for i, doc in enumerate(docs): topic = dists[i].argmax() proba = dists[i][topic] yield doc, topic, proba def print_topics(self): vocab = self.vectr.vocabulary for topic in self.m.show_topics(num_topics=self.n_topics, num_words=10, formatted=False): print([vocab[int(ix)] for prob, ix in topic])
dictionary.id2token[uid] = token print type(dictionary), type(corpus) #path where dtm file is installed dtm_path = "/home/ankit081190/NLP/dtm/dtm/dtm" #model = DtmModel(dtm_path, corpus, time_seq, num_topics=1, # id2word=corpus.dictionary, initialize_lda=True) model = LdaMulticore(corpus, num_topics=10, id2word=dictionary) model.save("DTModelMultiCore_" + files + ".model") #Gives top 25 topics tp = model.show_topics(num_topics=25, log=False, formatted=True) print model.print_topics(num_topics=25) data = pyLDAvis.gensim.prepare(model, corpus, dictionary) pyLDAvis.save_html(data, 'index_lda_' + files + '.html') cnt = Counter(tp) with codecs.open("topicsMultiLDA" + files + ".txt", "w", "utf-8") as f: for i, j in cnt: print i, j f.write("\nFor Topic Number " + str(i) + ":\n" + str(j).decode("utf-8") + "\n") f.close() #for i, j in cnt: # print "\nFor topic number: " ,i, "\n"; # print j.decode("utf-8")
class Model(): """ LDA (Latent Dirichlet Allocation) model for unsupervised topic modeling. TO DO: - this model has to be rebuilt for each comment section as new comments come in - what's the best way to manage that? Notes: - tried LDA on individual sentences, doesn't work as well. """ def __init__(self, n_topics=5, verbose=False): self.verbose = verbose self.n_topics = n_topics self.vectr = Vectorizer() def train(self, comments): """ Build the topic model from a list of documents (strings). Assumes documents have been pre-processed (e.g. stripped of HTML, etc) """ docs = [c.body for c in comments] vecs = self.vectr.vectorize(docs, train=True) corp = Scipy2Corpus(vecs) self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3) if self.verbose: self.print_topics() def featurize(self, docs): """ Return topic vectors for documents. """ vecs = self.vectr.vectorize(docs) dists = [] for dist in self.m[Scipy2Corpus(vecs)]: dists.append([t[1] for t in dist]) dists = np.array(dists) return dists def cluster(self, comments): """ Build clusters out of most likely topics. """ # If no model exists, train it. if not hasattr(self, 'm'): self.train(comments) clusters = [[] for _ in range(self.n_topics)] dists = self.featurize([c.body for c in comments]) for i, comment in enumerate(comments): topic = dists[i].argmax() clusters[topic].append(comment) return clusters def identify(self, docs): """ Labels a list of documents with their topic and probability for that topic. """ vecs = self.vectr.vectorize(docs) dists = self.featurize(docs) for i, doc in enumerate(docs): topic = dists[i].argmax() proba = dists[i][topic] yield doc, topic, proba def print_topics(self): vocab = self.vectr.vocabulary for topic in self.m.show_topics(num_topics=self.n_topics, num_words=10, formatted=False): print([vocab[int(ix)] for prob, ix in topic])
beta=beta, iter=num_iterations) print run_id output_file = output_file_template.format(run_id=run_id) # Train and save print 'Training...' model = LdaMulticore(corpus, alpha=alpha, eta=beta, passes=50, id2word=dictionary, num_topics=num_topics, iterations=num_iterations) # model.save(output_file) print 'Done training' # Print top 10 words in topics, if desired if print_topics: topics = model.show_topics(num_topics=4, formatted=False) for topic in topics: for tup in topic[1]: print tup[0] + ": " + str(tup[1]) print '\n' # Evaluate perplexity ll = model.log_perplexity(test_corpus) print "LL: " + str(ll) print "Perp: " + str(np.exp2(-ll))
tfidf = TfidfModel(corpus, id2word=dictionary, dictionary=dictionary) filtered_corpus = [] for doc in corpus: res = tfidf[doc] res.sort(key=lambda tup: tup[1], reverse=True) # take the 1/4 significant words as meaningful. meaningful = [t[0] for t in res[0:len(res) / 4]] filtered_corpus.append([t for t in doc if t[0] in meaningful]) corpus = filtered_corpus print("TD-IDF finished!") # train a new model. print("Training model...") lda = LdaMulticore(corpus, num_topics=100, id2word=dictionary, passes=1000, iterations=100000) print("Model trained!") # save the trained model. print("Saving model...") safe_mkdirs('model') lda.save('model/lda_model') print("Model saved!") # print('\nDocuments and their topics:') # for doc in corpus: # print(lda[doc]) topics = lda.show_topics(num_topics=-1, formatted=False) print('Topics and their related words:') for topic in topics: print(topic)