def runlda(filetopicwords,fileinput,NUMTOPICS=30,NUMPASSES=10,NUMITERATIONS=10): print('runlda...') from gensim.corpora import Dictionary from gensim.models.ldamodel import LdaModel import numpy as np docs,word2freqtopics = [],{} fr = open(fileinput,'r') for line in fr: words = line.strip('\r\n').split(' ') docs.append(words) for word in words: if not word in word2freqtopics: word2freqtopics[word] = [0,[0. for i in range(NUMTOPICS)]] word2freqtopics[word][0] += 1 fr.close() V = len(word2freqtopics) dct = Dictionary(docs) model = LdaModel(corpus=[dct.doc2bow(doc) for doc in docs],id2word=dct, \ num_topics=NUMTOPICS,passes=NUMPASSES,iterations=NUMITERATIONS) fw = open(filetopicwords,'w') for topicid in range(NUMTOPICS): s = 'topic '+str(topicid) wordscores = [] for (wordid,score) in model.get_topic_terms(topicid,topn=V): if score < 1e-6: break wordscores.append([dct[wordid],score]) scoresum = sum([x[1] for x in wordscores]) for [word,score] in wordscores: s += ','+word+':'+str(np.round(score/scoresum,6)) word2freqtopics[word][1][topicid] = score fw.write(s+'\n') fw.close() '''
def plottopicpop(): internet = [0 for i in range(10)] developing = [0 for i in range(10)] habr = [0 for i in range(10)] n = 0 for year in range(2006, 2016): articles, numberofarticles = getarticlesbyyear(year) print("Got articles for", str(year)) # Normalaize texts i = 0 for article in articles: article = replacesymbols(article) articles[i] = normalaisestr(article.lower()) i += 1 print('Normalaised') # Remove unnecessary words texts = [[word for word in article if word not in stoplist] for article in articles] print('Deleted stopwords') dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] print('Starting training') # Щадящий режим для ОЗУ for i in range(numberofarticles // 100): begin = 100 * i end = 100 * (i + 1) if end > numberofarticles: end = numberofarticles lda = LdaModel(corpus[begin:end:], id2word=dictionary, num_topics=end - begin) for j in range(lda.num_topics): topics = lda.get_topic_terms(j, 15) # print(topics) for topic in topics[0]: top = dictionary.get(topic) # print(top) if "интернет" == top: internet[n] += 1 if "разработка" == top: developing[n] += 1 if "хабра" == top: habr[n] += 1 del lda n += 1 print(internet,'\n', developing, '\n', habr) plt.title('Population of 3 topics.') plt.xlabel('Year 2006 - 2015') plt.ylabel('Number of articles') plt.plot(internet, label="Интернет") plt.plot(developing, label="Разработка") plt.plot(habr, label="Хабра") plt.legend() plt.show()
def ldaforhabr(): numberofarticles = 0 articles, numberofarticles = getarticles() print("Got articles") # Normalaize texts i = 0 for article in articles: article = replacesymbols(article) articles[i] = normalaisestr(article.lower()) i += 1 print('Normalaised') # Remove unnecessary words texts = [[word for word in article if word not in stoplist] for article in articles] print('Deleted stopwords') dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] print('Starting training') f = open('lda.log', 'w') for i in range(i // numberofarticles): begin = 100 * i end = 100 * (i + 1) if end > numberofarticles: end = numberofarticles lda = LdaModel(corpus[begin:end:], id2word=dictionary, num_topics=end - begin) for j in range(lda.num_topics): topics = lda.get_topic_terms(j, 15) f.write(str(begin + j) + ": ") # print(topics) for topic in topics[0]: top = dictionary.get(topic) if top is not None: f.write(top + '\n') f.write('-----------\n') # i += 1 del lda f.close()
def lda_score(text, sub_sen_vec): plain_text = list( map(lambda x: cut(x).split(), filter(lambda x: x != '', split_sentences(text)[::2]))) common_dict = Dictionary(plain_text) common_corpus = [common_dict.doc2bow(t) for t in plain_text] no_topics = int(len(plain_text) / 2) no_words = int(len(text) / no_topics / 2) lda = LdaModel(common_corpus, num_topics=no_topics) topic_list = [] for i in range(no_topics): topic_list.append( [common_dict[t[0]] for t in lda.get_topic_terms(i, no_words)]) score_list = [] for topic in topic_list: score_list.append(get_corr_sif(topic, sub_sen_vec)) return max(score_list)
def saliency_index(lda: LdaModel, corpus, words: Dictionary): full_corpus = list(chain(*corpus)) N = len(words) total = sum(words.cfs[i] for i in range(N)) frequencies = [words.cfs[i] / total for i in range(N)] topics = lda.print_topics() relative_likelihood = [0. for _ in range(N)] for topic_id, topic_prob in lda.get_document_topics( full_corpus, minimum_probability=0.): for term, cond_prob in lda.get_topic_terms(topic_id, topn=None): relative_likelihood[term] += cond_prob * log( cond_prob / topic_prob) saliencies = [f * l for f, l in zip(frequencies, relative_likelihood)] return {words[i]: s for i, s in enumerate(saliencies)}
num_topics=topic_num, alpha=alpha, random_state=1) doc_topics = [lda[c] for c in corpus] avg_doc_topics = mean([len(t) for t in doc_topics]) print(f"topics num of doc = {avg_doc_topics}") topic_freq = frequencies([t[0] for dt in doc_topics for t in dt]) print('----------') for i in range(topic_num): items = [(dic[t[0]], t[1]) for t in lda.get_topic_terms(i, topn=5)] freq = topic_freq[i] if i in topic_freq else 0 print(f"topic_id = {i}, freq = {freq}, items = {items}") print('----------') for i in range(len(corpus)): dts = lda.get_document_topics(corpus[i], per_word_topics=True) for dt in dts[2]: item = dic[dt[0]] print(f"corpus = {i}, item = {item}, topic_id = {dt[1]}") vis = pyLDAvis.gensim.prepare(lda, corpus, dic, n_jobs=1, sort_topics=False)
data_file = sys.argv[1] topic_num = int(sys.argv[2]) limit_topics = 3 sentences = list(word2vec.LineSentence(data_file)) dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences] lda = LdaModel(corpus = corpus, id2word = dic, num_topics = topic_num) doc_topics = [lda[c] for c in corpus] avg_doc_topics = mean([len(t) for t in doc_topics]) if avg_doc_topics > limit_topics: warnings.warn(f'topic_num is small. topics num of doc = {avg_doc_topics}') flatten = lambda x: sum(x, []) topic_freq = Counter(flatten([[x[0] for x in t] for t in doc_topics])) print('topic,freq,item,prob') for i in range(topic_num): for t in lda.get_topic_terms(i): item = dic[t[0]] print(f'{i},{topic_freq[i]},{item},{t[1]}')
class CustomLda(object): def __init__(self, data=None, dictionary=None): """ initialize, data should be provided, only when unpickling class object it is not needed!""" self.data = data self.model = None self.num_topics = None self.iterations = None self.random_state = None self.dictionary = dictionary if self.data is not None: if self.dictionary is None: self.dictionary = Dictionary(self.data) self.corpus = [self.dictionary.doc2bow(text) for text in self.data] else: self.dictionary = None self.corpus = None self.distributed = None self.chuncksize = None self.passes = None self.update_every = None self.alpha = None self.eta = None self.decay = None self.offset = None self.eval_every = None self.gamma_threshold = None self.minimum_probability = None self.ns_conf = None self.minimum_phi_value = None self.per_word_topics = None self.num_topics = None self.iterations = None self.random_state = None self.model = None self.coherence_model = None self.coherence = None self.coherence_type = None def train(self, num_topics, iterations=1500, random_state=1, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, gamma_threshold=0.001, minimum_probability=0.01, ns_conf=None, minimum_phi_value=0.01, per_word_topics=False, workers=1): """train lda model. If workers >1, goes multicore""" self.distributed = distributed self.chuncksize = chunksize self.passes = passes self.update_every = update_every self.alpha = alpha self.eta = eta self.decay = decay self.offset = offset self.eval_every = eval_every self.gamma_threshold = gamma_threshold self.minimum_probability = minimum_probability self.ns_conf = ns_conf self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics self.num_topics = num_topics self.iterations = iterations self.random_state = random_state self.workers = workers if self.workers > 1: self.model = LdaMulticore( workers=3, corpus=self.corpus, id2word=self.dictionary, iterations=self.iterations, num_topics=self.num_topics, random_state=self. random_state, # distributed=self.distributed, chunksize=self.chuncksize, passes=self.passes, # update_every= self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, minimum_probability=self. minimum_probability, # ns_conf=self.ns_conf, minimum_phi_value=self.minimum_phi_value, per_word_topics=self.per_word_topics) else: self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, iterations=self.iterations, num_topics=self.num_topics, random_state=self.random_state, distributed=self.distributed, chunksize=self.chuncksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, ns_conf=self.ns_conf, minimum_phi_value=self.minimum_phi_value, per_word_topics=self.per_word_topics) print('Trained!') def _train_coherence_model(self, coherence_type='u_mass'): """could be made on top of model to get coherence, type could be 'u_mass' or 'c_v'""" self.coherence_model = CoherenceModel(model=self.model, texts=self.data, dictionary=self.dictionary, coherence=coherence_type) def _calculate_coherence(self, coherence_type='u_mass'): self._train_coherence_model(coherence_type=coherence_type) self.coherence = self.coherence_model.get_coherence() def get_coherence(self, coherence_type='u_mass'): if coherence_type != self.coherence_type: self._calculate_coherence(coherence_type=coherence_type) return self.coherence def get_topic_terms(self, num, topn=10): return self.model.get_topic_terms(num, topn=topn) def get_preplexity(self): return self.model.log_perplexity(self.corpus) def get_topics(self, num): return self.model.show_topics(num) def _make_visualization(self): """prepare visualisation for display/saving""" return pyLDAvis.gensim.prepare(self.model, self.corpus, self.dictionary, sort_topics=False) def display(self): """display LDAvis in notebook""" visualisation = self._make_visualization() return pyLDAvis.display(visualisation) def save_ldavis(self, filename='topic.html'): """save LDAvis to .html""" ldavis = self._make_visualization() pyLDAvis.save_html(ldavis, filename) def save_lda(self, filename): """save lda model only""" self.model.save(filename) def pickle(self, filename): """save class instance to file""" f = open(filename, 'wb') pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) f.close() @staticmethod def unpickle(filename): """read class instance from file""" with open(filename, 'rb') as f: return pickle.load(f) def predict_topic(self, doc_list): """predict topic of document list (consists of strings""" topic_list = [] for doc in doc_list: bow = self.dictionary.doc2bow(str(doc).split()) topics_probs = self.model.get_document_topics(bow) topics_probs.sort(key=lambda tup: tup[1], reverse=True) topic_list.append(topics_probs) return topic_list
from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel from gensim.test.utils import datapath num_topics = 4 # Create a corpus from a list of texts common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] print common_dictionary.items() # Train the model on the corpus. lda = LdaModel(common_corpus, num_topics=num_topics) temp_file = datapath( "/Users/wanghaoxian/Documents/GitHub/recommend/dataContest/model") lda.save(temp_file) list = lda.get_document_topics(common_corpus) for topic in list: print topic for i in range(0, num_topics, 1): print i, lda.get_topic_terms(i, 3)
for w in list(jieba.cut(line, cut_all=True)): if len(w) > 1 and w not in stoplist: doc.append(w) segtexts.append(doc) dictionary = Dictionary(segtexts) dictionary.filter_extremes(2, 1.0, keep_n=1000) #词典过滤,保留1000个 corpus = [dictionary.doc2bow(text) for text in segtexts] lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics) #指定id2word,可以直接显示词汇而非其id topics = lda.print_topics(num_topics=num_topics, num_words=10) #list (topic_id, [(word, value), … ]) print(topics) #可视化 font = r'C:\Windows\Fonts\simfang.ttf' wc = WordCloud(collocations=False, font_path=font, width=2800, height=2800, max_words=20, margin=2) for topicid in range(0, num_topics): tlist = lda.get_topic_terms(topicid, topn=1000) #定义词云图中的词汇数 p(w|z) #print(tlist) wdict = {} #['词a':100 '词b':90,'词c':80] for wv in tlist: wdict[dictionary[wv[0]]] = wv[1] print(wdict) wordcloud = wc.generate_from_frequencies(wdict) wordcloud.to_file('topic_' + str(topicid) + '.png') #保存图片
# return [(i, model_lda.print_topic(i)) for i in top_k_topics] # In[116]: # `get_document_topics()` returns topic probability distribution for given document topic_dist_675_a = model_lda.get_document_topics(corpus_train[15]) pprint(sorted(topic_dist_675_a)) # In[117]: topicid = 3 model_lda.get_topic_terms(topicid, topn=10) # In[118]: text_train[doc_id] # In[119]: doc_id = 15 topic_dist_15_b = sorted(get_topics(corpus_train[doc_id], k=10)), text_train[doc_id] pprint(topic_dist_15_b)
for line in file: #seg_list = jieba.cut(line, cut_all=True) seg_list = jieba.analyse.extract_tags(line, topK=40, withWeight=True) words = [] for word, w in seg_list: if (len(word) < 2): continue words.append(word.encode('utf-8')) texts.append(words) # Create a corpus from a list of texts common_dictionary = Dictionary(texts) common_corpus = [common_dictionary.doc2bow(text) for text in texts] # Train the model on the corpus. lda = LdaModel(common_corpus, num_topics=num_topics) temp_file = datapath( "/Users/wanghaoxian/Documents/GitHub/recommend/dataContest/model") lda.save(temp_file) documentTopics = lda.get_document_topics(common_corpus) for doc in documentTopics: print doc for i in range(0, num_topics, 1): print "topic", i terms = lda.get_topic_terms(i, num_topic_terms) for term in terms: print common_dictionary[term[0]], term[1]
def get_most_common(title_list, dic, num=COMMON_TOPIC_WORDS_NUM, random_state=None): '''最頻出の話題の単語num個のセットを取得する''' bow = [dic.doc2bow(title) for title in title_list] # TODO: 適切なトピック数を取得して設定する if LOG_LEVEL == 'DEBUG': random_state = 123 model = LdaModel(bow, id2word=dic, num_topics=TOPIC_NUM, random_state=random_state) # 各タイトルを分類 topic_id_list = [] for idx, title in enumerate(title_list): logger.debug('title') logger.debug(title) doc_topics_tuple = model.get_document_topics(dic.doc2bow(title), minimum_probability=0.0) doc_topic_dist = [[val[0], val[1]] for val in doc_topics_tuple] doc_topic_dist = np.array(doc_topic_dist) if idx == 0: topic_dist_arr = doc_topic_dist else: topic_dist_arr = np.vstack([topic_dist_arr, doc_topic_dist]) topic_id = int( sorted(doc_topic_dist, key=lambda x: x[1], reverse=True)[0][0]) topic_id_list.append(topic_id) if LOG_LEVEL == 'DEBUG': # titleごとのトピック分布 df_topic_dist = pd.DataFrame({ 'title': title_list, 'topic_id': topic_id_list }) # トピックごとの単語分布 cols = ['{}_{}'.format(word_no, elem) \ for word_no in range(10) \ for elem in range(2)] df_word_dist = pd.DataFrame() arr_dist = topic_dist_arr.reshape(-1, model.get_topics().shape[0], 2) for topic_id in range(model.get_topics().shape[0]): df_topic_dist['topic_{}'.format(topic_id)] = arr_dist[:, topic_id, 1] topic_terms = model.get_topic_terms(topic_id, topn=int(len(cols) / 2)) topic_terms_2 = [] for term in topic_terms: topic_terms_2 = topic_terms_2 + [ dic.id2token[term[0]], term[1] ] df_word_dist = df_word_dist.append( pd.Series(topic_terms_2, name='topic_{}'.format(topic_id))) df_topic_dist.to_csv( os.path.join('test', 'classified_topic_{}.csv' \ .format(datetime.today().strftime(format='%Y%m%d'))), index=False, encoding='cp932' ) df_word_dist.columns = cols df_word_dist.to_csv( os.path.join('test', 'word_distribution_per_topic_{}.csv' \ .format(datetime.today().strftime(format='%Y%m%d'))), encoding='cp932' ) # 最頻出の話題を取得 topic_id_counter = Counter(topic_id_list) most_common_topic_id = topic_id_counter.most_common(1)[0][0] topic_terms = model.get_topic_terms(most_common_topic_id) logger.debug('') logger.debug('topic_id_counter: ' + str(topic_id_counter)) logger.debug('most_common_topic_id: ' + str(most_common_topic_id)) logger.debug(topic_terms) # 最頻出の話題の重要な単語num個を取得 important_word_list = [ dic.id2token[topic_tuple[0]] for topic_tuple in topic_terms[:num] ] logger.debug(important_word_list) return important_word_list
class GensimLDA: def __init__(self, texts): self.dictionary = Dictionary(texts) self.corpus = [self.dictionary.doc2bow(text) for text in texts] self.k_topics = None self.model = None def fit(self, k_topics, iterations=50): '''''' self.k_topics = k_topics self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, \ num_topics=k_topics, iterations=iterations) def get_document_topic_matrix(self, X=None): '''Returns an n_docs x k_topics array of probabilities of a topic in a given document.''' if X is None: X = self.corpus else: X = [self.dictionary.doc2bow(text) for text in X] n_docs = len(X) V = np.zeros((n_docs, self.k_topics)) # Extract assignments some_iterable = self.model.get_document_topics( X) ## equiv: self.model[X] for i, doc_topic in enumerate(some_iterable): for topic_id, prob in doc_topic: V[i, topic_id] = prob return V def get_topic_term_matrix(self): '''Returns an k_topics x m_words array of probabilities of a word in a given topic.''' return self.model.get_topics() def print_topics(self, top_n=10): '''Prints the top_n words in a topic''' for row in self.get_topic_term_matrix(): ranking = np.argsort(row) ids = np.arange(len(ranking))[ranking] for k in ids[:-top_n:-1]: weight = row[k] word = self.dictionary.id2token[k] print(k, word, weight) print() def print_topic_words(self, topic_num, topn=None): '''Prints the top words and probabilities of a given topic in descending probability.''' for tok_id, prob in self.model.get_topic_terms(topic_num, topn=topn): word = self.dictionary.id2token[tok_id] print(word, prob) def get_topic_bows(self, num_words=10): '''Returns a list (for each topic) containing a list of the top num_words''' q = self.model.show_topics(num_topics=self.k_topics, num_words=num_words, formatted=False) topics = [] for id, topic in q: words = [] for w, p in topic: words.append(w) topics.append(words) return topics
class LDAWDF: mysql: mysql.MySQL ldamodel: LdaModel dictionary = None corpus = None def __init__(self, mysql): self.mysql = mysql self.dataFolder = './data/' self.saveFile = 'lda_model' self.saveFileDict = 'lda_model_dict' def trainFromStart(self): with self.mysql as db: content = db.getContentsText() documents = [] for item in content: documents.append(item['content'].split()) self.dictionary = corpora.Dictionary(documents) self.dictionary.filter_extremes(no_below=5, no_above=0.5) doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents] self.corpus = doc_term_matrix # Running and Training LDA model on the document term matrix. print("Starting to train LDA Model...") self.ldamodel = LdaModel( doc_term_matrix, num_topics=200, id2word=self.dictionary, passes=100) def printTest(self): print(self.ldamodel.print_topics(num_topics=10, num_words=5)) def save(self): self.ldamodel.save(self.dataFolder + self.saveFile) self.dictionary.save(self.dataFolder + self.saveFileDict) def canLoad(self): my_file = Path(self.dataFolder + self.saveFile) my_file_dict = Path(self.dataFolder + self.saveFileDict) return my_file.is_file() and my_file_dict.is_file() def update(self, corpus): self.ldamodel.update(corpus) def load(self, subfolder=None): if subfolder: sf = subfolder + '/' else: sf = '' self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile) self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict) def fillDb(self): topics = {} result = [] result2 = [] nbTopics = self.ldamodel.get_topics().shape[0] # "Old" for topicId in range(0, nbTopics): topicTerms = self.ldamodel.get_topic_terms(topicId, 3) topicTerms.sort(key=lambda x: x[1], reverse=True) words = [] for topicTerm in topicTerms: words.append(self.dictionary.get(topicTerm[0])) topics[topicId] = ' '.join(words) with mysql as db: contentsText = db.getContentsText() for element in contentsText: bow = self.dictionary.doc2bow(element['content'].split()) docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05) if len(docTopics) > 0: docTopics.sort(key=lambda x: x[1], reverse=True) result.append((element['url'], topics[docTopics[0][0]])) for docTopic in docTopics: result2.append((element['url'], docTopic[0], str(docTopic[1]))) db.emptyUrlsTopic() db.emptyCurrentUrlsTopic() db.emptyCurrentUserTags() db.setCurrentUrlsTopic(result2) db.setPrecalcTopics() # "New" terms = [] for topicId in range(0, nbTopics): topicTerms = self.ldamodel.get_topic_terms(topicId, 5) topicTerms.sort(key=lambda x: x[1], reverse=True) for topicTerm in topicTerms: terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1]))) with mysql as db: db.emptyLdaTopics() db.setLdaTopics(terms) def get_terms_topics(self, keywords): bow = self.dictionary.doc2bow(keywords[:30]) topics = {} keywordsResult = {} for word in bow: wordTopics = self.ldamodel.get_term_topics(word[0], 0.05) keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics} for wordTopic in wordTopics: wordTopicId = wordTopic[0] if wordTopicId not in topics: topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId) return {'topics': topics, 'keywords': keywordsResult}
def print_topic_terms(model: LdaModel): for topic_id in range(model.num_topics): top_list = model.get_topic_terms(topic_id) print(topic_id, [idx2token[idx] for idx, value in top_list])