def _run(self, info): nbprint('Running LDA') vocab = data.load_vocab(info) id2word = {e['id']: e['token'] for e in vocab} corpus = Sparse2Corpus(self.input_mat) lda = LdaModel(corpus, id2word=id2word, num_topics=info["num_topics"]) self.W = lda.get_topics().T self.H = np.zeros((info["num_topics"], self.input_mat.shape[1])) for idx, doc in enumerate(corpus): weights = lda[doc] for topic, value in weights: self.H[topic, idx] = value
def make_model(self, texts, dictionary, corpus, num_topics, chunksize=1000, iterations=400, passes=40): model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=0, chunksize=chunksize, iterations=iterations, passes=passes, alpha='asymmetric', per_word_topics=True) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coh = coherencemodel.get_coherence() topic_matrix = model.get_topics() return model, coh
def gen_ldamodel(self): mdf = MyDataFrame() df = mdf.new_DataFrame() df2 = mdf.m_cut(df) filelist=[] for i in range(len(df2)): filelist.append(df2['fenci'][i]) #生成文档对应的字典和bow稀疏矩阵 dictionary = corpora.Dictionary(filelist) corpus = [dictionary.doc2bow(text) for text in filelist] # 仍为list in list tfidf_model = models.TfidfModel(corpus) # 建立TF-IDF模型 corpus_tfidf = tfidf_model[corpus] # 对所需文档计算TF-IDF结果 corpus_tfidf #拟合LDA模型 from gensim.models.ldamodel import LdaModel # 列出所消耗的时间备查 ldamodel = LdaModel(corpus, id2word = dictionary, num_topics = 10, passes = 10) #列出最重要的若干个主题 ldamodel.print_topics(num_topics = 20,num_words = 10) #计算各语料的LDA模型值 corpus_lda = ldamodel[corpus_tfidf] # 此处应当使用和模型训练时相同类型的矩阵 for doc in corpus_lda: print(doc) ldamodel.get_topics()#list of list 每个主题中每个词所对应的一个概率矩阵 # 检索和文本内容最接近的主题 # 检索和0.txt最接近的主题 query_bow = dictionary.doc2bow(df2['fenci'][0]) # 频数向量 query_tfidf = tfidf_model[query_bow] # TF-IDF向量 print("转换后:", query_tfidf[:10]) ldamodel.get_document_topics(query_bow) # 需要输入和文档对应的bow向量 # 检索和文本内容最接近的主题 ldamodel[query_tfidf] #list,最接近的主题list
class Lda: def __init__(self): self.model = None self.common_dictionary = None pass def train(self, common_texts, num_topics): self.common_dictionary = Dictionary(common_texts) common_corpus = [ self.common_dictionary.doc2bow(text) for text in common_texts ] self.model = LdaModel(common_corpus, num_topics=num_topics, alpha='auto', eval_every=5) def get_topics(self, words=None): s = self.model.get_topics().T if words is not None: common_corpus = self.common_dictionary.doc2idx(words) s = s[common_corpus] return s
def lda(fname, indF, nTopics=20, passes=1, iterations=50, fmax=math.inf, fmin=0, head='cancer_py_gen_gvLDA_'): cts = pd.read_csv(fname + '.csv', header=0, index_col=0, dtype={0: str}) # cts = pd.read_csv(fname + '.csv', header=None, index_col=None) ind = pd.read_csv(indF + '.csv', header=None) patID = cts.index gvID = cts.columns rows = np.where(ind > 0)[0] splits = np.max(np.array(ind)) patID = patID[rows] phi = cts.iloc[rows] ind = ind.iloc[rows, 0] for i in range(1, splits + 1): # initialize log ofname = head + str(nTopics) + '_' + str(i) ch = logging.FileHandler('logs/' + ofname + '.log', mode='w') ch.setLevel(logging.INFO) formatter = logging.Formatter('%(levelname)s : %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) # training set rowsT = np.where(ind != i) X = np.asarray(phi.iloc[rowsT]) cols = np.logical_and(fmin < X.sum(axis=0), X.sum(axis=0) < fmax) X = X[:, cols] X_corp = Dense2Corpus(np.array(X), documents_columns=False) # valid set rowsV = np.where(ind == i) X_test = np.asarray(phi.iloc[rowsV]) X_test = X_test[:, cols] X_testcorp = Dense2Corpus(np.array(X_test), documents_columns=False) lda = LdaModel(X_corp, nTopics, alpha='auto', passes=passes, iterations=iterations) ofname = 'data/' + ofname lda.save(ofname + '_model') gvTop = pd.DataFrame(lda.get_topics()) gvTop.columns = np.asarray(gvID)[cols] gvTop.to_csv(ofname + '_genes.csv') pd.DataFrame(lda.alpha).to_csv(ofname + '_alpha.csv') patTop = pd.DataFrame(get_doc_topic(X_corp, lda)) patTop.index = patID[rowsT] patTop.to_csv(ofname + '_train.csv') patTop = pd.DataFrame(get_doc_topic(X_testcorp, lda)) patTop.index = patID[rowsV] patTop.to_csv(ofname + '_valid.csv') logger.removeHandler(ch) # stop log
def get_most_common(title_list, dic, num=COMMON_TOPIC_WORDS_NUM, random_state=None): '''最頻出の話題の単語num個のセットを取得する''' bow = [dic.doc2bow(title) for title in title_list] # TODO: 適切なトピック数を取得して設定する if LOG_LEVEL == 'DEBUG': random_state = 123 model = LdaModel(bow, id2word=dic, num_topics=TOPIC_NUM, random_state=random_state) # 各タイトルを分類 topic_id_list = [] for idx, title in enumerate(title_list): logger.debug('title') logger.debug(title) doc_topics_tuple = model.get_document_topics(dic.doc2bow(title), minimum_probability=0.0) doc_topic_dist = [[val[0], val[1]] for val in doc_topics_tuple] doc_topic_dist = np.array(doc_topic_dist) if idx == 0: topic_dist_arr = doc_topic_dist else: topic_dist_arr = np.vstack([topic_dist_arr, doc_topic_dist]) topic_id = int( sorted(doc_topic_dist, key=lambda x: x[1], reverse=True)[0][0]) topic_id_list.append(topic_id) if LOG_LEVEL == 'DEBUG': # titleごとのトピック分布 df_topic_dist = pd.DataFrame({ 'title': title_list, 'topic_id': topic_id_list }) # トピックごとの単語分布 cols = ['{}_{}'.format(word_no, elem) \ for word_no in range(10) \ for elem in range(2)] df_word_dist = pd.DataFrame() arr_dist = topic_dist_arr.reshape(-1, model.get_topics().shape[0], 2) for topic_id in range(model.get_topics().shape[0]): df_topic_dist['topic_{}'.format(topic_id)] = arr_dist[:, topic_id, 1] topic_terms = model.get_topic_terms(topic_id, topn=int(len(cols) / 2)) topic_terms_2 = [] for term in topic_terms: topic_terms_2 = topic_terms_2 + [ dic.id2token[term[0]], term[1] ] df_word_dist = df_word_dist.append( pd.Series(topic_terms_2, name='topic_{}'.format(topic_id))) df_topic_dist.to_csv( os.path.join('test', 'classified_topic_{}.csv' \ .format(datetime.today().strftime(format='%Y%m%d'))), index=False, encoding='cp932' ) df_word_dist.columns = cols df_word_dist.to_csv( os.path.join('test', 'word_distribution_per_topic_{}.csv' \ .format(datetime.today().strftime(format='%Y%m%d'))), encoding='cp932' ) # 最頻出の話題を取得 topic_id_counter = Counter(topic_id_list) most_common_topic_id = topic_id_counter.most_common(1)[0][0] topic_terms = model.get_topic_terms(most_common_topic_id) logger.debug('') logger.debug('topic_id_counter: ' + str(topic_id_counter)) logger.debug('most_common_topic_id: ' + str(most_common_topic_id)) logger.debug(topic_terms) # 最頻出の話題の重要な単語num個を取得 important_word_list = [ dic.id2token[topic_tuple[0]] for topic_tuple in topic_terms[:num] ] logger.debug(important_word_list) return important_word_list
class GensimLDA: def __init__(self, texts): self.dictionary = Dictionary(texts) self.corpus = [self.dictionary.doc2bow(text) for text in texts] self.k_topics = None self.model = None def fit(self, k_topics, iterations=50): '''''' self.k_topics = k_topics self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, \ num_topics=k_topics, iterations=iterations) def get_document_topic_matrix(self, X=None): '''Returns an n_docs x k_topics array of probabilities of a topic in a given document.''' if X is None: X = self.corpus else: X = [self.dictionary.doc2bow(text) for text in X] n_docs = len(X) V = np.zeros((n_docs, self.k_topics)) # Extract assignments some_iterable = self.model.get_document_topics( X) ## equiv: self.model[X] for i, doc_topic in enumerate(some_iterable): for topic_id, prob in doc_topic: V[i, topic_id] = prob return V def get_topic_term_matrix(self): '''Returns an k_topics x m_words array of probabilities of a word in a given topic.''' return self.model.get_topics() def print_topics(self, top_n=10): '''Prints the top_n words in a topic''' for row in self.get_topic_term_matrix(): ranking = np.argsort(row) ids = np.arange(len(ranking))[ranking] for k in ids[:-top_n:-1]: weight = row[k] word = self.dictionary.id2token[k] print(k, word, weight) print() def print_topic_words(self, topic_num, topn=None): '''Prints the top words and probabilities of a given topic in descending probability.''' for tok_id, prob in self.model.get_topic_terms(topic_num, topn=topn): word = self.dictionary.id2token[tok_id] print(word, prob) def get_topic_bows(self, num_words=10): '''Returns a list (for each topic) containing a list of the top num_words''' q = self.model.show_topics(num_topics=self.k_topics, num_words=num_words, formatted=False) topics = [] for id, topic in q: words = [] for w, p in topic: words.append(w) topics.append(words) return topics
class LDAWDF: mysql: mysql.MySQL ldamodel: LdaModel dictionary = None corpus = None def __init__(self, mysql): self.mysql = mysql self.dataFolder = './data/' self.saveFile = 'lda_model' self.saveFileDict = 'lda_model_dict' def trainFromStart(self): with self.mysql as db: content = db.getContentsText() documents = [] for item in content: documents.append(item['content'].split()) self.dictionary = corpora.Dictionary(documents) self.dictionary.filter_extremes(no_below=5, no_above=0.5) doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents] self.corpus = doc_term_matrix # Running and Training LDA model on the document term matrix. print("Starting to train LDA Model...") self.ldamodel = LdaModel( doc_term_matrix, num_topics=200, id2word=self.dictionary, passes=100) def printTest(self): print(self.ldamodel.print_topics(num_topics=10, num_words=5)) def save(self): self.ldamodel.save(self.dataFolder + self.saveFile) self.dictionary.save(self.dataFolder + self.saveFileDict) def canLoad(self): my_file = Path(self.dataFolder + self.saveFile) my_file_dict = Path(self.dataFolder + self.saveFileDict) return my_file.is_file() and my_file_dict.is_file() def update(self, corpus): self.ldamodel.update(corpus) def load(self, subfolder=None): if subfolder: sf = subfolder + '/' else: sf = '' self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile) self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict) def fillDb(self): topics = {} result = [] result2 = [] nbTopics = self.ldamodel.get_topics().shape[0] # "Old" for topicId in range(0, nbTopics): topicTerms = self.ldamodel.get_topic_terms(topicId, 3) topicTerms.sort(key=lambda x: x[1], reverse=True) words = [] for topicTerm in topicTerms: words.append(self.dictionary.get(topicTerm[0])) topics[topicId] = ' '.join(words) with mysql as db: contentsText = db.getContentsText() for element in contentsText: bow = self.dictionary.doc2bow(element['content'].split()) docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05) if len(docTopics) > 0: docTopics.sort(key=lambda x: x[1], reverse=True) result.append((element['url'], topics[docTopics[0][0]])) for docTopic in docTopics: result2.append((element['url'], docTopic[0], str(docTopic[1]))) db.emptyUrlsTopic() db.emptyCurrentUrlsTopic() db.emptyCurrentUserTags() db.setCurrentUrlsTopic(result2) db.setPrecalcTopics() # "New" terms = [] for topicId in range(0, nbTopics): topicTerms = self.ldamodel.get_topic_terms(topicId, 5) topicTerms.sort(key=lambda x: x[1], reverse=True) for topicTerm in topicTerms: terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1]))) with mysql as db: db.emptyLdaTopics() db.setLdaTopics(terms) def get_terms_topics(self, keywords): bow = self.dictionary.doc2bow(keywords[:30]) topics = {} keywordsResult = {} for word in bow: wordTopics = self.ldamodel.get_term_topics(word[0], 0.05) keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics} for wordTopic in wordTopics: wordTopicId = wordTopic[0] if wordTopicId not in topics: topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId) return {'topics': topics, 'keywords': keywordsResult}
class Lda(): def __init__(self, corpora=None, savedModle=None, numTopics=10, seed=None, autoAproach=False): ''' corpora: Corpora 結構化之文本數據 saveModel: str = None 欲載入之model路徑 numTopics: int = 10 欲生成之主題數量 seed: int = None 使用特定亂數種子碼 autoAproach = False 是否自動調整主題數目找出適當值 ''' self.corpora = corpora self.numTopics = numTopics self.seed = seed if (savedModle == None): self.__trainingModel() else: self.ldaModel = LdaModel.load(savedModle) if (autoAproach): wellLastTime = False while (self.__isWellClassify() or not wellLastTime): if (self.__isWellClassify()): wellLastTime = True savedModle(name="temp") self.numTopics -= 1 self.__trainingModel() elif (not wellLastTime): self.numTopics += 2 self.__trainingModel() # else: self.numTopics += 1 LdaModel.load("temp.pkl") def __trainingModel(self): if (self.seed != None): self.ldaModel = LdaModel(corpus=self.corpora.TfidfPair, id2word=self.corpora.Dictionary, num_topics=self.numTopics, random_state=np.random.RandomState( self.seed)) else: self.ldaModel = LdaModel(corpus=self.corpora.TfidfPair, id2word=self.corpora.Dictionary, num_topics=self.numTopics) def __isWellClassify(self, threshold=0.8, test=None): ''' 確認個文本至少有一主題之吻合度(概率)大於標準值 threshold = 0.8: 最小接受之主題分佈(標準值) (test : 測試用的虛擬分佈) ''' if (test == None): test = self.topicsDistribution() for tdb in test: ambiguous = True for prob in tdb: if (prob[1] >= threshold): ambiguous = False break if (ambiguous): return False return True def saveModel(self, name="my_model"): ''' 儲存訓練完成之model name: str = "my_modle" 儲存路徑 ''' self.ldaModel.save(fname=name) def showTopicsStr(self, topn=10): ''' 以字串顯示訓練lda主題 topn: 欲顯示的詞彙個數 ''' return self.ldaModel.show_topics(num_topics=self.numTopics, num_words=topn) def showTopicsList(self, topn=10): ''' 以list of tuple 顯示主題 topn: 欲顯示的詞彙個數 ''' return self.ldaModel.show_topics(num_topics=self.numTopics, num_words=topn, formatted=False) def topicsDistribution(self, tfidf=None): ''' 以該模型分析待定之結構化文檔 Input: tfidf: 2d_list: tfidf矩陣 output: 2d_list: 文檔對各主題歸屬之概率 ''' if (tfidf == None): tfidf = self.corpora.TfidfPair return [self.ldaModel[article] for article in tfidf] def classifyTopic(self, topicsDistr=None): '''回傳存有文本對應主題之list''' if (topicsDistr == None): topicsDistr = self.topicsDistribution() result = [] for article in topicsDistr: #針對每一篇文章測試 topicID = 0 #預設主題為0 for topic in article: #依序迭代每一主題 if (topic[1] > article[topicID][1]): #該則主題概率更高則取代預設 topicID = topic[0] result.append(topicID) return result def findArticleMatched(self, classifiedTopic=None): '''將文本依主題歸類後做成list回傳''' if (classifiedTopic == None): classifiedTopic = self.classifyTopic() numOfTopic = max(classifiedTopic) + 1 result = [[] for num in range(0, numOfTopic)] counter = 0 while (counter < len(classifiedTopic)): result[classifiedTopic[counter]].append(counter) #把文章丟進對應的主題桶子 counter += 1 return result def __relativeEntropy(self, p, q): #q編碼p所需額外位元 '''sum(p*log(p/q))''' if (0 in q): return math.inf #infinity return reduce(operator.add, map(lambda x, y: x * math.log(x / y), p, q)) def showRelativeEntropy(self, topicId, dtMatrix): '''計算給定詞頻矩陣與該model之相對熵''' klMeans = list() p = self.ldaModel.get_topics()[topicId] #q candidatesIds = self.findArticleMatched()[topicId] #取得歸類於給定主題之文本 for id in candidatesIds: dtm = dtMatrix[id] totalWordCount = sum(dtm) #取得文章總辭彙數用於醬詞頻轉為概率 q = list() for prob in dtm: if (prob == 0): q.append(1e-20) else: q.append(prob / totalWordCount) klMeans.append((id, self.__relativeEntropy(p, q))) return klMeans def showAuthenticArticle(self, topicId, num=1): '''代表性文章''' entropy = self.showRelativeEntropy(topicId, self.corpora.DtMatrix) sortedEntropy = sorted(entropy, key=lambda x: x[1]) return [t[0] for t in sortedEntropy[:num]]