def LDA_model(corpus_Quran, corpus_NT, corpus_OT): # run LDA on the entire set of verses from all corpora total_corpus = corpus_Quran + corpus_NT + corpus_OT dictionary = Dictionary(total_corpus) dictionary.filter_extremes(no_below=50, no_above=0.1) corpus = [dictionary.doc2bow(text) for text in total_corpus] lda = LdaModel(corpus, num_topics=20, id2word=dictionary, random_state=1) # compute document-topic probability for Quran dictionary1 = Dictionary(corpus_Quran) dictionary1.filter_extremes(no_below=50, no_above=0.1) corpus1 = [dictionary1.doc2bow(text) for text in corpus_Quran] topics_Quran = lda.get_document_topics(corpus1) topic_dic_Quran = {} for doc in topics_Quran: for topic in doc: if topic[0] not in topic_dic_Quran.keys(): topic_dic_Quran[topic[0]] = topic[1] else: topic_dic_Quran[topic[0]] += topic[1] # compute document-topic probability for OT dictionary2 = Dictionary(corpus_OT) dictionary2.filter_extremes(no_below=50, no_above=0.1) corpus2 = [dictionary2.doc2bow(text) for text in corpus_OT] topics_OT = lda.get_document_topics(corpus2) topic_dic_OT = {} for doc in topics_OT: for topic in doc: if topic[0] not in topic_dic_OT.keys(): topic_dic_OT[topic[0]] = topic[1] else: topic_dic_OT[topic[0]] += topic[1] # compute document-topic probability for NT dictionary3 = Dictionary(corpus_NT) dictionary3.filter_extremes(no_below=50, no_above=0.1) corpus3 = [dictionary3.doc2bow(text) for text in corpus_NT] topics_NT = lda.get_document_topics(corpus3) topic_dic_NT = {} for doc in topics_NT: for topic in doc: if topic[0] not in topic_dic_NT.keys(): topic_dic_NT[topic[0]] = topic[1] else: topic_dic_NT[topic[0]] += topic[1] for k, v in topic_dic_Quran.items(): topic_dic_Quran[k] = v / len(corpus_Quran) for k, v in topic_dic_OT.items(): topic_dic_OT[k] = v / len(corpus_OT) for k, v in topic_dic_NT.items(): topic_dic_NT[k] = v / len(corpus_NT) return lda, topic_dic_Quran, topic_dic_NT, topic_dic_OT
def getLdaFeature(documents, topicNum): ''' Funciton: generate lda features by training lda model Input: documents: list of preprocessed sentences topicNum: output vector dimension Output: lda features(DataFrame format) ''' # get corpus # LogInfo(' Get corpus...') texts = [[word for word in document.split(' ')] for document in documents] dictionary = corpora.Dictionary(texts) corpusD = [dictionary.doc2bow(text) for text in texts] # train lda model # LogInfo(' Train LDA model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] # ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) ldaModel = LdaModel(corpus_tfidf, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) # generate lda features LogInfo(' Generate LDA features...') ldaFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpus_tfidf: topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01) for t in topic: ldaFeature[i, t[0]] = round(t[1],5) i = i + 1 colName = getColName(topicNum, "qlda") ldaFeature = pd.DataFrame(ldaFeature, columns = colName) return ldaFeature
class LMDL_LDA(): def __init__(self): self.lmdl = LMDL_Corpus() self.texts = self.lmdl.get_corpus_texts_words() self.dictionary = Dictionary(self.texts) self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] self.lda = LdaModel(self.corpus, num_topics=LDA_NUM_TOPICS, id2word=self.dictionary) def print_topics(self): return self.lda.print_topics(LDA_NUM_TOPICS) def get_document_topics(self, document_name): document_tokens = self.lmdl.token_list_processed(document_name) topics = self.lda.get_document_topics( self.dictionary.doc2bow(document_tokens), minimum_probability=None, minimum_phi_value=None, per_word_topics=False) show_topics_list = [] for topic in topics: lda_topic = self.lda.show_topic(topic[0], topn=10) show_topics_list.append(lda_topic) return show_topics_list def top_topics(self): return self.lda.top_topics(corpus=self.corpus, texts=self.texts, dictionary=self.dictionary, window_size=None, coherence='u_mass', topn=20, processes=-1)
def get_lda_feature(): doc_train = pd.read_csv(id_content_path) documents = doc_train['content'].apply(lambda x: x.split(' ')) # 建立词和ID的映射字典(id:word) dictionary = corpora.Dictionary(documents) # 建立文档和id和list(tuple(id,num)) of list df ds_df = [dictionary.doc2bow(document) for document in documents] # 建立tfidf模型,通过语料文档的tf,预测的时候只要提供语料的df tfidf_model = TfidfModel(ds_df) # 获取文档的tdf获取文档tfidf ds_tfidf = tfidf_model[ds_df] # 定义文档的主题个数 n = 60 # 构建lda模型,输入参数是文档的tfidf,并指明主题的个数 lda_model = LdaModel(ds_tfidf, num_topics=n, passes=10, random_state=12) vec_size = (len(documents), n) lda_feature = np.zeros(vec_size) i = 0 for doc in ds_tfidf: topics = lda_model.get_document_topics(doc, minimum_probability=0.01) for topic in topics: num_topic = topic[0] prob = round(topic[1], 5) lda_feature[i, num_topic] = prob i += 1 f_names = get_lda_feacture_name(n) pd.DataFrame(lda_feature, columns=f_names).to_csv(id_content_lda_path, index=0)
def cluster(doc_term_matrix, num, word_dict): ldamodel = Lda(doc_term_matrix, num_topics=num, id2word=word_dict) doc_topics = ldamodel.get_document_topics( doc_term_matrix, minimum_probability=0.20) # needs tuning result = [[] for i in range(num)] for k, topic in enumerate(doc_topics): # Some articles do not have a topic if topic: topic.sort(key=itemgetter(1), reverse=True) result[topic[0][0]].append(k) return [map(lambda x: titles[x], result[k]) for k in len(result)]
def gen_ldamodel(self): mdf = MyDataFrame() df = mdf.new_DataFrame() df2 = mdf.m_cut(df) filelist=[] for i in range(len(df2)): filelist.append(df2['fenci'][i]) #生成文档对应的字典和bow稀疏矩阵 dictionary = corpora.Dictionary(filelist) corpus = [dictionary.doc2bow(text) for text in filelist] # 仍为list in list tfidf_model = models.TfidfModel(corpus) # 建立TF-IDF模型 corpus_tfidf = tfidf_model[corpus] # 对所需文档计算TF-IDF结果 corpus_tfidf #拟合LDA模型 from gensim.models.ldamodel import LdaModel # 列出所消耗的时间备查 ldamodel = LdaModel(corpus, id2word = dictionary, num_topics = 10, passes = 10) #列出最重要的若干个主题 ldamodel.print_topics(num_topics = 20,num_words = 10) #计算各语料的LDA模型值 corpus_lda = ldamodel[corpus_tfidf] # 此处应当使用和模型训练时相同类型的矩阵 for doc in corpus_lda: print(doc) ldamodel.get_topics()#list of list 每个主题中每个词所对应的一个概率矩阵 # 检索和文本内容最接近的主题 # 检索和0.txt最接近的主题 query_bow = dictionary.doc2bow(df2['fenci'][0]) # 频数向量 query_tfidf = tfidf_model[query_bow] # TF-IDF向量 print("转换后:", query_tfidf[:10]) ldamodel.get_document_topics(query_bow) # 需要输入和文档对应的bow向量 # 检索和文本内容最接近的主题 ldamodel[query_tfidf] #list,最接近的主题list
def saliency_index(lda: LdaModel, corpus, words: Dictionary): full_corpus = list(chain(*corpus)) N = len(words) total = sum(words.cfs[i] for i in range(N)) frequencies = [words.cfs[i] / total for i in range(N)] topics = lda.print_topics() relative_likelihood = [0. for _ in range(N)] for topic_id, topic_prob in lda.get_document_topics( full_corpus, minimum_probability=0.): for term, cond_prob in lda.get_topic_terms(topic_id, topn=None): relative_likelihood[term] += cond_prob * log( cond_prob / topic_prob) saliencies = [f * l for f, l in zip(frequencies, relative_likelihood)] return {words[i]: s for i, s in enumerate(saliencies)}
# In[115]: # # extracts topics for given document from Gensim # def get_topics(doc, k=5, model_lda=model_lda): # topic_id = sorted(model_lda[doc][0], key=lambda x: -x[1]) # top_k_topics = [x[0] for x in topic_id[:k]] # return [(i, model_lda.print_topic(i)) for i in top_k_topics] # In[116]: # `get_document_topics()` returns topic probability distribution for given document topic_dist_675_a = model_lda.get_document_topics(corpus_train[15]) pprint(sorted(topic_dist_675_a)) # In[117]: topicid = 3 model_lda.get_topic_terms(topicid, topn=10) # In[118]: text_train[doc_id]
class LDA(object): def __init__(self, topics=10, worker=3, pretrained_model=None, dictionary=None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary) def save(self, model_file, dictionary_file): """ 保存训练的模型,同时保存对应的词典 Args: model_file -- 模型文件 dictionary_file -- 词典文件 Returns: 无 """ if self._model: self._model.save(model_file) if self._common_dictionary: self._common_dictionary.save(dictionary_file) def update(self, corpus=[[]]): """ 在线更新,在已有模型的基础上在线更新 Args: corpus -- 用于更新的文档列表 """ if not self._model and len(corpus) > 0: self._common_dictionary = Dictionary(corpus) corpus_data = [ self._common_dictionary.doc2bow(sentence) for sentence in corpus ] self._model = LdaModel(corpus_data, self._topics) elif self._model and len(corpus) > 0: self._common_dictionary.add_documents(corpus) new_corpus_data = [ self._common_dictionary.doc2bow(sentence) for sentence in corpus ] self._model.update(new_corpus_data) def inference(self, document=[]): """ 对新文档推断其话题分布 Args: document -- 文档,其实是词列表 Returns: 话题分布列表 """ if self._model: doc = [self._common_dictionary.doc2bow(document)] return self._model.get_document_topics(doc) return [] @property def model(self): return self._model @property def dictionary(self): return self._common_dictionary
class LDA(object): def __init__(self, topics = 10, worker = 3, pretrained_model = None, dictionary = None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary) def save(self, model_file, dictionary_file): """ 保存训练的模型,同时保存对应的词典 Args: model_file -- 模型文件 dictionary_file -- 词典文件 Returns: 无 """ if self._model: self._model.save(model_file) if self._common_dictionary: self._common_dictionary.save(dictionary_file) def update(self, corpus = [[]]): """ 在线更新,在已有模型的基础上在线更新 Args: corpus -- 用于更新的文档列表 """ if not self._model and len(corpus) > 0: self._common_dictionary = Dictionary(corpus) corpus_data = [self._common_dictionary.doc2bow(sentence) for sentence in corpus] self._model = LdaModel(corpus_data, self._topics) elif self._model and len(corpus) > 0: self._common_dictionary.add_documents(corpus) new_corpus_data = [self._common_dictionary.doc2bow(sentence) for sentence in corpus] self._model.update(new_corpus_data) def inference(self, document = []): """ 对新文档推断其话题分布 Args: document -- 文档,其实是词列表 Returns: 话题分布列表 """ if self._model: doc = [self._common_dictionary.doc2bow(document)] return self._model.get_document_topics(doc) return [] @property def model(self): return self._model @property def dictionary(self): return self._common_dictionary
def main(): # SQLite に接続 datastore.connect() # dic_id を1つずつ取得し, その doc_id 内の文章ごとに含まれる単語の原形を sentences に格納 sentences = [] for doc_id in datastore.get_all_ids(limit=-1): all_tokens = datastore.get_annotation(doc_id, "token") for sentence in datastore.get_annotation(doc_id, "sentence"): tokens = find_xs_in_y(all_tokens, sentence) sentences.append( [token["lemma"] for token in tokens if token.get("NE") == "O"]) # 分析に使用する記事が少ないため, 20文を 1つの文書として扱うように sentence を結合 n_sent = 20 docs = [ list(itertools.chain.from_iterable(sentences[i:i + n_sent])) for i in range(0, len(sentences), n_sent) ] # LDA の計算に使用する単語を選定 # - 出現頻度が 2つ未満の文書の場合, その単語は計算に使用しない( no_below=2 ) # - 出現頻度が 3割以上の文書の場合, その単語は計算に使用しない( no_above=0.3 ) dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=2, no_above=0.3) # 単語の集まりを doc2bow method を用いて, 所定のデータ型に変換 corpus = [dictionary.doc2bow(doc) for doc in docs] # LDA モデルを作成 lda = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10) # 主題の確認 # Topic の一覧を出力 # Topic の一覧と合わせて, その Topic の中で確率値の大きい単語上位 10個を出力 for topic in lda.show_topics(num_topics=-1, num_words=10): print(f"Topic id: {topic[0]} Word: {topic[1]}") # 記事の主題分布の推定 # doc_id ごとに確率値の大きい Topic を出力 for doc_id in datastore.get_all_ids(limit=-1): meta_info = json.loads( datastore.get(doc_id=doc_id, fl=["meta_info"])["meta_info"]) title = meta_info["title"] print(title) doc = [ token["lemma"] for token in datastore.get_annotation(doc_id, "token") if token.get("NE") == "O" ] topics = sorted(lda.get_document_topics(dictionary.doc2bow(doc)), key=lambda x: x[1], reverse=True) for topic in topics: print(f" Topic id: {topic[0]} Prob: {topic[1]}") datastore.close() return
class LDA_parser(): """ This class implements a wrapper pipeline for text preprocessing and LDA parsing of an input corpus in the form ['str','str','str', ... ]. """ def __init__(self, corpus='', language='english', preprocessor_type="spacy", tags=["DET", "PUNCT", "NUM", "SYM", "SPACE"], custom_filter=[], lemmatize=False, stem=False, min_len=2, num_topics=10, passes=100): """ Parses the input text into a suitable format, then performs all LDA extraction tasks. It expects the input corpus to be a list of texts. If input is a long string, it will attempt create documents by splitting by @ params: @ corpus: Input corpus in str or ['str','str','str', ... ] format, where each entry is a document of type str. Alternatively, a str format input (not recommended). @ preprocessor_type: Use nltk-based or spaCy-base preprocessor @ language: language to use in the preprocessor @ tags: if spaCy is selected, will filter words with input POS tags @ custom_filter: filter words in this input list in the preprocessing step @ lemmatize: use lemmatization in the preprocessing @ stem: use stemming in the preprocessing @ num_topics: maximum number of topics in the LDA algorithm @ passes: number of training epochs in the LDA """ print("Initializing model...\n") if preprocessor_type == "nltk": print("NLTK preprocessor selected.") self.preprocessor = nltk_preprocessor(language=language) if preprocessor_type == "spacy": print("spaCy preprocessor selected.") self.preprocessor = spacy_preprocessor(language=language) self.language = language # input language self.raw_corpus = "" # simply stores the input if in str type self.clean_corpus = [ ] # [doc, doc, ..., doc] = [[sent, sent, ...], ... ,[sent, sent, ...]] self.dictionary = None # holds a corpora.Dictionary representation of corpus self.doc2bow_corpus = None # contains doc2bow vector representations of each document in the corpus self.lda_model = None # LDA model trained on the input corpus self.topic_mixtures = [ ] # contains str representations of mixtures of words with their probabilities self.topics = { } # Contains a dictionary of topics with words and respective mix probabilities once "extract topics" is called. self.topic_words = { } # As above, but only contains the respective words of the topic # check for raw str corpus format if isinstance(corpus, str): print( "***WARNING***\nRaw input (str) received. Text will be sentence-tokenized and parsed accordingly." ) print("Make sure this is intended. \n") self.raw_corpus = str(corpus) # transform input to string self.fit(corpus, raw=True, language=language, num_topics=num_topics, passes=passes, min_len=min_len) # fit corpus as raw elif corpus == '': print("***WARNING***\nNull Corpus") # assume input corpus is in the right format else: self.fit(corpus, language=language, num_topics=num_topics, passes=passes, min_len=min_len) def fit(self, corpus, raw=False, language='english', stem=False, lemmatize=False, num_topics=10, passes=100, min_len=2, echo_corpus=False): """ Assumes input corpus is in the right format. @args: @ corpus = input corpus @ language = input language @ stem/lemmatize = if true, stem or lemmatize input corpus @ num_topics = number of topics to choose in the algorithm @ passes = number of epochs of the LDA @ min_len = minimum length of words to consider when preprocessing words """ if echo_corpus: print("CORPUS: {}".format(corpus)) t0 = time.time() print("Fitting LDA topic modelling...") self.raw_corpus = corpus # input corpus as is self.language = language # in case initial language changed if raw: print("Preprocessing corpus...(raw)") self.clean_corpus = self.preprocessor.preprocess_str_corpus( corpus, stem=stem, lemmatize=lemmatize, min_len=min_len) else: print("Preprocessing corpus...") self.clean_corpus = self.preprocessor.preprocess_texts( self.raw_corpus, min_len=2) # preprocess text list print("Creating corpora dictionary...") self.dictionary = corpora.Dictionary( self.clean_corpus) # create corpora.Dictionary mapping print("Translating doc2bow corpus...") self.doc2bow_corpus = [ self.dictionary.doc2bow(text) for text in self.clean_corpus ] # doc2bow corpus representation print("Running LDA...") self.lda_model = LdaModel(self.doc2bow_corpus, num_topics=num_topics, id2word=self.dictionary, passes=passes) self.topic_mixtures = self.lda_model.show_topics( num_topics=-1, num_words=10) # string representation of topics mixtures t1 = time.time() print("\nDone in {:.3f} seconds.".format(t1 - t0)) def print_topics(self, words_per_topic=5): """ Displays the topics in string format """ topics = self.lda_model.print_topics(num_words=words_per_topic) for topic in topics: print(topic) def extract_topics(self, max_words_per_topic=50, threshold=0.005): """ Returns all topics as a dictionary of tuples, where the key is the topic number, and the value is a list of tuples of words_per_topic many words with probability at least as high as threshold, where the second value is the density for the topic. @params: @ max_words_per_topic: Maximum topic mixture component words to consider. @ threshold: select words whose density is at least this value """ topics = {} # to store the topics indexes = [tup[0] for tup in self.topic_mixtures] # indexes of the thing # assign the topics mixtures for i in indexes: topics[i] = [ tup for tup in self.lda_model.show_topic(i, topn=max_words_per_topic) if tup[1] >= threshold ] # extract mosst probable words for topic i self.topics = topics # update attribute return topics def extract_topic_words(self, max_words_per_topic=50, threshold=0.005): """ Returns all topics as a dictionary of tuples, where the key is the topic number, and the value is a list of words_per_topic many words with probability at least as high as threshold. """ topics = {} # to store the topics indexes = [tup[0] for tup in self.topic_mixtures] # indexes of the thing # assign the topics mixtures for i in indexes: topics[i] = [ tup[0] for tup in self.lda_model.show_topic(i, topn=max_words_per_topic) if tup[1] >= threshold ] # extract mosst probable words for topic i self.topic_words = topics # update attribute return topics def parse_new(self, new_text, top_n_topics=100, top_n_w=30, max_words_per_topic=50, threshold=0.005, verbose=True): """ Parses a new text by obtaining the most likely topics for the new input, as well as the respective words. This function should be used only after the LDA parser has been fitted. @params: @ new_text: new input text @ top_n_topics: top n topics with larges densities p(topic) @ top_n_w: top n word with largest densities p(word) = p(word|topic)*p(topic) @ verbose: display information @ max_words_per_topic: maximum words per topic @ thrshold: only consider words with density greater than threshold @returns: @ max_topic: most likely topic for the document @ doc_max_topic_words: words associated with the most likely topic @ doc_topics: all topics related to the document @ doc_topic_words: all words from all topics associated with the document """ self.extract_topic_words( max_words_per_topic, threshold) # extract topics to ensure they are there new_text_clean = self.preprocessor.preprocess_sentence( new_text) # preprocess input text new_doc_bow = self.dictionary.doc2bow( new_text_clean) # convert to doc2bow doc_topics = self.lda_model.get_document_topics( new_doc_bow) # obtain topics for input document topic_idx = [tup[0] for tup in doc_topics] # topic indices doc_topic_words = [ word for idx in topic_idx for word in self.topic_words[idx] ] # extract all words from every topic top_n_topics = nlargest(top_n_topics, list(doc_topics), key=lambda x: x[1]) # extract top n topics top_n_words = list( set([ word for idx in [tup[0] for tup in top_n_topics] for word in self.topic_words[idx] ])) # extrac the word for the topc words # Currently, we have access to the top n topics and their actual probabilities. # We want to collect all the words for those topics, and multiply them with their probabilities words_with_probs = [ ] # will store words with their actual probabilities: for topic_tup in doc_topics: topic_idx = topic_tup[0] # obtain topic index topic_prob = topic_tup[1] # obtain topic probability p(topic) for word_tup in self.lda_model.show_topic(topic_idx, topn=10): word_probability = word_tup[ 1] * topic_prob # p(w) = p(w|topic)p(topic) words_with_probs.append( (word_tup[0], word_probability)) # (word, p(w)) # obtain the n most likely words according to they individual probabilities n_most_likely_words = [ tup[0] for tup in nlargest( top_n_w, list(words_with_probs), key=lambda x: x[1]) ] if verbose: print("\nLOGS: \n") print("*** Most likely topic: ***\n", top_n_topics) print("*** Words for most likely topic: ***\n", top_n_words) print("*** All topics: ***\n", doc_topics) print("*** All topics words: ***\n", doc_topic_words) return n_most_likely_words, top_n_topics, top_n_words, doc_topics, doc_topic_words def pickle_save(self, savename="full_LDA_parser.pkl"): """ Saves the full model object in pkl format """ pickle.dump(self, open(savename, 'wb')) def save_model(self, name="LDA_model"): """ Saves the LDA model, doc2bow_corpus and dictionary. These parameters can be used to instantiate a gensim model, so there is no load in this class. """ dictionary_name = name + "_dictionary.gensim" corpus_name = name + "_doc2bow_corpus.pkl" model_name = name + ".gensim" pickle.dump(self.doc2bow_corpus, open(corpus_name, 'wb')) # save the doc2bow_corpus self.dictionary.save(dictionary_name) # save corpus dictionary mapping self.lda_model.save(model_name) # save the full model
from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel from gensim.test.utils import datapath num_topics = 4 # Create a corpus from a list of texts common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] print common_dictionary.items() # Train the model on the corpus. lda = LdaModel(common_corpus, num_topics=num_topics) temp_file = datapath( "/Users/wanghaoxian/Documents/GitHub/recommend/dataContest/model") lda.save(temp_file) list = lda.get_document_topics(common_corpus) for topic in list: print topic for i in range(0, num_topics, 1): print i, lda.get_topic_terms(i, 3)
class CustomLda(object): def __init__(self, data=None, dictionary=None): """ initialize, data should be provided, only when unpickling class object it is not needed!""" self.data = data self.model = None self.num_topics = None self.iterations = None self.random_state = None self.dictionary = dictionary if self.data is not None: if self.dictionary is None: self.dictionary = Dictionary(self.data) self.corpus = [self.dictionary.doc2bow(text) for text in self.data] else: self.dictionary = None self.corpus = None self.distributed = None self.chuncksize = None self.passes = None self.update_every = None self.alpha = None self.eta = None self.decay = None self.offset = None self.eval_every = None self.gamma_threshold = None self.minimum_probability = None self.ns_conf = None self.minimum_phi_value = None self.per_word_topics = None self.num_topics = None self.iterations = None self.random_state = None self.model = None self.coherence_model = None self.coherence = None self.coherence_type = None def train(self, num_topics, iterations=1500, random_state=1, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, gamma_threshold=0.001, minimum_probability=0.01, ns_conf=None, minimum_phi_value=0.01, per_word_topics=False, workers=1): """train lda model. If workers >1, goes multicore""" self.distributed = distributed self.chuncksize = chunksize self.passes = passes self.update_every = update_every self.alpha = alpha self.eta = eta self.decay = decay self.offset = offset self.eval_every = eval_every self.gamma_threshold = gamma_threshold self.minimum_probability = minimum_probability self.ns_conf = ns_conf self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics self.num_topics = num_topics self.iterations = iterations self.random_state = random_state self.workers = workers if self.workers > 1: self.model = LdaMulticore( workers=3, corpus=self.corpus, id2word=self.dictionary, iterations=self.iterations, num_topics=self.num_topics, random_state=self. random_state, # distributed=self.distributed, chunksize=self.chuncksize, passes=self.passes, # update_every= self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, minimum_probability=self. minimum_probability, # ns_conf=self.ns_conf, minimum_phi_value=self.minimum_phi_value, per_word_topics=self.per_word_topics) else: self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, iterations=self.iterations, num_topics=self.num_topics, random_state=self.random_state, distributed=self.distributed, chunksize=self.chuncksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, ns_conf=self.ns_conf, minimum_phi_value=self.minimum_phi_value, per_word_topics=self.per_word_topics) print('Trained!') def _train_coherence_model(self, coherence_type='u_mass'): """could be made on top of model to get coherence, type could be 'u_mass' or 'c_v'""" self.coherence_model = CoherenceModel(model=self.model, texts=self.data, dictionary=self.dictionary, coherence=coherence_type) def _calculate_coherence(self, coherence_type='u_mass'): self._train_coherence_model(coherence_type=coherence_type) self.coherence = self.coherence_model.get_coherence() def get_coherence(self, coherence_type='u_mass'): if coherence_type != self.coherence_type: self._calculate_coherence(coherence_type=coherence_type) return self.coherence def get_topic_terms(self, num, topn=10): return self.model.get_topic_terms(num, topn=topn) def get_preplexity(self): return self.model.log_perplexity(self.corpus) def get_topics(self, num): return self.model.show_topics(num) def _make_visualization(self): """prepare visualisation for display/saving""" return pyLDAvis.gensim.prepare(self.model, self.corpus, self.dictionary, sort_topics=False) def display(self): """display LDAvis in notebook""" visualisation = self._make_visualization() return pyLDAvis.display(visualisation) def save_ldavis(self, filename='topic.html'): """save LDAvis to .html""" ldavis = self._make_visualization() pyLDAvis.save_html(ldavis, filename) def save_lda(self, filename): """save lda model only""" self.model.save(filename) def pickle(self, filename): """save class instance to file""" f = open(filename, 'wb') pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) f.close() @staticmethod def unpickle(filename): """read class instance from file""" with open(filename, 'rb') as f: return pickle.load(f) def predict_topic(self, doc_list): """predict topic of document list (consists of strings""" topic_list = [] for doc in doc_list: bow = self.dictionary.doc2bow(str(doc).split()) topics_probs = self.model.get_document_topics(bow) topics_probs.sort(key=lambda tup: tup[1], reverse=True) topic_list.append(topics_probs) return topic_list
lda_voc.doc2bow(token_list) for token_list in token_list_list ] nmi_vec = [] for _ in tqdm(range(n_test)): # LDA if prior_distrib: lda = LdaModel(lda_corpus, num_topics=n_group, alpha=topic_distrib) else: lda = LdaModel(lda_corpus, num_topics=n_group) # Id doc algo_group_vec = [] for id_doc in range(len(token_list_list)): topic_per_type = lda.get_document_topics(lda_corpus[id_doc], per_word_topics=True)[1] type_list = [] topic_list = [] for type_topic_elem in topic_per_type: type_list.append(lda_voc.get(type_topic_elem[0])) topic_list.append(type_topic_elem[1][0]) algo_group_vec.extend([ topic_list[type_list.index(token)] for token in token_list_list[id_doc] ]) nmi_vec.append( normalized_mutual_info_score(real_group_vec, algo_group_vec)) # Writing results
documents_list=import_docs() dictionary=corpora.Dictionary(documents_list) # Converting list of documents (corpus) into Document Term Matrix using the dictionary doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents_list] corpora.MmCorpus.serialize('corpus.mm', doc_term_matrix) nbTopics=33 nbPasses=5 # LDA model by Gensim with prior parameter (eta) # TO BE DONE #################### ldamodel = LdaModel(doc_term_matrix, num_topics=nbTopics, id2word = dictionary, passes=nbPasses) # to have the topic of a document : ldamodel[doc] topics = ldamodel.get_document_topics(doc_term_matrix, per_word_topics=True) all_topics = [(doc_topics, word_topics, word_phis) for doc_topics, word_topics, word_phis in topics] doc_topics, word_topics, word_phis = all_topics[1] f_res=open("resultsComs.csv", "w") count=0 for d in all_topics: for i in range(len(d[0])): f_res.write("%d;%d;%f;"%(count, d[0][i][0], d[0][i][1])) f_res.write("\n") count+=1 f_res.close()
csvData.append(row) with open(filename, 'w', encoding='utf-8') as writeFile: writer = csv.writer(writeFile) writer.writerows(csvData) print("Write to database successfully") print("corpus length:", len(pubs_corpus)) print("dict length:", len(pubs_dictionary.keys())) topic_word_list_result = dict() topic_dict = topic_to_lemmatized_word_list(lda) for id in pubs_eids: cur_corpus = id_to_corpus.get(id, None) if cur_corpus != None: candidate_topics = lda.get_document_topics( cur_corpus) # list all topic index best_topic_index = select_highest_prob_topic( candidate_topics) # select the index with highest prob if best_topic_index == -1: print("no topic document:", id) topic_word_list_result[id] = ["unknown"] else: topic_word_list_result[id] = topic_dict[ best_topic_index][: 3] # get corresponding topic word list, and store top 3 words else: topic_word_list_result[id] = ["unknown"] write_to_table_file('pubs_metadata_by_scopus.csv') ####################### other tried method ####################### # fixme: coherence is useless, cuz coherence always drop
class GensimLDA: def __init__(self, texts): self.dictionary = Dictionary(texts) self.corpus = [self.dictionary.doc2bow(text) for text in texts] self.k_topics = None self.model = None def fit(self, k_topics, iterations=50): '''''' self.k_topics = k_topics self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, \ num_topics=k_topics, iterations=iterations) def get_document_topic_matrix(self, X=None): '''Returns an n_docs x k_topics array of probabilities of a topic in a given document.''' if X is None: X = self.corpus else: X = [self.dictionary.doc2bow(text) for text in X] n_docs = len(X) V = np.zeros((n_docs, self.k_topics)) # Extract assignments some_iterable = self.model.get_document_topics( X) ## equiv: self.model[X] for i, doc_topic in enumerate(some_iterable): for topic_id, prob in doc_topic: V[i, topic_id] = prob return V def get_topic_term_matrix(self): '''Returns an k_topics x m_words array of probabilities of a word in a given topic.''' return self.model.get_topics() def print_topics(self, top_n=10): '''Prints the top_n words in a topic''' for row in self.get_topic_term_matrix(): ranking = np.argsort(row) ids = np.arange(len(ranking))[ranking] for k in ids[:-top_n:-1]: weight = row[k] word = self.dictionary.id2token[k] print(k, word, weight) print() def print_topic_words(self, topic_num, topn=None): '''Prints the top words and probabilities of a given topic in descending probability.''' for tok_id, prob in self.model.get_topic_terms(topic_num, topn=topn): word = self.dictionary.id2token[tok_id] print(word, prob) def get_topic_bows(self, num_words=10): '''Returns a list (for each topic) containing a list of the top num_words''' q = self.model.show_topics(num_topics=self.k_topics, num_words=num_words, formatted=False) topics = [] for id, topic in q: words = [] for w, p in topic: words.append(w) topics.append(words) return topics
def train_model(documents, onehot_enc, labels): """ :param documents: :param onehot_enc: :param labels: :return: """ # Configuration variables, how many topics will we attempt to extract from # our documents. num_topics = 400 # Start print('number of documents: ', len(documents)) id2word = corpora.Dictionary(documents) corpus = [id2word.doc2bow(doc) for doc in documents] onehot_labels = onehot_enc.transform(labels) print("starting LDA model") # plug into LDA model. # this can take a while with larger number of documents lda = LdaModel(num_topics=num_topics, id2word=id2word, corpus=corpus, passes=50, eval_every=1) print("topics:") for topic in lda.show_topics(num_topics=num_topics, num_words=20): # print_topics(): print(topic) lda.save("trained_ldamodel.model") # print("getting topics for testing document") # topic_prediction = lda.get_document_topics(bow=corpus[0]) # print(testing_text_raw) # print(topic_prediction) print("") print( "starting setup to train a classifier based on LDA topics for each document" ) topic_vecs = [] # get topic matches and put them into vectors for i in range(len(documents)): top_topics = lda.get_document_topics(corpus[i], minimum_probability=0) #print(len(top_topics)) topic_vec = [top_topics[i][1] for i in range(num_topics)] topic_vecs.append(topic_vec) # train basic logistic regression model = LogisticRegression(class_weight='balanced').fit(topic_vecs, labels) with open('trained_logreg_model.pkl', 'wb') as f: pickle.dump(model, f) return model, topic_vecs
print (i, topic) # visualization of topics #vis_data = gensimvis.prepare(speeches_topics, lda_corpus, dct) #pyLDAvis.display(vis_data) # extract all document-topic distritbutions to dictionnary document_key = list(speeches.index) document_topic = {} for doc_id in range(len(lda_corpus)): docbok = lda_corpus[doc_id] doc_topics = speeches_topics.get_document_topics(docbok, 0) tmp = [] for topic_id, topic_prob in doc_topics: tmp.append(topic_prob) document_topic[document_key[doc_id]] = tmp column_names = ['topic_'+str(i) for i in range(1, num_topics + 1)] # Topic Distribution over time topic_df = pd.DataFrame(document_topic) topic_df_T = topic_df.transpose() topic_df_T["Date"] = transcript_date sorted_df = topic_df_T.sort_values(by = "Date") sorted_df['datetime'] = pd.to_datetime(sorted_df['Date'])
class LDAWDF: mysql: mysql.MySQL ldamodel: LdaModel dictionary = None corpus = None def __init__(self, mysql): self.mysql = mysql self.dataFolder = './data/' self.saveFile = 'lda_model' self.saveFileDict = 'lda_model_dict' def trainFromStart(self): with self.mysql as db: content = db.getContentsText() documents = [] for item in content: documents.append(item['content'].split()) self.dictionary = corpora.Dictionary(documents) self.dictionary.filter_extremes(no_below=5, no_above=0.5) doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents] self.corpus = doc_term_matrix # Running and Training LDA model on the document term matrix. print("Starting to train LDA Model...") self.ldamodel = LdaModel( doc_term_matrix, num_topics=200, id2word=self.dictionary, passes=100) def printTest(self): print(self.ldamodel.print_topics(num_topics=10, num_words=5)) def save(self): self.ldamodel.save(self.dataFolder + self.saveFile) self.dictionary.save(self.dataFolder + self.saveFileDict) def canLoad(self): my_file = Path(self.dataFolder + self.saveFile) my_file_dict = Path(self.dataFolder + self.saveFileDict) return my_file.is_file() and my_file_dict.is_file() def update(self, corpus): self.ldamodel.update(corpus) def load(self, subfolder=None): if subfolder: sf = subfolder + '/' else: sf = '' self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile) self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict) def fillDb(self): topics = {} result = [] result2 = [] nbTopics = self.ldamodel.get_topics().shape[0] # "Old" for topicId in range(0, nbTopics): topicTerms = self.ldamodel.get_topic_terms(topicId, 3) topicTerms.sort(key=lambda x: x[1], reverse=True) words = [] for topicTerm in topicTerms: words.append(self.dictionary.get(topicTerm[0])) topics[topicId] = ' '.join(words) with mysql as db: contentsText = db.getContentsText() for element in contentsText: bow = self.dictionary.doc2bow(element['content'].split()) docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05) if len(docTopics) > 0: docTopics.sort(key=lambda x: x[1], reverse=True) result.append((element['url'], topics[docTopics[0][0]])) for docTopic in docTopics: result2.append((element['url'], docTopic[0], str(docTopic[1]))) db.emptyUrlsTopic() db.emptyCurrentUrlsTopic() db.emptyCurrentUserTags() db.setCurrentUrlsTopic(result2) db.setPrecalcTopics() # "New" terms = [] for topicId in range(0, nbTopics): topicTerms = self.ldamodel.get_topic_terms(topicId, 5) topicTerms.sort(key=lambda x: x[1], reverse=True) for topicTerm in topicTerms: terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1]))) with mysql as db: db.emptyLdaTopics() db.setLdaTopics(terms) def get_terms_topics(self, keywords): bow = self.dictionary.doc2bow(keywords[:30]) topics = {} keywordsResult = {} for word in bow: wordTopics = self.ldamodel.get_term_topics(word[0], 0.05) keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics} for wordTopic in wordTopics: wordTopicId = wordTopic[0] if wordTopicId not in topics: topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId) return {'topics': topics, 'keywords': keywordsResult}
for i in range(0, len(sentences), n_sent) ] dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=2, no_above=0.3) corpus = [dictionary.doc2bow(doc) for doc in docs] lda = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10) # 主題の確認 for topic in lda.show_topics(num_topics=-1, num_words=10): print('topic id:{0[0]:d}, words={0[1]:s}'.format(topic)) # 記事の主題分布の推定 for doc_id in datastore.get_all_ids(limit=-1): meta_info = json.loads( datastore.get(doc_id, ['meta_info'])['meta_info']) title = meta_info['title'] print(title) doc = [ token['lemma'] for token in datastore.get_annotation(doc_id, 'token') if token.get('NE') == 'O' ] for topic in sorted(lda.get_document_topics(dictionary.doc2bow(doc)), key=lambda x: x[1], reverse=True): print('\ttopic id:{0[0]:d}, prob={0[1]:f}'.format(topic)) datastore.close()
df = pd.read_csv("./total_info.txt", sep=',', header=0, names=['A', 'B', 'C', 'D', 'E', 'F']) data = df['B'] data = data.apply(lambda s: clean_text(s)) datalist = data.values print(datalist) # 分词 texts = [[word for word in doc.lower().split()] for doc in datalist] print(texts[0]) common_dictionary = Dictionary(texts) common_corpus = [common_dictionary.doc2bow(text) for text in texts] lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=20) print(lda.print_topic(10, topn=5)) lda.save('lda.model') lda = LdaModel.load('lda.model') tryTxt = "while i be suffer i be able to press and go in subscribe but when i press the video it keep show no connection ." trylist = [word for word in tryTxt.lower().split()] bow = common_dictionary.doc2bow(trylist) print(lda.get_document_topics(bow)) import pyLDAvis.gensim # 浏览器打开http://127.0.0.1:8888/ vis = pyLDAvis.gensim.prepare(lda, common_corpus, common_dictionary) pyLDAvis.show(vis)
num_topics = 3 dictionary = corpora.Dictionary(words_list) corpus = [dictionary.doc2bow(words) for words in words_list] lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics) ###output1: topics and corresponding words pp = pprint.PrettyPrinter(indent=4) pp.pprint(lda.print_topics(num_words=10)) ###output2: 2 ways of showing one topic and corresponding words lda.print_topic(topicno=0) lda.show_topic(1) ### ouput3: show topic of one user (even new user) sorted(lda.get_document_topics(corpus[100], minimum_probability=0, per_word_topics=False), key=lambda x: x[1], reverse=True) ### output4: visualize LDA lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, R=15, sort_topics=False) pyLDAvis.display(lda_display) ##### Text Similarities doc = tweets[list(tweets.keys())[2]].replace("|||", "") lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=3)
def get_document_topic_weights(lda_model: LdaModel, bow) -> list: """Returns the topic/weights matrix of the topic model""" return [ weight for topic, weight in lda_model.get_document_topics( bow, minimum_probability=0) ]
topics_keywords_weight_map_equalnum[topic_id] = temp_keyword_weight_map topics_words = {} for topic_id, topic_keywords in topics_keywords_weight_map_equalnum.items(): topic_words = [] for keyword, weight in topic_keywords.items(): topic_words.append(keyword) topics_words[topic_id] = topic_words timer(global_time,local_time) ################################################################################################################ # Calculate the LDA topic scores # Match each company to only one cluster using the highest topic score ################################################################################################################ print('Calculating LDA topic scores, and matching each company to one cluster using the highest topic score...') local_time = time.time() content_scores = lda.get_document_topics(word_corpus) contentidx_topicidx_map = {} contentidx_topicsocre_map = {} for contentidx, scores in enumerate(content_scores): topicid_topicscore_dict = {} for topicid, topicscore in scores: topicid_topicscore_dict[topicid] = topicscore topicid_topicscore_dict_sorted = {k: v for k, v in sorted(topicid_topicscore_dict.items(), key=lambda item: item[1], reverse=True)} # print('topicid_topicscore_dict_sorted:', topicid_topicscore_dict_sorted, list(topicid_topicscore_dict_sorted.keys())[0], list(topicid_topicscore_dict_sorted.values())[0]) contentidx_topicidx_map[contentidx] = list(topicid_topicscore_dict_sorted.keys())[0] score = list(topicid_topicscore_dict_sorted.values())[0] contentidx_topicsocre_map[contentidx] = score timer(global_time,local_time) ################################################################################################################ # Store all the results of every corpus as
# ldamodel.save('topic_articles.model') #print(ldamodel.print_topics(num_topics=2, num_words=4)) ii = ldamodel.print_topics(num_topics=50, num_words=30) df = pd.DataFrame(ii, columns=['id_topics', 'words']).set_index('id_topics') df1 = df.to_csv('50_topics_on_articles.csv') df2 = df.to_excel('50_topics_on_articles.xlsx') #MAIN PLOT viz = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary) pyLDAvis.save_html(viz, '50t_articles.html') #Load a model #ldamodel.load('topic_articles.model') yo = ldamodel.get_document_topics( doc_term_matrix) #get topics on all 10339 articles li = [] for i in range(len(comments)): new = pd.DataFrame(yo[i], columns=['id', 'prob' ]).sort_values('prob', ascending=False).drop(['prob'], axis=1) p = new.head(1).values.T.flatten().tolist() k = li.append(p) df_topic_id = pd.DataFrame(li, columns=['topics_id']) df_topic_id.index.name = 'article_text_id' #df_topic_id.head(5) #topic_ids for all authors' articles #len(df_topic_id.index)
#print("getting topics for testing document") #topic_prediction = lda.get_document_topics(bow=corpus[0]) #print(testing_text_raw) #print(topic_prediction) print("") print( "starting setup to train a classifier based on LDA topics for each document" ) topic_vecs = [] # get topic matches and put them into vectors for i in range(len(documents)): top_topics = lda.get_document_topics(corpus[i], minimum_probability=0.0) topic_vec = [top_topics[i][1] for i in range(20)] topic_vecs.append(topic_vec) from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.metrics import f1_score # train basic logistic regression sgd_model = LogisticRegression(class_weight='balanced').fit(topic_vecs, labels) pred_labels = sgd_model.predict(topic_vecs) # get accuracy from the training data, just to look at whether this even seems feasible... # 0.3 f1 score on the training, using 12123 documents. not great results for now. print("accuracy on training data: ", f1_score(labels, pred_labels, average='weighted'))
# # count = 0 # for i in range(len(abs_topics_prob)): # if get_doc_topic_id(i, abs_topics_prob) == get_doc_topic_id(i, title_topics_prob): # count += 1 # count = 27514 # count / len(abs_topics_prob) = 0.5229406621811685 def get_doc_topic_id(doc_id, docs_topics_prob): result = np.where( docs_topics_prob[doc_id] == docs_topics_prob[doc_id].max()) return result[0][0] doc_topics = lda.get_document_topics(corpus, 0) probs = [[entry[1] for entry in doc] for doc in doc_topics] docs_topics_prob = np.array(probs) topic_doc_year_num = np.zeros((50, 10)) for i in range(len(docs_topics_prob)): y = int(nat_data.year[i] - 1971) t = get_doc_topic_id(i, docs_topics_prob) topic_doc_year_num[y][t] += 1 colors = [ 'rosybrown', 'lightcoral', 'indianred', 'brown', 'peru', 'darkorange', 'gold', 'yellow', 'green', 'darkgoldenrod' ] fig = plt.figure(figsize=(30, 15))
random_state=1) doc_topics = [lda[c] for c in corpus] avg_doc_topics = mean([len(t) for t in doc_topics]) print(f"topics num of doc = {avg_doc_topics}") topic_freq = frequencies([t[0] for dt in doc_topics for t in dt]) print('----------') for i in range(topic_num): items = [(dic[t[0]], t[1]) for t in lda.get_topic_terms(i, topn=5)] freq = topic_freq[i] if i in topic_freq else 0 print(f"topic_id = {i}, freq = {freq}, items = {items}") print('----------') for i in range(len(corpus)): dts = lda.get_document_topics(corpus[i], per_word_topics=True) for dt in dts[2]: item = dic[dt[0]] print(f"corpus = {i}, item = {item}, topic_id = {dt[1]}") vis = pyLDAvis.gensim.prepare(lda, corpus, dic, n_jobs=1, sort_topics=False) pyLDAvis.save_html(vis, dest_file)
def get_most_common(title_list, dic, num=COMMON_TOPIC_WORDS_NUM, random_state=None): '''最頻出の話題の単語num個のセットを取得する''' bow = [dic.doc2bow(title) for title in title_list] # TODO: 適切なトピック数を取得して設定する if LOG_LEVEL == 'DEBUG': random_state = 123 model = LdaModel(bow, id2word=dic, num_topics=TOPIC_NUM, random_state=random_state) # 各タイトルを分類 topic_id_list = [] for idx, title in enumerate(title_list): logger.debug('title') logger.debug(title) doc_topics_tuple = model.get_document_topics(dic.doc2bow(title), minimum_probability=0.0) doc_topic_dist = [[val[0], val[1]] for val in doc_topics_tuple] doc_topic_dist = np.array(doc_topic_dist) if idx == 0: topic_dist_arr = doc_topic_dist else: topic_dist_arr = np.vstack([topic_dist_arr, doc_topic_dist]) topic_id = int( sorted(doc_topic_dist, key=lambda x: x[1], reverse=True)[0][0]) topic_id_list.append(topic_id) if LOG_LEVEL == 'DEBUG': # titleごとのトピック分布 df_topic_dist = pd.DataFrame({ 'title': title_list, 'topic_id': topic_id_list }) # トピックごとの単語分布 cols = ['{}_{}'.format(word_no, elem) \ for word_no in range(10) \ for elem in range(2)] df_word_dist = pd.DataFrame() arr_dist = topic_dist_arr.reshape(-1, model.get_topics().shape[0], 2) for topic_id in range(model.get_topics().shape[0]): df_topic_dist['topic_{}'.format(topic_id)] = arr_dist[:, topic_id, 1] topic_terms = model.get_topic_terms(topic_id, topn=int(len(cols) / 2)) topic_terms_2 = [] for term in topic_terms: topic_terms_2 = topic_terms_2 + [ dic.id2token[term[0]], term[1] ] df_word_dist = df_word_dist.append( pd.Series(topic_terms_2, name='topic_{}'.format(topic_id))) df_topic_dist.to_csv( os.path.join('test', 'classified_topic_{}.csv' \ .format(datetime.today().strftime(format='%Y%m%d'))), index=False, encoding='cp932' ) df_word_dist.columns = cols df_word_dist.to_csv( os.path.join('test', 'word_distribution_per_topic_{}.csv' \ .format(datetime.today().strftime(format='%Y%m%d'))), encoding='cp932' ) # 最頻出の話題を取得 topic_id_counter = Counter(topic_id_list) most_common_topic_id = topic_id_counter.most_common(1)[0][0] topic_terms = model.get_topic_terms(most_common_topic_id) logger.debug('') logger.debug('topic_id_counter: ' + str(topic_id_counter)) logger.debug('most_common_topic_id: ' + str(most_common_topic_id)) logger.debug(topic_terms) # 最頻出の話題の重要な単語num個を取得 important_word_list = [ dic.id2token[topic_tuple[0]] for topic_tuple in topic_terms[:num] ] logger.debug(important_word_list) return important_word_list