class LdaMalletHandler: def __init__(self, mallet_path): self.mallet_path = mallet_path def run_model(self, model_name, corpus, **kwargs): self.model_name = model_name self.dictionary = Dictionary(corpus) corpus_bow = [self.dictionary.doc2bow(text) for text in corpus] os.makedirs("ldamodels/"+model_name, exist_ok=True ) self.model = LdaMallet(self.mallet_path, corpus_bow, id2word=self.dictionary, prefix="./ldamodels/"+model_name+"/", **kwargs) def save_model(self): self.model.save("ldamodels/"+self.model_name+"/model.model") self.dictionary.save("ldamodels/"+self.model_name+"/dict.dict") def load_model(self, model_name): self.model_name = model_name self.dictionary = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict") self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model") self.model.mallet_path = self.mallet_path def doc_topics(self, doc_idx): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) return self.doc_retriever.doc_topics(doc_idx) def ext_doc_topics(self, ext_doc): doc_bow = self.dictionary.doc2bow(ext_doc) doc_topics = self.model[doc_bow] doc_topics.sort(key=lambda x: x[1], reverse=True) return doc_topics def ext_doc_n_most_similar(self, ext_doc, n=5, metric='cosine'): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) doc_bow = self.dictionary.doc2bow(ext_doc) doc_topics = self.model[doc_bow] topics = [] for topic in doc_topics: topics.append(topic[1]) most_similar = self.doc_retriever.n_most_similar(topics, n=n, metric=metric) return most_similar def n_most_representative(self, topic, n=3): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) topics = np.zeros(self.model.num_topics) topics[topic]=1 most_similar = self.doc_retriever.n_most_similar(topics, n=n) return most_similar def get_string_topics(self, num_topics=-1, num_words=10): if(num_topics==-1): num_topics = self.model.num_topics string_topics = [] for topic in self.model.print_topics(num_topics=num_topics, num_words=num_words): splitted = topic[1].split("\"") result = [splitted[2*i+1] for i in range(0,int(len(splitted)/2))] string_topics.append(" ".join(result)) return string_topics
def lda(self, column, method='mallet', save_model=None, load_model=None): if method == 'mallet': print("Mallet LDA") else: raise ValueError("Invalid paramater for LDA.method: {}".format(method)) tmp_dir = os.path.join(tempfile.gettempdir(), "mallet_lda/") if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) if not hasattr(self, "vocab"): self.__learn_vocab(column) if len(self.__bag_of_words) != 0: docs, id2word = self.__bag_of_words[column] else: docs, id2word = self.__get_bag_of_words(column) model = LdaMallet(mallet_path=self.mallet_path, id2word=id2word, prefix=tmp_dir, num_topics=self.num_topics, iterations=self.lda_max_iter, optimize_interval=20) model.train(docs) doc_topics = list() for doc_vec in model.read_doctopics(model.fdoctopics()): topic_ids, vecs = zip(*doc_vec) doc_topics.append(np.array(vecs)) self.features["lda"] = np.array(doc_topics) self.feature_names["lda"] = model.get_topics() return
def main(): print("\n-----LDA CONCEPT DETECITON-----") corpus = load_from_csv(CORPUS_PATH) # Create CountVectorizer to get Document-Term matrix stop_words = load_stop_words("data/stopwords-fr.txt") vectorizer = CountVectorizer(lowercase=True, max_df=MAX_DF, min_df=MIN_DF, token_pattern=r"(?u)\b\w\w\w+\b") proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer) proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only] proc_stop_words = [] for i in range(len(proc_corpus_text_only)): proc_stop_words.append([]) for j in range(len(proc_corpus_text_only[i])): if proc_corpus_text_only[i][j] not in stop_words and len( proc_corpus_text_only[i][j]) >= 3: proc_stop_words[i].append(proc_corpus_text_only[i][j]) # train vectorizer on corpus id2word = Dictionary(proc_stop_words) corp = [id2word.doc2bow(text) for text in proc_stop_words] # print("Number of Features: " + str(len(feature_names))) # initialize model path_to_mallet_binary = "Mallet/bin/mallet" mallet_model = LdaMallet(path_to_mallet_binary, corpus=corp, num_topics=14, id2word=id2word, optimize_interval=1, random_seed=9, iterations=5) doc_topics = list( mallet_model.read_doctopics(mallet_model.fdoctopics(), renorm=False)) topic_word = TopicWord(mallet_model) topic_word.get_topic_word() topic_word.write_to_csv("../output/topic_" + str(mallet_model.random_seed) + "_" + str(mallet_model.iterations) + "_" + str(mallet_model.num_topics) + ".csv") topic_doc = TopicDoc(mallet_model) topic_doc.get_topic_doc() topic_doc.write_to_csv("output/topic_doc" + str(mallet_model.random_seed) + "_" + str(mallet_model.iterations) + "_" + str(mallet_model.num_topics) + ".csv", num_docs=50) return 0
class LdaMalletHandler(TransformerMixin, BaseEstimator): def __init__(self, n_components=100, mallet_path=None, prefix=None, iterations=1000, vectorizer=None): self.n_components = n_components self.mallet_path = mallet_path self.prefix = prefix self.iterations = iterations self.vectorizer = vectorizer def vect2gensim(self, vectorizer, dtmatrix): # transform sparse matrix into gensim corpus and dictionary corpus_vect_gensim = Sparse2Corpus(dtmatrix, documents_columns=False) dictionary = Dictionary.from_corpus( corpus_vect_gensim, id2word=dict( (id, word) for word, id in vectorizer.vocabulary_.items())) return (corpus_vect_gensim, dictionary) def fit(self, X, y=None): print('vect2gensim') corpus, dictionary = self.vect2gensim(self.vectorizer, X) self.model = LdaMallet(self.mallet_path, iterations=self.iterations, corpus=corpus, num_topics=self.n_components, id2word=dictionary) return self def transform(self, X): corpus = Sparse2Corpus(X, documents_columns=False) doc_topic = self.model[corpus] mat = np.zeros((X.shape[0], self.n_components), dtype=np.float64) for did, doc in enumerate(doc_topic): for topic in doc: mat[did][topic[0]] = topic[1] return mat def get_doc_topic_matrix(self): arr = [] lines = open(self.model.fdoctopics(), "r").read().splitlines() for line in lines: arr.append(line.split()[2:]) return np.array(arr, dtype=np.float64) def get_topic_words_matrix(self): return self.model.get_topics()
def main(): num_topics = 10 #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt' MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin", "mallet.bat") # r"D:\Mallet\mallet-2.0.8\bin" texts = wenzhang_Lemmatizer1.texts2 dictionary = corpora.Dictionary(texts) dictionary.save('dictionary_mallet_10_3.dictionary') #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary') word_id = dictionary.token2id corpus = [dictionary.doc2bow(text) for text in texts] # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus) # 保存corpus # corpus = corpora.MmCorpus('corpus_wenzhang.mm') # 加载 # print(os.path.abspath('corpus.mm')) mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH, corpus=corpus, num_topics=num_topics, id2word=dictionary) mallet_lda_model.save( 'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model') #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model') topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics, num_words=20) # print(topic_words20) writetopic_wordToExcleFile( topic_words20, 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls' ) topic_words = mallet_lda_model.get_topics() print(len(topic_words), len(topic_words[0])) doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics()) #doc_topics_path #print(mallet_lda_model.fdoctopics()) writedoc_topicToExcleFile( doc_topics, 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3' ) return texts, word_id, topic_words, doc_topics, num_topics