def exec_lda(): client = MongoClient() db = client.epistemonikos_files num_topics = range(2, 51) files = range(12) # Número de combinaciones posibles de preprocesamiento # TODOS los documentos for f in files: for t in num_topics: data_cursor = DBDataCursor(db, 'preprocess_{0}'.format(f)) corpus_cursor = CorpusCursor(data_cursor) lda = LdaMulticore(corpus=corpus_cursor, id2word=corpus_cursor.dictionary, num_topics=t) lda.save( 'processing_data/lda/all_docs/preprocess_{0}_topics_{1}'.format( f, t)) # Para cada TIPO de documento config_list = utils.create_config_list( 'processing_data/lda/types/config_list.json') for i, config in enumerate(config_list): for f in files: for t in num_topics: data_cursor = DBDataCursor(db, 'preprocess_{0}'.format(f), **config) corpus_cursor = CorpusCursor(data_cursor, config=i) lda = LdaMulticore(corpus=corpus_cursor, id2word=corpus_cursor.dictionary, num_topics=t) lda.save( 'processing_data/lda/types/preprocess_{0}-topics_{1}-config_{2}'.format( f, t, i))
def main(self): print('Loading data') data = pd.read_csv('../../resources/abcnews-date-text.csv', error_bad_lines=False) data_text = data[['headline_text']] data_text['index'] = data_text.index documents = data_text np.random.seed(2018) print('Preprocessing text') preprocessed_docs = documents['headline_text'].map(self.preprocess) print('Building bag of words corpus') dictionary = Dictionary(preprocessed_docs) # list: token_id, token dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs] # list: token_id, token_count print(documents[documents['index'] == 4310].values[0][0]) print(bow_corpus[4310]) print(bow_corpus[:100]) print('Building lda model from bag of words') lda_model_bow = LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, workers=self.workers) for idx, topic in lda_model_bow.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) for index, score in sorted(lda_model_bow[bow_corpus[4310]], key=lambda tup: -1*tup[1]): print("\nScore: {}\t \nTopic: {}".format(score, lda_model_bow.print_topic(index, 10))) print('Building tfidf corpus from bag of words corpus') tfidf = TfidfModel(bow_corpus) tfidf_corpus = tfidf[bow_corpus] from pprint import pprint for doc in tfidf_corpus: pprint(doc) break print('Building lda model from tfidf') lda_model_tfidf = LdaMulticore(tfidf_corpus, num_topics=10, id2word=dictionary, workers=self.workers) for idx, topic in lda_model_tfidf.print_topics(-1): print('Topic: {} Word: {}'.format(idx, topic)) for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]): print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10))) print('Testing on unseen document') unseen_document = 'Facebook’s global lobbying against data privacy laws' bow_vector = dictionary.doc2bow(self.preprocess(unseen_document)) print('Bow:') for index, score in sorted(lda_model_bow[bow_vector], key=lambda tup: -1*tup[1]): print("Score: {}\t Topic: {}".format(score, lda_model_bow.print_topic(index, 5))) print('TfIdf:') for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]): print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))
def load_topic_model(self): if not hasattr(self, "word2id"): self.load_globel_vocab() self.vectorizer = CountVectorizer(vocabulary=self.word2id, tokenizer=lambda x: x, preprocessor=lambda x: x) file_path = "./preproc_data/topic_model.pkl" if os.path.exists(file_path): self.topic_model = LdaModel.load(file_path) else: texts = [] if not hasattr(self, "domain2data"): self.load_domain2data() for domain in self.domain2data: texts.extend(self.domain2data[domain]["labeled"]) texts.extend(self.domain2data[domain]["unlabeled"]) corpus = self.vectorizer.fit_transform(texts) corpus = Sparse2Corpus(corpus, documents_columns=False) self.topic_model = LdaMulticore( corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, iterations=self.num_topic_iterations, passes=self.num_topic_passes) self.topic_model.save(file_path)
def arun(corpus, dictionary, min_topics=10, max_topics=100, step=10): l = np.array([sum(cnt for _, cnt in doc) for doc in corpus]) kl = [] for n in range(min_topics, max_topics+step, step): print("starting multicore LDA for num_topics={}".format(n)) st = time.clock() lda = LdaMulticore(corpus=corpus, id2word=vocabulary, num_topics=n, passes=20, workers=mp.cpu_count()-1) el = time.clock()-st print("multicore LDA finished in {:.2f}s!".format(el)) m1 = lda.expElogbeta _, cm1, _ = np.linalg.svd(m1) lda_topics = lda[corpus] m2 = matutils.corpus2dense(lda_topics, lda.num_topics).transpose() cm2 = l.dot(m2) cm2 = cm2 + 0.0001 cm2norm = np.linalg.norm(l) cm2 = cm2/cm2norm kl.append(sym_kl(cm1, cm2)) return kl
def multicore(): # tfidfで重みが大きいもののセット words = pickle.loads(open('tmp/words.pkl', 'rb').read()) documents = [] for name in glob.glob('tmp/wakati/*'): terms = open(name).read().split() terms = [term for term in terms if term in words] documents.append(terms) term_index = {} for terms in documents: for term in terms: # wordsに入っていない単語は見ない if term not in words: continue if term_index.get(term) is None: term_index[term] = len(term_index) open('tmp/pickles/term_index.pkl', 'wb').write(pickle.dumps(term_index)) gensim_corpus = [] for document in documents: tf = dict(Counter(document)) doc = [(term_index[t], f) for t, f in tf.items()] gensim_corpus.append(doc) print('start to fit lda distribute...') model = LdaMulticore(gensim_corpus, workers=8, num_topics=TOPICN) open('tmp/pickles/model.pkl', 'wb').write(pickle.dumps(model)) print('finish to learn')
def train(corpus, dct, docs, ids, num_topics, field): model_dir = f'./models/{field}/k_{num_topics}/' os.makedirs(model_dir, exist_ok=True) model_file = model_dir + 'model' # The filename is the file that will be created with the log. # If the file already exists, the log will continue rather than being overwritten. log_file = model_dir + 'model_callbacks.log' logging.basicConfig(filename=log_file, format="%(asctime)s:%(levelname)s:%(message)s", level=logging.NOTSET) lda_model = LdaMulticore( corpus=corpus, id2word=dct, random_state=2020, num_topics=num_topics, # passes=100, chunksize=5000, # batch=False, alpha='asymmetric', decay=0.5, offset=64, eta='auto', eval_every=0, iterations=10, # gamma_threshold=0.001, per_word_topics=True) lda_model.save(model_file) return lda_model
def __createbasemodel(self): print('Creating base model') #Topics Alpha Beta Coherence #6 asymmetric symmetric 0.723863804 self.__model = LdaMulticore(corpus=self.corpus_tfidf, id2word=self.id2word, num_topics=6, alpha='asymmetric', eta='symmetric', workers=2, random_state=100, chunksize=100, passes=10, per_word_topics=True) if self.__config['Storemodel']: self.__savemodel() print(self.__model.print_topics()) print(self.__model[self.gensim_bow]) print('calculating coherence') #__cohe_model = CoherenceModel(model=self.__model,texts=self.processeddata,dictionary=self.id2word,coherence='c_v') __cohe_model = CoherenceModel(model=self.__model, corpus=self.corpus_tfidf, coherence='u_mass') __cohe = __cohe_model.get_coherence() print('coherence :', __cohe) self._addMLflowMetric('BaseModel.Coherence', __cohe) return self.__model
def build_models(self): documents_tokenized = [] for doc in self.__document_list: processed_document = self.__preprocess_text_document(doc) if len(processed_document) > 0: documents_tokenized.append(processed_document) # if the documents get filtered out completely (by the intersection with the index), # add some random word to prevent exceptions if len(documents_tokenized) <= 0: documents_tokenized.append(['None']) # turn tokenized documents into a id <-> term dictionary self.__dictionary = Dictionary(documents_tokenized) # convert tokenized documents into a document-term matrix self.__corpus = [ self.__dictionary.doc2bow(document) for document in documents_tokenized ] # generate models self.__model_tfidf = TfidfModel(corpus=self.__corpus) self.__model_lsi = LsiModel(corpus=self.__corpus, num_topics=self.topics_number) self.__model_lda = LdaMulticore(corpus=self.__corpus, num_topics=self.topics_number, id2word=self.__dictionary, workers=cpu_count() - 1, chunksize=2000, passes=1, batch=False)
def train_lda(): """ Usage: python Wechat_LDA.py wechat.csv """ with open(sys.argv[1], 'r') as wx: for f in wx: seg = jieba.cut(f) seg = [word for word in seg if word not in stopwords] with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg: wx_seg.write(' '.join(seg)) documents = open('wechat_seg.txt', 'r') dictionary = corpora.Dictionary(LineSentence(documents)) corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)] tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True) tfidf_model.save('wechat_seg.txt.tfidf_model') # corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus]) lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1) lda_model.save('wechat_lda_model.pkl') topics = [] for doc in corpus: topics.append(lda_model[doc]) counts = np.zeros(100) for top_doc in topics: for ti, _ in top_doc: counts[ti] += 1 words = lda_model.show_topic(counts.argmax(), 64) with open('top_words.txt', 'w') as tw: writer = UnicodeWriter(tw) for w in words: writer.writerow((w[0], int(float(w[1])*1000)))
def get_coherence_values(self): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ for i in range(*self.json['num_topics']): # i = topic number for j in range(*self.json['num_passes']): # j = pass number for corpus_type in self.corpuses['corpus_type']: sys.stdout.write( '\r Building model: topic # {} - pass # {} - {} corpus' .format(i, j, corpus_type)) model = LdaMulticore(corpus=corpus_type, id2word=self.dictionary, num_topics=i, passes=j) self.models['model'][(i, j)] = model self.models['c_v'][(i, j)] = CoherenceModel( model=model, texts=corpus_type, dictionary=self.dictionary, coherence='c_v')
def load_model(self, phrase): processed_phrase = self.preprocessing(phrase) self.all_phrases.append(processed_phrase) # print(self.all_phrases) # dct = Dictionary(common_texts) dct = Dictionary(self.all_phrases) corpus = [dct.doc2bow(line) for line in self.all_phrases] lda_model = LdaMulticore(corpus=corpus, id2word=dct, random_state=100, num_topics=3, passes=10, chunksize=1000, batch=False, alpha="asymmetric", decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) topic_keywords = [] topics = lda_model.print_topics(-1) for topic in topics[:3]: topics_str = topic[1] pattern = r"[^a-zA-Z+]" topics_list = re.sub(pattern, "", topics_str).split("+") topic_keywords += topics_list[:5] return topic_keywords
def train(self, num_topics, chunksize=10000, passes=6, iterations=40, eval_every=40): fmodel = f"./models/lda_{num_topics}top_{iterations}iter_{passes}pass" # logging.basicConfig(filename=fmodel + ".log", # format="%(asctime)s:%(levelname)s:%(message)s", # level=logging.INFO) temp = self.dictionary[0] id2word = self.dictionary.id2token model = LdaMulticore( corpus=self.corpus, id2word=id2word, chunksize=chunksize, iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) model.save(fmodel + ".pt") self.model = model # p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity") # matches = [p.findall(l) for l in open(fmodel+'.log')] # matches = [m for m in matches if len(m) > 0] # tuples = [t[0] for t in matches] # perplexity = [float(t[1]) for t in tuples] # liklihood = [float(t[0]) for t in tuples] # iter = list(range(0,len(tuples)*10,10)) # plt.plot(iter,liklihood,c="black") # plt.ylabel("log liklihood") # plt.xlabel("iteration") # plt.title("Topic Model Convergence") # plt.grid() # plt.savefig(fmodel + ".pdf") return model
def updateLDA(): api_file="./newsapi.key" categories=['business', 'entertainment', 'general', 'health', 'science', 'sports', 'technology'] with open(api_file,"r") as apikey: newsapi=NewsApiClient(api_key=apikey.read().strip()) headlines={cat:newsapi.get_top_headlines(category=cat, language='en', country='in') for cat in categories} pp_docs=[] for category in headlines: for article in headlines[category]['articles']: #print(lemma_pp(article['title'])) pp_docs.append(lemma_pp(article['title'])) if os.path.exists(MODEL_DIR+"corpus_dict.model"): corp_d=Dictionary.load(MODEL_DIR+"corpus_dict.model") corp_d.add_documents(pp_docs) else: corp_d = Dictionary(pp_docs) corp_d.filter_extremes(no_below=2, no_above=0.5) dtm=[corp_d.doc2bow(doc) for doc in pp_docs] tfidf=TfidfModel(dtm) corp_tfidf=tfidf[dtm] lda = LdaMulticore(corp_tfidf, num_topics=5, id2word=corp_d, passes=60, workers=3) print(lda.print_topics(num_topics=5, num_words=5)) checkdir(MODEL_DIR) corp_d.save(MODEL_DIR+"corpus_dict.model") #corp_tfidf.save(MODEL_DIR+"corpus_tfidf.model") lda.save(MODEL_DIR+"lda.model")
def createbasemodel(self): print('Creating base model') #Topics Alpha Beta Coherence #6 asymmetric symmetric 0.723863804 self.__model = LdaMulticore(corpus=self.__data.corpus_tfidf, id2word=self.__data.id2word, num_topics=6, alpha='asymmetric', eta='symmetric', workers=2, random_state=100, chunksize=100, passes=10, per_word_topics=True) print(self.__model.print_topics()) print(self.__model[self.__data.gensim_bow]) print('calculating coherence') __cohe_model = CoherenceModel(model=self.__model, texts=self.__data.processeddata, dictionary=self.__data.id2word, coherence='c_v') __cohe = __cohe_model.get_coherence() print('coherence :', __cohe) #print('hyper param tuning') #self.__hyperparamtunning() print('saving model') self.__savemodel()
def train_tfidf(self,num_topics=12): dictionary = corpora.Dictionary(self.df) corpus = [dictionary.doc2bow(doc) for doc in self.df] tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda_model = LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=2,workers=2) return dictionary, corpus_tfidf, lda_model,tfidf
def compute_coherence_lda(corpus, dictionary, start=None, limit=None, step=None): """Compute c_v coherence for various number of topics """ topic_coherence = [] model_list = [] tokens_list = df.trigram_tokens.values.tolist() texts = [[token for sub_token in tokens_list for token in sub_token]] for num_topics in range(start, limit, step): model = LdaMulticore( corpus=corpus, id2word=dictionary, num_topics=num_topics, eta='auto', workers=4, passes=20, iterations=100, random_state=42, eval_every=None, alpha= 'asymmetric', # shown to be better than symmetric in most cases decay=0.5, offset=64 # best params from Hoffman paper ) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') topic_coherence.append(coherencemodel.get_coherence()) return model_list, topic_coherence
def train_lda(self, path): """ https://www.cnblogs.com/Luv-GEM/p/10881838.html gensim 所需要的输入格式为:['教授', '长江', '学者', '优秀成果', '集中', '呈现'],也就是每段文章 text 是一个列表,元素为词语。 然后构建语料库,再利用语料库把每篇新闻进行数字化,corpus 就是数字化后的结果。 第一段文章 text ID 化后的结果为 corpus[0]:[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), ...],每个元素是 text 中的每个词语的 ID 和频率。 最后训练 LDA 模型。LDA是一种无监督学习方法,我们可以自由选择主题的个数。num_topics = 30 """ print('train lda') corpus_data = get_corpus(path, w2v=True) id2word = gensim.corpora.Dictionary(corpus_data) corpus = [id2word.doc2bow(text) for text in corpus_data] # corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...] # corpus是把每条ID化后的结果,每个元素是新闻中的每个词语,在字典中的ID和频率 LDAmodel = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=30, workers=4, chunksize=4000, passes=7, alpha='asymmetric') return LDAmodel
def lda_matrix(matrix_id, preprocess, topics, data_path): data_cursor, corpus_cursor = get_file_cursor(matrix_id, preprocess, data_path) lda = LdaMulticore(corpus=corpus_cursor, id2word=corpus_cursor.dictionary, num_topics=topics) lda.save(os.path.join(data_path, '{2}-preprocess_{0}-topics_{1}.lda'.format( preprocess, topics, matrix_id))) return lda, corpus_cursor, data_cursor
def __buildLDA(self, num_topics, chunksize, passes): self.__model = LdaMulticore(self.__corpus, id2word=self.__corpus.getDictionary(), num_topics=num_topics, chunksize=chunksize, passes=passes, eval_every=None, workers=40, random_state=10)