def create_evaluation_perplexity(config, Kind): model_fname = config.model_fname % Kind.__name__ corpus_fname = config.corpus_fname % Kind.__name__ try: id2word = Dictionary.load(corpus_fname + '.dict') corpus = MalletCorpus(corpus_fname, id2word=id2word) except: error('Corpora not built yet -- cannot evaluate') held_out = list() training = list() target_len = int(0.1 * len(corpus)) logger.info('Calculating perplexity with held-out %d of %d documents' % (target_len, len(corpus))) ids = set() while len(ids) < target_len: ids.add(random.randint(0, len(corpus))) for doc_id, doc in enumerate(corpus): if doc_id in ids: held_out.append(doc) else: training.append(doc) model = LdaModel(training, id2word=corpus.id2word, alpha=config.alpha, passes=config.passes, num_topics=config.num_topics) pwb = model.log_perplexity(held_out) with open(config.path + 'evaluate-perplexity-results.csv', 'a') as f: w = csv.writer(f) w.writerow([model_fname, pwb])
def Topic_Num_Decision(self, start, stop, size): model_list = [] coherence_values = [] topic_n_list = [] perplexity_values = [] for num_topics in range(start, stop, size): model = LdaModel(self.corpus, num_topics=num_topics, id2word=self.dictionary) model_list.append(model) coherencemodel = CoherenceModel( model=model, texts=self.news_doc, dictionary=self.dictionary, coherence="c_v", ) coherence_values.append(coherencemodel.get_coherence()) topic_n_list.append(num_topics) perplexity_values.append(model.log_perplexity(self.corpus)) print(num_topics) return model_list, coherence_values, perplexity_values
def get_gensim_topics(num_topics_list, sentences, print_flag = False): """ Gensim by default employs a version of count vectorization input: sentences (list of list of words) outputs coherence, perplexity, and topics prints topics if print == True """ texts = sentences.apply(retokenize).tolist() dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] perplexity_ls = [] coherence_ls = [] for i in num_topics_list: lda = LdaModel(corpus, num_topics=i, id2word = dictionary, random_state = 10) perplexity = lda.log_perplexity(corpus) perplexity_ls.append(perplexity) coherence_model_lda = CoherenceModel(model = lda, texts = texts, dictionary = dictionary, coherence = 'c_v') coherence = coherence_model_lda.get_coherence() coherence_ls.append(coherence) if print_flag == True: print('Num. Topics: ', i) print('') for i in (lda.print_topics()): words = i[1] words_ls = words.split('+') words_ls = ([i.split('*')[1] for i in words_ls]) words_ls = [i.replace('"', '') for i in words_ls] print(', '.join(words_ls)) print('') return perplexity_ls, coherence_ls
def get_model(dictionary, corpus, max_topics): #Here's the beast best_lda_model = None best_score = None best_num_topics = 1 current_num_topics = 1 for i in range(1, max_topics): lda_model = LdaModel(corpus=corpus, num_topics=current_num_topics, id2word=dictionary, passes=1000) coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus_texts, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() current_perplexity_score = lda_model.log_perplexity(corpus) current_score = coherence_lda #New variable because I was playing around with using some function of the coherence and perplexity, but just went with coherence print("Topics: ", current_num_topics, "Perplexity Score: ", current_perplexity_score, "Coherence Score: ", coherence_lda) # Saves the model with the highest score if best_score == None or current_score > best_score: best_score = current_score best_lda_model = lda_model best_num_topics = current_num_topics current_num_topics += 1 print("\nBest Num Topic: ", best_num_topics, best_score) return best_lda_model
def calculateLDA(dictionary, corpus, texts, list_num_topics, saveModelPath=[]): """ Computes LDA models for given list with number of topics and save them to disk And calculates coherence values for each model Parameters: ---------- dictionary: Gensim dictionary corpus : Gensim corpus texts : Preprocessed texts list_num_topics: list with number of topics to find for LDA saveModelPath: if empty, do nothing otherwise save model to disk Returns: ------- lm_list : List of LDA topic models c_v : Coherence values corresponding to the LDA model with respective number of topics """ c_v = [] lm_list = [] logPerplex_list = [] for num_topics in list_num_topics: print("\tNumber of topics:", num_topics) lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state=0, chunksize=5000, passes=50, eval_every=None, alpha='auto', eta='auto', iterations=50) lm_list.append(lm) cm = CoherenceModel(model=lm, corpus=corpus, dictionary=dictionary, texts=texts, coherence='c_v', processes=-1) logPerplex_list.append(lm.log_perplexity(corpus)) c_v.append(cm.get_coherence()) if saveModelPath != []: lm.save(saveModelPath + "K_{}.model".format(num_topics)) print("Number topics:", list_num_topics) print("Coherence scores:", c_v) print("LogPerplexity: ", logPerplex_list) return lm_list, c_v, logPerplex_list
def train_lda(self, cache_path): print(cache_path) trainBatchIter = BatchIterBert(self.trainDataIter, filling_last_batch=False, postProcessor=batchPostProcessor, batch_size=1) bow_list = [] for item in trainBatchIter: bow = item[1].squeeze().detach().numpy().tolist() bow_list.append(self.bow_2_gensim(bow)) print(len(bow_list)) #print(self.dictProcess.common_dictionary.id2token) lda = LdaModel(np.array(bow_list), num_topics=50, passes=200, chunksize=len(bow_list), id2word=self.dictProcess.common_dictionary) #print(lda.show_topic(1, topn=10)) output_topic_line = '' for topic_id in range(50): current_topic_list = [] current_topic = lda.show_topic(topic_id, topn=10) for topic_tuple in current_topic: current_topic_list.append(topic_tuple[0]) output_topic_line += ' '.join(current_topic_list) + '\n' #print(current_topic_list) topic_file = os.path.join(cache_path, 'ldatopic.txt') with open(topic_file, 'w') as fo: fo.write(output_topic_line) testBatchIter = BatchIterBert(self.testDataIter, filling_last_batch=False, postProcessor=batchPostProcessor, batch_size=1) test_bow_list = [] word_count = 0 for item in testBatchIter: bow = item[1].squeeze().detach().numpy().tolist() word_count += sum(bow) test_bow_list.append(self.bow_2_gensim(bow)) print(word_count) ppl = lda.log_perplexity(test_bow_list, len(test_bow_list)) print(ppl) bound = lda.bound(test_bow_list) print(bound / word_count) print(np.exp2(-bound / word_count))
def best_lda_model(self): tuple_list = [] for n in range(3, 50): test_model = LdaModel( corpus=self.corpus['content'].tolist(), id2word=self.dictionary, num_topics=n) # try the distributed parameter tperplexity = test_model.log_perplexity(self.test.content.tolist(), total_docs=None) tuple_list.append((n, tperplexity)) # if tperplexity < self.perplexity: # self.model = test_model # self.perplexity = tperplexity # print("New lower log_perplexity with",n,"topics") if n % 10 == 0: print(n) plt.scatter(*zip(*tuple_list)) plt.show()
def evaluate_perplexity(dictionary, corpus, texts, limit): perplex = np.zeros((1, limit), dtype=np.float16) lda_list = [] for num_topics in range(1, limit + 1): print("Topic %d" % num_topics) lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) lda_list.append(lm) perplex[0, num_topics - 1] = lm.log_perplexity(corpus) # Show graph x = range(1, limit + 1) plt.plot(x, perplex.T) plt.xlabel("k topics") plt.ylabel("Perplexity") plt.show() return lda_list, perplex
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3, coherence_measure="c_v"): coherence_values = [] perplexities = [] for num_topics in range(start, limit, step): model = LdaModel(corpus, num_topics=num_topics) perplexities.append(np.exp2(-model.log_perplexity(corpus))) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence_measure) coherence_values.append(coherencemodel.get_coherence()) return np.array(coherence_values, dtype=np.float32), np.array(perplexities, dtype=np.float32)
class MyLda: def __init__(self, myDictionary, num_topics=100, topic_threshold=0.15): self.num_topics = num_topics self.topic_threshold = topic_threshold self.myDictionary = myDictionary self.model = LdaModel(self.myDictionary.doc2bows, \ id2word=self.myDictionary.dictionary, \ num_topics=num_topics) self.topic2ids, self.id2topics = self.get_mappings() self.coherenceModel = None print("- Created MyLda with {} topics".format(self.num_topics)) def get_mappings(self): topic2ids, id2topics = defaultdict(list), defaultdict(list) for i, doc2bow in enumerate(self.myDictionary.doc2bows): topic_pairs = self.model.get_document_topics(doc2bow) for j, (topic, prob) in enumerate(topic_pairs): if prob >= self.topic_threshold or j == 0: topic2ids[topic].append(i) id2topics[i].append(topic) return topic2ids, id2topics def get_topic_terms(self, topic): terms = self.model.get_topic_terms(topic) return terms def get_top_topic(self): top_topics = self.model.top_topics(corpus=self.myDictionary.doc2bows) average = sum([t[1] for t in top_topics]) / self.num_topics return top_topics, average def get_perplexity(self): return self.model.log_perplexity(self.myDictionary.doc2bows) def get_coherence(self): if not self.coherenceModel: self.coherenceModel = CoherenceModel(model=self.model, \ corpus=self.myDictionary.doc2bows, \ dictionary=self.myDictionary.dictionary, \ coherence='u_mass') return self.coherenceModel.get_coherence()
def gridsearch_graph(dictionary, corpus, texts, list_num_topics): """ Function to display num_topics - LDA graph using c_v coherence Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : preprocessed tweets list_num_topics: list with number of topics to calculate the LDA on Returns: ------- lm_list : List of LDA topic models c_v : Coherence values corresponding to the LDA model with respective number of topics """ c_v = [] lm_list = [] logPerplex_list = [] for num_topics in list_num_topics: print("number of topics:", num_topics) lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state=0, chunksize=5000, passes=50, eval_every=None, alpha='auto', eta='auto', iterations=50) lm_list.append(lm) logPerplex_list.append(lm.log_perplexity(corpus)) cm = CoherenceModel(model=lm, corpus=corpus, dictionary=dictionary, texts=texts, coherence='c_v', processes=-1) c_v.append(cm.get_coherence()) # Show graph #x = list_num_topics #range(1, limit) #plt.plot(x, c_v) #plt.xlabel("num_topics") #plt.ylabel("Coherence score") #plt.legend(("c_v"), loc='best') #plt.show() return lm_list, c_v, logPerplex_list
def cluster_questions(topic_num, res_path, q_path='datasets\DialogQA\Qall.txt', a_path='datasets\DialogQA\Aall.txt'): with open(a_path, 'r', encoding='utf-8') as f: common_texts = [text.split() for text in f.readlines()] with open(q_path, 'r', encoding='utf-8') as f: questions = [text for text in f.readlines()] common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] lda = LdaModel(common_corpus, num_topics=topic_num) questions_clusterd = [[] for i in range(topic_num)] print('Questions : ', len(questions)) perp = lda.log_perplexity(common_corpus) for i, q in enumerate(questions): other_corpus = [common_dictionary.doc2bow(common_texts[i])] vector = lda[other_corpus] # print(vector[0]) max_prob = 0 for (idx, prob) in vector[0]: # print(idx) if prob > max_prob: topic = idx max_prob = prob questions_clusterd[topic].append(q) # print(topic) if (not os._exists(res_path)): os.makedirs(res_path) for top in range(topic_num): with open(res_path + str(top) + '.txt', 'w', encoding='utf-8') as f: for quest in questions_clusterd[top]: f.write(quest) # f.write('\n') return perp
def lda_main(word_with_pos=WORD_WITH_POS, topic_num=LDA_TOPIC_NUM): LDA_MODEL = './models/lda_{}.model'.format(topic_num) stop_word = read_stopword() begin_t = time.time() perplexity_f = open('perplexity.txt', 'a') def func(line): ''' 捆绑词性是否 ''' line = line.strip() json_data = json.loads(line) content = json_data['content'] if word_with_pos: word_list = [j[0] + j[1] for j in content if j[0] not in stop_word] else: word_list = [j[0] for j in content if j[0] not in stop_word] return word_list with open(DATA_JSONLINE) as f: # words = [func(i) for i in f.readlines()] words = [] for i in f.readlines(): words.append(func(i)) print('数据装载完毕! use ', time.time() - begin_t, 'sec.\n begin lda modeling') dic = corpora.Dictionary(words) corpus = [dic.doc2bow(text) for text in words] dic.save(DICTIONARY_PATH) corpora.MmCorpus.serialize(CORPUS_PATH, corpus) lda = LdaModel(corpus=corpus, id2word=dic, num_topics=topic_num) lda.save(LDA_MODEL) print(topic_num, ',', lda.log_perplexity(corpus), file=perplexity_f) vis_data = pyLDAvis.gensim.prepare(lda, corpus, dic) vis_html_path = 'ldavis_{}.html'.format(topic_num) pyLDAvis.save_html(vis_data, vis_html_path) print('LDA 建模完成!\nTotal use:', time.time() - begin_t, 'sec.')
def LDAmodel(words, num_topics=5, num_words=5): """ 1. the number of words 2. the mixture of topics ex: 1/2 the topic “health” and 1/2 the topic “vegetables" etc.. 3. the probability of topic depends on their dominancy """ dictionary = corpora.Dictionary(words) # Term Document Frequency corpus = [dictionary.doc2bow(word) for word in words] # save it! pickle.dump(corpus, open('corpus.pkl', 'wb')) dictionary.save('dictionary.gensim') # Train model ldamodel = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=20) # lda_model = LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True) topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words) # Validation # A measure of how good the model is. lower the better. val_perplexity = ldamodel.log_perplexity(corpus) # cohherent score coherence_ldamodel = CoherenceModel(model=ldamodel, texts=words, dictionary=dictionary, coherence='c_v') val_coherence = coherence_ldamodel.get_coherence() return topics, val_perplexity, val_coherence
def topic_model_gensim_lda(col: str, prefix=None, min_topics=19,max_topics=19,step=2) -> None: def trigram_bow_generator(filepath: str): ''' generator function to read docs from a file and yield a bag-of-words representation ''' for doc in LineSentence(filepath): yield trigram_dictionary.doc2bow(doc) if prefix is None: prefix = '' # for topic modeling trigram_docs_filepath = data_dir_processed / f'{prefix}{col}_transformed_docs_all.txt' print(f'Loading input file {trigram_docs_filepath}') trigram_dictionary_filepath = data_dir_processed / f'{prefix}{col}_trigram_dict_all.dict' trigram_bow_filepath = data_dir_processed / f'{prefix}{col}_trigram_bow_corpus_all.mm' #resp_whytfa_trigram_transformed_docs_all.txt # turn to posix filepaths until gensim supports this # trigram_docs_filepath = trigram_docs_filepath.as_posix() trigram_docs_filepath = trigram_docs_filepath.as_posix() trigram_dictionary_filepath = trigram_dictionary_filepath.as_posix() trigram_bow_filepath = trigram_bow_filepath.as_posix() # TODO - change 1 == 1 lines to overwrite_interim # this is a bit time consuming - make the if statement True # if you want to learn the dictionary yourself. if 1 == 1: trigram_docs = LineSentence(trigram_docs_filepath) # learn the dictionary by iterating over all of the docs trigram_dictionary = Dictionary(trigram_docs) print(trigram_dictionary) #for k, v in trigram_dictionary.iteritems(): # print (f'{k}, {v}') # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=min_absolute_frequency, no_above=max_relative_frequency, keep_n=max_features, ) trigram_dictionary.compactify() print(trigram_dictionary) #for k, v in trigram_dictionary.iteritems(): # print (f'{k}, {v}') if verbose: logger.info(f'Saving trigram dictionary: {trigram_dictionary_filepath} {len(trigram_dictionary)}') trigram_dictionary.save(trigram_dictionary_filepath) # load the finished dictionary from disk if verbose: logger.info(f'Loading trigram dictionary: {trigram_dictionary_filepath}') trigram_dictionary = Dictionary.load(trigram_dictionary_filepath) # this is a bit time consuming - make the if statement True # if you want to build the bag-of-words corpus yourself. if 1 == 1: # generate bag-of-words representations for # all docs and save them as a matrix if verbose: print(f'Saving corpus: {trigram_bow_filepath}') MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_docs_filepath)) # load the finished bag-of-words corpus from disk if verbose: print(f'Loading corpus: {trigram_bow_filepath}') trigram_bow_corpus = MmCorpus(trigram_bow_filepath) num_topics_range = range(min_topics, max_topics + 1, step) #iterations = 2000 #chunksize = 100 # more than the number of docs? passes = 10 # iterations = 400 iterations = 100 # chunksize = len(trigram_bow_corpus) chunksize = 100 # more than the number of docs? eta = 'auto' #eval_every = None # Don't evaluate model perplexity, takes too much time. workers=1 print(f'cpu_count:{cpu_count()}') alpha='auto' if multicore: # for multicore; one fewer than the number of cores workers = cpu_count() - 1 if verbose: print(f'Multiprocessing with {workers} cores (one fewer than the number of cores)') else: # for singnle core; cannot use in multicore alpha = 'auto' # now_str = datetime.now(timezone('US/Pacific')).strftime('%Y-%m-%d-%H-%M-%S') now_str = ''#datetime.now().strftime('%Y-%m-%d-%H-%M-%S') save_dir = data_dir_processed / f'{prefix}{col}_gensim_lda_models_{now_str}' if not save_dir.exists(): save_dir.mkdir(parents=True, exist_ok=True) # save_dir_s3 = f'{data_dir_processed_s3}/{prefix}{col}_gensim_lda_models_{now_str}' # lm_list = [] c_v = [] u_mass = [] perp = [] #alg='LDA' alg='Mallet' for num_topics in num_topics_range: if(alg == 'Mallet'): logger.info('Using Mallet...') #try the Mallet implementation ldamallet = LdaMallet(mallet_path, corpus=trigram_bow_corpus, num_topics=num_topics, id2word=trigram_dictionary,workers=workers,iterations=iterations) ldamallet_filepath = (save_dir / f'gensim_ldamallet_{num_topics}_topics').as_posix() ldamallet.save(ldamallet_filepath) for t in ldamallet.show_topics(num_topics=-1, num_words=10, formatted=False): words = [w[0] for w in t[1]] logger.info('topic {:2d}\t{}'.format(t[0], ' '.join(words))) # Show Topics #print(ldamallet.show_topics(formatted=False)) # Compute Coherence Score cm = CoherenceModel(model=ldamallet, texts=trigram_docs, dictionary=trigram_dictionary, coherence='c_v') c_v.append(cm.get_coherence()) cm = CoherenceModel(model=ldamallet, corpus=trigram_bow_corpus, dictionary=trigram_dictionary, coherence='u_mass')#, processes=workers) u_mass.append(cm.get_coherence()) #perp_lower_bound = ldamallet.log_perplexity(trigram_bow_corpus) #perp.append(2**(-perp_lower_bound)) perp.append(0) else: logger.info('Using LDA...') #TODO: try with and without alpha ldamodel = LdaModel(corpus=trigram_bow_corpus, id2word=trigram_dictionary, num_topics=num_topics, passes=passes, iterations=iterations, chunksize=chunksize, eta=eta, #eval_every=eval_every, alpha=alpha, random_state=np.random.RandomState(seed=10101010), ) #ldamodel = LdaMulticore(corpus=trigram_bow_corpus, id2word=trigram_dictionary, # num_topics=num_topics, passes=passes, iterations=iterations, # chunksize=chunksize, eta=eta, #eval_every=eval_every, # random_state=np.random.RandomState(seed=10101010), # workers=workers # ) ldamodel_filepath = (save_dir / f'gensim_lda_{num_topics}_topics').as_posix() ldamodel.save(ldamodel_filepath) for t in ldamodel.show_topics(num_topics=-1, num_words=50, formatted=False): words = [w[0] for w in t[1]] logger.info('topic {:2d}\t{}'.format(t[0], ' '.join(words))) cm = CoherenceModel(model=ldamodel, texts=trigram_docs, dictionary=trigram_dictionary, coherence='c_v')#, processes=workers) c_v.append(cm.get_coherence()) cm = CoherenceModel(model=ldamodel, corpus=trigram_bow_corpus, dictionary=trigram_dictionary, coherence='u_mass') #, processes=workers) u_mass.append(cm.get_coherence()) perp_lower_bound = ldamodel.log_perplexity(trigram_bow_corpus) perp.append(2**(-perp_lower_bound)) coh_perp = pd.DataFrame( data=np.array([c_v, u_mass, perp]).T, columns=['c_v', 'u_mass', 'perp'], index=list(num_topics_range)) coh_perp.index.name = 'num_topics' coh_perp_filepath = save_dir / 'coherence_perplexity.csv' coh_perp.to_csv(coh_perp_filepath) logger.info('coherence_docs={0}, coherence_corpus={1}, perplexity={2}'.format(c_v, u_mass, perp))
print 'Building bag-of-words corpus ...' bow_corpus = [dictionary.doc2bow(t) for t in texts] print 'Serializing corpus (%s) ...' % BOW MmCorpus.serialize(BOW, bow_corpus) size = len(bow_corpus) * 9 / 10 training = bow_corpus[:size] testing = bow_corpus[size:] t0 = time() print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training)) lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5) print("done in %0.3fs." % (time() - t0)) print 'Saving LDA model (%s) ...' % NSFLDA lda.save(NSFLDA) print 'Random subset of topics:' print '\n'.join(lda.print_topics()) print 'Computing perplexity on %d held-out documents ...' % len(testing) perplexity = 2**-(lda.log_perplexity(testing)) print 'Perplexity: %.2f' % perplexity for i in range(0, Num_Topics): temp = lda.show_topic(i, 10) terms = [] for term in temp: terms.append(term[1]) print "Top 10 terms for topic #" + str(i) + ": " + ", ".join(terms)
print 'Saving dictionary (%s)...' % DICT dictionary.save(DICT) print 'Building bag-of-words corpus ...' bow_corpus = [ dictionary.doc2bow(t) for t in texts ] print 'Serializing corpus (%s) ...' % BOW MmCorpus.serialize(BOW, bow_corpus) size = len(bow_corpus) * 4 / 5 training = bow_corpus[:size] testing = bow_corpus[size:] print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training)) lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000) print 'Saving LDA model (%s) ...' % NSFLDA lda.save(NSFLDA) print 'Random subset of topics:' print '\n'.join(lda.print_topics()) print 'Computing perplexity on %d held-out documents ...' % len(testing) perplexity = 2 ** -(lda.log_perplexity(testing)) print 'Perplexity: %.2f' % perplexity
def takeTokenList_ReturnModel(tokenList, dictionaryForLDA, corpus, baseFolder, topicList, passList, loadTrainedLDAIfExists): winningModel_SavePath = os.path.join(baseFolder,'Winning LDA Model') path_LDA_LTrainingOutput = os.path.join(baseFolder, "LDA_LTrainingOutput.csv") if loadTrainedLDAIfExists and os.path.exists(winningModel_SavePath) and os.path.exists(path_LDA_LTrainingOutput): print ("Loading pre-trained LDA model from %s"%(winningModel_SavePath)) winningLDAModel = gensin_models.LdaModel.load(winningModel_SavePath) ldaResultOutput_df = pd.read_csv(path_LDA_LTrainingOutput, header=0, index_col=0) _, numberOfTopics = pd.DataFrame(ldaResultOutput_df).sort_values(by=['Coherence'], ascending=False).filter(items=['ActualModel', 'TopicNum']).head(1).values[0] else: if type(topicList)==int: topicList=[topicList] elif type(topicList)==list: topicList=topicList else: topicList=[7] print ('LDA Topis to check: %s'%str(topicList)) if type(passList)==int: passList=[passList] elif type(passList)==list: passList=passList else: passList=[10] print('LDA Passes to check: %s' % str(passList)) ldaResultOutput={} for top in topicList: for passN in passList: ldaModelTitle = '\nLDA_%s_Topics_%s_Passes' % (top, passN) start_time = time() print("Training LDA Model: %s - StartTime: %s"%(ldaModelTitle,start_time)) ldaResultOutput[ldaModelTitle] = {'TopicNum': top, 'PassNum': passN} ldaTest = LdaModel(corpus=corpus, id2word=dictionaryForLDA, iterations=100, num_topics=top, passes=passN) Perplexity = ldaTest.log_perplexity(corpus) cohrM = CoherenceModel(model=ldaTest, texts=tokenList, corpus=corpus, dictionary=dictionaryForLDA, coherence='c_v', processes=1) cohrScore = cohrM.get_coherence() timeInSeconds = time() - start_time print("Coherence: %s"%(round(cohrScore,3))) ldaResultOutput[ldaModelTitle]['TopicNum'] = top ldaResultOutput[ldaModelTitle]['PassNum'] = passN ldaResultOutput[ldaModelTitle]['Perplexity'] = round(Perplexity,3) ldaResultOutput[ldaModelTitle]['Coherence'] = round(cohrScore,3) ldaResultOutput[ldaModelTitle]['TimeInSec'] = round(timeInSeconds,3) ldaResultOutput[ldaModelTitle]['ActualModel'] = ldaTest ldaResultOutput_df = pd.DataFrame(ldaResultOutput).T.sort_values(by=['Coherence'], ascending=False).copy() print(pd.DataFrame(ldaResultOutput_df).sort_values(by=['Coherence'], ascending=False)) winningLDAModel,numberOfTopics,Coherence,Perplexity = pd.DataFrame(ldaResultOutput_df).sort_values(by=['Coherence'], ascending=False).filter(items=['ActualModel','TopicNum','Coherence','Perplexity']).head(1).values[0] #pickle.dump(winningLDAModel, open(winningModel_SavePath, "wb")) winningLDAModel.save(winningModel_SavePath) print("Winning Model Details:") print(ldaResultOutput_df.head(1).values) return winningLDAModel,ldaResultOutput_df,numberOfTopics,Coherence,Perplexity
passes=5, chunksize=10000, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) ## See the topics lda_model.print_topics(-1) #this allows to observe the topics lda_model.get_topic_terms(0, topn=10) # this provides the top 10 words in topic 0 lda_model.log_perplexity(corpus) # this compute the log perplexity lda_model.get_document_topics( corpus[0] ) # This provide the document topic distribution. Note that by default, when a document has a low probability on a topic, it is not displayed lda_model.get_document_topics( corpus[0], minimum_probability=0 ) # This provide the document topic distribution. Here, every topics and associated probabilities are printed. ### Document topic #### # Plotting tools import pyLDAvis import pyLDAvis.gensim # don't skip this import matplotlib.pyplot as plt vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary) pyLDAvis.show(vis) 8
def train(args): lda_model = None corpus = None perplexities = [] coherence_values = [] topics = range(2, int(args['--num_topics']) + 1, 2) if args['--all'] else range( int(args['--num_topics']), int(args['--num_topics']) + 1) if args['--all']: target_date = calculate_target_date(float( args['--proportion'])) + timedelta(days=-1) else: target_date = datetime.now().date() + timedelta( days=-int(args['--days'])) for num_topics in topics: os.makedirs(name='./model/{}/{}'.format(num_topics, target_date), exist_ok=True) try: logger.info("loading model") lda_model = LdaModel.load('./model/{}/{}/topic_{}.model'.format( num_topics, target_date, num_topics)) corpus, dictionary = load_corpus_dictionary( float(args['--proportion'])) except: logger.info("not found model saved") corpus, dictionary = load_corpus_dictionary( float(args['--proportion'])) logger.info("training model") lda_model = LdaModel( corpus=corpus, num_topics=num_topics, id2word=dictionary, # Dictionary对象 chunksize=int(args['--chunk_size']), passes=int(args['--passes']), alpha='symmetric' if args['--alpha'] else 'auto', eta=None if args['--eta'] else 'auto', decay=float(args['--decay']), offset=float(args['--offset']), eval_every=int(args['--eval_every']), iterations=int(args['--iterations']), gamma_threshold=float(args['--gamma_threshold']), minimum_probability=float(args['--minimum_probability']), random_state=int(args['--random_state']), per_word_topics=True if args['--per_word_topics'] else False) logger.info("saving trained model") lda_model.save('./model/{}/{}/topic_{}.model'.format( num_topics, target_date, num_topics)) finally: perplexities.append(np.exp2(-lda_model.log_perplexity(corpus))) # u_mass得分越低越好 coherence_values.append( CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass').get_coherence()) lda_model.print_topics(5, 5) if args['--all']: os.makedirs(name='./pic/{}/'.format(target_date), exist_ok=True) draw_graph_perplexity(args, perplexities, topics, target_date) draw_graph_coherence(args, coherence_values, topics, target_date) else: save_ppl_coh(perplexities[0], coherence_values[0], int(args['--days'])) logger.info("perplexity: {}; coherence: {}.".format( perplexities[0], coherence_values[0]))
# %% ## トピック数の探索 start = 2 limit = 10 step = 1 coherence_vals = [] perplexity_vals = [] for n_topic in tqdm(range(start, limit, step)): lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topic, random_state=0) perplexity_vals.append(np.exp2(-lda_model.log_perplexity(corpus))) coherence_model_lda = CoherenceModel(model=lda_model, texts=df['words'], dictionary=dictionary, coherence='c_v') coherence_vals.append(coherence_model_lda.get_coherence()) # %% # evaluation x = range(start, limit, step) fig, ax1 = plt.subplots(figsize=(12, 5)) # coherence c1 = 'darkturquoise' ax1.plot(x, coherence_vals, 'o-', color=c1)
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) # Print the Keyword in the 10 topics print(lda_model.print_topics()) doc_lda = lda_model[corpus] # Compute Perplexity print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score #coherence_model_lda = CoherenceModel(model=lda_model, texts=nps_comment_filtered, dictionary=id2word, coherence='c_v') #coherence_lda = coherence_model_lda.get_coherence() #print('\nCoherence Score: ', coherence_lda) cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS' cloud = WordCloud(stopwords=stop_words, background_color='white', width=2500, height=1800, max_words=10, colormap='tab10', color_func=lambda *args, **kwargs: cols[i],
def gen_glda_model_sv(self, vectorizer, n_topics, finalCorpus, update_mat=False, alpha='auto', eta="auto", ittrs=300): # vectorizer = gen_feature_vectorizer(self,t) # print(finalCorpus) if update_mat or self.tfMatrix == None: # dont want to re-create the everytime we want to generate an lda_model (e.g. different topic numbers) # unless this is a new vectorizer, or a new corpus t0 = time() self.tfMatrix = vectorizer.fit_transform(finalCorpus) # transformer = TfidfTransformer() # this would be a substitue for TFIDFVectorizor, but already using it... # self.tfMatrix = transformer.fit_transform(TermDocMatrix) print("[tffeature]: gen_lda_model: transform done in %0.3fs." % (time() - t0)) print("[tffeature]: gen_lda_model: tfMatrix shape:", self.tfMatrix.shape) self.tfMatrix = normalize(self.tfMatrix, norm='l1', axis=1) print('[tffeature]: type after normalize: ', type(self.tfMatrix)) # invert vocabulary # idx_to_term vocab_key = [] if self.ind2vocab == None: inv_vocabulary = {} i = 0 for w in sorted(vectorizer.vocabulary_): #inv_vocabulary[vectorizer.vocabulary_[w]] = w inv_vocabulary[i] = w i += 1 vocab_key.append(vectorizer.vocabulary_[w]) # if vectorizer.vocabulary_[w] == 0: # print(w) self.ind2vocab = inv_vocabulary self.tfMatrix = self.tfMatrix[:, vocab_key] # print(self.tfMatrix[0].toarray().tolist() ) # print(self.tfMatrix[1].toarray().tolist() ) # print(self.tfMatrix[2].toarray().tolist() ) print(self.ind2vocab[0], self.ind2vocab[1], self.ind2vocab[2]) # row_sums = scipy_sparse_matrix.sum(axis=1) # scipy_sparse_matrix = scipy_sparse_matrix / row_sums[:, np.newaxis] corpus = gensim.matutils.Sparse2Corpus(self.tfMatrix, documents_columns=False) # print(corpus) # print(max(inv_vocabulary.keys())) # print('scipy shape', self.tfMatrix.shape) # print('vocabs: ', len(vectorizer.vocabulary_), len(inv_vocabulary)) np.random.seed(self.RSEED) random.seed(self.RSEED) lda = LdaModel(corpus, num_topics=n_topics, id2word=self.ind2vocab, alpha=alpha, eta=1.0 / n_topics, random_state=np.random.RandomState(self.RSEED), iterations=ittrs, minimum_probability=0.001, minimum_phi_value=0.001) print('[tffeature]: lda perplexity:', lda.log_perplexity(corpus)) return lda, vocab_key
for i, topics in enumerate(lda.get_document_topics(corpus)): doc_topics = pd.concat([ doc_topics, pd.DataFrame(topics, columns=['topic', 'value']).assign(doc=i) ]) doc_topics.to_csv(model_path / f'doc_topics_{key}_{n_topics}.csv', index=False) model_file = datapath((model_path / f'{key}_{n_topics}').resolve()) lda.save(model_file) train_lda = LdaModel(corpus=train_corpus, num_topics=n_topics, id2word=pd.Series(train_tokens).to_dict()) # see https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.log_perplexity test_perplexity = 2**(-train_lda.log_perplexity(test_corpus)) # https://markroxor.github.io/gensim/static/notebooks/topic_coherence_tutorial.html u_mass = np.mean([ c[1] for c in lda.top_topics( corpus=corpus, coherence='u_mass', topn=n_topics) ]) # extrinsic - need to provide external corpus # cm = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_uci') # uci = cm.get_coherence() result_ = [ vocab_size, test_vocab, max_features, n_topics, test_perplexity, u_mass ]
def grid_lda(dictionary, corpus, texts, max_topics, min_topics=5, step=5, save=True, plot=True): np.random.seed(49) import time coherence_scores = [] lda_list = [] perplexity = [] passes = 20 iterations = 100 eval_every = 50 with open('log_LDA.txt', 'w') as f: for num_topics in range(min_topics, max_topics + 1, step): print('#' * 100) print('Training LDA with {} Topics'.format(num_topics)) print() warnings.filterwarnings("ignore", category=DeprecationWarning) start = time.time() lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) #, #passes=passes,iterations=iterations,eval_every=eval_every) lda_list.append(lda) coherencemodel = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_v') coherence_score = coherencemodel.get_coherence() print('Coherence Score: ', coherence_score) coherence_scores.append(coherence_score) perplexity.append(lda.log_perplexity(corpus)) print('Perplexity: ', perplexity[-1]) print('Trained in {:0.3f}s'.format(time.time() - start)) f.write('#' * 100 + ' \n') f.write('Training LDA with {} Topics'.format(num_topics) + ' \n') f.write('Coherence Score: {}'.format(coherence_score) + ' \n') f.write('Perplexity: {}'.format(perplexity[-1]) + ' \n') f.write('Trained in {:0.3f}s'.format(time.time() - start) + ' \n') if save: lda.save( '../Models/grid/{}_clusters_full_grid_active_score{:0.3f}.model' .format(num_topics, coherence_score)) print( 'Model Saved under : ../Models/grid/{}_clusters_full_grid_active_score{:0.3f}.model' .format(num_topics, coherence_score)) print() f.close() if plot: x = range(min_topics, max_topics + 1, step) plt.plot(x, coherence_scores) plt.xlabel("Num Topics") plt.ylabel("Coherence score") #plt.legend(("coherence_values"), loc='best') plt.savefig('Coherence.png') plt.show() x = range(min_topics, max_topics + 1, step) plt.plot(x, perplexity) plt.xlabel("Num Topics") plt.ylabel("Log Perplexity") plt.savefig('Perplexity.png') #plt.legend(("coherence_values"), loc='best') plt.show() return lda_list, coherence_scores, perplexity