def coherence_umass(): topicanzahl = [] coherence10 = [] coherence20 = [] for i in range(min, max + 1, step): topicanzahl.append(i) model = models.LdaModel.load("./Topic_Modeling/Models/Topic_Model_%i" % i) u_mass_10 = models.CoherenceModel(corpus=corpus, model=model, dictionary=dictionary, coherence='u_mass', topn=10).get_coherence() u_mass_20 = models.CoherenceModel(corpus=corpus, model=model, dictionary=dictionary, coherence='u_mass', topn=20).get_coherence() coherence10.append(u_mass_10) coherence20.append(u_mass_20) top10a = pandas.DataFrame(data=coherence10, index=topicanzahl) top20a = pandas.DataFrame(data=coherence20, index=topicanzahl) pandas.DataFrame(top10a).to_csv( "./Topic_Modeling/Evaluation/UMass_Score_10_words.csv", sep=';', decimal=',') pandas.DataFrame(top20a).to_csv( "./Topic_Modeling/Evaluation/UMass_Score_20_words.csv", sep=';', decimal=',')
def build_coherence_models(topic_model, **kwargs): u_mass = models.CoherenceModel(model=topic_model, corpus=kwargs['corpus'], dictionary=kwargs['dictionary'], coherence='u_mass') c_v = models.CoherenceModel(model=topic_model, texts=kwargs['texts'], corpus=kwargs['corpus'], dictionary=kwargs['dictionary'], coherence='c_v') c_uci = models.CoherenceModel(model=topic_model, texts=kwargs['texts'], corpus=kwargs['corpus'], dictionary=kwargs['dictionary'], coherence='c_uci') c_npmi = models.CoherenceModel(model=topic_model, texts=kwargs['texts'], corpus=kwargs['corpus'], dictionary=kwargs['dictionary'], coherence='c_npmi') return { 'num_topics': topic_model.num_topics, 'u_mass': u_mass.get_coherence(), 'c_v': c_v.get_coherence(), 'c_uci': c_uci.get_coherence(), 'c_npmi': c_npmi.get_coherence() }
def train(self, num_topics=None): if num_topics is not None: self._train(num_topics) else: highest = { 'num_topic': 0, 'coherence': 0 } for num in itertools.count(1): self._train(num) cm = models.CoherenceModel( model=self.ldamodel, texts=self.collection, dictionary=self.dictionary, coherence='c_v' ) coherence = cm.get_coherence() if coherence > highest['coherence']: highest = { 'lda': copy.deepcopy(self.ldamodel), 'num_topic': num, 'coherence': coherence } elif ((highest['coherence'] - coherence) > 0.2) \ or num >= 20: break self.ldamodel = highest['lda']
def score(self, X, y=None): """Compute score reflecting how well the model has fitted for the input data. The scoring method is set using the `scorer` argument in :meth:`~gensim.sklearn_api.ldamodel.LdaTransformer`. Higher score is better. Parameters ---------- X : iterable of list of (int, number) Sequence of documents in BOW format. Returns ------- float The score computed based on the selected method. """ if self.scorer == 'perplexity': corpus_words = sum(cnt for document in X for _, cnt in document) subsample_ratio = 1.0 perwordbound = \ self.gensim_model.bound(X, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) return -1 * np.exp2( -perwordbound ) # returning (-1*perplexity) to select model with minimum value elif self.scorer == 'u_mass': goodcm = models.CoherenceModel(model=self.gensim_model, corpus=X, coherence=self.scorer, topn=3) return goodcm.get_coherence() else: raise ValueError( "Invalid value {} supplied for `scorer` param".format( self.scorer))
def compute_coherence_values(dictionary, tfidf_corpus, corpus, start, stop, step): """ Input : dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts stop : Max num of topics purpose : Compute c_v coherence for various number of topics Output : model_list : List of LSA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] model_list = [] for number_of_topics in range(start, stop, step): # generate LDA model lda = models.LdaModel(tfidf_corpus, num_topics=number_of_topics, id2word=dictionary) # train model model_list.append(lda) coherencemodel = models.CoherenceModel(model=lda, texts=formatted, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) return coherence_values
def train(self, num_topics=None): if num_topics is not None: self.ldamodels = [ self._train(num_topics[key], collection) for key, collection in enumerate(self.collections) ] else: self.ldamodels = [] for collection in self.collections: highest = {'num_topic': 0, 'coherence': 0} for num in itertools.count(1): ldamodel = self._train(num, collection) cm = models.CoherenceModel( model=ldamodel, texts=collection['collection'], dictionary=collection['dictionary'], coherence='c_v') coherence = cm.get_coherence() if coherence > highest['coherence']: highest = { 'lda': copy.deepcopy(ldamodel), 'num_topic': num, 'coherence': coherence } elif ((highest['coherence'] - coherence) > 0.2) \ or num >= 20: break self.ldamodels.append(highest['lda'])
def coherence_cv(texts_file): #texts sind tokenized texts import csv texts = [] with open(texts_file, newline='', encoding="utf-8") as csvfile: reader = csv.reader(csvfile, delimiter=';', quotechar='|') for i in reader: texts.append(i) topicanzahl = [] coherence10 = [] coherence20 = [] for i in range(min, max + 1, step): topicanzahl.append(i) model = models.LdaModel.load("./Topic_Modeling/Models/Topic_Model_%i" % i) c_v_10 = models.CoherenceModel(texts=texts, model=model, dictionary=dictionary, coherence='c_v', topn=10, processes=1) c_v_10 = c_v_10.get_coherence() c_v_20 = models.CoherenceModel(texts=texts, model=model, dictionary=dictionary, coherence='c_v', topn=20, processes=1) c_v_20 = c_v_20.get_coherence() coherence10.append(c_v_10) coherence20.append(c_v_20) top10b = pandas.DataFrame(data=coherence10, index=topicanzahl) top20b = pandas.DataFrame(data=coherence20, index=topicanzahl) pandas.DataFrame(top10b).to_csv( "./Topic_Modeling/Evaluation/Cv_Score_10_words.csv", sep=';', decimal=',') pandas.DataFrame(top20b).to_csv( "./Topic_Modeling/Evaluation/Cv_Score_20_words.csv", sep=';', decimal=',')
def coherence_umass(model): corpus1 = corpora.MmCorpus("./Topic_Modeling_Bigram/Input_Data/corpus.mm") dictionary1 = corpora.dictionary.Dictionary.load_from_text( "./Topic_Modeling_Bigram/Input_Data/dictionary.dict") u_mass = models.CoherenceModel(corpus=corpus1, model=model, dictionary=dictionary1, coherence='u_mass', processes=-1).get_coherence() return u_mass
def score(self, docs, model=None, coherence="c_v", return_per_topic=False): if model == None: model = self.model # calculate coherence score (the higher the better) cm = models.CoherenceModel( model=model, texts=docs, dictionary=self.dictionary, coherence=coherence, ) if return_per_topic: return cm.get_coherence(), cm.get_coherence_per_topic() else: return cm.get_coherence()
def score(self, X, y=None): """ Compute score reflecting how well the model has fit for the input data. """ if self.scorer == 'perplexity': corpus_words = sum(cnt for document in X for _, cnt in document) subsample_ratio = 1.0 perwordbound = self.gensim_model.bound(X, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) return -1 * np.exp2(-perwordbound) # returning (-1*perplexity) to select model with minimum perplexity value elif self.scorer == 'u_mass': goodcm = models.CoherenceModel(model=self.gensim_model, corpus=X, coherence=self.scorer, topn=3) return goodcm.get_coherence() else: raise ValueError("Invalid value of `scorer` param supplied")
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, start, stop, step): coherence_values = [] model_list = [] for num_topics in range(start, stop, step): model = models.LsiModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary) model_list.append(model) coherencemodel = models.CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) return model_list, coherence_values
def cv_score(corpus, dict_, k, alpha, eta): lda_model = models.LdaMulticore(corpus = corpus, id2word = dict_, num_topics = k, alpha = alpha, eta = eta, random_state = 100, chunksize = 100, passes = 10, per_word_topics = True) coherence = models.CoherenceModel(model = lda_model, texts = texts, corpus = corpus, dictionary = dict_, coherence = "c_v") # u_mass, c_v, c_uci, c_npmi return coherence.get_coherence()
def lda_model_selection(corpus, id2word, r): print('Selecting LDA models...') model_list = [] coherence_values = [] for num_topics in r: print('Number of topics: %d' % num_topics) model = models.LdaModel(corpus, num_topics=num_topics, id2word=id2word, alpha='auto', eta='auto', minimum_probability=0.001, passes=10) model_list.append(model) coherence_model = models.CoherenceModel(model=model, texts=Text, dictionary=dictionary, coherence='c_v') v = coherence_model.get_coherence() coherence_values.append(v) print('Coherence value: %f' % v) plt.plot(r, coherence_values) plt.xlabel('Num Topics') plt.ylabel('Coherence score') plt.legend('coherence_values', loc='best') plt.show()
def get_optimal_ldamodel_by_coherence_values(corpus, texts, dictionary, stop=100, start=10, step=10): """ get the lsi model with optimal number of topics Input : dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts stop : Max num of topics purpose : Compute c_v coherence for various number of topics Output : model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] model_list = [] num_lists = range(start, stop, step) for num_topics in num_lists: # generate LDA model model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, alpha='auto', eta='auto', eval_every=None) # train model model_list.append(model) coherencemodel = models.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) print("num_topics: %s" % str(num_lists)) print("coherence_values: %s" % str(coherence_values)) max_ind = np.argmax(np.array(coherence_values)) print("opt_num_topics: %s" % num_lists[max_ind]) return model_list[max_ind]
def main(): reviews = [] for filePath in searchFiles('./Reviews/IP/'): review = pd.read_csv(filePath, encoding = 'utf-8') reviews.append(review) docs = pd.concat(reviews, ignore_index=True) docs['내용'] = docs.apply(lambda x: x['내용']*int(np.log2(2 + x['공감수'])), axis = 1) print('리뷰 읽기 끝') vect = GensimTfidfVectorizer(tokenizer=getNVM_lemma, n_gram = 2, dir_path='.') texts = vect.fit_transform(docs['내용']) id2word = vect.get_id2word() data = vect.texts print('벡터화 끝') lda = models.LdaModel(corpus=texts, id2word=id2word, num_topics=20, update_every=1, chunksize=1000, passes=10, alpha='auto', eta='auto', per_word_topics=False) topics = sorted(lda.show_topics(num_topics = 20, num_words=20, formatted=False), key=lambda x:x[0]) pprint(topics) print('') for row in lda[texts][2]: pprint(row) print(lda.log_perplexity(texts)) cm = models.CoherenceModel(model=lda, texts = data, dictionary=id2word, coherence = 'c_v') print(cm.get_coherence()) return None
def LDA(self, tf_vector, K_range, PASS_range, ITER_range, alpha="auto", eta="auto", seed=7571): self.LDA_EVAL_LIST = [] self.LDA_MODEL_LIST = [] for K in K_range: for PASSES in PASS_range: for ITER in ITER_range: print("建模參數測試-主題數:{} PASS:{} Iter:{}".format( K, PASSES, ITER)) np.random.seed(seed) MODEL = models.LdaModel(corpus=tf_vector, id2word=self.dic, alpha=alpha, eta=eta, num_topics=K, passes=PASSES, iterations=ITER) self.LDA_MODEL_LIST.append(MODEL) EVAL = models.CoherenceModel(model=MODEL, texts=self.split_list, dictionary=self.dic, coherence='c_v') self.LDA_EVAL_LIST.append( (K, PASSES, ITER, EVAL.get_coherence())) # 模型比較 self.LDA_index, self.LDA_param = max(enumerate( self.LDA_EVAL_LIST), key=lambda x: x[1][3]) self.BEST_MODEL = self.LDA_MODEL_LIST[self.LDA_index] print("最佳參數-主題數:{} PASS:{} Iter:{} Eval:{}".format( self.LDA_param[0], self.LDA_param[1], self.LDA_param[2], self.LDA_param[3]))
def evaluate_graph(filename, dictionary, corpus, texts, limit): coherence_values = {} lda_models = {} for num_topics in range(1, limit): lm = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) lda_models[num_topics] = lm cm = models.CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v') coherence_values[num_topics] = cm.get_coherence() x = list(coherence_values.keys()) y = list(coherence_values.values()) plt.plot(x, y) plt.xlabel("num_topics") plt.ylabel("Coherence score") plt.legend(("c_v"), loc='best') plt.show() plt.savefig(filename + "_coherence-topic.pdf", bbox_inches='tight') return lda_models, coherence_values
def main(): st = time.time() print "Start Time: ", st documents = get_input(input_file_path) p = Pool(15) urls = [row[0]['title'] for row in documents] individual_results = p.map(evaluate, documents) dictionary = corpora.Dictionary(individual_results) corpus = [dictionary.doc2bow(text) for text in individual_results] tfidf = gensim.models.TfidfModel(corpus) imp_corpus = tfidf[corpus] lsimodel = models.LsiModel(imp_corpus, id2word=dictionary) cohmodel = models.CoherenceModel(model=lsimodel, corpus=imp_corpus, coherence='u_mass') print 'Coherence:', cohmodel.get_coherence_per_topic() lsi_corpus = lsimodel[imp_corpus] # Use the singular values to choose how many components to use v = lsimodel.projection.s**2 / sum(lsimodel.projection.s**2) print v[:100] k = np.argmin( v > 0.005 ) + 1 # Hard threshold, may be better to plot and find the knee topics = lsimodel.show_topics(num_topics=k, num_words=5) #topcis2 = ldamodel.get_topics() for i, topic in enumerate(topics): print topic tops = sorted(zip(range(len(lsi_corpus)), lsi_corpus), reverse=True, key=lambda doc: abs(dict(doc[1]).get(i, 0.0))) print 'Most relevant documents: ' for top in tops[:10]: print urls[top[0]] print #print corpus[0] end = time.time() print "End Time: ", end - st
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] model_list = [] mallet_path = "../models/mallet-2.0.8/bin/mallet" for num_topics in range(start, limit, step): model = models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary) model_list.append(model) coherencemodel = models.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) return model_list, coherence_values
def optimum_topics(corpus,list_topics,dictionary,iterations,processed_content): temp = -10000 temp_model = None topics = 0 model_coherences = [] model_preplexities = [] models_tosave = [] for i in range(0,len(list_topics)): lda_model = models.LdaModel(corpus=deepcopy(corpus),num_topics=list_topics[i],id2word=deepcopy(dictionary),iterations=iterations) models_tosave.append(lda_model) #model perplexity calculation model_preplexity = lda_model.log_perplexity(corpus) model_preplexities.append(model_preplexity) #Calculating the coherence topic_coherence = models.CoherenceModel(model=lda_model, texts=processed_content, dictionary=dictionary, coherence='c_v') model_coherence = topic_coherence.get_coherence() model_coherences.append(model_coherence) if model_coherence>=temp: topics = list_topics[i] temp = model_coherence temp_model = lda_model return temp_model,topics,temp,model_coherences, model_preplexities, models_tosave
def get_coherence_value(dictionary, doc_term_matrix, tokenized_list, max_topics, processors): coherence_value = 0 best_model = None best_num_topics = 0 for num_topics in range(2, max_topics, 1): # generate LSA model model = models.LsiModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary) # train model coherencemodel = models.CoherenceModel(model=model, texts=tokenized_list, dictionary=dictionary, coherence='c_v', processes=processors) # check best model if (coherence_value < coherencemodel.get_coherence()): coherence_value = coherencemodel.get_coherence() best_model = model best_num_topics = num_topics return best_model, coherence_value, best_num_topics
corpus2=[] with open(path2,'r',encoding='utf-8') as f: for line in f.readlines(): corpus2.append(line.strip().split(' ')) f.close() corpus0=corpus1+corpus2 id2word=corpora.Dictionary(corpus0) corpus = [id2word.doc2bow(text) for text in corpus0] ldamodel=models.ldamodel.LdaModel(iterations=200,corpus=corpus,num_topics=20,id2word=id2word) for i in range(20): print('第{0}个主题的信息****************8'.format(i)) print(ldamodel.show_topic(topicid=i,topn=20)) op=ldamodel.get_topics() print(op) coh=models.CoherenceModel(ldamodel,corpus=corpus,dictionary=id2word,coherence='u_mass') print(coh.get_coherence()) kmean=KMeans(n_clusters=5) kmean.fit(op) pre_kmean=kmean.predict(op) data=pd.DataFrame(op) plt.scatter(data[45],data[187],c=pre_kmean) plt.show() from scipy.cluster.hierarchy import dendrogram, linkage,fcluster from matplotlib import pyplot as plt Z = linkage(data, 'ward') f = fcluster(Z,3,'distance') fig = plt.figure(figsize=(5, 3)) dn = dendrogram(Z)
def run_experiment(num_iterations): iterations = num_iterations max_number_words = 10 min_word_length = 3 city = 'San Francisco' category = 'Arts & Culture' perplexities_name = [] coherence_scores_name = [] perplexities_descr = [] coherence_scores_descr = [] perplexities_name_descr = [] coherence_scores_name_descr = [] for number_words in range(max_number_words): perplexities_name.append(0) coherence_scores_name.append(0) perplexities_descr.append(0) coherence_scores_descr.append(0) perplexities_name_descr.append(0) coherence_scores_name_descr.append(0) for iteration in range(iterations): print("Iteration: " + str(iteration)) for number_words in range(max_number_words): print("Num words: " + str(number_words)) # USE NAME # Get model [clusters, lda_model, corpus, tokenized_items, dictionary_LDA] = get_clusters(city, category, 'event', 10, number_words + 1, min_word_length, True, False) # Compute perplexity of model perplexity = lda_model.log_perplexity(corpus) # Compute coherence score of model coherence_model_lda = models.CoherenceModel( model=lda_model, texts=tokenized_items, dictionary=dictionary_LDA, coherence='c_v') coherence = coherence_model_lda.get_coherence() # Aggregate scores perplexities_name[number_words] += perplexity coherence_scores_name[number_words] += coherence # USE DESCRIPTION # Get model [clusters, lda_model, corpus, tokenized_items, dictionary_LDA] = get_clusters(city, category, 'event', 10, number_words + 1, min_word_length, False, True) # Compute perplexity of model perplexity = lda_model.log_perplexity(corpus) # Compute coherence score of model coherence_model_lda = models.CoherenceModel( model=lda_model, texts=tokenized_items, dictionary=dictionary_LDA, coherence='c_v') coherence = coherence_model_lda.get_coherence() # Aggregate scores perplexities_descr[number_words] += perplexity coherence_scores_descr[number_words] += coherence # USE BOTH # Get model [clusters, lda_model, corpus, tokenized_items, dictionary_LDA] = get_clusters(city, category, 'event', 10, number_words + 1, min_word_length, True, True) # Compute perplexity of model perplexity = lda_model.log_perplexity(corpus) # Compute coherence score of model coherence_model_lda = models.CoherenceModel( model=lda_model, texts=tokenized_items, dictionary=dictionary_LDA, coherence='c_v') coherence = coherence_model_lda.get_coherence() # Aggregate scores perplexities_name_descr[number_words] += perplexity coherence_scores_name_descr[number_words] += coherence # Average the scores perplexities_name = [ perplexities_name[number_words] / iterations for number_words in range(max_number_words) ] coherence_scores_name = [ coherence_scores_name[number_words] / iterations for number_words in range(max_number_words) ] perplexities_descr = [ perplexities_descr[number_words] / iterations for number_words in range(max_number_words) ] coherence_scores_descr = [ coherence_scores_descr[number_words] / iterations for number_words in range(max_number_words) ] perplexities_name_descr = [ perplexities_name_descr[number_words] / iterations for number_words in range(max_number_words) ] coherence_scores_name_descr = [ coherence_scores_name_descr[number_words] / iterations for number_words in range(max_number_words) ] num_words = [number_words + 1 for number_words in range(max_number_words)] # Print graphs containing results plt.xlabel('Number of words') plt.ylabel('Perplexity') plt.title('Perplexity vs Number of Words in Topic') plt.plot(num_words, perplexities_name, label='Use name') plt.plot(num_words, perplexities_descr, label='Use description') plt.plot(num_words, perplexities_name_descr, label='Use both') plt.legend(loc='upper right') plt.savefig('perplexity.png') plt.clf() plt.xlabel('Number of words') plt.ylabel('Coherence score') plt.title('Coherence score vs Number of Words in Topic') plt.plot(num_words, coherence_scores_name, label='Use name') plt.plot(num_words, coherence_scores_descr, label='Use description') plt.plot(num_words, coherence_scores_name_descr, label='Use both') plt.legend(loc='upper right') plt.savefig('coherence_score.png') # print(perplexities) # print(coherence_scores) return
coherence_values = [] for num_topics in r: print('Number of topics: %d' % num_topics) model = models.LdaModel(corpus, num_topics=num_topics, id2word=id2word, alpha='auto', eta='auto', minimum_probability=0.001, passes=10) model_list.append(model) coherence_model = models.CoherenceModel(model=model, texts=Text, dictionary=dictionary, coherence='c_v') v = coherence_model.get_coherence() coherence_values.append(v) print('Coherence value: %f' % v) plt.plot(r, coherence_values) plt.xlabel('Num Topics') plt.ylabel('Coherence score') plt.legend('coherence_values', loc='best') plt.show() if __name__ == '__main__': Text = texts.Text dictionary = corpora.Dictionary(Text) corpus = [dictionary.doc2bow(text) for text in Text] corpus_tfidf = models.TfidfModel(corpus)[corpus] lda_model_selection(corpus_tfidf, dictionary, range(3, 30, 3)) print('Getting HDP model coherence...') hdp = models.HdpModel(corpus_tfidf, id2word=dictionary) coherence_model = models.CoherenceModel(model=hdp, texts=Text, dictionary=dictionary, coherence='c_v') v = coherence_model.get_coherence() print('Coherence value: %f' % v)
def get_umass(corpus, num_topics, dictionary):#计算话题一致性指标 mod = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) cm = models.CoherenceModel(model=mod, corpus=corpus, dictionary=dictionary, coherence="u_mass") umass = cm.get_coherence() return umass
def main(): st = time.time() print "Start Time: ", st documents = get_input(input_file_path) p = Pool(15) #urls = [row[0]['URL_s'] for row in documents] individual_results = p.map(evaluate, documents) dictionary = corpora.Dictionary(individual_results) corpus = [dictionary.doc2bow(text) for text in individual_results] tfidf = gensim.models.TfidfModel(corpus) imp_corpus = tfidf[corpus] #LSA lsimodel = models.LsiModel(imp_corpus, id2word=dictionary) #LDA # lda_model = gensim.models.ldamodel.LdaModel(corpus=imp_corpus, # id2word=dictionary, # num_topics=6, # random_state=100, # update_every=1, # chunksize=100, # passes=10, # alpha='auto', # per_word_topics=True) #cohmodel = models.CoherenceModel(model=lda_model, corpus=imp_corpus, coherence='u_mass') cohmodel = models.CoherenceModel(model=lsimodel, corpus=imp_corpus, coherence='u_mass') # print 'Coherence:', cohmodel.get_coherence_per_topic() lsi_corpus = lsimodel[imp_corpus] #lsi_corpus = lda_model[imp_corpus] # Use the singular values to choose how many components to use v = lsimodel.projection.s**2 / sum(lsimodel.projection.s**2) #print v[:100] #k = np.argmin(v>0.005)+1 # Hard threshold, may be better to plot and find the knee #At the moment just print out 15 topics topics = lsimodel.show_topics(num_topics=15, num_words=5) #topics = lda_model.show_topics(num_topics=15, num_words=5) #topcis2 = ldamodel.get_topics() for i, topic in enumerate(topics): print topic listOfDocsPerTopic = [] for i, topic in enumerate(topics): articles = [] tops = sorted(zip(range(len(lsi_corpus)), lsi_corpus), reverse=True, key=lambda doc: abs(dict(doc[1]).get(i, 0.0))) curr = tops[0][1][i][1] j = 0 while (abs(curr) > 0.3 and j < len(tops)): top = tops[j] j += 1 curr = top[1][i][1] articles.append(top[0]) listOfDocsPerTopic.append(articles) appendArticles(listOfDocsPerTopic[0], "topic-big-0", documents) appendArticles(listOfDocsPerTopic[2], "topic-big-2", documents) appendArticles(listOfDocsPerTopic[7], "topic-big-9", documents) end = time.time() print "End Time: ", end - st
topicanzahl_liste.append(topicanzahl) dateiname_model1 = "DTM_%i_Topics.model" % (topicanzahl) model1 = utils.SaveLoad.load(dateiname_model1) dateiname_corpus1 = "Korpus_2000_bis_2013.mm" corpus1 = corpora.MmCorpus(dateiname_corpus1) dateiname_dictionary1 = "Dictionary_2000_bis_2013.dict" dictionary1 = corpora.dictionary.Dictionary.load_from_text( dateiname_dictionary1) coherence = [] coherence_model.append(coherence) for time1 in range(0, Endjahr - Anfangsjahr + 1, 1): topics_dtm = model1.dtm_coherence(time1) cm = models.CoherenceModel(topics=topics_dtm, dictionary=dictionary1, corpus=corpus1, coherence='u_mass').get_coherence() coherence.append(cm) jahressequenz = [] for i in range(Anfangsjahr, Endjahr + 1, 1): jahressequenz.append(i) x = pandas.DataFrame(data=coherence_model, columns=jahressequenz, index=topicanzahl_liste) a = pandas.DataFrame.transpose(x) dateiname_evaluation = "DTM_Bigramm_Evaluation_%i_%i.csv" % (start, end) pandas.DataFrame(data=a).to_csv(dateiname_evaluation, sep=';') print("Benötigte Zeit: %0.3fs." % (time() - t0))
t0 = time() min = 1 max = 100 step = 1 corpus = corpora.MmCorpus("./Topic_Modeling/Input_Data/corpus.mm") dictionary = corpora.dictionary.Dictionary.load_from_text( "./Topic_Modeling/Input_Data/dictionary.dict") topicanzahl = [] coherence10 = [] coherence20 = [] for i in range(min, max + 1, step): topicanzahl.append(i) model = models.LdaModel.load("./Topic_Modeling/Models/Topic_Model_%i" % i) u_mass_10 = models.CoherenceModel(corpus=corpus, model=model, dictionary=dictionary, coherence='u_mass', topn=10).get_coherence() u_mass_20 = models.CoherenceModel(corpus=corpus, model=model, dictionary=dictionary, coherence='u_mass', topn=20).get_coherence() coherence10.append(u_mass_10) coherence20.append(u_mass_20) top10a = pandas.DataFrame(data=coherence10, index=topicanzahl) top20a = pandas.DataFrame(data=coherence20, index=topicanzahl) pandas.DataFrame(top10a).to_csv( "./Topic_Modeling/Evaluation/UMass_Score_10_words.csv",
) print(res) print(f"Execution time: {(time.time() - start_time)/60} mins") #https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#12buildingthetopicmodel visualize_topics(lda_model, corpus, id2word, cv) # Compute Coherence Score #TODO: test why it returns nan d = corpora.Dictionary() word2id = dict((k, v) for k, v in cv.vocabulary_.items()) d.id2word = id2word d.token2id = word2id coherence_model_lda = models.CoherenceModel(model=lda_model, texts=list(df['text']), dictionary=d, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print(f"Coherence Score: {coherence_lda}") #trick: look at nouns only; by default it looks at all words as being the same # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html def get_nouns(text): ''' Given a string of text, tokenize the text and pull out only the nouns. ''' is_noun = lambda pos: pos[:2] == 'NN' tokenized_text = word_tokenize(text) all_nouns = [
top10 = sort_sims[:10] top10doc = [texts[j[0]] for j in top10] print(top10doc) ############################ # Train lda model lda = models.LdaModel(corpus_tfidf, id2word=dct, num_topics=100) # Compute Perplexity print('\nPerplexity: ', lda.log_perplexity(corpus_tfidf)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = models.CoherenceModel(model=lda, texts=texts, dictionary=dct, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # ============================================================================= # # Visualize the topics # pyLDAvis.enable_notebook() # vis = pyLDAvis.gensim.prepare(lda, corpus_tfidf, dct) # vis # ============================================================================= ########################### # Cross-validation of LDA sentiment = df['Recommended IND'] #