def testPersistenceCompressed(self): fname = testfile() + '.gz' model = CoherenceModel( topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') model.save(fname) model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence())
def EvaluateCoherence(self, model, coherence='c_v'): """ Evaluate the coherence of the LDA model. The core estimation code is based on the onlineldavb.py script by M. Hoffman, see Hoffman, Blei, Bach: Online Learning for Latent Dirichlet Allocation, NIPS 2010. """ logger = logging.getLogger(__name__) if isinstance(coherence, str): coherence = [coherence] elif not isinstance(coherence, list): raise ValueError("The coherence method should be either a list or a specific type") values = dict() supported = set(['u_mass', 'c_v', 'c_uci', 'c_npmi']) for ctype in coherence: if ctype not in supported: logger.warning("Coherence evaluation for type %s is not supported, ignored. Only support types %s" % (ctype, str(supported))) continue cm = CoherenceModel(model=model, texts=self.tokenizedDocs, corpus=self.corpus, dictionary=self.id2token, coherence=ctype, topn=10) values[ctype] = cm.get_coherence() # Add run log for the coherence values if ctype == 'u_mass': self.u_mass_list.append(values[ctype]) elif ctype == 'c_v': self.c_v_list.append(values[ctype]) elif ctype == 'c_uci': self.c_uci_list.append(values[ctype]) else: self.c_npmi_list.append(values[ctype]) logger.info("Coherence type: %s, coherence value = %.6f" % (ctype, values[ctype])) return values
def testPersistence(self): fname = get_tmpfile('gensim_models_coherence.tst') model = CoherenceModel( topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' ) model.save(fname) model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence())
def testAccumulatorCachingTopicSubsets(self): kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') cm1 = CoherenceModel(topics=self.topics1, **kwargs) cm1.estimate_probabilities() accumulator = cm1._accumulator self.assertIsNotNone(accumulator) cm1.topics = [t[:2] for t in self.topics1] self.assertEqual(accumulator, cm1._accumulator) cm1.topics = self.topics1 self.assertEqual(accumulator, cm1._accumulator)
def check_coherence_measure(self, coherence): """Check provided topic coherence algorithm on given topics""" if coherence in BOOLEAN_DOCUMENT_BASED: kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence=coherence) else: kwargs = dict(texts=self.texts, dictionary=self.dictionary, coherence=coherence) cm1 = CoherenceModel(topics=self.topics1, **kwargs) cm2 = CoherenceModel(topics=self.topics2, **kwargs) self.assertGreater(cm1.get_coherence(), cm2.get_coherence())
def testAccumulatorCachingWithModelSetting(self): kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') cm1 = CoherenceModel(topics=self.topics1, **kwargs) cm1.estimate_probabilities() self.assertIsNotNone(cm1._accumulator) cm1.model = self.ldamodel topics = [] for topic in self.ldamodel.state.get_lambda(): bestn = argsort(topic, topn=cm1.topn, reverse=True) topics.append(bestn) self.assertTrue(np.array_equal(topics, cm1.topics)) self.assertIsNone(cm1._accumulator)
def testAccumulatorCachingWithTopnSettingGivenModel(self): kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, topn=5, coherence='u_mass') cm1 = CoherenceModel(model=self.ldamodel, **kwargs) cm1.estimate_probabilities() self.assertIsNotNone(cm1._accumulator) accumulator = cm1._accumulator topics_before = cm1._topics cm1.topn = 3 self.assertEqual(accumulator, cm1._accumulator) self.assertEqual(3, len(cm1.topics[0])) self.assertEqual(topics_before, cm1._topics) cm1.topn = 6 # should be able to expand given the model self.assertEqual(6, len(cm1.topics[0]))
def testPersistenceAfterProbabilityEstimationUsingTexts(self): fname = testfile() model = CoherenceModel( topics=self.topics1, texts=self.texts, dictionary=self.dictionary, coherence='c_v') model.estimate_probabilities() model.save(fname) model2 = CoherenceModel.load(fname) self.assertIsNotNone(model2._accumulator) self.assertTrue(model.get_coherence() == model2.get_coherence())
def testPersistenceAfterProbabilityEstimationUsingCorpus(self): fname = get_tmpfile('gensim_similarities.tst.pkl') model = CoherenceModel( topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' ) model.estimate_probabilities() model.save(fname) model2 = CoherenceModel.load(fname) self.assertIsNotNone(model2._accumulator) self.assertTrue(model.get_coherence() == model2.get_coherence())
def checkCoherenceMeasure(topics1, topics2, coherence): """Check provided topic coherence algorithm on given topics""" if coherence in boolean_document_based: cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence) cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence) else: cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence) cm2 = CoherenceModel(topics=topics2, texts=texts, dictionary=dictionary, coherence=coherence) return cm1.get_coherence() > cm2.get_coherence()
def testAccumulatorCachingSameSizeTopics(self): kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass') cm1 = CoherenceModel(topics=self.topics1, **kwargs) cm1.estimate_probabilities() accumulator = cm1._accumulator self.assertIsNotNone(accumulator) cm1.topics = self.topics1 self.assertEqual(accumulator, cm1._accumulator) cm1.topics = self.topics2 self.assertEqual(None, cm1._accumulator)
def evaluation(randomint, num_topics, n_iter, n_partition, result_path, result_folder, data_path, par): tempdf = pd.read_csv(data_path) mydictionary = copy.deepcopy(par['word_token']) result_file = result_path + result_folder + str( num_topics) + 'topics' + str( n_iter) + 'iteration_' + 'topic_important_words_sep.csv' wordtemp = [] with open(result_file) as f: reader = csv.reader(f, delimiter='\n') for row in reader: wordtemp.append(row) rsult_file = result_path + result_folder + '/' + str( num_topics) + 'topics' + str( n_iter) + 'iteration_' + 'topic_important_words_probs_sep.csv' probtemp = [] with open(result_file) as f: reader = csv.reader(f, delimiter='\n') for row in reader: probtemp.append(row) all_dic = [] for i in range(len(wordtemp)): # for number of topics wordtemp[i] = str(wordtemp[i]).replace('[', '').replace( ']', '').replace('"', '').replace("'", '').replace(',', ' ').split() probtemp[i] = str(probtemp[i]).replace('[', '').replace( ']', '').replace('"', '').replace("'", '').replace(',', ' ').split() probtemp[i] = np.array(probtemp[i]).astype(float) all_dic.append(dict(zip(wordtemp[i], probtemp[i]))) id2word = corpora.Dictionary([list(mydictionary)]) my_data = tempdf.nofreq.apply( lambda x: x.replace('[', '').replace(']', '').replace('"', '').replace( "'", '').replace(' ', '').replace(',', ' ').split()).values my_topics_ls = [list(all_dic[i].keys()) for i in range(len(all_dic))] cm = CoherenceModel(topics=my_topics_ls, texts=my_data, dictionary=id2word, coherence='c_v') coherence = cm.get_coherence() print('Coherence Score: ', coherence)
def compute_coherence_values(dictionary, bow_corpus, texts, limit, start=2, step=3): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] max_coherencemodel = 0 best_topic = 0 for num_topics in range(start, limit, step): print('Training with ', num_topics, ' Topic') lda_model = gensim.models.LdaMulticore( bow_corpus, num_topics=num_topics, id2word=dictionary, passes=15, workers=8, minimum_probability=0.04, random_state=50, alpha=1e-2, chunksize=4000, eta=0.5e-2, ) coherencemodel = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v') print("Num Topics =", num_topics, " coherence = ", round(coherencemodel.get_coherence(), 4)) if max_coherencemodel < round(coherencemodel.get_coherence(), 4): max_coherencemodel = round(coherencemodel.get_coherence(), 4) best_model = lda_model best_topic = num_topics coherence_values.append(coherencemodel.get_coherence()) return best_model, best_topic, coherence_values
def testCompareCoherenceForModels(self): models = [self.ldamodel, self.ldamodel] cm = CoherenceModel.for_models( models, dictionary=self.dictionary, texts=self.texts, coherence='c_v') self.assertIsNotNone(cm._accumulator) # Accumulator should have all relevant IDs. for model in models: cm.model = model self.assertIsNotNone(cm._accumulator) (coherence_topics1, coherence1), (coherence_topics2, coherence2) = \ cm.compare_models(models) self.assertAlmostEqual(np.mean(coherence_topics1), coherence1, 4) self.assertAlmostEqual(np.mean(coherence_topics2), coherence2, 4) self.assertAlmostEqual(coherence1, coherence2, places=4)
def testCompareCoherenceForTopics(self): topics = [self.topics1, self.topics2] cm = CoherenceModel.for_topics( topics, dictionary=self.dictionary, texts=self.texts, coherence='c_v') self.assertIsNotNone(cm._accumulator) # Accumulator should have all relevant IDs. for topic_list in topics: cm.topics = topic_list self.assertIsNotNone(cm._accumulator) (coherence_topics1, coherence1), (coherence_topics2, coherence2) = \ cm.compare_model_topics(topics) self.assertAlmostEqual(np.mean(coherence_topics1), coherence1, 4) self.assertAlmostEqual(np.mean(coherence_topics2), coherence2, 4) self.assertGreater(coherence1, coherence2)
def getCoherency(d, corp, topics=10, coherence='u-mass', varyTopics=False): m1 = LdaModel(corp, topics, d) cm = CoherenceModel(model=m1, corpus=corp, coherence='u_mass') if varyTopics: topics = range(5, 16) coherencies = [] for topic in topics: m = LdaModel(corp, topic, d) c = CoherenceModel(model=m, corpus=corp, coherence='u_mass') coherencies.append(c.get_coherence()) return np.max(coherencies) return cm.get_coherence()
def fit_model(corpora, dictionary, topicNum, beta): corpus = [dictionary.doc2bow(text) for text in corpora] model = LdaTransformer(id2word=dictionary, num_topics=topicNum, alpha='auto', eta=beta, iterations=100, random_state=2019) lda = model.fit(corpus) #docvecs = lda.transform(corpus) coherence = evaluateModel(lda.gensim_model, corpora) try: cm = CoherenceModel(model=lda.gensim_model, corpus=corpus, dictionary=dictionary, coherence='u_mass') u_mass = cm.get_coherence() cm = CoherenceModel(model=lda.gensim_model, texts=corpora, coherence='c_uci') c_uci = cm.get_coherence() cm = CoherenceModel(model=lda.gensim_model, texts=corpora, coherence='c_npmi') c_npmi = cm.get_coherence() saveModelConfigs(lda, coherence, u_mass, c_uci, c_npmi, config_path) except: saveModelConfigs(lda, coherence, "Invalid", "Invalid", "Invalid", config_path) #return lda.gensim_model, docvecs return lda.gensim_model
def find_best_coherence(tokens, range_num_topics): dct = corpora.Dictionary(tokens) coherences = [] for i in range(range_num_topics): mod = train_LSIModel(tokens, i + 1) coherences[i] = CoherenceModel(model=mod, texts=tokens, dictionary=dct, coherence='c_v').get_coherence() # Find maximum maximum = coherences[0] max_index = 0 for i in range(len(coherences)): if coherences[i] > maximum: max_index = i + 1 maximum = coherences[i] print(str(max_index) + " has coherence " + maximum) return max_index
def get_coherence(self, docs=None, dictionary=None, corpus=None, n_terms=10): topics = self.get_topics(n_terms=n_terms) if not dictionary and not corpus: dictionary = Dictionary(docs) corpus = [dictionary.doc2bow(t) for t in docs] return CoherenceModel(topn=self.n_components, texts=docs, topics=topics.values, corpus=corpus, dictionary=dictionary, coherence='c_npmi').get_coherence()
def create_models(self): file_name = self.folder_path + self.algorithm + '/' + get_range_file_name( ) + ".csv" c_v_list = [] for i in self.num_topics: print(i) model = self.get_model(i) c_v_list.append( CoherenceModel(model=model, texts=self.dataset, corpus=self.corpus_tfidf, coherence='c_v').get_coherence()) coherence_scores_df = pd.DataFrame({ 'num_topics': self.num_topics, 'c_v': c_v_list, }) coherence_scores_df.to_csv(file_name) self.__plot_coherence_scores(c_v_list, "c_v") print("models created")
class MyLda: def __init__(self, myDictionary, num_topics=100, topic_threshold=0.15): self.num_topics = num_topics self.topic_threshold = topic_threshold self.myDictionary = myDictionary self.model = LdaModel(self.myDictionary.doc2bows, \ id2word=self.myDictionary.dictionary, \ num_topics=num_topics) self.topic2ids, self.id2topics = self.get_mappings() self.coherenceModel = None print("- Created MyLda with {} topics".format(self.num_topics)) def get_mappings(self): topic2ids, id2topics = defaultdict(list), defaultdict(list) for i, doc2bow in enumerate(self.myDictionary.doc2bows): topic_pairs = self.model.get_document_topics(doc2bow) for j, (topic, prob) in enumerate(topic_pairs): if prob >= self.topic_threshold or j == 0: topic2ids[topic].append(i) id2topics[i].append(topic) return topic2ids, id2topics def get_topic_terms(self, topic): terms = self.model.get_topic_terms(topic) return terms def get_top_topic(self): top_topics = self.model.top_topics(corpus=self.myDictionary.doc2bows) average = sum([t[1] for t in top_topics]) / self.num_topics return top_topics, average def get_perplexity(self): return self.model.log_perplexity(self.myDictionary.doc2bows) def get_coherence(self): if not self.coherenceModel: self.coherenceModel = CoherenceModel(model=self.model, \ corpus=self.myDictionary.doc2bows, \ dictionary=self.myDictionary.dictionary, \ coherence='u_mass') return self.coherenceModel.get_coherence()
def getCoherenceScores(nTopics): model = DtmModel(path_to_dtm_binary, corpus=corpus, num_topics=nTopics, id2word=dictionary, time_slices=timeSlice) model.save(f'./Models/model{nTopics}Topics') wordRepresentationTopics = [ model.dtm_coherence(time=time) for time in range(0, len(timeSlice)) ] coherenceModels = [ CoherenceModel(topics=wordRepresentationTopics[time], corpus=corpus, dictionary=dictionary, coherence='u_mass') for time in range(0, len(timeSlice)) ] coherenceScores = [ coherenceModels[time].get_coherence() for time in range(0, len(timeSlice)) ] return coherenceScores
def testAccumulatorCachingWithTopnSettingGivenTopics(self): kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, topn=5, coherence='u_mass') cm1 = CoherenceModel(topics=self.topics1, **kwargs) cm1.estimate_probabilities() self.assertIsNotNone(cm1._accumulator) accumulator = cm1._accumulator topics_before = cm1._topics cm1.topn = 3 self.assertEqual(accumulator, cm1._accumulator) self.assertEqual(3, len(cm1.topics[0])) self.assertEqual(topics_before, cm1._topics) # Topics should not have been truncated, so topn settings below 5 should work cm1.topn = 4 self.assertEqual(accumulator, cm1._accumulator) self.assertEqual(4, len(cm1.topics[0])) self.assertEqual(topics_before, cm1._topics) with self.assertRaises(ValueError): cm1.topn = 6 # can't expand topics any further without model
def evaluate_graph(dictionary, corpus, texts, begin, end, steps): """ Function to display num_topics - LDA graph using c_v coherence Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus limit : topic limit Returns: ------- lm_list : List of LDA topic models """ u_mass = [] c_v = [] lm_list = [] for num_topics in range(begin, end, steps): lm = LdaMulticore(corpus=corpus, num_topics=num_topics, workers=24, id2word=dictionary, eval_every=10, eta='auto', passes=20) lm_list.append(lm) cm_umass = CoherenceModel(model=lm, corpus=corpus, dictionary=dictionary, coherence='u_mass') cm_cv = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v') c_v.append(cm_cv.get_coherence()) u_mass.append(cm_umass.get_coherence()) print(c_v) file_1 = open('c_v.txt', 'w') for item in c_v: file_1.write("%s\n" % item) print(u_mass) file_2 = open('u_mass.txt', 'w') for item in u_mass: file_2.write("%s\n" % item)
def topic_coherence(self): if self.lda_model == None: self.fit() # Compute Coherence Score using c_v coherence_model_lda = CoherenceModel(model=self.lda_model, texts=self.docs, dictionary=self.dictionary, coherence='c_v') coherence_lda_CV = coherence_model_lda.get_coherence() log.info('\nCoherence Score CV method: ', coherence_lda_CV) # Compute Coherence Score using UMass coherence_model_lda = CoherenceModel(model=self.lda_model, texts=self.docs, dictionary=self.dictionary, coherence="u_mass") coherence_lda_umass = coherence_model_lda.get_coherence() log.info('\nCoherence Score: ', coherence_lda_umass) return coherence_lda_CV, coherence_lda_umass
def check_coherence(listcorpus, vectorcorpus, model, numtopics, resultsfolder): print("check_coherence") # coherence for the entire model, using several measures measures = ["c_v", "c_npmi", "u_mass", "c_uci"] coherences = [] for measure in measures: coherencemodel = CoherenceModel(texts=listcorpus, model=model, corpus=vectorcorpus, coherence=measure, processes=3) coherence = coherencemodel.get_coherence() coherences.append(coherence) coherences = dict(zip(measures, coherences)) coherences = pd.DataFrame.from_dict(coherences, orient='index', columns=["score"]) with open(join(resultsfolder, "coherences-model.csv"), "w", encoding="utf8") as outfile: coherences.to_csv(outfile, sep="\t") # coherence of each topic, using one measure only coherencemodel = CoherenceModel(texts=listcorpus, model=model, corpus=vectorcorpus, coherence="c_v", processes=3) coherences = list( zip(range(0, numtopics), coherencemodel.get_coherence_per_topic())) coherences = pd.DataFrame(coherences, columns=["topic", "score"]).sort_values(by="score", ascending=False) with open(join(resultsfolder, "coherences-topics.csv"), "w", encoding="utf8") as outfile: coherences.to_csv(outfile, sep="\t")
lda_model = gensim.models.LdaModel(rec_corpus, num_topics=5, id2word=rec_dict, passes=2) # compare different # of topics cm_score = [] for i in [3, 5, 10]: lda_model0 = gensim.models.LdaModel(rec_corpus, num_topics=i, iterations=100, id2word=rec_dict, passes=2) cm = CoherenceModel(model=lda_model0, corpus=rec_corpus, coherence='u_mass') cm_score.append(cm.get_coherence()) for idx, topic in lda_model.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) def lda_ana(rev_text, n, iterations=100): # rev_text[rev_text.isna()] = 'na' rev_text = rev_text.apply(lambda x: " ".join(ast.literal_eval(x))) rev_text = rev_text.apply(lambda x: x.split(' ')) rec_dict = gensim.corpora.Dictionary(rev_text) rec_dict.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) rec_corpus = [rec_dict.doc2bow(doc) for doc in rev_text]
# applying the functions to the documents -> list of stemmed tokens # and their frequencies in each document (bag of words) docs = list(map(preprocess, reviews)) dictionary = gensim.corpora.Dictionary(docs) dictionary.filter_extremes(no_below=10) bow_corpus = [dictionary.doc2bow(doc) for doc in docs] # choosing the optimal number of topics via coherence score cs = [] for i in range(2, 21): model = gensim.models.LdaModel(bow_corpus, num_topics=i, id2word=dictionary, passes=10) cv = CoherenceModel(model=model, texts=list(docs), coherence='c_v').get_coherence() cs.append(cv) print(f'{i} topics, Coherence Score = {cv: .3f}') # buiding the 6 topic model and saving the topics model = gensim.models.LdaModel(bow_corpus, num_topics=6, id2word=dictionary, passes=50) # saving the topics and the model with open('topics.txt', 'w') as f: for l in range(6): f.write(f'TOPIC {l}:\n') for i in model.get_topic_terms(l): f.write(f'{dictionary[i[0]]}\n')
def coherence_score(model,tokens_lst,dictionary): coherence_model_lda = CoherenceModel(model=model, texts=data['tokens'], dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda)
def evaluate(docs): # global docs # Perform function on our document docs = docs_preprocessor(docs) # Create Biagram & Trigram Models from gensim.models import Phrases if __name__ == "__main__": # Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more. bigram = Phrases(docs, min_count=10) trigram = Phrases(bigram[docs]) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) for token in trigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) # Remove rare & common tokens # Create a dictionary representation of the documents. dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=10, no_above=0.2) # Create dictionary and corpus required for Topic Modeling corpus = [dictionary.doc2bow(doc) for doc in docs] print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(corpus)) print(corpus[:1]) # Set parameters. num_topics = 20 chunksize = 500 passes = 20 iterations = 400 eval_every = 1 # Make a index to word dictionary. temp = dictionary[0] # only to "load" the dictionary. id2word = dictionary.id2token lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \ alpha='auto', eta='auto', \ iterations=iterations, num_topics=num_topics, \ passes=passes, eval_every=eval_every) # Print the Keyword in the 5 topics print(lda_model.print_topics()) # Compute Coherence Score using c_v coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Compute Coherence Score using UMass coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence="u_mass") coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ model_list, coherence_values = compute_coherence_values( dictionary=dictionary, corpus=corpus, texts=docs, start=2, limit=40, step=6) # Show graph import matplotlib.pyplot as plt limit = 40 start = 2 step = 6 x = range(start, limit, step) plt.plot(x, coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend(("coherence_values"), loc='best') plt.show() return coherence_lda
def model_creation(file): print("Importing Data") data = pd.read_csv(os.getcwd() + "\\Src Files\\" + file + ".csv", error_bad_lines=False) data_text = data[['Body']] documents = data_text print("Data import is completed") print("Running pre Processing on Documents") processed_body = documents['Body'].map(pp.pre_process) print("Pre Processing is completed") print("Creation dictionary with documents") dictionary = corpora.Dictionary(processed_body) dictionary.filter_extremes(no_below=100, no_above=0.5, keep_n=200000) print("Saving dictionary") print("Creating dictionary directory") dirName = os.getcwd() + "\\Dictionaries\\" + file dictPath = dirName + "\\dictionary.pkl" path = Path(dirName) path.mkdir(parents=True, exist_ok=True) print("Creating Topic directory") dirName = os.getcwd() + "\\Topics\\" + file path = Path(dirName) path.mkdir(parents=True, exist_ok=True) dirName = os.getcwd() + "\\Named Topics\\" + file path = Path(dirName) path.mkdir(parents=True, exist_ok=True) print("Saving Dictionary") with open(dictPath, 'wb') as d: pickle.dump(dictionary, d, protocol=pickle.HIGHEST_PROTOCOL) print("Creating Corpus") bow_corpus = [dictionary.doc2bow(doc) for doc in processed_body] tfidf = models.TfidfModel(bow_corpus) corpus_tfidf = tfidf[bow_corpus] print("Creating LDA Model") lda = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=20, id2word=dictionary, passes=4) coherence = CoherenceModel(model=lda, corpus=corpus_tfidf, dictionary=dictionary, coherence='u_mass') coherenceLda = coherence.get_coherence() print("Coherence: " + str(coherenceLda)) print("Creating model directory") dirName = os.getcwd() + "\\Models\\" + file path = Path(dirName) path.mkdir(parents=True, exist_ok=True) print("Saving model") lda.save(dirName + "\\models") print("Saving Topics to file") topicList = lda.show_topics(num_topics=20, num_words=15, log=False, formatted=True) topicsPath = os.getcwd() + "\\Topics\\" namedTopicsPath = os.getcwd() + "\\Named Topics\\" with open(topicsPath + file + "\\topic.txt", 'w') as f: for x, item in topicList: f.write(str(x) + ", " + item + "\n") with open(namedTopicsPath + file + "\\topic.csv", 'w') as f: f.write("index,Body,Topic\n") for x, item in topicList: f.write(str(x) + ", " + item + ",Replace Topic Name" + "\n")
def run_lda(corpus, dictionary, texts, num_topics=10, passes=20, iterations=100): eval_frame = pd.DataFrame(columns=[ 'Num_Topics', 'Log_Perplexity_P_{0}_I_{1}'.format(passes, iterations), 'Topic_Coherence(u_mass)_P_{0}_I_{1}'.format(passes, iterations), 'Topic_Coherence(c_uci)_P_{0}_I_{1}'.format(passes, iterations), 'Topic_Coherence(c_v)_P_{0}_I_{1}'.format(passes, iterations), 'Topic_Coherence(c_npmi)_P_{0}_I_{1}'.format(passes, iterations) ]) logging.debug('******* RUNNING LDA *************') lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, iterations=iterations, chunksize=2500) coh_model_umass = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass') coh_model_uci = CoherenceModel(model=lda_model, texts=texts, coherence='c_uci') coh_model_ucv = CoherenceModel(model=lda_model, texts=texts, coherence='c_v') coh_model_npmi = CoherenceModel(model=lda_model, texts=texts, coherence='c_npmi') eval_frame.loc[len(eval_frame)] = [ num_topics, lda_model.log_perplexity(corpus), coh_model_umass.get_coherence(), coh_model_uci.get_coherence(), coh_model_ucv.get_coherence(), coh_model_npmi.get_coherence() ] model = namedtuple('model', ['lda_model', 'eval_frame']) return model(lda_model, eval_frame)
def build_array(): start = time.time() print("start ---------------------------------------------------") # load test set test_year = dictload(2018) # load the rest intermediate_path = "../Data/Intermediate/" doc_set = pickle.load( open(os.path.join(intermediate_path + 'doc_set.p'), "rb")) label_set = pickle.load( open(os.path.join(intermediate_path + 'label_set.p'), "rb")) topic_superset = pickle.load( open(os.path.join(intermediate_path + 'topic_superset.p'), "rb")) time_load = time.time() print("It took", time_load - start, "seconds to load") print("training ------------------------------------------------") doc_texts = tokenize(doc_set) print("tokenized") # build individual lda lda_superset = [] num_topics_list = [] dictionary_set = [] i = 0 for topic_set in topic_superset: topic_texts = tokenize(topic_set) # turn our tokenized documents into a id - term dictionary dictionary = corpora.Dictionary(topic_texts) dictionary_set.append(dictionary) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in topic_texts] # generate LDA model # number of topics is logarithmic # num_topics = math.floor(math.log2(len(topic_set))) # number of topics is modified logarithmic # 15*rounded(log_2())-140 num_topics = 15 * (round(math.log2(len(topic_set)))) - 140 # num_topics = math.floor(len(topic_set)/1000) print(str(i) + ' ' + "number of topics: " + str(num_topics)) num_topics_list.append(num_topics) ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=20) lda_superset.append(ldamodel) i += 1 # print lda topics print(ldamodel.print_topics()) # Compute Perplexity print('\nPerplexity: ', ldamodel.log_perplexity( corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=ldamodel, texts=topic_texts, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) print("all LDA built") # build training matrix prop_array_superset = [] for i in range(len(num_topics_list)): num_topics = num_topics_list[i] topic_prop_array = np.zeros((len(doc_texts), num_topics)) for j in range(len(doc_texts)): text = doc_texts[j] textProp = lda_superset[i][dictionary_set[i].doc2bow(text)] for pair in textProp: topicIdx = pair[0] weight = pair[1] topic_prop_array[j, topicIdx] = weight prop_array_superset.append(topic_prop_array) # concat full feature array training_array = prop_array_superset[0] for i in range(len(prop_array_superset)): if i != 0: training_array = np.concatenate( (training_array, prop_array_superset[i]), axis=1) print("training matrix built") time_train = time.time() print("It took", time_train - time_load, "seconds to train") print("---------------------------------------------------------") print("testing") # test on new data 1000 documents split by proportion of training data test_set = test_year['astro'][0:144] + test_year['cond'][0:145] + \ test_year['cs'][0:125] + test_year['hep'][0:113] + \ test_year['math'][0:257] + test_year['physics'][0:134] + \ test_year['qbio'][0:13] + test_year['qfin'][0:7] + \ test_year['quant'][0:45] + test_year['stat'][0:17] test_label = [1]*144 + [2]*145 + [3]*125 + [4]*113 + [5]*257 + \ [6]*134 + [7]*13 + [8]*7 + [9]*45 + [10]*17 test_texts = tokenize(test_set) # build individual test prop array test_prop_array_superset = [] for i in range(len(num_topics_list)): num_topics = num_topics_list[i] test_prop_array = np.zeros((len(test_label), num_topics)) for j in range(len(test_texts)): test = test_texts[j] testProp = lda_superset[i][dictionary_set[i].doc2bow(test)] for pair in testProp: topicIdx = pair[0] weight = pair[1] test_prop_array[j, topicIdx] = weight test_prop_array_superset.append(test_prop_array) # concat full test array test_array = test_prop_array_superset[0] for i in range(len(test_prop_array_superset)): if i != 0: test_array = np.concatenate( (test_array, test_prop_array_superset[i]), axis=1) arraydump('modifiedlog_', training_array, test_array) x_train, x_test, y_train, y_test = training_array, test_array, label_set, test_label print("training_array length: " + str(len(topic_prop_array))) print("test_array length: " + str(len(test_prop_array))) print("training_label length: " + str(len(label_set))) print("test_label length: " + str(len(test_label))) print("---------------------------------------------------------") # choose model via a list model_names = ["knn3"] buildmodel(model_names, x_train, y_train, x_test, y_test) time_end = time.time() print("total time is ", time_end - start)
def lda_score(model, rec_corpus): cm = CoherenceModel(model=model, corpus=rec_corpus, coherence='u_mass') return cm.get_coherence()
dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] #%%Set up two topic models goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=2) badLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=1, num_topics=2) #%% Using U_Mass Coherence goodcm = CoherenceModel(model=goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass') #coherence='u_mass' badcm = CoherenceModel(model=badLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass') #coherence='u_mass' #View the pipeline parameters for one coherence model #print(goodcm) print(goodcm.get_coherence()) #print(badcm) print(badcm.get_coherence()) #%% check how much topics - coherence = 'u_mass'
def main(cuda, batch_size, epochs, top_words, testing_mode, verbose_mode): print("Loading input data") # TODO fix relative paths data_train = load_npz("data/train.txt.npz") data_val = load_npz("data/test.txt.npz") corpus = Sparse2Corpus(data_train, documents_columns=False) with open("data/vocab.pkl", "rb") as f: vocab = pickle.load(f) reverse_vocab = {vocab[word]: word for word in vocab} indexed_vocab = [ reverse_vocab[index] for index in range(len(reverse_vocab)) ] writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(autoencoder, epoch, lr, loss, perplexity): if verbose_mode: decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[ reverse_vocab[item.item()] for item in topic ] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence="u_mass", ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print( str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic)) print(coherence) else: coherence = 0 writer.add_scalars( "data/autoencoder", { "lr": lr, "loss": loss, "perplexity": perplexity, "coherence": coherence, }, global_step=epoch, ) ds_train = CountTensorDataset(data_train) ds_val = CountTensorDataset(data_val) autoencoder = ProdLDA(in_dimension=len(vocab), hidden1_dimension=100, hidden2_dimension=100, topics=50) if cuda: autoencoder.cuda() print("Training stage.") ae_optimizer = Adam(autoencoder.parameters(), 0.0001, betas=(0.99, 0.999)) train( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=epochs, batch_size=batch_size, optimizer=ae_optimizer, update_callback=training_callback, sampler=WeightedRandomSampler(torch.ones(data_train.shape[0]), 20000), num_workers=4, ) autoencoder.eval() decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence="u_mass", ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print( str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic)) print(coherence) if not testing_mode: writer.add_embedding( autoencoder.encoder.linear1.weight.detach().cpu().t(), metadata=indexed_vocab, tag="feature_embeddings", ) writer.close()
from gensim.models.coherencemodel import CoherenceModel import pickle lda = pickle.load(open('../output/lda_model', 'rb')) # Compute Coherence Score using c_v coherence_model_lda = CoherenceModel(model=lda['model'], texts=lda['texts'], corpus=lda['corpus'], dictionary=lda['dictionary'], coherence='u_mass') lda['coherence'] = coherence_model_lda.get_coherence() print('\nCoherence Score: ', lda['coherence'])
random_state=10) #tune parameters as needed # Print the Keyword in the 10 topics pprint(lda_model.print_topics()) doc_lda = lda_model[corpus] ################### perplexity and coherence metrics ##################### #perplexity, the lower the better print('\nPerplexity: ', lda_model.log_perplexity(corpus, total_docs=10000)) # Compute Coherence Score, the higher the better from gensim.models.coherencemodel import CoherenceModel coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) #higher the better ################### OPTIONAL: Tuning with c_v ##################### ##ran on vpn eng computer took 7 hrs to get to 41% compelte. # supporting function def compute_coherence_values(corpus, dictionary, k, a, b): lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=k, random_state=10,