for parameters in parameters_combinations: lda_model = gensim.models.LdaModel(corpus=tokens_tfidf, num_topics=parameters.get('num_topics'), id2word=id2token, chunksize=parameters.get('chunksize'), passes=parameters.get('passes'), iterations=parameters.get('iterations'), random_state=1332, alpha="auto", eta="auto") coherence_model_lda = CoherenceModel(model=lda_model, corpus=tokens_tfidf, coherence='u_mass') coherence_score = coherence_model_lda.get_coherence() coherence_score_out = dict() coherence_score_out['num_topics'] = parameters.get('num_topics') coherence_score_out['chunksize'] = parameters.get('chunksize') coherence_score_out['passes'] = parameters.get('passes') coherence_score_out['iterations'] = parameters.get('iterations') coherence_score_out['coherence_u_mass'] = coherence_score coherence_scores.append(coherence_score_out) # Export coherence scores if not os.path.isfile(os.path.join(out_path, filename_out)): with open(os.path.join(out_path, filename_out), 'w', encoding='utf-8') as f: f.write(str(coherence_score_out) + "\n")
texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) #print(dictionary) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] #print(corpus # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=60) topics = [] for topic in ldamodel.print_topics(num_topics=30,num_words=12): topics.append(topic) print(topics) df_topics = pd.DataFrame(topics) with open (r"E:\Helen\FinalProject_INFO5731\ALL_OUTPUTS\LDAgensim_wholeDS.csv", 'w', newline="", encoding='utf-8') as file: df_topics.to_csv(file) # Compute Model Perplexity and Coherence Score: This model to judge how good the model performed, # especially by Coherene score # Compute Coherence Score coherence_ldamodel = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_ldamodel.get_coherence() print('\nCoherence Score: ', coherence_lda)
def spatial_lda(adata, x_coordinate='X_centroid', y_coordinate='Y_centroid', phenotype='phenotype', method='radius', radius=30, knn=10, imageid='imageid', num_motifs=10, random_state=0, subset=None, label='spatial_lda', **kwargs): """ Parameters: adata : AnnData object x_coordinate : float, required Column name containing the x-coordinates values. y_coordinate : float, required Column name containing the y-coordinates values. phenotype : string, required Column name of the column containing the phenotype information. It could also be any categorical assignment given to single cells. method : string, optional Two options are available: a) 'radius', b) 'knn'. a) radius - Identifies the neighbours within a given radius for every cell. b) knn - Identifies the K nearest neigbours for every cell. radius : int, optional The radius used to define a local neighbhourhood. knn : int, optional Number of cells considered for defining the local neighbhourhood. imageid : string, optional Column name of the column containing the image id. subset : string, optional imageid of a single image to be subsetted for analyis. num_motifs : int, optional The number of requested latent motifs to be extracted from the training corpus. random_state : int, optional Either a randomState object or a seed to generate one. Useful for reproducibility. label : string, optional Key for the returned data, stored in `adata.obs`. Returns: adata : AnnData object Updated AnnData object with the results stored in `adata.obs ['spatial_lda']`. Example: ```python # Running the radius method adata = sm.tl.spatial_lda (adata, num_motifs=10, radius=100) ``` """ # Function def spatial_lda_internal(adata_subset, x_coordinate, y_coordinate, phenotype, method, radius, knn, imageid): # Print which image is being processed print('Processing: ' + str(np.unique(adata_subset.obs[imageid]))) # Create a DataFrame with the necessary inforamtion data = pd.DataFrame({ 'x': adata_subset.obs[x_coordinate], 'y': adata_subset.obs[y_coordinate], 'phenotype': adata_subset.obs[phenotype] }) # Identify neighbourhoods based on the method used # a) KNN method if method == 'knn': print("Identifying the " + str(knn) + " nearest neighbours for every cell") tree = BallTree(data[['x', 'y']], leaf_size=2) ind = tree.query(data[['x', 'y']], k=knn, return_distance=False) # b) Local radius method if method == 'radius': print("Identifying neighbours within " + str(radius) + " pixels of every cell") kdt = BallTree(data[['x', 'y']], leaf_size=2) ind = kdt.query_radius(data[['x', 'y']], r=radius, return_distance=False) # Map phenotype phenomap = dict(zip(list(range(len(ind))), data['phenotype'])) # Used for mapping for i in range(len(ind)): ind[i] = [phenomap[letter] for letter in ind[i]] # return return ind # Subset a particular image if needed if subset is not None: adata_list = [adata[adata.obs[imageid] == subset]] else: adata_list = [ adata[adata.obs[imageid] == i] for i in adata.obs[imageid].unique() ] # Apply function to all images # Create lamda function r_spatial_lda_internal = lambda x: spatial_lda_internal( adata_subset=x, x_coordinate=x_coordinate, y_coordinate=y_coordinate, phenotype=phenotype, method=method, radius=radius, knn=knn, imageid=imageid) all_data = list(map(r_spatial_lda_internal, adata_list)) # Apply function # combine all the data into one texts = np.concatenate(all_data, axis=0).tolist() # LDA pre-processing print('Pre-Processing Spatial LDA') # Create Dictionary id2word = corpora.Dictionary(texts) # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # Build LDA model print('Training Spatial LDA') try: lda_model = gensim.models.ldamulticore.LdaMulticore( corpus=corpus, id2word=id2word, num_topics=num_motifs, random_state=random_state, **kwargs) except: lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_motifs, random_state=random_state, **kwargs) # Compute Coherence Score print('Calculating the Coherence Score') coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # isolate the latent features print('Gathering the latent weights') topic_weights = [] for row_list in lda_model[corpus]: tmp = np.zeros(num_motifs) for i, w in row_list: tmp[i] = w topic_weights.append(tmp) # conver to dataframe arr = pd.DataFrame(topic_weights, index=adata.obs.index).fillna(0) arr = arr.add_prefix('Motif_') # isolate the weights of phenotypes pattern = "(\d\.\d+).\"(.*?)\"" cell_weight = pd.DataFrame(index=np.unique(adata.obs[phenotype])) for i in range(0, len(lda_model.print_topics())): level1 = lda_model.print_topics()[i][1] tmp = pd.DataFrame(re.findall(pattern, level1)) tmp.index = tmp[1] tmp = tmp.drop(columns=1) tmp.columns = ['Motif_' + str(i)] cell_weight = cell_weight.merge(tmp, how='outer', left_index=True, right_index=True) # fill zeros cell_weight = cell_weight.fillna(0).astype(float) # save the results in anndata object adata.uns[label] = arr # save the weight for each cell adata.uns[str(label) + '_probability'] = cell_weight # weights of each cell type adata.uns[str(label) + '_model'] = lda_model # return return adata
def main(): print('LSI model with gensim') print('1) 1 gram, 2) 2 gram, 3) 3 gram') op = input() op = int(op) lsReturn = [] lsDocuments = [] lsSubject = [] #Get the the information into a list of documents lsReturn = mlf.getRawTextToList() lsDocuments = lsReturn[0] lsSubject = lsReturn[1] lsDocuments_NoSW = [[ word for word in simple_preprocess(str(doc)) if word not in sw ] for doc in lsDocuments] if (op == 1): print('LSI model with gensim for 1 gram') if (op == 2): print('LSI model with gensim for 2 gram') bigram = gensim.models.Phrases(lsDocuments_NoSW, min_count=5, threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) lsDocBiGram = [bigram_mod[doc] for doc in lsDocuments_NoSW] lsDocuments_NoSW.clear() lsDocuments_NoSW = [[ word for word in simple_preprocess(str(doc)) if word not in sw ] for doc in lsDocBiGram] if (op == 3): print('LSI model with gensim for 3 gram') bigram = gensim.models.Phrases(lsDocuments_NoSW, min_count=5, threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) trigram = gensim.models.Phrases(bigram[lsDocuments_NoSW], threshold=100) trigram_mod = gensim.models.phrases.Phraser(trigram) lsDocTrigram = [trigram_mod[doc] for doc in lsDocuments_NoSW] lsDocuments_NoSW.clear() lsDocuments_NoSW = [[ word for word in simple_preprocess(str(doc)) if word not in sw ] for doc in lsDocTrigram] """ print('Getting bigrams list...') for doc in lsDocuments_NoSW: for word in doc: mlf.appendInfoToFile(pathtohere,'\\trigrams.txt',word+'\n') """ print('LSI Model starting...') # Create Dictionary id2word = corpora.Dictionary(lsDocuments_NoSW) # Create Corpus: Term Document Frequency corpus = [id2word.doc2bow(text) for text in lsDocuments_NoSW] # Build LDA model lsi_model = gensim.models.LsiModel(corpus=corpus, id2word=id2word, num_topics=20) df = pd.DataFrame() #df=mlf.getDominantTopicDataFrame(lsi_model,corpus,lsDocuments_NoSW,lsSubject) #mlf.generateFileSeparatedBySemicolon(df,'LSI_trigram_csv.txt') lsi_cm = CoherenceModel(model=lsi_model, corpus=corpus, dictionary=id2word, texts=lsDocuments_NoSW) print('LSI Coherence:', lsi_cm.get_coherence())
def compute_coherence_values(self, limit=11, start=4, step=1): """ Compute c_v coherence for various number of topics Parameters: ---------- limit : Max num of topics start : Least num of topics step : Step-size Returns: ------- model_list : List of LDA topic models (not now) coherence_values : Coherence values corresponding to the LDA model with respective number of topics (not now) optim_k : optimal number of Topics """ coherence_values = [] perplexity_values = [] model_list = [] for num_topics in range(start, limit, step): model = gensim.models.ldamulticore.LdaMulticore(corpus=self.corpus, id2word=self.dictionary, num_topics=num_topics, passes=10, alpha = 'asymmetric', eta = 'auto', random_state=42, iterations = 500, per_word_topics=True, eval_every=None) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=self.texts, dictionary=self.dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) perplexity_values.append(model.log_perplexity(self.corpus)) plot_val = [-1 * i / j for i, j in zip(coherence_values, perplexity_values)] x = np.array(coherence_values) z = (x-min(x))/(max(x)-min(x)) scaled_coherence = z.tolist() scaled_subset = [i for i in scaled_coherence if i >= max(scaled_coherence)*0.7] scaled_subset_index = [scaled_coherence.index(i) for i in scaled_subset] try: scaled_subset_index.remove(4) if(len(scaled_subset_index)==0): scaled_subset_index = [4] except: scaled_subset_index = scaled_subset_index finally: best_model = model_list[min(scaled_subset_index)] optim_k = best_model.get_topics().shape[0] x = range(start, limit, step) plt.plot(x, coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend(("coherence_values"), loc='best') plt.show() x = range(start, limit, step) plt.plot(x, perplexity_values) plt.xlabel("Num Topics") plt.ylabel("Perplexity score") plt.legend(("perplexity_values"), loc='best') plt.show() self.num_topics = optim_k
############ (7) Mallet Model ############## # I use the Mallet Model to improve the lda results and opt for the optimal number of topics mallet_path = '/Users/elsayedissa/Desktop/Topic_Modeling/mallet-2.0.8/bin/mallet' # update this path ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=100, id2word=dictionary) # Show Topics #pprint(ldamallet.show_topics(formatted=False)) # Compute Coherence Score coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=lemmatized_data, dictionary=dictionary, coherence='c_v') coherence_ldamallet = coherence_model_ldamallet.get_coherence() print('\nCoherence Score: ', coherence_ldamallet) ############# (9) Finding the Optimal Number of Topics for the LDA Model ######################## def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): """ Compute c_v coherence for various number of topics Parameters: ----------
num_topics=40, id2word=dictionary, workers=3, alpha=0.2, eta=0.03) #measure performance of LDA from gensim.models.coherencemodel import CoherenceModel # Compute Coherence Score using c_v coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenize, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Compute Coherence Score using UMass coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenize, dictionary=dictionary, coherence="u_mass") coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Save model to disk. temp_file = datapath("maude_ventilator_lda_gensim") lda_model.save(temp_file) # Load a potentially pretrained model from disk.
def label_documents(documents: List, LVL, SET): OPTIMAL_TOPICS = 10 PREPROCESSINGs = ["lemmed"] for PREPROCESSING in PREPROCESSINGs: # texts = clean_documents([str(doc) for doc in documents], PREPROCESSING) # pickle.dump(texts, open("{}_{}_{}_text".format(SET, LVL, PREPROCESSING), "wb")) texts = pickle.load( open("{}_{}_{}_text".format(SET, LVL, PREPROCESSING), "rb")) dictionary = Dictionary(texts) print('Number of unique tokens: %d' % len(dictionary)) dictionary.filter_extremes(no_below=20, no_above=0.5) print('Number filtered of unique tokens: %d' % len(dictionary)) logger.info("Clean complete") logger.info("Dictionary complete") corpus = [dictionary.doc2bow(text) for text in texts] logger.info("Corpus complete") # model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=1000, num_topics=OPTIMAL_TOPICS, workers=3) # model.save("{}_{}_{}_model_cv_coherence_{}".format(LVL, OPTIMAL_TOPICS, PREPROCESSING)) model = LdaMulticore.load("{}_{}_1000_model_cv_coherence_{}".format( LVL, OPTIMAL_TOPICS, PREPROCESSING)) coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') print("Coherence {}".format(coherence_model.get_coherence())) topics = [] corpus = [dictionary.doc2bow(text) for text in texts] for document in corpus: topic = model.get_document_topics(document) topics.append(topic) with open("topics_results_{}_{}_{}.csv".format(SET, LVL, PREPROCESSING), "wt", encoding="utf8", newline="") as outf: writer = csv.writer(outf) for document_topics in topics: sorted_topics = sorted(document_topics, key=lambda x: -x[1]) if sorted_topics: best = [sorted_topics[0][0]] else: best = [-1] print("NONE ERROR") writer.writerow(best) x = model.show_topics(num_topics=OPTIMAL_TOPICS, num_words=10, formatted=True) # Below Code Prints Topics and Words with open("topics_keywords_{}_{}_{}".format(SET, LVL, PREPROCESSING), "wt", encoding="utf8", newline="") as outf: writer = csv.writer(outf) for t in x: topic = t[0] words = [(word_score.split("*")[1].strip()[1:-1], float(word_score.split("*")[0].strip())) for word_score in t[1].split("+")] sort_words = sorted(words, key=lambda z: z[1]) print(str(topic) + " " + str(sort_words)) words = [w for w, s in sort_words] score = [s for w, s in sort_words] topics_word_bar(words, score, t[0]) writer.writerow([topic] + [sort_words]) return
tfidf = gensim.models.TfidfModel(bowCorpus) bowCorpus = tfidf[bowCorpus] extraDict['tfidf'] = True # Ende der Data-Prepartion # Training des Models ## Initzialisiert die Trainingsphase für den Algorithmus. Die Parameter sollten aus dem extraDict kommen, falls nicht default. lsiModel = gensim.models.LsiModel(bowCorpus, num_topics=numberTopics, id2word=dictionary, power_iters=extraDict['power_iters']) # Ende des Trainings # Validierung ## Erstellt den Coherence-Score u_mass cm = CoherenceModel(model=lsiModel, corpus=bowCorpus, coherence='u_mass') coherence = cm.get_coherence() ## Erstellt eine Liste und ein String mit den erstellten Topics topicString = "" topicList = [] for idx, topic in lsiModel.print_topics(-1): topicList.append(topic) topicString += '\nTopic: {} \nWords: {}'.format(idx, topic) ## Speichert die Ergebnisse und den Modeltype in ein Dictionary resultDict = {'modelType': modelType, 'topicN': numberTopics, 'coherence': coherence, 'topicList':topicList} ## Berechnung und Ausgabe des CoherenceCV scores c_v - zeitintensiv
def ldaStemmed(corpus, NUM_TOPICS=4, fechaMin='1-1', fechaMax='1-1', random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True, iterations=50, eval_every=1): fechaMin = fechaMin.split('-') fechaMax = fechaMax.split('-') diaMin = int(fechaMin[0]) diaMax = int(fechaMax[0]) fechaMin = datetime.datetime(2020, int(fechaMin[1]), diaMin) fechaMax = datetime.datetime(2020, int(fechaMax[1]), diaMax) data_words = [] if fechaMin.month == fechaMax.month: corpus = corpus[meses[fechaMin.month - 1]].query('dia >= @diaMin and dia <= @diaMax') else: p1 = corpus[meses[fechaMin.month - 1]].query('dia >= @diaMin') p2 = pd.DataFrame() for i in range(fechaMin.month, fechaMax.month - 1): p2 = p2.append(corpus[meses[i]]) p3 = corpus[meses[fechaMax.month - 1]].query('dia <= @diaMax') corpus = pd.concat([p1, p2, p3]) corpus = corpus[corpus.lexemas.values == corpus.lexemas.values] arrayTokens = corpus.lexemas.values for tweet in arrayTokens: d = nlp(tweet) data_words.append([ str(t) for t in d if t.pos_ == 'ADJ' or t.pos_ == 'PROPN' or t.pos_ == 'NOUN' ]) # Build the bigram and trigram models bigram = gensim.models.Phrases( data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] data_words_bigrams = make_bigrams(data_words) id2word = corpora.Dictionary(data_words_bigrams) text = data_words_bigrams corpus = [id2word.doc2bow(text) for text in data_words_bigrams] lda_model = gensim.models.ldamodel.LdaModel( corpus=corpus, id2word=id2word, num_topics=NUM_TOPICS, random_state=random_state, update_every=update_every, chunksize=chunksize, passes=passes, alpha=alpha, per_word_topics=per_word_topics, iterations=iterations) coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() # print('\nCoherence Score with ' + str(NUM_TOPICS) + ' topics and ' + str(iterations) + ' iterations: ' + str(coherence_lda)) return [lda_model, coherence_lda]
def ldaMulticore(corpus, workers=1, NUM_TOPICS=4, fechaMin='1-1', fechaMax='1-1', random_state=100, chunksize=100, passes=10, alpha='auto', per_word_topics=True, iterations=50): fechaMin = fechaMin.split('-') fechaMax = fechaMax.split('-') diaMin = int(fechaMin[0]) diaMax = int(fechaMax[0]) fechaMin = datetime.datetime(2020, int(fechaMin[1]), diaMin) fechaMax = datetime.datetime(2020, int(fechaMax[1]), diaMax) data_words = [] if fechaMin.month == fechaMax.month: corpus = corpus[meses[fechaMin.month - 1]].query('dia >= @diaMin and dia <= @diaMax') else: p1 = corpus[meses[fechaMin.month - 1]].query('dia >= @diaMin') p2 = pd.DataFrame() for i in range(fechaMin.month, fechaMax.month - 1): p2 = p2.append(corpus[meses[i]]) p3 = corpus[meses[fechaMax.month - 1]].query('dia <= @diaMax') corpus = pd.concat([p1, p2, p3]) arrayTokens = corpus.tokens.values for tweet in arrayTokens: d = ' '.join(tweet) d = nlp(d) data_words.append([ str(t) for t in d if t.pos_ == 'ADJ' or t.pos_ == 'PROPN' or t.pos_ == 'NOUN' ]) id2word = corpora.Dictionary(data_words) text = data_words corpus = [id2word.doc2bow(text) for text in data_words] lda_model = gensim.models.ldamulticore.LdaMulticore( workers=workers, corpus=corpus, id2word=id2word, num_topics=NUM_TOPICS, random_state=random_state, chunksize=chunksize, passes=passes, alpha=alpha, per_word_topics=per_word_topics, iterations=iterations) coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score with ' + str(NUM_TOPICS) + ' topics: ' + str(coherence_lda)) return lda_model
def lda(corpus, NUM_TOPICS=4, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True, iterations=50, eval_every=1): data_words = [] arrayTokens = corpus.tokens.values for tweet in arrayTokens: d = ' '.join(tweet) d = nlp(d) data_words.append([ str(t) for t in d if t.pos_ == 'ADJ' or t.pos_ == 'PROPN' or t.pos_ == 'NOUN' ]) # Build the bigram and trigram models bigram = gensim.models.Phrases( data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] data_words_bigrams = make_bigrams(data_words) id2word = corpora.Dictionary(data_words_bigrams) text = data_words_bigrams corpus = [id2word.doc2bow(text) for text in data_words_bigrams] lda_model = gensim.models.ldamodel.LdaModel( corpus=corpus, id2word=id2word, num_topics=NUM_TOPICS, random_state=random_state, update_every=update_every, chunksize=chunksize, passes=passes, alpha=alpha, per_word_topics=per_word_topics, iterations=iterations) coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() # print('\nCoherence Score with ' + str(NUM_TOPICS) + ' topics and ' + str(iterations) + ' iterations: ' + str(coherence_lda)) return [lda_model, corpus, id2word, coherence_lda]
def compute_all_scores(model_name, corpus, id2word, texts, measure, max_topics, start=5, step=5, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True): """ This module computes coherence scores for increasing number of topics. Arguments: model_name {str} -- Name of the model corpus {list} -- List of corpus created from preprocessed texts id2word {Dictionary} -- Dictionary of ids with mapped words texts {list} -- List of preprocessed input words measure {str} -- Name of the coherence score you want to use. Possible measures: c_v, c_uci, c_npmi, u_mass max_topics {int} -- Maximum number of topics used in computing various coherence scores Keyword Arguments: start {int} -- Start value for x-axis values (default: {5}) step {int} -- Steps for values on x-axis (default: {5}) Returns: list, list -- List of computed models, List of coherence scores for respective computed models """ def compute_model(model_name, corpus, id2word, num_topics): """ This module computes various topic models from the gensim library. Arguments: model_name {str} -- Name of the model corpus {list} -- List of corpus created from preprocessed texts id2word {Dictionary} -- Dictionary of ids with mapped words Keyword Arguments: num_topics {int} -- Number of topics as a parameter to gensim model computation (default: {5}) Returns: gensim.models.MODEL -- Return the computed model """ # LSI Model if model_name == 'LsiModel' or model_name == 'lsimodel' or model_name == 'lsi': model = gensim.models.lsimodel.LsiModel(corpus=corpus, num_topics=num_topics, id2word=id2word, chunksize=chunksize) # LDA Model elif model_name == 'LdaModel' or model_name == 'ldamodel' or model_name == 'lda': model = gensim.models.ldamodel.LdaModel( corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=random_state, update_every=update_every, chunksize=chunksize, passes=passes, alpha=alpha, per_word_topics=per_word_topics) # LDAMallet Model elif model_name == 'LdaMallet' or model_name == 'ldamallet' or model_name == 'ldamallet': model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word) else: print('Invalid model!') return None return model # Models list and coherence scores list model_list, coherence_values = [], [] # Iterator for max_topics iterator = range(start, max_topics + 1, step) for num_topics in iterator: # Compute models for given max_topics computed_model = compute_model(model_name=model_name, corpus=corpus, id2word=id2word, num_topics=num_topics) # Append above computed to a list model_list.append(computed_model) # Compute coherence score for above model coherencemodel = CoherenceModel(model=computed_model, texts=texts, dictionary=id2word, coherence=measure) # Append above coherence score to a list coherence_values.append(coherencemodel.get_coherence()) # Print all scores to the console print('\nCalculating various coherence scores...\n') for number_of_topics, score in zip(iterator, coherence_values): print(model_name, ": Num Topics =", number_of_topics, " Coherence Value :", round(score, 4)) return model_list, coherence_values
def most_similar_texts( self, X, num_examples, text_column_name, num_topics=None, chosen_stopwords=set() ): """ Uses NMF clustering to create n topics based on adjusted word frequencies Parameters -------- X: DataFrame num_examples: int text_column_name: str num_topics: int Optional - if none algorithm will determine best number Returns -------- topic_words_df: DataFrame Top 15 words/phrases per topic combined_df: DataFrame Original text with topic number assigned to each """ X = X[~X[text_column_name].isna()] X = X[X[text_column_name] != ""] X = X[X[text_column_name] != " "] X = X[X[text_column_name] != "NA"] X = X[X[text_column_name] != "n/a"] X = X[X[text_column_name] != "N/A"] X = X[X[text_column_name] != "na"] all_stop_words = ( set(ENGLISH_STOP_WORDS) | set(["-PRON-"]) | set(string.punctuation) | set([" "]) | chosen_stopwords ) ct = CleanText() vectorizer = TfidfVectorizer( tokenizer=ct.lematize, ngram_range=(1, 3), stop_words=all_stop_words, min_df=5, max_df=0.4, ) vectors = vectorizer.fit_transform(X[text_column_name]).todense() # Adding words/phrases used in text data frequencies back into the dataset (so we can see feature importances later) vocab = vectorizer.get_feature_names() vector_df = pd.DataFrame(vectors, columns=vocab, index=X.index) if X.shape[0] < 20: return "Too few examples to categorize." if not num_topics: # In case 1, add 1 to get at least 2 # The rest are based on eyeballing numbers min_topics = ceil(X.shape[0] * 0.01) + 1 max_topics = ceil(X.shape[0] * 0.2) step = ceil((max_topics - min_topics) / 5) topic_nums = list(np.arange(min_topics, max_topics, step)) texts = X[text_column_name].apply(ct.lematize) # In gensim a dictionary is a mapping between words and their integer id dictionary = Dictionary(texts) # Filter out extremes to limit the number of features dictionary.filter_extremes(no_below=2, no_above=0.85, keep_n=5000) # Create the bag-of-words format (list of (token_id, token_count)) corpus = [dictionary.doc2bow(text) for text in texts] coherence_scores = [] for num in topic_nums: model = nmf.Nmf( corpus=corpus, num_topics=num, id2word=dictionary, chunksize=2000, passes=5, kappa=0.1, minimum_probability=0.01, w_max_iter=300, w_stop_condition=0.0001, h_max_iter=100, h_stop_condition=0.001, eval_every=10, normalize=True, random_state=42, ) cm = CoherenceModel( model=model, texts=texts, dictionary=dictionary, coherence="u_mass" ) coherence_scores.append(round(cm.get_coherence(), 5)) scores = list(zip(topic_nums, coherence_scores)) chosen_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0] else: chosen_num_topics = num_topics model = NMF(n_components=chosen_num_topics, random_state=42) model.fit(vectors) component_loadings = model.transform(vectors) top_topics = pd.DataFrame( np.argmax(component_loadings, axis=1), columns=["top_topic_num"] ) top_topic_loading = pd.DataFrame( np.max(component_loadings, axis=1), columns=["top_topic_loading"] ) X.reset_index(inplace=True, drop=False) vector_df.reset_index(inplace=True, drop=True) # Fix for duplicate text_column_name vector_df.columns = [x + "_vector" for x in vector_df.columns] combined_df = pd.concat([X, vector_df, top_topics, top_topic_loading], axis=1) combined_df.sort_values(by="top_topic_loading", ascending=False, inplace=True) combined_df = pd.concat([X, vector_df, top_topics], axis=1) topic_words = {} sample_texts_lst = [] for topic, comp in enumerate(model.components_): word_idx = np.argsort(comp)[::-1][:num_examples] topic_words[topic] = [vocab[i] for i in word_idx] sample_texts_lst.append( list( combined_df[combined_df["top_topic_num"] == topic][ text_column_name ].values[:num_examples] ) ) topic_words_df = pd.DataFrame( columns=[ "topic_num", "num_in_category", "top_words_and_phrases", "sample_texts", ] ) topic_words_df["topic_num"] = [k for k, _ in topic_words.items()] topic_words_df["num_in_category"] = ( combined_df.groupby("top_topic_num").count().iloc[:, 0] ) topic_words_df["top_words_and_phrases"] = [x for x in topic_words.values()] topic_words_df["sample_texts"] = sample_texts_lst topic_words_explode = pd.DataFrame( topic_words_df["sample_texts"].tolist(), index=topic_words_df.index, ) topic_words_explode.columns = [ "example{}".format(num) for num in range(len(topic_words_explode.columns)) ] concated_topics = pd.concat( [ topic_words_df[ ["topic_num", "num_in_category", "top_words_and_phrases"] ], topic_words_explode, ], axis=1, ) print("Topics created with top words & example texts:") print(concated_topics) original_plus_topics = combined_df[list(X.columns) + ["index", "top_topic_num"]] original_with_keywords = pd.merge( original_plus_topics, concated_topics[["topic_num", "top_words_and_phrases"]], left_on="top_topic_num", right_on="topic_num", how="left", ).drop("top_topic_num", axis=1) return ( concated_topics, original_with_keywords, model, )
def get_coherence(model, texts, dictionary): coherence_model_ldamallet = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') return coherence_model_ldamallet.get_coherence()
update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) # Print the Keyword in the 10 topics pprint(lda_model.print_topics()) doc_lda = lda_model[corpus] # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) else: print( "Topic number NOT given. Training multiple models. Optimal one will return at the end..." ) print("______________ \n \n ") # this may take a long time - every model in the list will be evaluated based on coherence coherence_values = [] model_list = [] for num_topics in range(5, 20, 1): model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100,
def evaluate_allmodels(self, file_extension, bool_allmodels): print('Number of topics: %d' % self.topics_n) print('Number of documents: %d' % len(self.corpus)) print('Number of unique tokens: %d' % len(self.dictionary)) topicmodel_list = ['JST', 'VODUM', 'TAM', 'LAM'] coherence_list = [] randindex_list = [] output_list = [] for topic_number in range(self.min_topics_n, self.topics_n + 1): print(topic_number) lda_model = lda.fit_lda_model(topic_number, self.corpus, self.dictionary) lda_topics = lda_model.show_topics(formatted=False) lda_topics = [[word for word, prob in topic] for topicid, topic in lda_topics] lda_cm = CoherenceModel(topics=lda_topics, texts=self.tokenized_data, corpus=self.corpus, dictionary=self.dictionary, coherence='c_v') lda_coherence = lda_cm.get_coherence() results_lda_df = evalmodels.evaluate_LDA( ldafoldername=file_extension, corpus=self.corpus, ldamodel=lda_model, top_topics_n=1, data=self.data, perspectives=self.perspectives) lda_randindex = evalmodels.rand_index( results_lda_df['perspective'], results_lda_df['sum_topic']) print('LDA : \n', 'Adjust. Rand index: ', lda_randindex, '\n Topic coherence: ', lda_coherence) print(lda_topics) if bool_allmodels == True: '''JST''' jstfile = '../data/models/JST/' + file_extension + '/final.twords' if os.path.isfile(jstfile): jst_topic = fm.read_jst_toptopics(jstfile) jst_results_model_df = evalmodels.evaluate_JST( file_extension, self.data, self.perspectives) coherence_list.append( CoherenceModel(topics=jst_topic, texts=self.tokenized_data, corpus=self.corpus, dictionary=self.dictionary, coherence='c_v').get_coherence()) randindex_list.append( evalmodels.rand_index(jst_results_model_df['perspective'], jst_results_model_df['sum_topic'])) output_list.append(jst_topic) '''VODUM''' vodum_topic = fm.read_vodum_toptopics(file_extension) vodum_results_model_df = evalmodels.evaluate_VODUM( vodum_foldername=file_extension, data=self.data, perspectives=self.perspectives, topics_list=vodum_topic) coherence_list.append( CoherenceModel(topics=vodum_topic, texts=self.tokenized_data, corpus=self.corpus, dictionary=self.dictionary, coherence='c_v').get_coherence()) randindex_list.append( evalmodels.rand_index(vodum_results_model_df['perspective'], vodum_results_model_df['sum_topic'])) output_list.append(vodum_topic) '''TAM''' tamfile = '../data/models/TAM/' + file_extension + '/output_topwords_tokenized_data.txt' tam_topic = fm.read_tam_toptopics(tamfile) tam_results_model_df = evalmodels.evaluate_TAM( tam_foldername=file_extension, data=self.data, perspectives=self.perspectives, topics_list=tam_topic) coherence_list.append( CoherenceModel(topics=tam_topic, texts=self.tokenized_data, corpus=self.corpus, dictionary=self.dictionary, coherence='c_v').get_coherence()) randindex_list.append( evalmodels.rand_index(tam_results_model_df['perspective'], tam_results_model_df['sum_topic'])) output_list.append(tam_topic) '''LAM''' lamfile = '../data/models/LAM/' + file_extension + '/lam.out' lam_topic_dict, lam_topic = fm.read_lam_toptopics(lamfile) lam_results_model_df = evalmodels.evaluate_LAM( lam_foldername=file_extension, data=self.data, perspectives=self.perspectives, topics_dict=lam_topic_dict) coherence_list.append( CoherenceModel(topics=lam_topic, texts=self.tokenized_data, corpus=self.corpus, dictionary=self.dictionary, coherence='c_v').get_coherence()) randindex_list.append( evalmodels.rand_index(lam_results_model_df['perspective'], lam_results_model_df['sum_topic'])) output_list.append(lam_topic) '''RANDOM TF-IDF''' random_model, top_tokens = randommodel.fit_tfidf( self.tokenized_data) print('RANDOM MODEL results: \n', random_model) for i in range(0, len(topicmodel_list)): print('\n ----------', topicmodel_list[i], ' ------------- \n Adjust. Rand index: ', randindex_list[i], '\n Topic coherence: ', coherence_list[i]) print('Model output:') for topicelement in output_list[i]: print(topicelement)
def main(): t = time() tree = ET.parse('WSD_Training_Corpora\SemCor\semcor.data.xml') root = tree.getroot() """ root - corpus root[0] - text (document) root[0][0] - sentence root[0][0][0] - word (token) """ documents = [0] * len(root) i = 0 for text in root: document = '' result = [] for sentence in text: for word in sentence: document = document + " " + word.text #egy dokumentum document = simple_preprocess(document) for token in document: if token not in STOPWORDS and len(token) > 3: result.append(PorterStemmer().stem( WordNetLemmatizer().lemmatize(token, pos='v'))) documents[i] = result print(i) i += 1 """ ------------------------------------------------- Bag of words -------------------------------------------------------- """ dictionary = gensim.corpora.Dictionary(documents) dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) bow_corpus = [dictionary.doc2bow(doc) for doc in documents] """ ---------------------------------------- Coherence Values and Num Topics Graph ---------------------------------------- """ def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): coherence_values = [] model_list = [] for num_topics in range(start, limit, step): print("Working on next model, num_topics =", num_topics, "...") model = gensim.models.LdaMulticore(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=3) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=documents, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) return model_list, coherence_values # Can take a long time to run. print("Computing coherence values...") model_list, coherence_values = compute_coherence_values( dictionary=dictionary, corpus=bow_corpus, texts=documents, start=2, limit=40, step=6) # Show graph limit = 40 start = 2 step = 6 x = range(start, limit, step) plt.plot(x, coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend(("coherence_values"), loc='best') """ --------------------------------------------------- LDA -------------------------------------------------------------- """ print("\nWorking on simple LDA num_topics=16, passes=10...") lda_model_bow = gensim.models.LdaModel(bow_corpus, num_topics=16, id2word=dictionary, passes=10) for idx, topic in lda_model_bow.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) print('\nPerplexity: ', lda_model_bow.log_perplexity(bow_corpus)) coherence_model_lda = CoherenceModel(model=lda_model_bow, texts=documents, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) print('\nworking on topic visualization') vis = pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary) pyLDAvis.save_html(vis, 'LDA_visualized.html') print('Time for this WHOLE thing: {} mins'.format( round((time() - t) / 60, 2))) plt.show()
def ldaModelGeneration(train_corpus, test_corpus, topic_number, value_save_model): ''' LDA model training, visualisation and evaluation Inputs: - train_corpus: 80% of the wikipedia corpus to be used for training of LDA model - test_corpus: 20% of the wikipedia corpus to be used for final evaluation of trained model(perplexity+coherence) - topic_number: number of latent topics to be found by model - value_save_model: if True, will save model, model dictionary and pyldavis visualisation Output: - perplexity value: evaluation of perplexity of trained model over unseen documents (test_corpus) --> common LDA evaluation metrics - coherence value: coherence score of trained model over unseen documents (test_corpus) --> less reliable ''' fileDir = os.path.dirname(os.path.abspath(__file__)) # parentDir = os.path.dirname(fileDir) # Directory of the Module directory # ------------------------------------------------------------------------------------------------------------ # LDA MODEL - GENERATION/VISUALISATION WITH TRAINING CORPUS # ------------------------------------------------------------------------------------------------------------ # Choose LDA model name for this iteration model_name = 'model_' + str(topic_number) # Create model dictionary dictionary = corpora.Dictionary(train_corpus) dictionary.filter_extremes(no_below=0.2) print('\n LDA Model Inputs:\n Dictionary Size:', dictionary) # Create Document-Term matrix corpus = [dictionary.doc2bow(tokens) for tokens in train_corpus] # Generate LDA model ldamodel = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=topic_number, passes=300) if value_save_model == True: # Visualise topics: words and their weights print("LDA Topics:") for i in ldamodel.show_topics(formatted=False, num_topics=ldamodel.num_topics, num_words=20): print(i) # Save model dictionary.save(parentDir + '/TopicModeling/LDAmodels/new_unsupervised/dic_' + str(model_name) + '.dict') ldamodel.save(parentDir+'/TopicModeling/LDAModels/new_unsupervised/'+str(model_name)) print('LDA model generated and saved') # Save pyldavis (usually takes a few minutes to generate) vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics = False) pyLDAvis.save_html(vis, parentDir + '/TopicModeling/LDAmodels/new_unsupervised/LDA_Visualization_' + str(topic_number)+'.html') # ------------------------------------------------------------------------------------------------------------ # LDA MODEL - EVALUATION # ------------------------------------------------------------------------------------------------------------ # Use same dictionary as the model was trained with to transform unseen data into Document-Term matrix corpusTest = [dictionary.doc2bow(tokens) for tokens in test_corpus] # Model Perplexity - must be minimised perplexity = ldamodel.log_perplexity(corpusTest) perplexityExp = math.exp(perplexity) # Topic Coherence cm = CoherenceModel(model=ldamodel, corpus=corpusTest, coherence='u_mass') coherence = cm.get_coherence() # get coherence value return perplexityExp, coherence
def get_coherence(lda_model, data_lemmatized, id2word): coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') return coherence_model_lda.get_coherence()
def calculate_cv(model, data, id2word): coherence_model_lda = CoherenceModel(model=model, texts=data, dictionary=id2word, coherence='c_v') return coherence_model_lda.get_coherence()
bag_of_words = [dictionary.doc2bow(abstract) for abstract in abstracts] print('- Read and preprocessed the dataset!') ########################## DYNAMIC TOPIC MODELING ########################## #Build the model print('- Training the model') start_time = time.time() #Start count time ldaseq = ldaseqmodel.LdaSeqModel(corpus=bag_of_words, id2word=dictionary, time_slice=time_slices_2years_interval, num_topics=8) print('- Model finish running in', round((time.time() - start_time) / 60), 'min(s)') #Save the model path = datapath('dynamic_model_code') ldaseq.save(path) ########################## EVALUATION ########################## coherence = ldaseq.dtm_coherence(time=0) temp = CoherenceModel(topics=coherence, corpus=bag_of_words, dictionary=dictionary, coherence='u_mass') print("u_mass = ", temp.get_coherence()) temp = CoherenceModel(topics=coherence, texts=abstracts, dictionary=dictionary, coherence='c_v') print("c_v = ", temp.get_coherence())