def fit_transform(self, dataset: Dataset, name: str) -> TopicModel: # https://radimrehurek.com/gensim/models/ldamulticore.html#module-gensim.models.ldamulticore model = LdaMulticore( corpus=dataset.get_gensim_corpus(), id2word=dataset.get_gensim_vocab(), num_topics=self.n, random_state=get_seed(), **self.kwargs, ) # To get back DT matrix https://github.com/bmabey/pyLDAvis/blob/master/pyLDAvis/gensim_models.py topic_word_matrix = model.get_topics() doc_topic_matrix = model.inference(dataset.get_gensim_corpus())[0] self.model = model return TopicModel.from_array(name, topic_word_matrix, doc_topic_matrix)
def create_LDA_model(coursesList): warnings.filterwarnings('ignore') text_clean = [doc.split(' ') for doc in coursesList['description']] bigrams, trigrams = create_n_grams(text_clean) text_clean = add_n_grams(text_clean, bigrams, trigrams) id2word = Dictionary(text_clean) id2word.filter_extremes(no_below=5, no_above=0.45) corpus = [id2word.doc2bow(text) for text in text_clean] num_topics = config.num_lda_topic lda_model = LDA(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, alpha='asymmetric', passes=25) lda_model.save("./best_model.lda") coherence_model_c_v = CoherenceModel(model=lda_model, texts=text_clean, dictionary=id2word, coherence='c_v') c_v = coherence_model_c_v.get_coherence() term_topic_mat = lda_model.get_topics() aver_cosine_similarities = 0 for i in range(0, (num_topics - 1)): cosine_similarities = linear_kernel(term_topic_mat[i].reshape(1, -1), term_topic_mat[i + 1:]).flatten() aver_cosine_similarities += sum(cosine_similarities) if num_topics != 1: aver_cosine_similarities = aver_cosine_similarities / ( num_topics * (num_topics - 1) / 2) print(c_v) print(aver_cosine_similarities) create_vector_topics(lda_model, corpus, id2word, coursesList) visual_data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) pyLDAvis.save_html(visual_data, 'topics.html') return lda_model, id2word, bigrams, trigrams
print("Now Extracting Gibbs Signatures") ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=bagOfMutations, num_topics=num_sigs, id2word=idToChannel, iterations=100, topic_threshold=0.0) #hdpmodel = HdpModel(bagOfMutations, idToChannel, K=20, T=48) pickle.dump(ldamodel, open(output_path + project + '_lda_model.pickle', 'wb')) pickle.dump(ldamallet, open(output_path + project + '_mallet_model.pickle', 'wb')) #pickle.dump(hdpmodel, output_path + project + '_hdp_model.pickle') bayes_signatures = pd.DataFrame(ldamodel.get_topics().transpose()) columns = [] for i in range(bayes_signatures.shape[1]): columns.append("Signature " + alpha_dict[i]) bayes_signatures.columns = columns channels = [] for c in bayes_signatures.index: channels.append(ldamodel.id2word[c]) if len(channels ) != 48: #one channel had 0 counts for all samples in dataset channel_set = set(channels) for c in classification: if c not in channel_set: channels.append(c) bayes_signatures.loc[len(bayes_signatures)] = 0