def get_doc_topic_dist(OUT_DIR=OUT_DIR): lda_dict = corpora.Dictionary.load(OUT_DIR + TV_SHOW + '.dict') lda_corpus = corpora.MmCorpus(OUT_DIR + TV_SHOW + '.mm') lda = LdaMulticore.load(OUT_DIR + TV_SHOW + '.lda') return _extract_data(topic_model=lda, dictionary=lda_dict, corpus=lda_corpus)
def build_model(self, fname=None, save_to=None): id2word = self.id2word or self.build_id2word() corpus = self.corpus or self.build_corpus() # read model.lda file if not fname: fname = click.prompt('model file name', type=str, default='model.lda') fname = self.__dest(fname) # if there is no model file or the user wants to rebuild, build .model if not os.path.isfile(fname) or click.confirm( 'There already is %s. Do you want to re run lda?' % fname): num_procs = click.prompt('Number of processes to launch', type=int, default=multiprocessing.cpu_count()) num_epochs = click.prompt('Number of epochs to run', type=int, default=20) num_topics = click.prompt('Number of topics', type=int, default=100) print 'start building model' start = time() model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, workers=num_procs, passes=num_epochs) model.save(fname) #save print 'building model takes: %s' % LdaUtils.human_readable_time( time() - start) self.model = LdaMulticore.load(fname) return self.model
def _load(self): modeldir = self._workdir.joinpath("ldamodel_{}".format(self._name)) if not modeldir.exists(): return False self._lda = LdaMulticore.load(str(modeldir)) self._dictionary = Dictionary.load( str(self._workdir.joinpath("dictionary_{}.gz".format(self._name))))
def getTopics(jobs_): bigram_model = Phrases.load('data/bigram_model_all') trigram_model = Phrases.load('data/trigram_model_all') trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict') lda = LdaMulticore.load('data/lda_model_all') topic_names = {0:u'Risk Management Bank', 1:u'Big Data Report', 2:u'Automotive SAP', 3:u'Microsoft Java Scrum', 4:u'Medical Consultant', 5:u'Java Engineer', 6:u'Computer Vision Developer', 7:u'Data Analyst', 8:u'BI SAP BW', 9:u'IOT Reporting R', 10:u'Global Project Presentation', 11:u'Cloud Engineer IOT', 12:u'Industry 4.0', 13:u'Risk Consulting', 14:u'Machine Learning Data Science'} topics_ = [] for job_ in jobs_: if job_ is not None: #print(job_[0]) topics_.append(lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, job_[1], job_[0]))
def lda(corpus, num_topics=5, save_as=None, load=None, verbose=True): module_path = os.path.dirname(__file__) model_path = module_path + "/models" if verbose: print("prepare data") corpus = corpus.apply(lambda x: x.split(" ")) dictionary = Dictionary(corpus) bow = [dictionary.doc2bow(doc) for doc in corpus] if type(load) == str: if verbose: print("loading lda") lda = LdaMulticore.load(model_path + "/" + load) else: if verbose: print("training lda") lda = LdaMulticore(bow, num_topics=num_topics) if save_as: try: os.mkdir(model_path) except: pass lda.save(model_path + "/" + save_as) if verbose: print("generate visualization") vis = pyLDAvis.gensim.prepare(lda, bow, dictionary) return lda, vis
def get_model(self, n_topics=50, n_workers=6, recalculate=False, from_scratch=True): filepath = self.paths.get_lda_filepath(n_topics) if not os.path.isfile(filepath) or recalculate: if not from_scratch: raise ValueError( 'No LDA file exists but from_scratch is False') trigram_dictionary = self.get_corpus_dict() trigram_bow_corpus = self.get_trigram_bow_corpus( trigram_dictionary) print('Building LDA model...') lda = LdaMulticore(trigram_bow_corpus, num_topics=n_topics, id2word=trigram_dictionary, workers=n_workers) lda.save(filepath) print('LDA model (n_topics={}) written to {}'.format( n_topics, filepath)) else: print('Loading LDA model (n_topics={})...'.format(n_topics)) lda = LdaMulticore.load(filepath) return lda
def test(): import pickle # from sqlalchemy.dialects.mssql import BINARY ## Create a semi-complex list to pickle listToPickle = LdaMulticore.load(model_dir) ## Pickle the list into a string pickledList = pickle.dumps(listToPickle, pickle.HIGHEST_PROTOCOL) connection = engine.connect() ## Create a cursor for interacting # cursor = connection.cursor() ## Add the information to the database table pickleTest connection.execute( """INSERT INTO dbo.model_test(id, binary_model) VALUES (?, ?)""", (1, pickledList)) ## Select what we just added result = connection.execute( """SELECT binary_model FROM dbo.model_test WHERE id = 1""") ## Dump the results to a string rows = result.fetchall() ## Get the results for each in rows: ## The result is also in a tuple for pickledStoredList in each: ## Unpickle the stored string unpickledList = pickle.loads(pickledStoredList) print(unpickledList)
def test_lda(sentence): """Tests the trained LDA model on an example sentence, i.e. returns the topics of that sentence. May only be called after train_lda(). Args: sentence: A sentence to test on as string. """ # validate and process the sentence if sentence is None or len(sentence) < 1: raise Exception("Missing or empty 'sentence' argument.") sentence = sentence.decode("utf-8").lower().strip().split(" ") if len(sentence) != cfg.LDA_WINDOW_SIZE: print("[INFO] the token size of your sentence does not match the defined window " \ "size (%d vs %d)." % (len(sentence), cfg.LDA_WINDOW_SIZE)) # load dictionary and trained model dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH) lda_model = LdaMulticore.load(cfg.LDA_MODEL_FILEPATH) # sentence to bag of words bow = dictionary.doc2bow(sentence) # print topics of sentence print(lda_model[bow])
def get_required_models(algorithm, best_topic_model_path, topic_modeling_path): with open(topic_modeling_path + "dictionary", 'rb') as pickle_file: dictionary = pickle.load(pickle_file) with open(topic_modeling_path + "tfidf_model", 'rb') as pickle_file: tfidf = pickle.load(pickle_file) topic_model = LdaMulticore.load(best_topic_model_path + algorithm + "/model/" + algorithm + ".model") return dictionary, tfidf, topic_model
def load(self): """ Load previous saved ldaprocessor results """ try: return LdaMulticore.load(self.lda_out_file_name) except: return None
def explore_topic(self, topic_number, topn=20): lda = LdaMulticore.load(self.lda_model_filepath) """ accept a user-supplied topic number and print out a formatted list of the top terms """ print("{:20} {} \n".format("term", "frequency")) for term, frequency in lda.show_topic(topic_number, topn): print("{:20} {:.5f}".format(term, frequency))
def fit_universal_models(self): vec = CountVectorizer(stop_words='english', max_features=10000) vec_t = vec.fit_transform(' '.join(x) for x in self.all_sentences) id2word = {v: k for k, v in vec.vocabulary_.iteritems()} vec_corpus = gensim.matutils.Sparse2Corpus(vec_t.T) if os.path.isfile('lda.modl'): lda = LdaMulticore.load('lda.modl') else: lda = LdaMulticore(corpus=vec_corpus, id2word=id2word, iterations=200, num_topics=2, passes=10, workers=4) lda.save('lda.modl') all_counts = vec.transform(' '.join(x) for x in self.all_sentences) self.d['all']['_probas'] = np.array( lda.inference(gensim.matutils.Sparse2Corpus(all_counts.T))[0]) labeled_counts = vec.transform(' '.join(x) for x in self.X) self.d['labeled']['_probas'] = np.array( lda.inference(gensim.matutils.Sparse2Corpus(labeled_counts.T))[0]) w2vmodel = Word2Vec(self.all_sentences, size=100, window=5, min_count=3, workers=4) best_centroids = None best_score = None for _ in xrange( 10): # todo -- implement kmeans++ instead of best of 10 km = Kmeans(50) km.fit(w2vmodel.syn0) score = km.compute_sse(w2vmodel.syn0) if best_score is None or score < best_score: best_score = score best_centroids = km.centroids km.centroids = best_centroids self.tfidf = TfidfVectorizer(stop_words=set(stopwords.words())) self.d['all']['_t'] = self.tfidf.fit_transform( ' '.join(x) for x in self.all_sentences) self.d['labeled']['_t'] = self.tfidf.transform(' '.join(x) for x in self.X) self.d['all']['_kmeans'] = np.array( kmeans_word2vecify(self.all_sentences, w2vmodel, km, self.d['all']['_t'], self.tfidf)) self.d['labeled']['_kmeans'] = np.array( kmeans_word2vecify(self.X, w2vmodel, km, self.d['labeled']['_t'], self.tfidf))
def lda_show_topic(i=[1]): # take list variable, return topic name and sub-topic items lda = LdaMulticore.load('../Models/lda_model_all_30') name = get_topic_name() lst = [] for x in i: print('subtopic = {}'.format(name[x])) print(lda.show_topic(x, topn=25)) lst.append(lda.show_topic(x, topn=25)) return lst
def LDA_IO(lda_model_name): with open('dictionary.pkl', 'rb') as f: dictionary = pickle.load(f) with open('doc2idx.pkl', 'rb') as f: doc2idx = pickle.load(f) lda_model = LdaMulticore.load(lda_model_name) token2id = dictionary.token2id return doc2idx, lda_model, token2id
def get_lda_model(corpus, dictionary, num_topics, SAVE_FILE=OUT_FILE, passes=20, iterations=100): if not os.path.exists(SAVE_FILE + '.lda'): print('creating lda model for the {} file..'.format(SAVE_FILE)) print('num_topics: {}'.format(num_topics)) lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, iterations=iterations, chunksize=2500) lda_model.save(SAVE_FILE + '.lda') else: print('LDA model for the file:{} already exists.. loading..'.format(SAVE_FILE)) lda_model = LdaMulticore.load(SAVE_FILE + '.lda') return lda_model
def __init__(self, examples, vocab, lda_vocab_path, lda_model_path, args): self.data = examples self.vocab = vocab self.args = args self.item_vocab = load_item_vocab(args) self.lda_vocab = Dictionary.load(lda_vocab_path) self.lda_model = LdaMulticore.load(lda_model_path) self.sent_lim = [ self.args.cp_sentNum, self.args.desc_sentNum, self.args.require_sentNum, self.args.benefit_sentNum ]
def __init__(self, lda_filepath, dictionary_filepath, cache_filepath=None): """Initialize the LDA wrapper. Args: lda_filepath: Filepath to the trained LDA model. dictionary_filepath: Filepath to the dictionary of the LDA. cache_filepath: Optional filepath to a shelve cache for the LDA results. """ self.lda = LdaMulticore.load(lda_filepath) self.dictionary = gensim.corpora.dictionary.Dictionary.load(dictionary_filepath) self.cache_synch_prob = 2 # in percent, 1 to 100 self.cache_filepath = cache_filepath self.cache = shelve.open(cache_filepath) if cache_filepath is not None else None
def display_data(self): lda = LdaMulticore.load(self.lda_model_filepath) trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath) trigram_dictionary = Dictionary.load_from_text(self.trigram_dictionary_filepath) LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary) with open(self.LDAvis_data_filepath, 'w') as f: f.write(str(LDAvis_prepared)) # json.dump(LDAvis_prepared.to_json(), f) with open(self.LDAvis_data_filepath) as f: LDAvis_prepared = f pyLDAvis.display(LDAvis_prepared)
def _load_model(self): """This function is used to load a gensim LdaModel from the models folder. Or `None` if one does not exist. Returns: :obj:`gensim.models.ldamodel.LdaModel`: The model found in ucla_topic_analysis/model/lda.model or None if there was no lda model saved or the number of topics does not match. """ if os.path.isfile(self.file_path): return LdaMulticore.load(self.file_path) return None
def load_all(modeldesc, sourcedesc): modelfilename = model_file('ldamodel-%s' % modeldesc) ldamodel = LdaMulticore.load(modelfilename) corpus = file_read_json(model_file('ldacorpus-%s.json' % modeldesc)) prep_items = file_read_json(data_source_file(sourcedesc + '.prep.json')) sourcefilename = data_source_file(sourcedesc + '.csv') reader = csv_reader(sourcefilename) source_texts = {row['id']: row['text'] for row in reader} return ldamodel, corpus, prep_items, source_texts
def load_topicmodel(self, model_path): print("--- Loading Model ---\n") if self.algo == 'gensim': self.model_path = model_path self.ldamodel = LdaMulticore.load(model_path + "/model_obj", mmap='r') self.dictionary = Dictionary.load(model_path + "/dictionary_obj", mmap='r') self.num_topics = self.ldamodel.num_topics else: self.ldamodel = pickle.load(model_path + "/model_obj.pk") self.lda_vectorizer = pickle.load(model_path + "/vectorizer_obj.pk") self.num_topics = self.ldamodel.n_components
def __init__(self, lda_filepath, dictionary_filepath, cache_filepath=None): """Initialize the LDA wrapper. Args: lda_filepath: Filepath to the trained LDA model. dictionary_filepath: Filepath to the dictionary of the LDA. cache_filepath: Optional filepath to a shelve cache for the LDA results. """ self.lda = LdaMulticore.load(lda_filepath) self.dictionary = gensim.corpora.dictionary.Dictionary.load( dictionary_filepath) self.cache_synch_prob = 2 # in percent, 1 to 100 self.cache_filepath = cache_filepath self.cache = shelve.open( cache_filepath) if cache_filepath is not None else None
def loadModel(self, filename): self.util.logDebug('LDA', 'Loading model from ' + filename) self.model = LdaMulticore.load(fname=filename) self.dictionary = Dictionary.load(fname=filename + '.dict') self.corpus = MmCorpus(filename + '.corpus') print(self.dictionary) print(self.model.print_topic(0, topn=5)) print(self.model.print_topic(1, topn=5)) print(self.model.print_topic(2, topn=5)) print(self.model.print_topic(3, topn=5)) self.loaded = True self.util.logDebug('LDA', 'Model loaded in ' + self.util.stopTimeTrack()) self.labelTopics(filename)
def show_topics(): """Shows all topics of the trained LDA model. May only be called after train_lda(). """ # load trained model lda_model = LdaMulticore.load(cfg.LDA_MODEL_FILEPATH) # list the topics topics = lda_model.show_topics(num_topics=cfg.LDA_COUNT_TOPICS, num_words=10, log=False, formatted=True) print("List of topics:") for i, topic in enumerate(topics): # not adding topic to the tuple here prevents unicode errors print("%3d:" % (i,), topic)
def main(coursesList): lda = LDA.load("./best_model.lda") dictionary = Dictionary.load("best_model.lda.id2word") bigrams = Phraser.load("./bigram_model.pkl") trigrams = Phraser.load("./trigram_model.pkl") text_clean = [doc.split(' ') for doc in coursesList['description']] corpus = [dictionary.doc2bow(text) for text in text_clean] create_vector_topics(lda, corpus, dictionary, coursesList) courses_topic = config.matrix_courses_topic.to_numpy() #lda, dictionary, bigrams, trigrams = create_LDA_model(coursesList) #courses_topic = config.matrix_courses_topic.to_numpy() cursor.execute("select id from auth_group") id_groups = cursor.fetchall() for i in id_groups: cursor.execute( "select distinct studyplan_id from students where group_id = %(id)s ", {'id': i[0]}) studyplan_id = cursor.fetchall() for j in studyplan_id: subject_list = pd.DataFrame(columns=['id_subject', 'description']) subject_list = WordProcessing.word_processing( get_work_program(j[0], subject_list)) #for k in subject_list: token_stud_prog = [ program.split(' ') for program in subject_list['description'] ] #token_stud_prog = add_n_grams(token_stud_prog, bigrams, trigrams) prog_corp = [ dictionary.doc2bow(program) for program in token_stud_prog ] topic_prog = lda.get_document_topics(prog_corp) for l in range(0, len(topic_prog)): profile_student = np.zeros(config.num_lda_topic) dense_topic_prog = np.zeros(config.num_lda_topic) for m in topic_prog[l]: dense_topic_prog[m[0]] += m[1] #mask = np.argsort(dense_topic_prog)[::-1][:1] #profile_student[mask] += 1 profile_student = dense_topic_prog cosine_similarities = linear_kernel( profile_student.reshape(1, -1), courses_topic).flatten() top_courses = np.where(cosine_similarities >= 0.2)[0] print(subject_list.loc[l, 'id_subject']) #print(top_courses) print(coursesList.loc[top_courses, 'name':'link'])
def visualizeLDA(self, filename): dictionary = Dictionary.load(filename + '.dict') corpus = MmCorpus(filename + '.corpus') lda = LdaMulticore.load(filename) self.util.logDebug('LDA', 'Preparing HTML ') ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary) self.util.logDebug('LDA', 'HTML prepared in ' + self.util.stopTimeTrack()) pyLDAvis.save_html(ldavis, filename + '.html') self.util.logDebug('LDA', 'HTML saved in ' + self.util.stopTimeTrack()) # # lda = LDA(logfilename='/home/kah1/test.log') # lda.loadModel('/u01/bigdata/02d_d2vModel1/CvLda4TopicModel.model') # lda.labelTopics()
def train_lda(corpus, dictionary, lda_model_filepath, num_topics, run_or_load_flag): if run_or_load_flag: with warnings.catch_warnings(): warnings.simplefilter('ignore') # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, workers=3) lda.save(lda_model_filepath) else: lda = LdaMulticore.load(lda_model_filepath) return lda
def __init__(self, examples, tokenizer, lda_vocab_path, lda_model_path, args): self.data = examples self.tokenizer = tokenizer # add new special token self.spec_tokens = load_special_tokens(args) self.tokenizer.additional_special_tokens = self.spec_tokens self.tokenizer.add_tokens(self.spec_tokens) self.args = args self.item_vocab = load_item_vocab(args) self.lda_vocab = Dictionary.load(lda_vocab_path) self.lda_model = LdaMulticore.load(lda_model_path) self.sent_lim = [ self.args.cp_sentNum, self.args.desc_sentNum, self.args.require_sentNum, self.args.benefit_sentNum ] self.text_fields = self.data[0]._fields[:4]
def main(): options = { 'corpus_file': 'data\\origtweets_dtm.pkl', 'id_file': 'data\\row_origtweets.csv', 'model_file': 'data\\orig_10topics.lda', 'meta_file': 'data\\origtweets_meta.csv', 'output_file': 'data\\origtweets_topics.csv' } start_time = time.time() id_df = pd.read_csv(options['id_file'], usecols=['row'], dtype='float') meta_df = pd.read_csv(options['meta_file']) with open(options['corpus_file']) as corpus_file: corpus = pickle.load(corpus_file) lda = LdaMulticore.load(options['model_file']) if len(meta_df) != len(corpus): print ('Warning: Some documents may have been deleted during processing.\n') print ('metadata size - corpus size = ' + str(len(meta_df) - len(corpus))) topic_features = [to_dense(lda[bow], lda.num_topics) for bow in corpus] topic_colname = 'topic{0}'.format topic_colnames = [topic_colname(t+1) for t in xrange(lda.num_topics)] topic_df = pd.DataFrame.from_records(topic_features, columns=topic_colnames) with open('data\\topic_df.pkl', 'wb') as pkl_file: pickle.dump(topic_df, pkl_file) print ('topic size - id size = ' + str(len(id_df) - len(topic_df))) if len(id_df) != len(topic_df): raise Exception() topic_df = pd.concat([id_df, topic_df], axis=1) merged_df = pd.merge(meta_df, topic_df, on='row', how='right', sort=False) merged_df.to_csv(options['output_file'], index=False) end_time = time.time() print ('running time: ' + str((end_time - start_time)/60) + ' minutes')
def build_model(self, fname=None, save_to=None): id2word = self.id2word or self.build_id2word() corpus = self.corpus or self.build_corpus() # read model.lda file if not fname: fname = click.prompt('model file name', type=str, default='model.lda') fname = self.__dest(fname) # if there is no model file or the user wants to rebuild, build .model if not os.path.isfile(fname) or click.confirm('There already is %s. Do you want to re run lda?' % fname): num_procs = click.prompt('Number of processes to launch', type=int, default=multiprocessing.cpu_count()) num_epochs = click.prompt('Number of epochs to run', type=int, default=20) num_topics = click.prompt('Number of topics', type=int, default=100) print 'start building model' start = time() model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, workers=num_procs, passes=num_epochs) model.save(fname) #save print 'building model takes: %s' % LdaUtils.human_readable_time(time() - start) self.model = LdaMulticore.load(fname) return self.model
def get_topics(): '''Computes distribution over topics for each abstract''' dictionary = Dictionary.load('lda.dict') lda = LdaMulticore.load('lda.gensim') base = 'datasets/dspace' new_base = 'datasets/dspace_topics' for filename in tqdm(os.listdir(base)): path = os.path.join(base, filename) with open(path, 'r') as f: d = json.load(f) abstract = d['abstract'] if abstract is not None: words = tokenize(abstract.split()) bow = dictionary.doc2bow(words) topics = lda.get_document_topics(bow, minimum_probability=0) topics = to_vec(topics) d['topics'] = topics new_path = os.path.join(new_base, filename) with open(new_path, 'w') as new_f: json.dump(d, new_f)
def generate_lda_topics(self): from gensim.corpora import Dictionary, MmCorpus from gensim.models.ldamulticore import LdaMulticore import pyLDAvis import pyLDAvis.gensim import warnings import _pickle as pickle trigram_sentences = LineSentence(self.trigram_sentences_filepath) trigram_dictionary = Dictionary(trigram_sentences) # trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(self.trigram_dictionary_filepath) def trigram_bow_generator(filepath): for sentence in LineSentence(filepath): yield trigram_dictionary.doc2bow(sentence) MmCorpus.serialize( self.trigram_bow_filepath, trigram_bow_generator(self.trigram_sentences_filepath)) trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath) with warnings.catch_warnings(): warnings.simplefilter('ignore') lda = LdaMulticore(trigram_bow_corpus, num_topics=3, id2word=trigram_dictionary, workers=3) lda.save(self.lda_model_filepath) lda = LdaMulticore.load(self.lda_model_filepath) lda.show_topic(0) lda.show_topic(1) lda.show_topic(2) LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary) pyLDAvis.save_html(LDAvis_prepared, self.LDAvis_html_filepath)
if 0 == 1: with warnings.catch_warnings(): warnings.simplefilter('ignore') # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(trigram_bow_corpus, num_topics=5, id2word=trigram_dictionary, workers=3) lda.save(lda_model_filepath) # load the finished LDA model from disk lda = LdaMulticore.load(lda_model_filepath) explore_topic(topic_number=0) topic_names = { 0: 'looking_at_websites_for_info', 1: 'doesnt_have_the_negative_exercise_effect', 2: 'spend_time_looking_on_websites', 3: 'games_and_information', 4: 'bad_if_kids_spend_too_much_time' } topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl') with open(topic_names_filepath, 'wb') as f: pickle.dump(topic_names, f)
def load_model(self, path): self.lda_model = LdaMulticore.load(path)
plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend(("coherence_values"), loc='best') plt.show() coherence_values # for m, cv in zip(x, coherence_values): # print("Num Topics =", m, " has Coherence Value of", round(cv, 4)) # optimal_model = model_list[1] # for i, row in enumerate(optimal_model[corpus]): # print(i, row) # optimal_model.save('lda.model') # model.save('lda.model') model = LdaMulticore.load('lda.model') model_topics = optimal_model.show_topics(formatted=False) pprint(optimal_model.print_topics(num_words=10)) # def format_topics_sentences(ldamodel, corpus=corpus, texts=texts): # # Init output # sent_topics_df = pd.DataFrame() # # Get main topic in each document # for i, row in enumerate(ldamodel[corpus]): # row = sorted(row, key=lambda x: (x[1]), reverse=True) # # Get the Dominant topic, Perc Contribution and Keywords for each document # for j, (topic_num, prop_topic) in enumerate(row): # if j == 0: # => dominant topic