def run_lda(corpus, dictionary, texts, num_topics=10, passes=20, iterations=100): eval_frame = pd.DataFrame(columns=[ 'Num_Topics', 'Log_Perplexity_P_{0}_I_{1}'.format(passes, iterations), 'Topic_Coherence(u_mass)_P_{0}_I_{1}'.format(passes, iterations), 'Topic_Coherence(c_uci)_P_{0}_I_{1}'.format(passes, iterations), 'Topic_Coherence(c_v)_P_{0}_I_{1}'.format(passes, iterations), 'Topic_Coherence(c_npmi)_P_{0}_I_{1}'.format(passes, iterations) ]) logging.debug('******* RUNNING LDA *************') lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, iterations=iterations, chunksize=2500) coh_model_umass = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass') coh_model_uci = CoherenceModel(model=lda_model, texts=texts, coherence='c_uci') coh_model_ucv = CoherenceModel(model=lda_model, texts=texts, coherence='c_v') coh_model_npmi = CoherenceModel(model=lda_model, texts=texts, coherence='c_npmi') eval_frame.loc[len(eval_frame)] = [ num_topics, lda_model.log_perplexity(corpus), coh_model_umass.get_coherence(), coh_model_uci.get_coherence(), coh_model_ucv.get_coherence(), coh_model_npmi.get_coherence() ] model = namedtuple('model', ['lda_model', 'eval_frame']) return model(lda_model, eval_frame)
def learn_lda(corpus=None, dictionary=None, num_topics = NUM_TOPICS, passes = PASSES, iterations = ITERATION): print("\nLDA Training...\n") ldamodel = LdaMulticore( corpus, num_topics = num_topics, id2word = dictionary, passes = passes, workers = WORKERS, iterations = iterations ) print("\nLDA Training Done!\n") print("\nCoherence | Perplexity computing...\n") cm = CoherenceModel(model=ldamodel, corpus=corpus, coherence='u_mass') coherence = cm.get_coherence() perplexity = ldamodel.log_perplexity(corpus) return ldamodel, coherence, perplexity
def make_ldamodels(pre_processed, max=6): perplex_coherence = [] dictionary = corpora.Dictionary(pre_processed) corpus = [dictionary.doc2bow(text) for text in pre_processed] for num in range(5, max + 1): model = LdaMulticore(corpus, num_topics=num, id2word=dictionary, passes=30, random_state=1) coherence_model = CoherenceModel(model=model, texts=pre_processed, dictionary=dictionary, coherence='c_v') perplex_coherence.append((num, model.log_perplexity(corpus), coherence_model.get_coherence())) for val in perplex_coherence: print(val)
def fit_numtopics(train_corpus, test_corpus, id2word, num_topics_list, iters, workers, chunksize, logfilename, save=True): """ Args: num_topics_list = list of number of topics, a model will be fitted for each save: indicates whether model should be saved Returns: topics_dict = a dictionary of topics lists, where the key is the number of topics """ topics_dict = {} logfile = open(logfilename, 'w') for num_topics in num_topics_list: print('training', num_topics) np.random.seed(NUM) start_time = time.time() model = LdaMulticore(corpus=train_corpus, id2word=id2word, num_topics=num_topics, iterations=iters, eval_every=None, workers=workers, chunksize=chunksize) end_time = time.time() if save: fname = 'data\\orig_' + str(num_topics) + 'topics.lda' model.save(fname) per_word_bound = model.log_perplexity(test_corpus) perplexity = np.exp2(-1.0 * per_word_bound) logfile.write('\n' + 'num_topics: ' + str(num_topics) + '\n') logfile.write('perplexity: ' + str(perplexity) + '\n') logfile.write('train_time: ' + str(end_time - start_time) + '\n' + 'Topics: \n') topics = model.show_topics(num_topics=num_topics, num_words=20) topics_dict[str(num_topics)] = topics for topic in topics: logfile.write('\n\t' + topic.encode('ascii', 'ignore') + '\n') logfile.close() return topics_dict
passes=10, chunksize=5000, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, ) print("--- %s seconds ---" % (time.time() - start_time)) fname = folder_name + 'LDA' + str(topic_number) + 'topics' model.save(fname) #Load a pretrained model model = LdaModel.load(fname, mmap='r') type(model) #perplexity perplexity = model.log_perplexity(matutils.Sparse2Corpus( X, documents_columns=False), total_docs=None) # batch LDA model_eval = [] for k in range(2, 21): topic_number = k start_time = time.time() model = LdaMulticore( matutils.Sparse2Corpus(X, documents_columns=False), num_topics=topic_number, passes=10, chunksize=5000, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, )
# num_topics=35, # random_state=100, # update_every=1, # chunksize=100, # passes=10, # alpha='auto', # per_word_topics=True) pprint(model.print_topics()) doc_lda = model[corpus] doc_lda[4] model.get_document_topics(corpus)[1] # Compute Perplexity print('\nPerplexity: ', model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_multicore, corpus, dictionary) vis mallet_path = '/home/ubuntu/Signal/mallet-2.0.8/bin/mallet' ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=35, id2word=id2word) coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=dictionary, coherence='c_v') coherence_ldamallet = coherence_model_ldamallet.get_coherence()
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list other_texts.append(stemmed_tokens) other_corpus = [dictionary.doc2bow(text) for text in other_texts] # unseen_doc = other_corpus[2] # vector = ldamodel[unseen_doc] # print(vector) # generate LDA model------------------------------------------------------------------------- my_loop_num_topics = [2, 5, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100] for i in my_loop_num_topics: my_num_topics = i print(my_num_topics) # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20) myldamodel = LdaMulticore(corpus, num_topics=my_num_topics, id2word=dictionary, workers=3, alpha=1e-5, eta=5e-1) print(myldamodel.print_topics(num_topics=my_num_topics, num_words=5)) print(myldamodel.log_perplexity(corpus)) print(myldamodel.log_perplexity(other_corpus))
results = lda.print_topics() print("-------------") print("TOPICS (RAW RESULTS)...") print(results) parsed_topics = parse_topics(lda) print("-------------") print("TOPICS (PARSED RESULTS)...") pprint(parsed_topics) # h/t: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#11createthedictionaryandcorpusneededfortopicmodeling topics = lda[bags_of_words] print(topics[0]) #> [(4, 0.3149784), (7, 0.47801575), (13, 0.20485382)] # a measure of how good the model is. lower the better. print("Perplexity:", lda.log_perplexity(bags_of_words)) #> -7.74115184561741 cm = CoherenceModel(model=lda, texts=token_stream(NOVELS_DIRPATH), dictionary=dictionary, coherence="c_v") print("Coherence Score:", cm.get_coherence()) #> 0.3695864834032673 #vis = pyLDAvis.gensim.prepare(lda, bags_of_words, dictionary) #vis exit() # # SPACY NAMED ENTITY APPROACH # nlp = spacy.load("en_core_web_md")
matutils.Sparse2Corpus(X,documents_columns=False), num_topics=topic_number,passes=10, chunksize=5000, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, ) print("--- %s seconds ---" % (time.time() - start_time)) fname = folder_name+'LDA'+str(topic_number)+'topics' model.save(fname) #Load a pretrained model model = LdaModel.load(fname, mmap='r') type(model) #perplexity perplexity = model.log_perplexity(matutils.Sparse2Corpus(X,documents_columns=False), total_docs=None) # batch LDA model_eval = [] for k in range(2,21): topic_number = k start_time = time.time() model = LdaMulticore( matutils.Sparse2Corpus(X,documents_columns=False), num_topics=topic_number,passes=10, chunksize=5000, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, )
def start(num_topics, kind): data = loader.load_data(kind) df = pd.DataFrame(data) cleaner.clean(df) nlps = { 'it': spacy.load('it_core_news_lg'), 'en': spacy.load('en_core_web_lg'), 'fr': spacy.load('fr'), 'de': spacy.load('de') } tokenizers = { 'it': Tokenizer(nlps['it'].vocab), 'en': Tokenizer(nlps['en'].vocab), 'fr': Tokenizer(nlps['fr'].vocab), 'de': Tokenizer(nlps['de'].vocab) } # Customize stop words by adding to the default list stop_words = [] stop_words += nlps['it'].Defaults.stop_words stop_words += nlps['en'].Defaults.stop_words stop_words += nlps['fr'].Defaults.stop_words stop_words += nlps['de'].Defaults.stop_words stop_words += s.ALL_STOPWORDS stop_words = set(stop_words) # ALL_STOP_WORDS = spacy + gensim + wordcloud ALL_STOP_WORDS = stop_words.union(SW).union(stopwords) cleaner.remove_stopwords(df, tokenizers, ALL_STOP_WORDS) cleaner.lemmas(df, nlps) tok.tokenize_text(df) # Create a id2word dictionary id2word = Dictionary(df['lemma_tokens']) print(len(id2word)) # Filtering Extremes id2word.filter_extremes(no_below=2, no_above=.99) print(len(id2word)) # Creating a corpus object corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']] # Instantiating a Base LDA model base_model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, workers=12, passes=5) # Filtering for words words = [re.findall(r'"([^"]*)"', t[1]) for t in base_model.print_topics()] # Create Topics topics = [' '.join(t[0:10]) for t in words] # Getting the topics for id, t in enumerate(topics): print(f"------ Topic {id} ------") print(t, end="\n\n") # Compute Perplexity # a measure of how good the model is. lower the better base_perplexity = base_model.log_perplexity(corpus) print('\nPerplexity: ', base_perplexity) # Compute Coherence Score coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v') coherence_lda_model_base = coherence_model.get_coherence() print('\nCoherence Score: ', coherence_lda_model_base) lda_display = pyLDAvis.gensim.prepare(base_model, corpus, id2word) d = pyLDAvis.display(lda_display) today = date.today() directory_path = f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/" if not os.path.exists(directory_path): os.makedirs(directory_path) f = open( f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/{num_topics}.html", 'w') f.write(d.data) f.close() vectorizer = CountVectorizer() data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text']) # Define Search Param search_params = { 'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9] } # Init the Model lda = LatentDirichletAllocation() # Init Grid Search Class model = GridSearchCV(lda, param_grid=search_params) # Do the Grid Search model.fit(data_vectorized) GridSearchCV(cv=None, error_score='raise', estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_jobs=1, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0), iid=True, n_jobs=1, param_grid={ 'n_topics': [10, 15, 20, 30], 'learning_decay': [0.5, 0.7, 0.9] }, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0) # Best Model best_lda_model = model.best_estimator_ # Model Parameters print("Best Model's Params: ", model.best_params_) # Log Likelihood Score print("Best Log Likelihood Score: ", model.best_score_) # Perplexity print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
my_loop_num_topics = list(range(1, 51)) # set number of topics to loop my_loop_num_topics.append(100) print(my_loop_num_topics) training_fit = [] test_fit = [] for i in my_loop_num_topics: my_num_topics = i print(my_num_topics) # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20) myldamodel = LdaMulticore(training_set, num_topics=my_num_topics, id2word=dictionary, workers=3, alpha=1e-5, eta=5e-1) print(myldamodel.print_topics(num_topics=my_num_topics, num_words=5)) print(myldamodel.log_perplexity(training_set)) print(myldamodel.log_perplexity(test_set)) training_fit.append(myldamodel.log_perplexity(training_set)) test_fit.append(myldamodel.log_perplexity(test_set)) with open('training_fit.csv', 'w') as f: wr = csv.writer(f, quoting=csv.QUOTE_ALL) wr.writerow(training_fit) with open('test_fit.csv', 'w') as f: wr = csv.writer(f, quoting=csv.QUOTE_ALL) wr.writerow(test_fit)
level=logging.INFO) # Build LDA model with this number of topics lda_model = LdaMulticore( corpus=corpus, id2word=id2word, num_topics=topics, random_state=100, chunksize=200, passes=1000, # iterations=5000, # minimum_probability=0, per_word_topics=True) #Compute Perplexity perplexity[topics] = lda_model.log_perplexity( corpus) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') coherence[topics] = coherence_model_lda.get_coherence() #save results lda_model.save( f"trained_models/trained_lda_model_search_broad_{topics}") with open("data/perplexity.pkl", 'wb') as f: pkl.dump(perplexity, f) with open("data/coherence.pkl", 'wb') as f: pkl.dump(coherence, f)
beta=beta, iter=num_iterations) print run_id output_file = output_file_template.format(run_id=run_id) # Train and save print 'Training...' model = LdaMulticore(corpus, alpha=alpha, eta=beta, passes=50, id2word=dictionary, num_topics=num_topics, iterations=num_iterations) # model.save(output_file) print 'Done training' # Print top 10 words in topics, if desired if print_topics: topics = model.show_topics(num_topics=4, formatted=False) for topic in topics: for tup in topic[1]: print tup[0] + ": " + str(tup[1]) print '\n' # Evaluate perplexity ll = model.log_perplexity(test_corpus) print "LL: " + str(ll) print "Perp: " + str(np.exp2(-ll))