def main(fn): with open(fn + '.json', 'r') as f: data_samples = [] doc_lengths = [] lb = datetime(2014, 04, 25) #datetime(2014, 01, 01) ub = datetime(2014, 04, 27) #datetime(2014, 12, 12) for line in f: tweet = json.loads(line) if getTime(tweet) < lb or getTime(tweet) > ub: continue sample, length = preprocess_text(getText(tweet)) data_samples.append(sample) doc_lengths.append(length) n_features = 1000 n_topics = 10 n_top_words = 20 #lda_topic(data_samples, n_features, n_topics, n_top_words) data_viz, _ = lda_viz(data_samples, doc_lengths, n_features, n_topics, n_top_words) #data_viz = pyLDAvis.prepare(**data_viz) #pyLDAvis.show(data_viz) pyLDAvis.save_html(data_viz, 'topics.html')
def issue_analysis(df): df_sub = df[['Issue']] df_sub.insert(0, 'count', 1) Issue_List=[] for i in range(0,50): Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name) tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop text_view = '' # loop through document list for i in Issue_List: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) #print ' '.join(stemmed_tokens) text_view += ' '.join(stemmed_tokens) text_view += ' ' wordcloud = WordCloud().generate(text_view) fig = plt.figure(figsize=(8,6)) fig1 = fig.add_subplot(1,1,1) fig1.set_title("Top issued words", fontdict={'fontsize':25}) fig1.imshow(wordcloud) fig1.axis("off") #plt.savefig('ComplainCount_WC.png') plt.savefig('ComplainCount_WC_2016.png') # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "issue_lda.html") #pyLDAvis.save_json(vis_data, "issue_lda.json") pyLDAvis.save_html(vis_data, "issue_lda_2016.html") pyLDAvis.save_json(vis_data, "issue_lda_2016.json") return 0
def plot_lda_vis(model_data, mode='show', filename=None): """Designed to work with to_py_lda_vis() in the model classes.""" from pyLDAvis import prepare, save_html, show model_vis_data = prepare(**model_data) if mode == 'save_html' and filename: save_html(model_vis_data, filename) else: show(model_vis_data)
def learn_topic_model(X, vocab, graphlets, config, dbg=False): alpha = config['dirichlet_params']['alpha'] eta = config['dirichlet_params']['eta'] model = lda.LDA(n_topics=config['n_topics'], n_iter=config['n_iters'], random_state=1, alpha=alpha, eta=eta) model.fit(X) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works n_top_words = 30 feature_freq = (X != 0).sum(axis=0) doc_lengths = (X != 0).sum(axis=1) try: print "phi: %s. theta: %s. nd: %s. vocab: %s. Mw: %s" \ %( model.topic_word_.shape, model.doc_topic_.shape, doc_lengths.shape, len(graphlets.keys()), len(feature_freq)) data = {'topic_term_dists': model.topic_word_, 'doc_topic_dists': model.doc_topic_, 'doc_lengths': len(graphlets.keys()), 'vocab': graphlets.keys(), 'term_frequency': X} import pyLDAvis vis_data = pyLDAvis.prepare(model.topic_word_, model.doc_topic_, doc_lengths, graphlets.keys(), feature_freq) # vis_data = pp.prepare(model.topic_word_, model.doc_topic_, doc_lengths, graphlets.keys(), feature_freq) html_file = "../LDAvis/Learnt_Models/topic_model_" + id + ".html" pyLDAvis.save_html(vis_data, html_file) print "PyLDAVis ran. output: %s" % html_file """investigate the objects used in the topics""" print("\ntype(topic_word): {}".format(type(topic_word))) print("shape: {}".format(topic_word.shape)) topics = {} for i, topic_dist in enumerate(topic_word): objs = [] topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] #print('Topic {}: {}'.format(i, ' '.join( [repr(i) for i in topic_words] ))) for j in [graphlets[k] for k in topic_words]: objs.extend(object_nodes(j)[0]) topics[i] = objs if dbg: print('Topic {}: {}'.format(i, list(set(objs)))) except ImportError: print "No module pyLDAvis. Cannot visualise topic model" """investigate the highly probably topics in each document""" doc_topic = model.doc_topic_ # #Each document's most probable topic - don't have the UUIDs, so dont use this. # pred_labels = [] # for n in range(doc_topic.shape[0]): # if max(doc_topic[n]) > config['class_thresh']: # topic_most_pr = doc_topic[n].argmax() # pred_labels.append(topic_most_pr) return doc_topic, topic_word #, pred_labels
def vis_hdpvis(self): """ Produces LDAvis visualization. Opens a web browser page with javascript topic viewer. """ hdp_vis_data = pg.prepare(self.hdp, self.cor, self.cor.dictionary) pyLDAvis.save_html(hdp_vis_data, '../../data/hdpvis.html') vis_path = os.path.realpath('../../data/hdpvis.html') webbrowser.open('file://{}'.format(vis_path), new=2)
def vis_ldavis(self): """ Produces LDAvis visualization. Opens a web browser page with javascript topic viewer. """ lda_vis_data = pg.prepare(self.lda, self.cor, self.cor.dictionary) pyLDAvis.save_html(lda_vis_data, "../../data/ldavis.html") vis_path = os.path.realpath("../../data/ldavis.html") webbrowser.open("file://{}".format(vis_path), new=2)
def lda_vis(modeled_corpus, mode='show', filename=None): """Designed to work with to_py_lda_vis() in the model classes.""" from pyLDAvis import prepare, show, save_html model_vis_data = _to_py_lda_vis(modeled_corpus) prepared_model_vis_data = prepare(**model_vis_data) if mode == 'save_html' and filename: save_html(prepared_model_vis_data, filename) else: show(prepared_model_vis_data)
def lda_vis(modeled_corpus, mode='show', filename=None): """Designed to work with to_py_lda_vis() in the model classes.""" from pyLDAvis import prepare, show, save_html model_vis_data = _to_py_lda_vis(modeled_corpus) prepared_model_vis_data = prepare(**model_vis_data) if mode == 'save_html' and filename: logging.info("Saving pyLDAVis to {}".format(filename)) save_html(prepared_model_vis_data, filename) else: show(prepared_model_vis_data, ip="0.0.0.0", port=8888)
def visualize(self, outfn): """ Produce a pyLDAvis visualization of a model and save to disk at the given location. """ if self.has_viz_data: pyLDAvis.save_html(self.vis_data, outfn) return assert(self.has_vocab and self.has_corpus) assert(self.is_trained) # this might crash. I think because corpus, vocab, and _lda_model are all big. self.vis_data = prepare(self._lda_model, self.corpus, self.vocab) self.has_viz_data = True pyLDAvis.save_html(self.vis_data, outfn)
def vectorize(self): ''' args: none output: generates an LDA topic model of the document using gensim and pyLDAvis ''' # tokenize and remove stopwords sentences = self.sent_detector.tokenize(self.raw.decode('utf-8').strip()) # use raw text #sentences = Topic(raw_input('topic: ')).text # get text from wikipedia #stoplist = set('for this that by or is a of the and to in are be as an it can on if at which then also with used such not from use other have some these more using has many one was may often but their they than when been its not all may some have had'.split()) texts = [[word for word in sentence.lower().split() if word not in self.stopwords] for sentence in sentences] # compute the frequency of each token frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 # remove words that appear only once texts = [[token for token in text if frequency[token] > 1] for text in texts] # construct a gensim dictionary and corpus (bag of words) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # currently, "text" is a sentence in the document # define LDA model lda = models.ldamodel.LdaModel( corpus = corpus, id2word = dictionary, num_topics = 10, #what should this be ??? update_every = 1, chunksize = 10000, passes = 1 ) # visualize the lda space vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary) pyLDAvis.display(vis_data) pyLDAvis.show(vis_data) with open('topic_models/'+self.name+'.json', 'a+') as topic_json: pyLDAvis.save_json(vis_data, topic_json) with open('topic_models/'+self.name+'.html', 'a+') as topic_html: pyLDAvis.save_html(vis_data, topic_html)
def narrative_analysis(df): tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop for index in range(0,len(df.index)): if str(df['narrative'].ix[index]) != 'nan': intext = df['narrative'].ix[index] intext = re.sub(r"X+", "", intext) raw = intext.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "narrative_lda.html") #pyLDAvis.save_json(vis_data, "narrative_lda.json") pyLDAvis.save_html(vis_data, "narrative_lda_2016.html") pyLDAvis.save_json(vis_data, "narrative_lda_2016.json") return 0
id2word = corpora.Dictionary(data_lemmatized) # Create Corpus texts = data_lemmatized # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # Human readable format of corpus (term-frequency) [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]] #Creating LDA model print('\nBuilding the model\n') lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=100, update_every=1, chunksize=100, passes=100, alpha='auto', per_word_topics=True) topics = lda_model.print_topics() print(topics) #topics = zip(*topics) # Visualize the topics p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) pyLDAvis.save_html(p, 'lda.html')
def main(self): # set seed np.random.seed(1) # preprocess tweets self.preprocess_tweets() # word cloud word_cloud_img = os.path.join(self.this_dir, "../plots/word_cloud.png") if not os.path.isfile(word_cloud_img): self.word_cloud(word_cloud_img) # initialise the count vectorizer with English stop words count_vectorizer = CountVectorizer(stop_words="english") # fit and transform preprocessed tweets (counts the num of each word in vector) count_data = count_vectorizer.fit_transform( self.tweets_df["content_pro"]) # most common words most_comm_words = self.most_common_words(count_data, count_vectorizer) # best fitted LDA model and num of topics best_lda_model = self.grid_search(count_data) n_topics = best_lda_model.n_components # best fitted LDA model performance log_like_best, perp_best = self.performance(count_data, best_lda_model) print("Model: best_lda_model", end="\n") print(f"Best Model's Params: {best_lda_model.get_params()}") print(f"Log Likelihood: {log_like_best}") print(f"Perplexity: {perp_best}") # extract topics from top keywords in each tweet topics, doc_topic_df = self.extract_topics(count_data, count_vectorizer, best_lda_model) # add topics to self.tweets_df self.tweets_df["topic"] = doc_topic_df["topic"].tolist() # set LDAvis_prepared paths LDAvis_prep_data_path = os.path.join( self.this_dir, "../data/ldavis_data_" + str(n_topics)) LDAvis_prep_html_path = os.path.join( self.this_dir, "../plots/ldavis_html_" + str(n_topics)) # load LDAvis_prepared data from disk # plot showing topics in topic model that has been fitted to corpus of text data try: with open(LDAvis_prep_data_path, "rb") as f: LDAvis_prep = cPickle.load(f) except FileNotFoundError: LDAvis_prep = sklearn_lda.prepare(best_lda_model, count_data, count_vectorizer) with open(LDAvis_prep_data_path, "wb") as f: cPickle.dump(LDAvis_prep, f) # save html file pyLDAvis.save_html(LDAvis_prep, LDAvis_prep_html_path + ".html") # returns interactive plot, groups, and 10 most common words return ( self.tweets_df[["content_pro", "topic"]], LDAvis_prep_html_path, most_comm_words, topics, )
from gensim import corpora, models import pyLDAvis.gensim import pyLDAvis dic = corpora.Dictionary.load('data/model/newsgroups.dict') corp = corpora.MmCorpus('data/model/newsgroups.mm') lda = models.ldamodel.LdaModel.load('data/model/newsgroups_50.model') # Prepare the data for the visualization newsgroup_data = pyLDAvis.gensim.prepare(lda, corp, dic) # Create the visualization pyLDAvis.display(newsgroup_data) # Save the visualization as a html file pyLDAvis.save_html(newsgroup_data, 'data/model/newsgroup_ldavis.html')
update_every=1, chunksize=100, passes=10, alpha="auto", per_word_topics=True, ) # 13. View the topics in LDA model # Print the Keyword in the 10 topics print(lda_model.print_topics()) doc_lda = lda_model[corpus] # 14. Compute Model Perplexity and Coherence Score # Compute Perplexity print("\nPerplexity: ", lda_model.log_perplexity( corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence="u_mass") coherence_lda = coherence_model_lda.get_coherence() print("\nCoherence Score: ", coherence_lda) # 15. Visualize the topics-keywords # Visualize the topics # pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) pyLDAvis.save_html(vis, "lda.html")
def visualize_data(bow_corpus, tweet_dictionary, lda_model): lda_visualization = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, tweet_dictionary) pyLDAvis.save_html(lda_visualization, 'vis.html')
def visualize_model(model, dictcorpus, vectorcorpus, vizfile): visualization = pyLDAvis.gensim.prepare(model, vectorcorpus, dictcorpus, sort_topics=False, mds="mmds") pyLDAvis.save_html(visualization, vizfile)
count_vectorizer = CountVectorizer(stop_words=my_stop_words)# Fit and transform the processed titles count_data = count_vectorizer.fit_transform(texts)# Visualise the 10 most common words #count_data = crossRef(count_data, men) #plot_10_most_common_words(count_data, count_vectorizer) # Tweak the two parameters below number_topics = 11 number_words = 10# Create and fit the LDA model lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data)# Print the topics found by the LDA model print("Topics found via LDA:") print_topics(lda, count_vectorizer, number_words) LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(number_topics)) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) f.close() # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: #f.encode('utf-8').strip() LDAvis_prepared = pickle.load(f) pyLDAvis.save_html(LDAvis_prepared, './outs/ldavis_prepared_'+ str(number_topics) +'.html')
def make(corpus_path, dest_path, num_topics=50, passes=10): corpus_id = corpus_path.split("/").pop().replace('.pkl', "") cfname = os.path.join(dest_path, "lda_" + corpus_id) data_words = create_data_words(corpus_path) print "MAKING GRAM MODELS" # Build the bigram and trigram models # higher threshold fewer phrases. bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Ngram models bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) data_words_nostops = remove_stopwords(data_words) data_words_bigrams = make_bigrams(bigram_mod, data_words_nostops) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) nlp = spacy.load('en', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization( nlp, data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) print "MAKING CORPUS RESOURCES" texts = data_lemmatized # Dictionary id2word = corpora.Dictionary(data_lemmatized) # TDF corpus = [id2word.doc2bow(text) for text in texts] lda_model = None # # Check if model exists if not os.path.isfile(cfname): # LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, update_every=1, chunksize=100, passes=passes, alpha='auto', per_word_topics=True) lda_model.save(cfname) else: print "MODEL FOUND USING PREVIOUS" lda_model = gensim.models.ldamodel.LdaModel.load(cfname) doc_lda = lda_model[corpus] # Compute Perplexity # a measure of how good the model is. lower the better. print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Topics vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds') vis_file_name = dest_path + '/' + \ str(num_topics) + "_" + corpus_id + ".html" pyLDAvis.save_html(vis, vis_file_name)
def get_result_lda(corpus_file_name, paper_file_name, trial_file_name, result_paper_file, result_trail_file, label_word_file, result_topic_file, most_relevant_file, topic_number, html_file): """ Perform lda, get the result :param html_file: :param topic_number: :param most_relevant_file: :param result_topic_file: :param corpus_file_name: Corpus file :param paper_file_name: paper file :param trial_file_name: trial file :param result_paper_file: Tagged paper file :param result_trail_file: Tagged trial file :param label_word_file: Main word file :return: """ # TODO: data processing data = pd.read_excel(corpus_file_name) data['context'] = data['context'].apply(lambda x: x if x is not np.nan else '') doc_clean = [x.split() for x in list(data['context'])] dictionary = corpora.Dictionary(doc_clean) doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # TODO: 模型训练 Lda = gensim.models.ldamodel.LdaModel ldamodel = Lda(doc_term_matrix, num_topics=topic_number, id2word=dictionary, random_state=4, iterations=1000) # TODO: Judgment tags, main keywords,Perc Contribution df_dominant_topic = format_topics_sentences(ldamodel, doc_term_matrix, doc_clean, data['id']) df_dominant_topic.columns = [ 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'id' ] # Show df_dominant_topic.to_excel(result_topic_file) labels = pd.DataFrame(list(df_dominant_topic['Dominant_Topic']), columns=['label']) paper_data = pd.read_excel(paper_file_name) trail_data = pd.read_excel(trial_file_name) paper_data['label'] = list(labels['label'])[:len(paper_data)] trail_data['label'] = list(labels['label'])[len(paper_data):] paper_data.to_excel(result_paper_file, index=False) trail_data.to_excel(result_trail_file, index=False) # TODO: Statistics of each category: print(labels['label'].value_counts()) print('Statistics of the number of papers in each category') data['label'] = labels data1 = data[data['type'] == '论文'] print(data1['label'].value_counts()) print('Statistics of the number of trial in each categories') data2 = data[data['type'] == '试验'] print(data2['label'].value_counts()) # TODO:Determine the top 10 words in the total frequency of each category s = ldamodel.print_topics(num_topics=topic_number, num_words=20) result_topic = [] for doc_class, doc_t in s: doc_topics = doc_t.split('+') for doc_topic in doc_topics: result_topic.append([ doc_class, doc_topic.split('*')[1].strip(), doc_topic.split('*')[0].strip() ]) result_topic = pd.DataFrame(result_topic, columns=['class', 'topic', 'score']) result_topic.to_excel(label_word_file, index=None) result_topic.to_excel(label_word_file, index=False) # TODO: Determine the most similar corpus (corpus, ID) sent_topics_sorteddf_mallet = pd.DataFrame() sent_topics_outdf_grpd = df_dominant_topic.groupby('Dominant_Topic') for i, grp in sent_topics_outdf_grpd: sent_topics_sorteddf_mallet = pd.concat([ sent_topics_sorteddf_mallet, grp.sort_values(['Topic_Perc_Contrib'], ascending=[0]).head(10) ], axis=0) # Reset Index sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True) # Format sent_topics_sorteddf_mallet.columns = [ 'Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text", 'id' ] # Show sent_topics_sorteddf_mallet.head(10) sent_topics_sorteddf_mallet.to_excel(most_relevant_file, index=None) # TODO: Measurement model # cm_result = [] # for coherence in ['u_mass']: # goodcm = CoherenceModel(model=ldamodel, corpus=doc_term_matrix, dictionary=dictionary, coherence=coherence) # cm_result.append(goodcm.get_coherence()) # for coherence in ['c_v', 'c_uci', 'c_npmi']: # goodcm = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence=coherence) # cm_result.append(goodcm.get_coherence()) # print(cm_result) vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary) # pyLDAvis.show(vis) pyLDAvis.save_html(vis, html_file) print('Clustering complete!')
id2word = dictionary, num_topics = k, alpha = alpha, eta = eta, random_state = 100, chunksize = 100, passes = 10, per_word_topics = True) for idx, item in enumerate(final_model.print_topics(num_topics = -1, num_words = 30)): print("Topic %s has following keywords: "%(idx)) patterns=re.findall("\"(.*?)\"",str(item),re.S) print(patterns) # print(cv_score(corpus=corpus, dict_ = dictionary, k=30, alpha="symmetric", eta="auto")) # 使用 pyLDAvis 进行可视化 viz = pyLDAvis.gensim.prepare(final_model, corpus, dictionary) pyLDAvis.save_html(viz, "./tm_viz.html") #命名生成的html # 修改网页中的 3 处调用 with open("./tm_viz_new.html", "w") as t1: with open("./tm_viz.html", "r") as f2: webpage = f2.read() # 将 css js 文件存放在本地,避免加载堵塞 webpage = webpage.replace("https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css", "ldavis.v1.0.0.css") webpage = webpage.replace("https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js", "ldavis.v1.0.0.js") # 需要修正 d3.js 调用,回归老版本 webpage = webpage.replace("https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js", "https://d3js.org/d3.v3.js") t1.write(webpage)
folder = 'lda/manifesto/' # this has been added in previous steps (basically the number of topics) postfix = '_100' manifestos = [ 'cdu_2002.csv', 'cdu_2005.csv', 'cdu_2009.csv', 'cdu_2013.csv', 'fdp_2002.csv', 'fdp_2005.csv', 'fdp_2009.csv', 'fdp_2013.csv', 'gruene_2002.csv', 'gruene_2005.csv', 'gruene_2009.csv', 'gruene_2013.csv', 'linke_2005.csv', 'linke_2009.csv', 'linke_2013.csv', 'pds_2002.csv', 'piraten_2013.csv', 'spd_2002.csv', 'spd_2005.csv', 'spd_2009.csv', 'spd_2013.csv' ] start = time.time() for file in manifestos: checkpoint = time.time() print('starting analysis for file ' + file) model = gensim.models.ldamodel.LdaModel.load(folder + file + postfix + '.model') corpus = gensim.corpora.mmcorpus.MmCorpus(folder + file + postfix + '.corpus') dictionary = gensim.corpora.dictionary.Dictionary.load( folder + file + postfix + '.dictionary', ) visdata = gensimvis.prepare(model, corpus, dictionary, R=15) pyLDAvis.save_html(visdata, folder + file + postfix + '.html') print('generated html for ' + file + ' in ' + str(time.time() - checkpoint) + 's') print('generating html for all files took ' + str(time.time() - start) + 's')
def get_pyLDAvis(model, corpus, id2word, current_dir): print("\n* Now we will visualize the topics using pyLDAvis.") vis = pyLDAvis.gensim.prepare(model, corpus, id2word, sort_topics=False) pyLDAvis.save_html(vis, '%s/topic_model.html' % current_dir) print("PyLDAvis saved to html.")
text = list(csv.reader(open('result.txt', encoding='UTF-8'))) for i in range(len(text)): for j in range(len(text[i])): text[i][j] = re.sub(r'[!-@]', "", text[i][j]) text[i][j] = re.sub(r'[{-~]', "", text[i][j]) text[i][j] = text[i][j].lower() data = [] for i in range(len(text)): data.append([ word for word in text[i] if word not in stop_words and len(word) >= 2 ]) with open('fil_data.pkl', 'wb') as w: pickle.dump(data[::2], w) dictionary = gensim.corpora.Dictionary(data) dictionary.filter_extremes(no_below=3, no_above=0.8) corpus = [dictionary.doc2bow(t) for t in data] print('vocab size: ', len(dictionary)) #LDAvis lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=6, random_state=0) vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False) vis pyLDAvis.save_html(vis, 'LDAvis_output.html')
from sklearn.decomposition import LatentDirichletAllocation as LDA # Tweak the two parameters below number_topics = 10 number_words = 20 # Create and fit the LDA model lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data) # Print the topics found by the LDA model print("Topics found via LDA:") print_topics(lda, count_vectorizer, number_words) LDAvis_data_filepath = os.path.join( output_dir, filename_stem + '_' + 'lda_vis_prepared_' + str(number_topics)) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html( LDAvis_prepared, os.path.join( output_dir, filename_stem + '_' + 'lda_vis_prepared_' + str(number_topics) + '.html')) foo = 1
texts=all_studio_comment_list, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) #Coherence Score: 0.28624721848288204 # Visualize the topics pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, comment_vectors, id2word, mds='mmds') #pyLDAvis.show(vis) pyLDAvis.save_html(vis, 'lda_t15_w40.html') mallet_path = 'mallet-2.0.8/bin/mallet' # update this path ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=comment_vectors, num_topics=20, id2word=id2word) # Show Topics pprint(ldamallet.show_topics(formatted=False)) #endregion 2.4) LDA from Gensim ######### # region 2.5) Build tfidf model comment_tfidf = TfidfModel(comment_vectors)
def modeling(self): data_ = self.data string_ = self.string stop_words = stopwords.words('english') stop_words.extend(list(STOPWORDS)) stop_words.extend(list(ENGLISH_STOP_WORDS)) stop_words1 = get_stop_words('english') stop_words.extend(stop_words1) stop_words = list(set(stop_words)) stop_words.extend([ "_d180g", "Object", "Name", "NaN", "dtype", "Length", "backupnotes", "contact", "history" ]) dataS4 = data_[string_].values.tolist() # Word tokenization def sent_to_words(sentences): for sentence in sentences: yield (gensim.utils.simple_preprocess(str(sentence), deacc=True) ) # deacc=True removes punctuations def remove_stopwords(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in stop_words ] for doc in texts] data_words = list(sent_to_words(dataS4)) data_words = remove_stopwords(data_words) # Build the bigram and trigram models bigram = gensim.models.Phrases( data_words, min_count=2, threshold=2) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=2) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([ token.lemma_ for token in doc if token.pos_ in allowed_postags ]) return texts_out # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization( data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Create Dictionary id2word = corpora.Dictionary(data_lemmatized) # Create Corpus texts = data_lemmatized # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # Build LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=9, random_state=100, per_word_topics=True) # Compute Perplexity #print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='u_mass') coherence_lda = coherence_model_lda.get_coherence() #print('\nCoherence Score: ', coherence_lda) # Visualize the topics pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) pyLDAvis.save_html(vis, string_ + '.html') return (print('\nPerplexity: ', lda_model.log_perplexity(corpus)), print('\nCoherence Score: ', coherence_lda))
def exportLDA_vis(best_model, corpus, id2word, filename='pyLDAvis.html'): import pyLDAvis import pyLDAvis.gensim panel = pyLDAvis.gensim.prepare(best_model, corpus, id2word) pyLDAvis.save_html(panel, filename)
def visualize_pyldavis(lda_model, corpus, dictionary): prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary) pyLDAvis.save_html(prepared, 'vis_topic_model_02.html') pyLDAvis.show(prepared)
NUM_TOPICS = 8 #optium = 8 lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary) # # Build the LSI model # lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary) print("LDA Model:") for idx in range(NUM_TOPICS): # Print the first 10 most representative topics print("Topic #%s:" % idx, lda_model.print_topic(idx)) from pyLDAvis import gensim import pyLDAvis visualisation = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary) pyLDAvis.save_html(visualisation, 'LDA_Visualization.html') # creat lda values lda_value = [] for token in tokenized_data: #print(text) bow = dictionary.doc2bow(token) lda_value.append(lda_model[bow]) # bow = dictionary.doc2bow(clean_text(text_df[3])) # print(lda_model[bow]) # bow3 = dictionary.doc2bow(tokenized_data[3]) # print(lda_model[bow3]) embeddings_index_all = {}
def produce_visualization( file_names=["Isla Vista - All Excerpts - 1_2_2019.xlsx"], tokenizer=stem_tokenizer, labels=['ACCOUNT', 'HERO'], max_sentences=None, as_sentences=False, output_file='ldavis'): data = load_data.load_xlsx_data(file_names, max_sentences=max_sentences, as_sentences=as_sentences, labels=labels) excerpts = list(data['Excerpts']) # exclude labels with no true label keep_labels = [] for lab in labels: if sum(data[lab]) > 0: keep_labels.append(lab) else: print(lab + " label not present in files: " + str(file_names)) labels = keep_labels # create a subset of the data frame that is the account label types main_types_df = data[labels] main_types_df.index = range(1, main_types_df.shape[0] + 1) # drop rows and excerpts with no label # build vocab and doc_lengths all_words = [] doc_lengths = [] main_types_excerpts = [] for idx, doc in enumerate(excerpts): if sum(main_types_df.loc[idx + 1]) < 1: # if this document had no main type label main_types_df = main_types_df.drop([idx + 1], axis=0) else: main_types_excerpts.append(doc) doc_toks = stem_tokenizer(doc) all_words.extend(doc_toks) doc_lengths.append(len(doc_toks)) fdist = FreqDist(all_words) fdistmc = fdist.most_common() vocab = [word for word, count in fdistmc] term_frequency = [count for word, count in fdistmc] print("number of labelled documents: " + str(len(doc_lengths))) # build topic-term distribution stop_words = set(stopwords.words('english')) freq_dist_dict = {} topic_size = [] topic_num_words = [] i = 0 for coln in main_types_df.columns: categ_excerpts = list( compress(main_types_excerpts, main_types_df[coln].values)) exq = [tokenizer(doc) for doc in categ_excerpts] excerpt_words = [tok for tok_list in exq for tok in tok_list] i = i + 1 topic_size.append(len(exq)) topic_num_words.append(len(excerpt_words)) #print("Topic "+str(i)+": "+coln+" number of excerpts: "+str(len(exq))) words = [ word for word in excerpt_words if word.lower() not in stop_words and word.isalpha() ] freq_dist_dict[coln] = FreqDist(words) topic_term_dists = [] for coln in main_types_df.columns: ffdist = freq_dist_dict[coln] fdist = [ ffdist.freq(word) if word in ffdist.keys() else np.nextafter( float(0), (1)) for word in vocab ] #print("categ: "+str(coln)+" len of freq dist "+str(len(fdist))+" sum of vetor: "+str(sum(fdist))) topic_term_dists.append([float(i) for i in fdist]) # Document-topic distribution doc_topic_dists = [] for index, rowi in main_types_df.iterrows(): row = list(rowi) if (sum(row) > 1.01 or sum(row) < 0.99): #print(str(index)+" row: "+str(row)) # normalize row row = [r / sum(row) for r in row] if (sum(row) == 0): print(row) doc_topic_dists.append([float(i) for i in row]) # format for pyLDAvis data_dict = { 'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_frequency } #print('Topic-Term shape: %s' % str(np.array(data_dict['topic_term_dists']).shape)) #print('Doc-Topic shape: %s' % str(np.array(data_dict['doc_topic_dists']).shape)) # save data as json with open(output_file + '.json', 'w') as json_file: json.dump(data_dict, json_file) vis_data = pyLDAvis.prepare(**data_dict, n_jobs=-1) # order the columns for pyldavis col_order = vis_data.topic_order categs = list(main_types_df.columns) string_list = [""] * len(col_order) for idx, i in enumerate(col_order): msg = "Topic " + str(idx + 1) + ": " + categs[ i - 1] + ", number of words: " + str(topic_num_words[i - 1]) print(msg) string_list[idx] = msg with open(output_file + '.txt', 'w') as f: for msg in string_list: f.write("%s\n" % msg) pyLDAvis.save_html(vis_data, output_file + '.html') #if display: #pyLDAvis.display(vis_data) return vis_data
import pickle import pandas as pd infile = open( '/Users/gavin/Documents/Metis/Coursework/Project_4/notebooks/lda_tfidf.pkl', 'rb') lda_tfidf = pickle.load(infile) infile = open( '/Users/gavin/Documents/Metis/Coursework/Project_4/notebooks/dtm_tfidf.pkl', 'rb') dtm_tfidf = pickle.load(infile) infile = open( '/Users/gavin/Documents/Metis/Coursework/Project_4/notebooks/tf_idf_vectorizer.pkl', 'rb') tf_vectorizer = pickle.load(infile) import pyLDAvis import pyLDAvis.sklearn visualization = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tf_vectorizer) pyLDAvis.save_html( visualization, '/Users/gavin/Documents/Metis/Coursework/Project_4/notebooks/LDA_Visualization.html' )
best_lda_model = model.best_estimator_ # Model Parameters print("Best Model's Params: ", model.best_params_) # Log Likelihood Score print("Best Log Likelihood Score: ", model.best_score_) # Perplexity print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized)) panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne') pyLDAvis.save_html(panel, 'models/lda.html') # Create Document - Topic Matrix lda_output = best_lda_model.transform(data_vectorized) # column names topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)] # index names docnames = ["Doc" + str(i) for i in range(len(data))] # Make the pandas dataframe df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
def topicVisuals(df): parent_df = preProcess_data(df) parent_df['sat1'] = parent_df['sat1'].astype(int) # In[8]: parent_df['bagofwords'].head(10) # In[9]: df = parent_df # In[10]: #Segmenting the complete data frame based on the quarter df_q1 = df[df['rptqtr'] == '201603'] df_q2 = df[df['rptqtr'] == '201604'] df_q3 = df[df['rptqtr'] == '201701'] df_q4 = df[df['rptqtr'] == '201702'] df_q5 = df[df['rptqtr'] == '201703'] df_q6 = df[df['rptqtr'] == '201704'] df_q7 = df[df['rptqtr'] == '201801'] df_q8 = df[df['rptqtr'] == '201802'] df_q9 = df[df['rptqtr'] == '201803'] df_q1['sat1'] = df_q1['sat1'].astype(int) df_q2['sat1'] = df_q2['sat1'].astype(int) df_q3['sat1'] = df_q3['sat1'].astype(int) df_q4['sat1'] = df_q4['sat1'].astype(int) df_q5['sat1'] = df_q5['sat1'].astype(int) df_q6['sat1'] = df_q6['sat1'].astype(int) df_q7['sat1'] = df_q7['sat1'].astype(int) df_q8['sat1'] = df_q8['sat1'].astype(int) df_q9['sat1'] = df_q9['sat1'].astype(int) print(df_q1.shape) print(df_q2.shape) print(df_q3.shape) print(df_q4.shape) print(df_q5.shape) print(df_q6.shape) print(df_q7.shape) print(df_q8.shape) print(df_q9.shape) # In[11]: #Topic modelling #First tokenizing the data sentence wise and then after that word wise to avoid missing characters like punctuation def tokenize_only(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [ word.lower() for sent in nltk.sent_tokenize(str(text)) for word in nltk.word_tokenize(sent) ] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) return filtered_tokens # In[12]: word_list = tokenize_only(df['bagofwords'].tolist()) stop_words = stopwords.words('english') filtered_words = [ word for word in word_list if word.lower().strip() not in stop_words ] tfidf_vect = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=False) count_vect = CountVectorizer(max_df=0.80, max_features=50000) X = tfidf_vect.fit_transform(filtered_words) _X_ = count_vect.fit_transform(filtered_words) dense_matrix = _X_.todense() print("Sparsity: ", ((dense_matrix > 0).sum() / dense_matrix.size) * 100, "%") n_components = 10 lda = LatentDirichletAllocation(n_components=n_components, learning_method='batch', max_iter=25, random_state=0) document_topics = lda.fit_transform(_X_).T sorting = np.argsort(lda.components_, axis=1)[:, ::-1] feature_names = np.array(count_vect.get_feature_names()) mglearn.tools.print_topics(topics=range(n_components), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=10) # Log Likelyhood: Higher the better print("Log Likelihood using tf-idf: ", lda.score(_X_)) print("Perplexity using tf-idf: ", lda.perplexity(_X_)) # In[13]: lda_model, countMatrix, countVectorizer, tfidfMatrix, tfidfVectorizer = lda, _X_, count_vect, X, tfidf_vect p = pyLDAvis.sklearn.prepare(lda_model, countMatrix, countVectorizer, mds='mmds') visual_file = 'visuals.html' pyLDAvis.save_html(p, os.getcwd() + '/templates/' + visual_file) # In[14]: detract_df = parent_df[parent_df['sat1'] < 8] promo_df = parent_df[parent_df['sat1'] > 8] return (visual_file)
help="specify LDA model.") args.add_argument("-s", "--save_to_file", type=str, help="speficy file which the HTML will be saved to.") args.add_argument("-t", "--use_tfidf", action="store_true", help="use TF-IDF corpus.") args.add_argument( "--method", type=str, default="pcoa", help="specify a method for MDS by one from 'pcoa', 'mmds', or 'tsne'.") return args.parse_args() if __name__ == "__main__": args = parse_arg() model = LdaModel.load(args.model[0]) corpus = MmCorpus(args.corpus[0]) if args.use_tfidf: tfidf = TfidfModel(corpus) corpus = tfidf[corpus] dictionary = Dictionary.load_from_text(args.dictionary[0]) vis = pyLDAvis.gensim.prepare(model, corpus, dictionary, mds=args.method) if args.save_to_file is not None: pyLDAvis.save_html(vis, args.save_to_file) else: pyLDAvis.show(vis)
import pyLDAvis.gensim as gensimvis import pyLDAvis import gensim import csv import logging from gensim.corpora import Dictionary from gensim.models.wrappers import LdaMallet logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) path_to_mallet_binary = "/home/xiu-xiu/Mallet/bin/mallet" tweets = [] with open('data/clear_covid_tweets.csv', newline='') as csvfile: reader = csv.DictReader(csvfile) for tweet in reader: tweets.append(tweet['text'].split(' ')) dictionary = Dictionary(tweets) corpus = [dictionary.doc2bow(tweet) for tweet in tweets] model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel( LdaMallet(path_to_mallet_binary, corpus=corpus, num_topics=50, id2word=dictionary)) vis_data = gensimvis.prepare(model, corpus, dictionary) pyLDAvis.save_html(vis_data, 'lda.html')
def get_vis(model,corpus,dictionary): vis=pyLDAvis.gensim.prepare(model,corpus,dictionary) pyLDAvis.display(vis) pyLDAvis.save_html(vis,configuration.lda_dir + 'lda_visualization_test.html')
def main(root_path): timeStamp = str(int(time())) # todo change this for full run num = 1000 # 128915 is the total out_file_name = '../out/output-' + timeStamp + "-" + str(num) + '.txt' out_file = open(out_file_name, 'w') start = time() spark = init_spark() json_files = read_json_files(root_path, spark, num) data = get_body_text(spark, json_files) print("data reading done") # clean the data word_clean_up_F = F.udf(lambda x: clean_up(x), StringType()) data = data.withColumn("body_text_cleaned", word_clean_up_F("body_text")) data = data.select("body_text_cleaned") print("data processing done") tokenizer = Tokenizer(inputCol="body_text_cleaned", outputCol="words") token_DataFrame = tokenizer.transform(data) token_DataFrame = token_DataFrame.select("words") # Remove stopwords remover = StopWordsRemover(inputCol="words", outputCol="filtered") cleaned_DataFrame = remover.transform(token_DataFrame) cleaned_DataFrame = cleaned_DataFrame.select('filtered') # Count vectorizer cv_tmp = CountVectorizer(inputCol="filtered", outputCol="count_features") cvmodel = cv_tmp.fit(cleaned_DataFrame) count_dataframe = cvmodel.transform(cleaned_DataFrame) count_dataframe = count_dataframe.select('count_features') # TF-IDF Vectorizer tfidf = IDF(inputCol="count_features", outputCol="features") tfidfmodel = tfidf.fit(count_dataframe) tfidf_dataframe = tfidfmodel.transform(count_dataframe).select("features") print("Ready to fit with the LDA model") # Fit the LDA Model num_topics = 5 max_iterations = 20 lda_start = time() lda = LDA(seed=1, optimizer="em", k=num_topics, maxIter=max_iterations) lda_model = lda.fit(tfidf_dataframe) lda_transformed = lda_model.transform(tfidf_dataframe) lda_end = time() print("LDA complete") # joblib.dump(lda_model, 'lda.csv') # Get terms per topic topics = lda_model.topicsMatrix() vocabArray = cvmodel.vocabulary wordNumbers = 15 # number of words per topic topicIndices = lda_model.describeTopics(maxTermsPerTopic=wordNumbers).rdd.map(tuple) topics_final = topicIndices.map(lambda topic: topic_render(topic, wordNumbers, vocabArray)).collect() for topic in range(len(topics_final)): print("Topic " + str(topic) + ":") print("Topic " + str(topic) + ":", file=out_file) print(topics_final[topic]) print(topics_final[topic], file=out_file) print("Full runtime : {} min. ".format((time() - start) / 60)) print("LDA runtime : {} min. ".format((lda_end - lda_start) / 60)) print("Check" + out_file.name) cleaned_DataFrame.cache() lda_transformed.cache() # Data Visualization data = format_data_to_pyldavis(cleaned_DataFrame, cvmodel, lda_transformed, lda_model) print("Preparing data with pyLDAvis ...") filter_bad_docs(data) py_lda_prepared_data = pyLDAvis.prepare(**data) file_name = '../out/data-viz-' + timeStamp + '.html' print("Saving pyLDAvis html page ...") pyLDAvis.save_html(py_lda_prepared_data, file_name) pyLDAvis.show(py_lda_prepared_data) spark.stop()
corpus, keywords = preprocess_data(df) print(len(corpus)) #print(corpus) start, stop, step = 2, 12, 1 maximum = plot_graph(corpus, start, stop, step) print(maximum) # LSI Model print("LSI Model") number_of_topics = maximum words = 15 #document_list,titles=load_data("","articles.txt") #clean_text=preprocess_data(document_list) model = create_gensim_lsa_model(corpus, number_of_topics, words) # LDA Model print("\n LDA Model") number_of_topics = maximum words = 15 #document_list,titles=load_data("","articles.txt") #clean_text=preprocess_data(document_list) lda_model, dictionary, corpus_out = create_gensim_lda_model( corpus, number_of_topics) #print(keywords) vis_file = open("full_lak_lda_vis.html", "w") vis = pyLDAvis.gensim.prepare(lda_model, corpus_out, dictionary) #pyLDAvis.display(vis) pyLDAvis.save_html(vis, vis_file)
def main(): t = time() parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--path', required=False,default=".", type=str) args = parser.parse_args() path = args.path """ ------------------------------------------------- Bag of words -------------------------------------------------------- """ with open(path+'\\documents.pkl','rb')as f: documents = pkl.load(f) dictionary = gensim.corpora.Dictionary(documents) dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) bow_corpus = [dictionary.doc2bow(doc) for doc in documents] """ ---------------------------------------- Coherence Values and Num Topics Graph ---------------------------------------- """ def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): coherence_values = [] model_list = [] for num_topics in range(start, limit, step): print("Working on next model, num_topics =",num_topics,"...") model = gensim.models.LdaMulticore(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary, passes=3, workers=3, random_state=0) model_list.append(model) coherencemodel = model.log_perplexity(bow_corpus) #coherencemodel = CoherenceModel(model=model, texts=documents, dictionary=dictionary, coherence='c_npmi') print("Perplexity: ",coherencemodel) coherence_values.append(coherencemodel) return model_list, coherence_values # Can take a long time to run. print("Computing coherence values...") model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=documents, start=2, limit=40, step=6) # Save graph limit=40; start=2; step=6; x = range(start, limit, step) plt.plot(x, coherence_values) plt.xlabel("Num Topics") plt.ylabel("Perplexity score") plt.legend(("coherence_values"), loc='best') print('Time for this WHOLE thing: {} mins'.format(round((time() - t) / 60, 2))) plt.savefig(path+'\\perplex.png') """ --------------------------------------------------- LDA -------------------------------------------------------------- """ print("\nWorking on simple LDA num_topics=7, passes=3...") lda_model_bow = gensim.models.LdaModel(bow_corpus, num_topics=7, id2word=dictionary, passes=3, random_state=0) f = open(path+"\\stats.txt",'w') for idx, topic in lda_model_bow.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) f.write(str('\nTopic: {} \nWords: {}'.format(idx, topic))) print('\nPerplexity: ', lda_model_bow.log_perplexity(bow_corpus)) f.write('\nPerplexity: '+str(lda_model_bow.log_perplexity(bow_corpus))) f.write('\n') """ coherence_model_lda = CoherenceModel(model=lda_model_bow, texts=documents, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) f.write('\nCoherence Score: '+str(coherence_lda)) """ f.close() print('\nworking on topic visualization') vis = pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary) pyLDAvis.save_html(vis,path+'\\LDA_visualized.html') print('Time for this WHOLE thing: {} mins'.format(round((time() - t) / 60, 2)))
id2word.save('dictionary.gensim') #generate corpus with doc2bow corpus = [id2word.doc2bow(text) for text in cleandoc] #generate LDA model for 10 topics print("generating LDA model") lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=i, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) print("print topics") print(lda_model.show_topics()) lda_model.save(str(i)+'topicLDAModel.gensim') #generate Visualiziation #pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) print("Open interactive visualization in web browser") pyLDAvis.display(vis) pyLDAvis.save_html(vis,visavestr) print("--- %s seconds ---" % (time.time() - start_time))
#Plot topic labels and terms labels separately to have different colours g = G.subgraph([topic for topic, _ in pos.items() if topic in t]) nx.draw_networkx_labels(g, pos, font_size=20, font_color='r') #If network graph is difficult to read, don't plot ngrams titles. #g = G.subgraph([term for term, _ in pos.items() if str(term) not in t]) #nx.draw_networkx_labels(g, pos, font_size=12, font_color='orange') #Plot edges nx.draw_networkx_edges(G, pos, edgelist=G.edges(), alpha=0.3) #Having trouble saving graph to file automatically; below code not working. Must manually save. plt.axis('off') plt.show(block=False) plt.savefig('/Users/Marcia/OneDrive/UNCC General/DSBA_6880/Misc_Analysis_Files/TopicNetwork'+num+'.png', bbox_inches='tight') graph_terms_to_topics(lda, num_terms=num_top) #Create interactive graph to examine top 30 ngrams in each topic. #Use pyLDAvis to visualize the topics in a network using # Jensen-Shannon divergence as metric of distance between the topics. import pyLDAvis.gensim as gensimvis import pyLDAvis #Create data to visualize. vis_data = gensimvis.prepare(lda, corpus, dictionary) #pyLDAvis.display(vis_data) #Use vis_data "prepared" in earlier step. #Now display the visualization in a local server page. #pyLDAvis.show(vis_data) #Save the visualization to an html file. pyLDAvis.save_html(vis_data, '/Users/Marcia/OneDrive/UNCC General/DSBA_6880/Misc_Analysis_Files/ClaimsInteractVis'+num+'.html')
def save_lda_vis_as_html(self, filename="./pyldavis_output.html", method=None): if method is None: vis = pyLDAvis.gensim.prepare(self.model.lda, self.data.corpuses, self.model.dictionary, n_jobs=1, sort_topics=False) else: vis = pyLDAvis.gensim.prepare(self.model.lda, self.data.corpuses, self.model.dictionary, n_jobs=1, mds=method, sort_topics=False) pyLDAvis.save_html(vis, filename)
for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] #pprint(texts) dictionary = corpora.Dictionary(texts) dictionary.save('/tmp/trends.dict') # store the dictionary, for future reference corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('/tmp/trends.mm', corpus) # store to disk, for later use preprocessdocuments() dictionary = corpora.Dictionary.load('/tmp/trends.dict') corpus = corpora.MmCorpus('/tmp/trends.mm') tfidf = models.TfidfModel(corpus) model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=3) print('Topics: ') print(model.print_topics(3, 3)) vis_data = pyLDAvis.gensim.prepare(model, corpus, dictionary) pyLDAvis.save_html(vis_data, 'e.html') # print('Test: ') # print(model[tfidf[dictionary.doc2bow(['smartes', 'armband', 'fitnessarmband', 'dienen', 'sms', 'emails', 'anzeigen'])]]) # print(model[tfidf[dictionary.doc2bow(['verfolgen', 'produktion', 'sicherstellen', 'richtigen', 'kunden', 'kunde', 'tag'])]]) # print(model[tfidf[dictionary.doc2bow(['sunpartner','transparente','solarfolie','entwickelt'])]])
plt.tight_layout() plt.show() # save as png plt.savefig('work/wordcloud.png') # %% # Vis PCoA vis_pcoa = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False) vis_pcoa # save as html pyLDAvis.save_html(vis_pcoa, 'work/pyldavis_output_pcoa.html') # %% data = [] for c, words, fileName, title, category in zip(corpus, df['words'], df['file'], df['title'], df['category']): topics = [] for topic, score in lda_model[c]: if (score > 0.7): topics.append(str(topic)) data.append([fileName, title, category, ','.join(topics)]) df_topic = pd.DataFrame(data, columns=['file', 'title', 'category', 'topics']) df_topic.head() # %%