def issue_analysis(df): df_sub = df[['Issue']] df_sub.insert(0, 'count', 1) Issue_List=[] for i in range(0,50): Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name) tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop text_view = '' # loop through document list for i in Issue_List: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) #print ' '.join(stemmed_tokens) text_view += ' '.join(stemmed_tokens) text_view += ' ' wordcloud = WordCloud().generate(text_view) fig = plt.figure(figsize=(8,6)) fig1 = fig.add_subplot(1,1,1) fig1.set_title("Top issued words", fontdict={'fontsize':25}) fig1.imshow(wordcloud) fig1.axis("off") #plt.savefig('ComplainCount_WC.png') plt.savefig('ComplainCount_WC_2016.png') # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "issue_lda.html") #pyLDAvis.save_json(vis_data, "issue_lda.json") pyLDAvis.save_html(vis_data, "issue_lda_2016.html") pyLDAvis.save_json(vis_data, "issue_lda_2016.json") return 0
def vis_hdpvis(self): """ Produces LDAvis visualization. Opens a web browser page with javascript topic viewer. """ hdp_vis_data = pg.prepare(self.hdp, self.cor, self.cor.dictionary) pyLDAvis.save_html(hdp_vis_data, '../../data/hdpvis.html') vis_path = os.path.realpath('../../data/hdpvis.html') webbrowser.open('file://{}'.format(vis_path), new=2)
def vis_ldavis(self): """ Produces LDAvis visualization. Opens a web browser page with javascript topic viewer. """ lda_vis_data = pg.prepare(self.lda, self.cor, self.cor.dictionary) pyLDAvis.save_html(lda_vis_data, "../../data/ldavis.html") vis_path = os.path.realpath("../../data/ldavis.html") webbrowser.open("file://{}".format(vis_path), new=2)
def visualize(self, outfn): """ Produce a pyLDAvis visualization of a model and save to disk at the given location. """ if self.has_viz_data: pyLDAvis.save_html(self.vis_data, outfn) return assert(self.has_vocab and self.has_corpus) assert(self.is_trained) # this might crash. I think because corpus, vocab, and _lda_model are all big. self.vis_data = prepare(self._lda_model, self.corpus, self.vocab) self.has_viz_data = True pyLDAvis.save_html(self.vis_data, outfn)
def narrative_analysis(df): tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop for index in range(0,len(df.index)): if str(df['narrative'].ix[index]) != 'nan': intext = df['narrative'].ix[index] intext = re.sub(r"X+", "", intext) raw = intext.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "narrative_lda.html") #pyLDAvis.save_json(vis_data, "narrative_lda.json") pyLDAvis.save_html(vis_data, "narrative_lda_2016.html") pyLDAvis.save_json(vis_data, "narrative_lda_2016.json") return 0
import pyLDAvis.gensim as gensimvis import pyLDAvis import gensim import csv import logging from gensim.corpora import Dictionary from gensim.models.wrappers import LdaMallet logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) path_to_mallet_binary = "/home/xiu-xiu/Mallet/bin/mallet" tweets = [] with open('data/clear_covid_tweets.csv', newline='') as csvfile: reader = csv.DictReader(csvfile) for tweet in reader: tweets.append(tweet['text'].split(' ')) dictionary = Dictionary(tweets) corpus = [dictionary.doc2bow(tweet) for tweet in tweets] model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel( LdaMallet(path_to_mallet_binary, corpus=corpus, num_topics=50, id2word=dictionary)) vis_data = gensimvis.prepare(model, corpus, dictionary) pyLDAvis.save_html(vis_data, 'lda.html')
random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) """ # mallet lda 적용 mallet_path = 'source/mallet-2.0.8/bin/mallet' lda_model = models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word) pprint(lda_model.print_topics()) # Compute Coherence Score coherence_model_lda = models.CoherenceModel(model=lda_model, texts=totalCorpus, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # mallet 모델을 gensim 의 lda 모델로 변환 (wrapping) lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(lda_model) prepared_data = gensimvis.prepare(lda_model, corpus, id2word) pyldavis_html_path = 'LDAresult/' + filename + '.html' pyLDAvis.save_html(prepared_data, pyldavis_html_path)
def visualize(self): lda_display = ldvis.prepare(self.model, self.corpus, self.dictionary) pyLDAvis.save_html(lda_display, self.fileprefix + ".html") pyLDAvis.display(lda_display)
dk = pd.read_csv('TW_Tweet.csv', encoding='UTF-8', low_memory=False) df = pd.DataFrame(dk, columns=['id', 'keyword', 'created', 'language', 'message']) df.columns = ['id', 'key', 'created_time', 'language', 'message'] rm_duplicates = df.drop_duplicates(subset=['key', 'message']) rm_na = rm_duplicates.dropna() dtime = rm_na.sort_values(['created_time']) dtime.index = range(len(dtime)) dlang = dtime[dtime['language'] == 'en'] data = dlang[dlang['key'] != 'johnson & johnson'] data = data[data['key'] != 'johnson&johnson'] data.index = range(len(data)) # ldamodel = LdaModel(finalcorpus, num_topics = 30, id2word = dictionary, update_every = 10, chunksize=2000, passes=10, alpha=0.05) # ldamodel.save('lda30.model') ldamodel = LdaModel.load('lda30.model') vis_data = gensimvis.prepare(ldamodel, finalcorpus, dictionary) # pyLDAvis.save_html(vis_data, 'lda30.html') vistopicid = vis_data[6] idlist = [] for j in range(1, len(vistopicid) + 1): idlist.append([i for i, x in enumerate(vistopicid) if x == j][0]) topicwords = {} no = 0 for prob in ldamodel.show_topics(30, 7): tokens = ' '.join(re.findall(r"[\w']+", str(prob[1]))).lower().split() x = [''.join(c for c in s if c not in string.punctuation) for s in tokens] result = ' '.join([i for i in x if not i.isdigit()]) topicwords[idlist[no]] = result.split() no += 1 for i in range(30): print("Topic", i + 1, ": ", topicwords[i])
i = 6 while i > 0: if (i != 4): ldamodel = pickle.load(open('ldamodel_doc_topics' + str(i * 5), 'r')) prepared = pg.prepare(ldamodel, corpus, dictionary) pyLDAvis.save_html(prepared, open('lda_doc_topics' + str(i * 5) + '.html', 'w')) i -= 1 ldamodel = pickle.load(open('ldamodel_doc_topics30', 'r')) pyLDAvis.save_html(prepared, open('lda_doc_topics30.html', 'w')) ############################LDA VISUALIZATION########################################## import pyLDAvis from pyLDAvis import gensim as pg prepared = pg.prepare(ldamodel, corpus, dictionary) pyLDAvis.save_html(pyLDAvis.display(prepared), open('lda.html', 'w')) # not correct mostly ####################################################################################### #GET ALL DATES# from selenium import webdriver from selenium.webdriver.common.keys import Keys datesu = [] titles = [] def get_titles(): url = 'http://www.narendramodi.in/category/text-speeches' driver = webdriver.Chrome()
texts = [[token for token in text if frequency[token] > repetition_threshold] for text in texts] print texts # Construct a document-term matrix to understand how frewuently each term occurs within each document # The Dictionary() function traverses texts, assigning a unique integer id to each unique token while also collecting word counts and relevant statistics. # To see each token unique integer id, try print(dictionary.token2id) dictionary = corpora.Dictionary(texts) dictionary.compactify() dictionary.save('dict.dict') # Convert dictionary to a BoW # The result is a list of vectors equal to the number of documents. Each document containts tumples (term ID, term frequency) corpus = [dictionary.doc2bow(text) for text in texts] texts = [] #Randomize training elements corpus = np.random.permutation(corpus) gensim.corpora.MmCorpus.serialize('corpus.mm', corpus) # Create csc matrix of corpus (speed up if calling multiple times prepare) #corpus_csc = gensim.matutils.corpus2csc(corpus) dictionary = gensim.corpora.Dictionary.load('dict.dict') corpus = gensim.corpora.MmCorpus('corpus.mm') lda = gensim.models.ldamodel.LdaModel.load(model_path) vis_data = prepare(lda, corpus, dictionary) pyLDAvis.save_html(vis_data, 'visualization.html') pyLDAvis.display(vis_data)
# generate TF-IDF, LDA model from gensim import models tfidf_model = models.TfidfModel(corpus) tfidf = tfidf_model[corpus] print("\n","=========== TF-IDF ============") # print first 10 elements of first document's tf-idf vector print("\n",tfidf.corpus[0][:10]) # print top 10 elements of first document's tf-idf vector print("\n",sorted(tfidf.corpus[0], key=lambda x: x[1], reverse=True)[:10]) # print token of most frequent element #print("\n",dictionary.get(13)) n_topics = 5 lda = models.ldamodel.LdaModel(tfidf, num_topics=n_topics, id2word=dictionary, passes=1) print("\n","=========== lda.show_topics() ============") #print(lda.show_topics()) print(lda.print_topics(num_topics=n_topics, num_words=10)) import matplotlib matplotlib.use('qt5agg') import pyLDAvis.gensim as gensimvis import pyLDAvis vis_data = gensimvis.prepare(lda, corpus, dictionary) x = pyLDAvis.prepared_data_to_html(vis_data) print (x)
def get_lda(): tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # create sample documents doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother." doc_b = "My mother spends a lot of time driving my brother around to baseball practice." doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure." doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better." doc_e = "Health professionals say that brocolli is good for your health." doc_set2 = [i for i in range(1,10)] print(doc_set2) # compile sample documents into a list doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e] # list for tokenized documents in loop texts = [] # loop through document list for i in doc_set: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) tagged_tokens = nltk.pos_tag(tokens) print(tagged_tokens) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) print (tokens) print (stemmed_tokens) print ("--------------------------------") # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20) print (lda.show_topics()) import matplotlib matplotlib.use('qt5agg') import pyLDAvis.gensim as gensimvis import pyLDAvis vis_data = gensimvis.prepare(lda, corpus, dictionary) x = pyLDAvis.prepared_data_to_html(vis_data) #print (x) return x print("-------------------") '''
topics = 15 words = 20 lda_model = gensim.models.LdaMulticore(corpus_md_tfidf, num_topics=topics, id2word=dictionary, passes=2, workers=4) lsi_model = gensim.models.LsiModel(corpus_md_tfidf, num_topics=topics, id2word=dictionary) print("LDA Model:") for idx in range(topics): # Print the first 10 most representative topics print("Topic #%s:" % idx, lda_model.print_topic(idx, 10)) print("=" * 20) print("LSI Model:") for idx in range(topics): # Print the first 10 most representative topics print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10)) lda_vis = gensimvis.prepare(lda_model, bow_corpus, dictionary) pyLDAvis.display(lda_vis) print("Done")
rn = ReviewNormalizer() normalized_reviews = [rn.tokenize(r) for r in reviews] pretty_print_html([" ".join(normalized_reviews[randint(0, len(normalized_reviews))]), " ".join(normalized_reviews[randint(0, len(normalized_reviews))])]) # #### Training the model (this might take a while...) # In[12]: dictionary = corpora.Dictionary(normalized_reviews) corpus = [dictionary.doc2bow(r) for r in normalized_reviews] lda = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary, passes=100) # #### Prepare data and visualize! # In[14]: prepared_data = prepare(lda, corpus, dictionary) pyLDAvis.display(prepared_data) # In[ ]:
#ldamodel = lda.LdaModel.load(fname, mmap='r') pprint.pprint(ldamodel.show_topics(num_topics=50, num_words=20)) #pprint.pprint(ldamodel.top_topics(corpus,num_words=10)) ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)] lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=5, coherence='c_v') print(lda_coherence.get_coherence()) print(lda_coherence.get_coherence_per_topic()) vis_data = ldavis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.display(vis_data) pyLDAvis.save_html(vis_data, 'world_lda50.html') """" hey, do you want to play a game? oh come on! let's play """ # select top n words for each of the LDA topics top_words = [[word for word, _ in ldamodel.show_topic(topicno, topn=10)] for topicno in range(ldamodel.num_topics)] # get all top words in all topics, as one large set all_words = set(itertools.chain.from_iterable(top_words)) print("Can you spot the misplaced word in each topic?")
# Visualize the LDA Mallet terms as wordclouds from wordcloud import WordCloud # Import wordclouds # Initiate the wordcloud object wc = WordCloud(background_color="white", colormap="Dark2", max_font_size=150, random_state=42) plt.rcParams['figure.figsize'] = [20, 15] # Create subplots for each topic for i in range(25): wc.generate(text=topics_df["Terms per Topic"][i]) plt.subplot(5, 5, i + 1) plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.title(topics_df.index[i]) plt.show() # In[55]: import pyLDAvis.gensim as gensimvis vis_data = gensimvis.prepare(ldagensim, corpus, id2word, sort_topics=False) pyLDAvis.display(vis_data) # In[ ]:
print(type(medical_df)) text = medical_df['transcription'] print(type(text)) docs = array(text) print(type(docs)) # ============================= # LDA lda = LDAAnalysis(docs) do_process = True if do_process: lda.fit() pickle_LDAAnalysis = open("data/cache/LDAAnalysis.pkl", "wb") pickle.dump(lda, pickle_LDAAnalysis) pickle_LDAAnalysis.close() else: LDAAnalysis = pickle.load("data/cache/LDAAnalysis.pkl") lda.coherence_values() lda_vis = gensimvis.prepare(lda, lda.corpus, lda.dictionary) pyLDAvis.display(lda_vis) # ============================= # NNMF nnmf = NNMFTopicAnalysis(docs=docs) nnmf.fit() print('Done')
print("Making the BOW list") corpus = [dictionary.doc2bow(text) for text in text_data] ### Save the dictionary and Corpus so they can be used later ### pickle.dump(corpus, open('corpus.pkl', 'wb')) dictionary.save('dictionary.gensim') print("TF-IDF") model = TfidfModel(corpus) tfidfCorpus = model[corpus] #print(vector) print("Training the network") NUM_TOPICS = 40 #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15) #ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=100) ldamodel = gensim.models.ldamulticore.LdaMulticore(tfidfCorpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=50) ldamodel.save('ldamodel.gensim') lda = gensim.models.ldamodel.LdaModel.load('ldamodel.gensim') import pyLDAvis.gensim as gensimvis import pyLDAvis.gensim #lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False) lda_display = gensimvis.prepare(lda, corpus, dictionary, sort_topics=False) #gensimvis.display(lda_display) pyLDAvis.display(lda_display) pyLDAvis.show(lda_display)
folder = 'lda/manifesto/' # this has been added in previous steps (basically the number of topics) postfix = '_100' manifestos = [ 'cdu_2002.csv', 'cdu_2005.csv', 'cdu_2009.csv', 'cdu_2013.csv', 'fdp_2002.csv', 'fdp_2005.csv', 'fdp_2009.csv', 'fdp_2013.csv', 'gruene_2002.csv', 'gruene_2005.csv', 'gruene_2009.csv', 'gruene_2013.csv', 'linke_2005.csv', 'linke_2009.csv', 'linke_2013.csv', 'pds_2002.csv', 'piraten_2013.csv', 'spd_2002.csv', 'spd_2005.csv', 'spd_2009.csv', 'spd_2013.csv' ] start = time.time() for file in manifestos: checkpoint = time.time() print('starting analysis for file ' + file) model = gensim.models.ldamodel.LdaModel.load(folder + file + postfix + '.model') corpus = gensim.corpora.mmcorpus.MmCorpus(folder + file + postfix + '.corpus') dictionary = gensim.corpora.dictionary.Dictionary.load( folder + file + postfix + '.dictionary', ) visdata = gensimvis.prepare(model, corpus, dictionary, R=15) pyLDAvis.save_html(visdata, folder + file + postfix + '.html') print('generated html for ' + file + ' in ' + str(time.time() - checkpoint) + 's') print('generating html for all files took ' + str(time.time() - start) + 's')
def visualize_topics(lda_model, corpus, dictionary): """ Function to visualize topics using pyLDAvis """ vis_data = gensimvis.prepare(lda_model, corpus, dictionary) pyLDAvis.display(vis_data)
#Plot topic labels and terms labels separately to have different colours g = G.subgraph([topic for topic, _ in pos.items() if topic in t]) nx.draw_networkx_labels(g, pos, font_size=20, font_color='r') #If network graph is difficult to read, don't plot ngrams titles. #g = G.subgraph([term for term, _ in pos.items() if str(term) not in t]) #nx.draw_networkx_labels(g, pos, font_size=12, font_color='orange') #Plot edges nx.draw_networkx_edges(G, pos, edgelist=G.edges(), alpha=0.3) #Having trouble saving graph to file automatically; below code not working. Must manually save. plt.axis('off') plt.show(block=False) plt.savefig('/Users/Marcia/OneDrive/UNCC General/DSBA_6880/Misc_Analysis_Files/TopicNetwork'+num+'.png', bbox_inches='tight') graph_terms_to_topics(lda, num_terms=num_top) #Create interactive graph to examine top 30 ngrams in each topic. #Use pyLDAvis to visualize the topics in a network using # Jensen-Shannon divergence as metric of distance between the topics. import pyLDAvis.gensim as gensimvis import pyLDAvis #Create data to visualize. vis_data = gensimvis.prepare(lda, corpus, dictionary) #pyLDAvis.display(vis_data) #Use vis_data "prepared" in earlier step. #Now display the visualization in a local server page. #pyLDAvis.show(vis_data) #Save the visualization to an html file. pyLDAvis.save_html(vis_data, '/Users/Marcia/OneDrive/UNCC General/DSBA_6880/Misc_Analysis_Files/ClaimsInteractVis'+num+'.html')
clean_emails = pickle.load( open( "output/clean_emails.p", "rb" ) ) def tokenize_and_stem(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [p_stemmer.stem(t) for t in filtered_tokens] return stems from gensim import corpora, models, similarities #tokenize token_emails = [tokenize_and_stem(text) for text in clean_emails] # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(token_emails) #remove extremes dictionary.filter_extremes(no_below=1, no_above=0.8) dictionary.compactify() # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in token_emails] final=models.ldamodel.LdaModel.load('output/final_topic10.model') import pyLDAvis.gensim as gensimvis import pyLDAvis vis_data = gensimvis.prepare(final, corpus, dictionary) pyLDAvis.display(vis_data)