def issue_analysis(df): df_sub = df[['Issue']] df_sub.insert(0, 'count', 1) Issue_List=[] for i in range(0,50): Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name) tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop text_view = '' # loop through document list for i in Issue_List: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) #print ' '.join(stemmed_tokens) text_view += ' '.join(stemmed_tokens) text_view += ' ' wordcloud = WordCloud().generate(text_view) fig = plt.figure(figsize=(8,6)) fig1 = fig.add_subplot(1,1,1) fig1.set_title("Top issued words", fontdict={'fontsize':25}) fig1.imshow(wordcloud) fig1.axis("off") #plt.savefig('ComplainCount_WC.png') plt.savefig('ComplainCount_WC_2016.png') # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "issue_lda.html") #pyLDAvis.save_json(vis_data, "issue_lda.json") pyLDAvis.save_html(vis_data, "issue_lda_2016.html") pyLDAvis.save_json(vis_data, "issue_lda_2016.json") return 0
def save_lda_model(self, lda_model, corpus, dictionary): pyLDAvis.save_json( pyLDAvis.gensim.prepare(lda_model, corpus, dictionary), './../static/js/lda.json') print(lda_model.print_topics()) lda_model.save('./../lda/model.lda') dictionary.save('./../lda/dict.lda') corpora.MmCorpus.serialize('./../lda/corpus.mm', corpus)
def save_lda_model(self, lda_model, corpus, dictionary, index): index.save(self.lda_path + 'index.lda') pyLDAvis.save_json(pyLDAvis.gensim.prepare(lda_model, corpus, dictionary), self.lda_path + '/../static/js/lda.json') print(lda_model.print_topics()) lda_model.save(self.lda_path + 'model.lda') dictionary.save(self.lda_path + 'dict.lda') corpora.MmCorpus.serialize(self.lda_path + 'corpus.mm', corpus)
def view_lda_model(self, model, corpus, dictionary): # corpus = [dictionary.doc2bow(doc) for doc in corpus] prepared_data = gensimvis.prepare(model, corpus, dictionary, mds='mmds') pyLDAvis.save_json( prepared_data, self.model_path + self.data_name + '_vis_result.json') pyLDAvis.save_html( prepared_data, self.model_path + self.data_name + '_vis_result.html')
def new(cls, name: str, dataset: Dataset, model: TopicModel, **kwargs) -> "Visualizer": path = common.PROJDIR / (name + ".LDAvis.json") pyLDAvis.save_json( pyLDAvis.prepare(model.get_topic_word_matrix(normalize=True), model.get_doc_topic_matrix(normalize=True), dataset.get_count_matrix().sum(axis=1).squeeze(), [word.decode() for word in dataset.get_vocab()], dataset.get_count_matrix().sum(axis=0).squeeze(), **kwargs), str(path), ) return cls(path)
def narrative_analysis(df): tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop for index in range(0, len(df.index)): if str(df['narrative'].ix[index]) != 'nan': intext = df['narrative'].ix[index] intext = re.sub(r"X+", "", intext) raw = intext.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word=dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "narrative_lda.html") #pyLDAvis.save_json(vis_data, "narrative_lda.json") pyLDAvis.save_html(vis_data, "narrative_lda_2016.html") pyLDAvis.save_json(vis_data, "narrative_lda_2016.json") return 0
def vectorize(self): ''' args: none output: generates an LDA topic model of the document using gensim and pyLDAvis ''' # tokenize and remove stopwords sentences = self.sent_detector.tokenize(self.raw.decode('utf-8').strip()) # use raw text #sentences = Topic(raw_input('topic: ')).text # get text from wikipedia #stoplist = set('for this that by or is a of the and to in are be as an it can on if at which then also with used such not from use other have some these more using has many one was may often but their they than when been its not all may some have had'.split()) texts = [[word for word in sentence.lower().split() if word not in self.stopwords] for sentence in sentences] # compute the frequency of each token frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 # remove words that appear only once texts = [[token for token in text if frequency[token] > 1] for text in texts] # construct a gensim dictionary and corpus (bag of words) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # currently, "text" is a sentence in the document # define LDA model lda = models.ldamodel.LdaModel( corpus = corpus, id2word = dictionary, num_topics = 10, #what should this be ??? update_every = 1, chunksize = 10000, passes = 1 ) # visualize the lda space vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary) pyLDAvis.display(vis_data) pyLDAvis.show(vis_data) with open('topic_models/'+self.name+'.json', 'a+') as topic_json: pyLDAvis.save_json(vis_data, topic_json) with open('topic_models/'+self.name+'.html', 'a+') as topic_html: pyLDAvis.save_html(vis_data, topic_html)
def narrative_analysis(df): tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop for index in range(0,len(df.index)): if str(df['narrative'].ix[index]) != 'nan': intext = df['narrative'].ix[index] intext = re.sub(r"X+", "", intext) raw = intext.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "narrative_lda.html") #pyLDAvis.save_json(vis_data, "narrative_lda.json") pyLDAvis.save_html(vis_data, "narrative_lda_2016.html") pyLDAvis.save_json(vis_data, "narrative_lda_2016.json") return 0
def topic_modelling(self): sec = self.__preprocess_text_pylda(self.text) dictionary = corpora.Dictionary(sec) doc_term_matrix = [dictionary.doc2bow(rev) for rev in (sec)] LDA = gensim.models.ldamodel.LdaModel # Build LDA model lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=self.no_of_topics, random_state=100, chunksize=1000, passes=50) #pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary) pyLDAvis.save_html(vis, directory + "\\templates\\abc.html") #Clodinary_url = self.upload_cloudinary(directory+"\\Text\\Output\\abc.html") pyLDAvis.save_json(vis, directory + "\\Text\\Output\\abc.json") p = list(lda_model.print_topics()) topics_ = dict() for i in p: topics_[i[0]] = i[1] topic_list = dict() for i in topics_: s = topics_[i] s = s.split("+") #Sprint(s) t = [] for i in s: q = i.split("*") #print(q[1][1:-2]) t.append(q[1][1:-2]) topic_list[i] = t #print(t) #p=[s[j] for j in range(1,len(s),2)] #print(p) #for j in range(len(s)): #s[j] Clodinary_url = "http://localhost:5000/abc.html" return topics_, topic_list, self.text, Clodinary_url
def LDA(tokens, start, stop, step=1): dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(text) for text in tokens] model_list = [] coherence_values = [] max_topic_num = 0 for i in range(start, stop, step): print('steps ', i) model = LdaModel(corpus, id2word=dictionary, num_topics=i + 1) #LDA model model_list.append(model) coherence_model_lda = CoherenceModel(model, texts=tokens, dictionary=dictionary, coherence='c_v') #Coherence coherence_lda = coherence_model_lda.get_coherence( ) #calculate the coherence score if i is not start and coherence_lda > max(coherence_values): max_topic_num = i coherence_values.append(coherence_lda) x = range(start, stop, step) plt.plot(x, coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend(("coherence_values"), loc='best') plt.show() #show graph of coherence score by pyplot max_ind = coherence_values.index(max(coherence_values)) model_list[max_ind].save("result_model") prepared_data = gensimvis.prepare(model_list[max_ind], corpus=corpus, dictionary=dictionary) pyLDAvis.save_html(prepared_data, 'res.html') #save the result of LDA by html file pyLDAvis.save_json(prepared_data, 'res.json') #save the result of LDA by JSON file return model_list[max_ind], coherence_values[max_ind], max_topic_num
def issue_analysis(df): df_sub = df[['Issue']] df_sub.insert(0, 'count', 1) Issue_List = [] for i in range(0, 50): Issue_List.append( df_sub.groupby(['Issue' ]).sum().sort_index(by='count', ascending=False).ix[i].name) tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop text_view = '' # loop through document list for i in Issue_List: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) #print ' '.join(stemmed_tokens) text_view += ' '.join(stemmed_tokens) text_view += ' ' wordcloud = WordCloud().generate(text_view) fig = plt.figure(figsize=(8, 6)) fig1 = fig.add_subplot(1, 1, 1) fig1.set_title("Top issued words", fontdict={'fontsize': 25}) fig1.imshow(wordcloud) fig1.axis("off") #plt.savefig('ComplainCount_WC.png') plt.savefig('ComplainCount_WC_2016.png') # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word=dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "issue_lda.html") #pyLDAvis.save_json(vis_data, "issue_lda.json") pyLDAvis.save_html(vis_data, "issue_lda_2016.html") pyLDAvis.save_json(vis_data, "issue_lda_2016.json") return 0
# for each (topic,probability) for each document # append [(topic, probability),document id] to cluster1 cluster1.append((x, j, l)) # Save topics with open('lda_topic.txt', 'w') as file: for i in lda.show_topics(num_topics=topic_number_setup): file.write(str(i) + '\n') # topic cluster visualization # topic term relation json save movies = pyLDAvis.gensim.prepare(lda, corpus, dictionary) pyLDAvis.save_html(movies, 'LDA_Visualization.html') # Topic-Term relationship matrix pyLDAvis.save_json(movies, 'topic_term.json') with open('topic_term.json') as json_data: d = json.load(json_data) mat = np.column_stack((d['token.table']['Topic'], d['token.table']['Freq'], d['token.table']['Term'])) # load movie metadata: meta_dict = {} with open("movie.metadata.tsv") as tsvfile: tsvreader = csv.reader(tsvfile, delimiter="\t") for line in tsvreader: meta_dict[line[0]] = line[2] # Enable topic document search def enable_search():
def write_json_data(self, ldaviz_model, n_topics): pyLDAvis.save_json(ldaviz_model, self.paths.ldaviz_json(n_topics))
# Format df_dominant_topic = df_topic_sents_keywords.reset_index() df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'] # Show df_dominant_topic.head(10) ##### Visualise topics ##### print('Visualising topics...') # Visualise # https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#15visualizethetopicskeywords import IPython # still required import pyLDAvis from pyLDAvis import gensim # Visualize the topics # Visualize the topics in notebook # pyLDAvis.enable_notebook() # vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) # vis vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, mds='mmds') pyLDAvis.show(vis) pyLDAvis.save_html(vis, 'topics-lda.html') print('Save visualisation to json...') with open('pylda-vis' + t + '.json', 'w') as vis_json: vis_json.write(pyLDAvis.save_json(vis))
# Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # View print(corpus[:1]) lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) # Print the Keyword in the 10 topics pprint(lda_model.print_topics()) #doc_lda = lda_model[corpus] # Visualize the topics #pyLDAvis.enable_notebook() print('Working on creating visualization...') vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, n_jobs=1, R=50) # n_jobs is so it uses up less cpu print('Going to save html and json...') pyLDAvis.save_html(vis, 'LDA_Visualization_{}.html'.format(newsSource)) pyLDAvis.save_json(vis, 'LDA_Visualization_{}.json'.format(newsSource)) # TODO NEXT: get keywords from json, save to a .txt file (as temp[NewsSource]), and format
vis_data = {} gensim_lda_model = {} for c in cmallet.keys(): vis_data[c] = {} gensim_lda_model[c] = {} for i in cmallet[c].keys(): gensim_lda_model[c][ i] = gensim.models.wrappers.ldamallet.malletmodel2ldamodel( cmallet[c][i]) vis_data[c][i] = pyLDAvis.gensim.prepare( gensim_lda_model[c][i], corpora[c], dictionary=cmallet[c][i].id2word, mds='tsne') pyLDAvis.save_json(vis_data[c][i], outdir + f'pyldavis_{c}_{i}.json') print(outdir + f'pyldavis_{c}_{i}.json') ofdir = web_out_dir + f'{c}-{i}/' os.makedirs(ofdir, mode=out_path_mode, exist_ok=True) pyLDAvis.save_html(vis_data[c][i], ofdir + f'pyldavis_{c}_{i}.html', ldavis_url=MODIFIED_LDAVIS_URL) print(web_out_dir + f'{c}-{i}/pyldavis_{c}_{i}.html') # #### Save Gensim Mallet Models # In[38]: for c in gensim_lda_model.keys(): for i in gensim_lda_model[c].keys(): gensim_lda_model[c][i].save(
def pyLDAvisUI(lda, tf, tf_vectorizer): page = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer) pyLDAvis.save_html(page, 'lda.html') #将主题可视化数据保存为html文件 pyLDAvis.save_json(page, 'lda.json')
corpus=bow_corpus, dictionary=cdict, doc_topic_dist=None, R=30, lambda_step=0.2, mds='tsne', # mds=<function js_PCoA>, n_jobs=-1, plot_opts={ 'xlab': 'PC1', 'ylab': 'PC2' }, sort_topics=True, ) LDA_HTML = f'data/lda_vis_result_{LDA_TOPIC_NUM}_topics.html' LDA_JSON = f'data/lda_vis_result_{LDA_TOPIC_NUM}_topics.json' pyLDAvis.save_html(prepared_data, LDA_HTML) pyLDAvis.save_json(prepared_data, LDA_JSON) # %% # pyLDAvis.display(prepared_data, local=False) print("Test: 'pyLDAvis' finished.") # %% print("Test: 'pyLDAvis' finished.") #%%
vals = line.rstrip('\r\n').split('\t') doc_id = int(vals[0]) word_id = int(vals[1]) word_count = int(vals[2]) doc_lengths[doc_id] += word_count term_frequency[word_id] += word_count # Dictionary terms vocab = [corpus_dictionary[word_id] for word_id in range(V)] # Generate the JSON and html prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, R=corpus_n_terms) pyLDAvis.save_json(prepared_data, pyLDAvis_dir + '/' + json_name + '.json') print('Generating ' + json_name + '.html') generate_LDAvis_html(json_name, pyLDAvis_dir) else: print('Warning: ' + pyLDAvis_dir + '/' + json_name + '.json already exists') if model == 'ngppf' or model == 'jgppf': # Generate the network files json_name = 'network' json_names.append(json_name) if not os.path.exists(pyLDAvis_dir + '/' + json_name + '.json'): print('Generating ' + json_name + '.json') # Topic-term probabilities topic_term_dists = P_nk(rkB, phink) topic_term_dists = np.array(