def issue_analysis(df):
    df_sub = df[['Issue']]
    df_sub.insert(0, 'count', 1)

    Issue_List=[]
    for i in range(0,50):
        Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)

    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop
    text_view = ''
                                                                
    # loop through document list
    for i in Issue_List:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
       
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        # stem tokens and add them to list
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

        #print ' '.join(stemmed_tokens)
        text_view += ' '.join(stemmed_tokens)
        text_view += ' '

    wordcloud = WordCloud().generate(text_view)
    fig = plt.figure(figsize=(8,6))
    fig1 = fig.add_subplot(1,1,1)
    fig1.set_title("Top issued words", fontdict={'fontsize':25})
    fig1.imshow(wordcloud)
    fig1.axis("off")
    #plt.savefig('ComplainCount_WC.png')
    plt.savefig('ComplainCount_WC_2016.png')
    
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "issue_lda.html")
    #pyLDAvis.save_json(vis_data, "issue_lda.json")
    pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
    pyLDAvis.save_json(vis_data, "issue_lda_2016.json")

    return 0
Exemplo n.º 2
0
    def vis_hdpvis(self):
        """
        Produces LDAvis visualization.

        Opens a web browser page with javascript topic viewer.
        """
        hdp_vis_data = pg.prepare(self.hdp, self.cor, self.cor.dictionary)
        pyLDAvis.save_html(hdp_vis_data, '../../data/hdpvis.html')
        vis_path = os.path.realpath('../../data/hdpvis.html')
        webbrowser.open('file://{}'.format(vis_path), new=2)
Exemplo n.º 3
0
    def vis_ldavis(self):
        """
        Produces LDAvis visualization.

        Opens a web browser page with javascript topic viewer.
        """
        lda_vis_data = pg.prepare(self.lda, self.cor, self.cor.dictionary)
        pyLDAvis.save_html(lda_vis_data, "../../data/ldavis.html")
        vis_path = os.path.realpath("../../data/ldavis.html")
        webbrowser.open("file://{}".format(vis_path), new=2)
Exemplo n.º 4
0
 def visualize(self, outfn):
     """
     Produce a pyLDAvis visualization of a model and save to disk at the given location.
     """
     if self.has_viz_data:
         pyLDAvis.save_html(self.vis_data, outfn)
         return
     assert(self.has_vocab and self.has_corpus)
     assert(self.is_trained)
     # this might crash. I think because corpus, vocab, and _lda_model are all big. 
     self.vis_data = prepare(self._lda_model, self.corpus, self.vocab)
     self.has_viz_data = True
     pyLDAvis.save_html(self.vis_data, outfn)
def narrative_analysis(df):
    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop

    for index in range(0,len(df.index)):
        if str(df['narrative'].ix[index]) != 'nan':
            intext = df['narrative'].ix[index]
            intext = re.sub(r"X+", "", intext)
            raw = intext.lower()
            tokens = tokenizer.tokenize(raw)
       
            # remove stop words from tokens
            stopped_tokens = [i for i in tokens if not i in en_stop]
        
            # stem tokens and add them to list
            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
            texts.append(stemmed_tokens)

    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "narrative_lda.html")
    #pyLDAvis.save_json(vis_data, "narrative_lda.json")
    pyLDAvis.save_html(vis_data, "narrative_lda_2016.html")
    pyLDAvis.save_json(vis_data, "narrative_lda_2016.json")

    return 0
Exemplo n.º 6
0
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import gensim
import csv
import logging

from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
path_to_mallet_binary = "/home/xiu-xiu/Mallet/bin/mallet"

tweets = []
with open('data/clear_covid_tweets.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for tweet in reader:
        tweets.append(tweet['text'].split(' '))

dictionary = Dictionary(tweets)
corpus = [dictionary.doc2bow(tweet) for tweet in tweets]
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
    LdaMallet(path_to_mallet_binary,
              corpus=corpus,
              num_topics=50,
              id2word=dictionary))

vis_data = gensimvis.prepare(model, corpus, dictionary)
pyLDAvis.save_html(vis_data, 'lda.html')
Exemplo n.º 7
0
random_state=100,
update_every=1,
chunksize=100,
passes=10,
alpha='auto',
per_word_topics=True)
"""

# mallet lda 적용
mallet_path = 'source/mallet-2.0.8/bin/mallet'
lda_model = models.wrappers.LdaMallet(mallet_path,
                                      corpus=corpus,
                                      num_topics=20,
                                      id2word=id2word)

pprint(lda_model.print_topics())

# Compute Coherence Score
coherence_model_lda = models.CoherenceModel(model=lda_model,
                                            texts=totalCorpus,
                                            dictionary=id2word,
                                            coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# mallet 모델을 gensim 의 lda 모델로 변환 (wrapping)
lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(lda_model)

prepared_data = gensimvis.prepare(lda_model, corpus, id2word)
pyldavis_html_path = 'LDAresult/' + filename + '.html'
pyLDAvis.save_html(prepared_data, pyldavis_html_path)
Exemplo n.º 8
0
    def visualize(self):
        lda_display = ldvis.prepare(self.model, self.corpus, self.dictionary)

        pyLDAvis.save_html(lda_display, self.fileprefix + ".html")
        pyLDAvis.display(lda_display)
Exemplo n.º 9
0
dk = pd.read_csv('TW_Tweet.csv', encoding='UTF-8', low_memory=False)
df = pd.DataFrame(dk,
                  columns=['id', 'keyword', 'created', 'language', 'message'])
df.columns = ['id', 'key', 'created_time', 'language', 'message']
rm_duplicates = df.drop_duplicates(subset=['key', 'message'])
rm_na = rm_duplicates.dropna()
dtime = rm_na.sort_values(['created_time'])
dtime.index = range(len(dtime))
dlang = dtime[dtime['language'] == 'en']
data = dlang[dlang['key'] != 'johnson & johnson']
data = data[data['key'] != 'johnson&johnson']
data.index = range(len(data))
# ldamodel = LdaModel(finalcorpus, num_topics = 30, id2word = dictionary, update_every = 10, chunksize=2000, passes=10, alpha=0.05)
# ldamodel.save('lda30.model')
ldamodel = LdaModel.load('lda30.model')
vis_data = gensimvis.prepare(ldamodel, finalcorpus, dictionary)
# pyLDAvis.save_html(vis_data, 'lda30.html')
vistopicid = vis_data[6]
idlist = []
for j in range(1, len(vistopicid) + 1):
    idlist.append([i for i, x in enumerate(vistopicid) if x == j][0])
topicwords = {}
no = 0
for prob in ldamodel.show_topics(30, 7):
    tokens = ' '.join(re.findall(r"[\w']+", str(prob[1]))).lower().split()
    x = [''.join(c for c in s if c not in string.punctuation) for s in tokens]
    result = ' '.join([i for i in x if not i.isdigit()])
    topicwords[idlist[no]] = result.split()
    no += 1
for i in range(30):
    print("Topic", i + 1, ": ", topicwords[i])
Exemplo n.º 10
0
i = 6
while i > 0:
    if (i != 4):
        ldamodel = pickle.load(open('ldamodel_doc_topics' + str(i * 5), 'r'))
        prepared = pg.prepare(ldamodel, corpus, dictionary)
        pyLDAvis.save_html(prepared,
                           open('lda_doc_topics' + str(i * 5) + '.html', 'w'))
    i -= 1
ldamodel = pickle.load(open('ldamodel_doc_topics30', 'r'))
pyLDAvis.save_html(prepared, open('lda_doc_topics30.html', 'w'))

############################LDA VISUALIZATION##########################################
import pyLDAvis
from pyLDAvis import gensim as pg
prepared = pg.prepare(ldamodel, corpus, dictionary)
pyLDAvis.save_html(pyLDAvis.display(prepared), open('lda.html',
                                                    'w'))  # not correct mostly

#######################################################################################
#GET ALL DATES#
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

datesu = []
titles = []


def get_titles():
    url = 'http://www.narendramodi.in/category/text-speeches'
    driver = webdriver.Chrome()
Exemplo n.º 11
0
texts = [[token for token in text if frequency[token] > repetition_threshold]
         for text in texts]
print texts
# Construct a document-term matrix to understand how frewuently each term occurs within each document
# The Dictionary() function traverses texts, assigning a unique integer id to each unique token while also collecting word counts and relevant statistics.
# To see each token unique integer id, try print(dictionary.token2id)
dictionary = corpora.Dictionary(texts)
dictionary.compactify()
dictionary.save('dict.dict')

# Convert dictionary to a BoW
# The result is a list of vectors equal to the number of documents. Each document containts tumples (term ID, term frequency)
corpus = [dictionary.doc2bow(text) for text in texts]

texts = []

#Randomize training elements
corpus = np.random.permutation(corpus)
gensim.corpora.MmCorpus.serialize('corpus.mm', corpus)

# Create csc matrix of corpus (speed up if calling multiple times prepare)
#corpus_csc = gensim.matutils.corpus2csc(corpus)

dictionary = gensim.corpora.Dictionary.load('dict.dict')
corpus = gensim.corpora.MmCorpus('corpus.mm')
lda = gensim.models.ldamodel.LdaModel.load(model_path)

vis_data = prepare(lda, corpus, dictionary)
pyLDAvis.save_html(vis_data, 'visualization.html')
pyLDAvis.display(vis_data)
Exemplo n.º 12
0
# generate TF-IDF, LDA model
from gensim import models
tfidf_model = models.TfidfModel(corpus)
tfidf = tfidf_model[corpus]
print("\n","=========== TF-IDF ============")
# print first 10 elements of first document's tf-idf vector
print("\n",tfidf.corpus[0][:10])
# print top 10 elements of first document's tf-idf vector
print("\n",sorted(tfidf.corpus[0], key=lambda x: x[1], reverse=True)[:10])
# print token of most frequent element
#print("\n",dictionary.get(13))

n_topics = 5
lda = models.ldamodel.LdaModel(tfidf, num_topics=n_topics, id2word=dictionary, passes=1)
print("\n","=========== lda.show_topics() ============")
#print(lda.show_topics())
print(lda.print_topics(num_topics=n_topics, num_words=10))

import matplotlib
matplotlib.use('qt5agg')

import pyLDAvis.gensim as gensimvis
import pyLDAvis

vis_data = gensimvis.prepare(lda, corpus, dictionary)
x = pyLDAvis.prepared_data_to_html(vis_data)
print (x)


Exemplo n.º 13
0
def get_lda():
    tokenizer = RegexpTokenizer(r'\w+')

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # create sample documents
    doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
    doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
    doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
    doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
    doc_e = "Health professionals say that brocolli is good for your health."

    doc_set2 = [i for i in range(1,10)]
    print(doc_set2)

    # compile sample documents into a list
    doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

    # list for tokenized documents in loop
    texts = []

    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        tagged_tokens = nltk.pos_tag(tokens)
        print(tagged_tokens)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]

        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

        # add tokens to list
        texts.append(stemmed_tokens)
        print (tokens)
        print (stemmed_tokens)
        print ("--------------------------------")
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

    print (lda.show_topics())



    import matplotlib
    matplotlib.use('qt5agg')

    import pyLDAvis.gensim as gensimvis
    import pyLDAvis

    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    x = pyLDAvis.prepared_data_to_html(vis_data)
    #print (x)



    return x


    print("-------------------")

    '''
Exemplo n.º 14
0
    topics = 15
    words = 20

    lda_model = gensim.models.LdaMulticore(corpus_md_tfidf,
                                           num_topics=topics,
                                           id2word=dictionary,
                                           passes=2,
                                           workers=4)

    lsi_model = gensim.models.LsiModel(corpus_md_tfidf,
                                       num_topics=topics,
                                       id2word=dictionary)

    print("LDA Model:")

    for idx in range(topics):
        # Print the first 10 most representative topics
        print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

    print("=" * 20)

    print("LSI Model:")

    for idx in range(topics):
        # Print the first 10 most representative topics
        print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))

    lda_vis = gensimvis.prepare(lda_model, bow_corpus, dictionary)
    pyLDAvis.display(lda_vis)

    print("Done")
Exemplo n.º 15
0
rn = ReviewNormalizer()
normalized_reviews = [rn.tokenize(r)
                      for r in reviews]
pretty_print_html([" ".join(normalized_reviews[randint(0, len(normalized_reviews))]), 
                   " ".join(normalized_reviews[randint(0, len(normalized_reviews))])])


# #### Training the model (this might take a while...)

# In[12]:

dictionary = corpora.Dictionary(normalized_reviews)
corpus = [dictionary.doc2bow(r)
          for r in normalized_reviews]
lda = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary, passes=100)


# #### Prepare data and visualize!

# In[14]:

prepared_data = prepare(lda, corpus, dictionary)
pyLDAvis.display(prepared_data)


# In[ ]:



Exemplo n.º 16
0
#ldamodel = lda.LdaModel.load(fname, mmap='r')

pprint.pprint(ldamodel.show_topics(num_topics=50, num_words=20))
#pprint.pprint(ldamodel.top_topics(corpus,num_words=10))

ldatopics = [[word for word, prob in topic]
             for topicid, topic in ldamodel.show_topics(formatted=False)]
lda_coherence = CoherenceModel(topics=ldatopics,
                               texts=texts,
                               dictionary=dictionary,
                               window_size=5,
                               coherence='c_v')
print(lda_coherence.get_coherence())
print(lda_coherence.get_coherence_per_topic())

vis_data = ldavis.prepare(ldamodel, corpus, dictionary)
#pyLDAvis.display(vis_data)
pyLDAvis.save_html(vis_data, 'world_lda50.html')
""""
hey, do you want to play a game?
oh come on!
let's play
"""

# select top n words for each of the LDA topics
top_words = [[word for word, _ in ldamodel.show_topic(topicno, topn=10)]
             for topicno in range(ldamodel.num_topics)]

# get all top words in all topics, as one large set
all_words = set(itertools.chain.from_iterable(top_words))
print("Can you spot the misplaced word in each topic?")
# Visualize the LDA Mallet terms as wordclouds
from wordcloud import WordCloud  # Import wordclouds

# Initiate the wordcloud object
wc = WordCloud(background_color="white",
               colormap="Dark2",
               max_font_size=150,
               random_state=42)

plt.rcParams['figure.figsize'] = [20, 15]

# Create subplots for each topic
for i in range(25):

    wc.generate(text=topics_df["Terms per Topic"][i])

    plt.subplot(5, 5, i + 1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(topics_df.index[i])

plt.show()

# In[55]:

import pyLDAvis.gensim as gensimvis
vis_data = gensimvis.prepare(ldagensim, corpus, id2word, sort_topics=False)
pyLDAvis.display(vis_data)

# In[ ]:
Exemplo n.º 18
0
    print(type(medical_df))
    text = medical_df['transcription']
    print(type(text))
    docs = array(text)
    print(type(docs))
    # =============================
    # LDA
    lda = LDAAnalysis(docs)

    do_process = True

    if do_process:
        lda.fit()
        pickle_LDAAnalysis = open("data/cache/LDAAnalysis.pkl", "wb")
        pickle.dump(lda, pickle_LDAAnalysis)
        pickle_LDAAnalysis.close()
    else:
        LDAAnalysis = pickle.load("data/cache/LDAAnalysis.pkl")

    lda.coherence_values()

    lda_vis = gensimvis.prepare(lda, lda.corpus, lda.dictionary)
    pyLDAvis.display(lda_vis)

    # =============================
    # NNMF
    nnmf = NNMFTopicAnalysis(docs=docs)
    nnmf.fit()

    print('Done')
Exemplo n.º 19
0
    print("Making the BOW list")
    corpus = [dictionary.doc2bow(text) for text in text_data]

    ### Save the dictionary and Corpus so they can be used later ###
    pickle.dump(corpus, open('corpus.pkl', 'wb'))
    dictionary.save('dictionary.gensim')

    print("TF-IDF")
    model = TfidfModel(corpus)
    tfidfCorpus = model[corpus]
    #print(vector)

    print("Training the network")
    NUM_TOPICS = 40
    #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    #ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=100)
    ldamodel = gensim.models.ldamulticore.LdaMulticore(tfidfCorpus,
                                                       num_topics=NUM_TOPICS,
                                                       id2word=dictionary,
                                                       passes=50)
    ldamodel.save('ldamodel.gensim')

    lda = gensim.models.ldamodel.LdaModel.load('ldamodel.gensim')
    import pyLDAvis.gensim as gensimvis
    import pyLDAvis.gensim
    #lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
    lda_display = gensimvis.prepare(lda, corpus, dictionary, sort_topics=False)
    #gensimvis.display(lda_display)
    pyLDAvis.display(lda_display)
    pyLDAvis.show(lda_display)
folder = 'lda/manifesto/'
# this has been added in previous steps (basically the number of topics)
postfix = '_100'
manifestos = [
    'cdu_2002.csv', 'cdu_2005.csv', 'cdu_2009.csv', 'cdu_2013.csv',
    'fdp_2002.csv', 'fdp_2005.csv', 'fdp_2009.csv', 'fdp_2013.csv',
    'gruene_2002.csv', 'gruene_2005.csv', 'gruene_2009.csv', 'gruene_2013.csv',
    'linke_2005.csv', 'linke_2009.csv', 'linke_2013.csv', 'pds_2002.csv',
    'piraten_2013.csv', 'spd_2002.csv', 'spd_2005.csv', 'spd_2009.csv',
    'spd_2013.csv'
]

start = time.time()

for file in manifestos:
    checkpoint = time.time()
    print('starting analysis for file ' + file)
    model = gensim.models.ldamodel.LdaModel.load(folder + file + postfix +
                                                 '.model')
    corpus = gensim.corpora.mmcorpus.MmCorpus(folder + file + postfix +
                                              '.corpus')
    dictionary = gensim.corpora.dictionary.Dictionary.load(
        folder + file + postfix + '.dictionary', )

    visdata = gensimvis.prepare(model, corpus, dictionary, R=15)
    pyLDAvis.save_html(visdata, folder + file + postfix + '.html')
    print('generated html for ' + file + ' in ' +
          str(time.time() - checkpoint) + 's')

print('generating html for all files took ' + str(time.time() - start) + 's')
Exemplo n.º 21
0
def visualize_topics(lda_model, corpus, dictionary):
    """
    Function to visualize topics using pyLDAvis
    """
    vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
    pyLDAvis.display(vis_data)
    
    #Plot topic labels and terms labels separately to have different colours
    g = G.subgraph([topic for topic, _ in pos.items() if topic in t])
    nx.draw_networkx_labels(g, pos, font_size=20, font_color='r')
    #If network graph is difficult to read, don't plot ngrams titles.
    #g = G.subgraph([term for term, _ in pos.items() if str(term) not in t])
    #nx.draw_networkx_labels(g, pos, font_size=12, font_color='orange')
    #Plot edges
    nx.draw_networkx_edges(G, pos, edgelist=G.edges(), alpha=0.3)
    #Having trouble saving graph to file automatically; below code not working. Must manually save.
    plt.axis('off')
    plt.show(block=False)
    plt.savefig('/Users/Marcia/OneDrive/UNCC General/DSBA_6880/Misc_Analysis_Files/TopicNetwork'+num+'.png', bbox_inches='tight')

graph_terms_to_topics(lda, num_terms=num_top) 


#Create interactive graph to examine top 30 ngrams in each topic.
#Use pyLDAvis to visualize the topics in a network using 
#   Jensen-Shannon divergence as metric of distance between the topics.
import pyLDAvis.gensim as gensimvis
import pyLDAvis
#Create data to visualize.
vis_data = gensimvis.prepare(lda, corpus, dictionary)
#pyLDAvis.display(vis_data)
#Use vis_data "prepared" in earlier step.
#Now display the visualization in a local server page. 
#pyLDAvis.show(vis_data) 
#Save the visualization to an html file.
pyLDAvis.save_html(vis_data, '/Users/Marcia/OneDrive/UNCC General/DSBA_6880/Misc_Analysis_Files/ClaimsInteractVis'+num+'.html')
Exemplo n.º 23
0
clean_emails = pickle.load( open( "output/clean_emails.p", "rb" ) )
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [p_stemmer.stem(t) for t in filtered_tokens]
    return stems

from gensim import corpora, models, similarities 
#tokenize
token_emails = [tokenize_and_stem(text) for text in clean_emails]

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(token_emails)

#remove extremes
dictionary.filter_extremes(no_below=1, no_above=0.8)

dictionary.compactify()

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in token_emails]
final=models.ldamodel.LdaModel.load('output/final_topic10.model')
import pyLDAvis.gensim as gensimvis
import pyLDAvis
vis_data = gensimvis.prepare(final, corpus, dictionary)
pyLDAvis.display(vis_data)