Пример #1
0
def main(fn):
    with open(fn + '.json', 'r') as f:
        data_samples = []
        doc_lengths = []
        lb = datetime(2014, 04, 25) #datetime(2014, 01, 01)
        ub = datetime(2014, 04, 27) #datetime(2014, 12, 12)

        for line in f:
            tweet = json.loads(line)
            if getTime(tweet) < lb or getTime(tweet) > ub:
                continue
            
            sample, length = preprocess_text(getText(tweet))
            data_samples.append(sample)
            doc_lengths.append(length)

    n_features = 1000
    n_topics = 10
    n_top_words = 20

    #lda_topic(data_samples, n_features, n_topics, n_top_words)
    data_viz, _ = lda_viz(data_samples, doc_lengths, n_features, n_topics, n_top_words)
    #data_viz = pyLDAvis.prepare(**data_viz)
    #pyLDAvis.show(data_viz)
    pyLDAvis.save_html(data_viz, 'topics.html')
def issue_analysis(df):
    df_sub = df[['Issue']]
    df_sub.insert(0, 'count', 1)

    Issue_List=[]
    for i in range(0,50):
        Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)

    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop
    text_view = ''
                                                                
    # loop through document list
    for i in Issue_List:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
       
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        # stem tokens and add them to list
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

        #print ' '.join(stemmed_tokens)
        text_view += ' '.join(stemmed_tokens)
        text_view += ' '

    wordcloud = WordCloud().generate(text_view)
    fig = plt.figure(figsize=(8,6))
    fig1 = fig.add_subplot(1,1,1)
    fig1.set_title("Top issued words", fontdict={'fontsize':25})
    fig1.imshow(wordcloud)
    fig1.axis("off")
    #plt.savefig('ComplainCount_WC.png')
    plt.savefig('ComplainCount_WC_2016.png')
    
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "issue_lda.html")
    #pyLDAvis.save_json(vis_data, "issue_lda.json")
    pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
    pyLDAvis.save_json(vis_data, "issue_lda_2016.json")

    return 0
Пример #3
0
def plot_lda_vis(model_data, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, save_html, show
    model_vis_data = prepare(**model_data)
    if mode == 'save_html' and filename:
        save_html(model_vis_data, filename)
    else:
        show(model_vis_data)
Пример #4
0
def learn_topic_model(X, vocab, graphlets, config, dbg=False):

    alpha = config['dirichlet_params']['alpha']
    eta = config['dirichlet_params']['eta']
    model = lda.LDA(n_topics=config['n_topics'], n_iter=config['n_iters'], random_state=1, alpha=alpha, eta=eta)

    model.fit(X)  # model.fit_transform(X) is also available
    topic_word = model.topic_word_  # model.components_ also works
    n_top_words = 30

    feature_freq = (X != 0).sum(axis=0)
    doc_lengths = (X != 0).sum(axis=1)

    try:
        print "phi: %s. theta: %s. nd: %s. vocab: %s. Mw: %s" \
        %( model.topic_word_.shape, model.doc_topic_.shape, doc_lengths.shape, len(graphlets.keys()), len(feature_freq))
        data = {'topic_term_dists': model.topic_word_,
                'doc_topic_dists': model.doc_topic_,
                'doc_lengths': len(graphlets.keys()),
                'vocab': graphlets.keys(),
                'term_frequency': X}

        import pyLDAvis
        vis_data = pyLDAvis.prepare(model.topic_word_, model.doc_topic_, doc_lengths, graphlets.keys(), feature_freq)
        # vis_data = pp.prepare(model.topic_word_, model.doc_topic_, doc_lengths, graphlets.keys(), feature_freq)
        html_file = "../LDAvis/Learnt_Models/topic_model_" + id + ".html"
        pyLDAvis.save_html(vis_data, html_file)
        print "PyLDAVis ran. output: %s" % html_file

        """investigate the objects used in the topics"""
        print("\ntype(topic_word): {}".format(type(topic_word)))
        print("shape: {}".format(topic_word.shape))
        topics = {}
        for i, topic_dist in enumerate(topic_word):
            objs = []
            topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
            #print('Topic {}: {}'.format(i, ' '.join( [repr(i) for i in topic_words] )))
            for j in [graphlets[k] for k in topic_words]:
                objs.extend(object_nodes(j)[0])
            topics[i] = objs
            if dbg:
                print('Topic {}: {}'.format(i, list(set(objs))))

    except ImportError:
        print "No module pyLDAvis. Cannot visualise topic model"

    """investigate the highly probably topics in each document"""
    doc_topic = model.doc_topic_
    # #Each document's most probable topic - don't have the UUIDs, so dont use this.
    # pred_labels = []
    # for n in range(doc_topic.shape[0]):
    #     if max(doc_topic[n]) > config['class_thresh']:
    #         topic_most_pr = doc_topic[n].argmax()
    #         pred_labels.append(topic_most_pr)

    return doc_topic, topic_word #, pred_labels
Пример #5
0
    def vis_hdpvis(self):
        """
        Produces LDAvis visualization.

        Opens a web browser page with javascript topic viewer.
        """
        hdp_vis_data = pg.prepare(self.hdp, self.cor, self.cor.dictionary)
        pyLDAvis.save_html(hdp_vis_data, '../../data/hdpvis.html')
        vis_path = os.path.realpath('../../data/hdpvis.html')
        webbrowser.open('file://{}'.format(vis_path), new=2)
Пример #6
0
    def vis_ldavis(self):
        """
        Produces LDAvis visualization.

        Opens a web browser page with javascript topic viewer.
        """
        lda_vis_data = pg.prepare(self.lda, self.cor, self.cor.dictionary)
        pyLDAvis.save_html(lda_vis_data, "../../data/ldavis.html")
        vis_path = os.path.realpath("../../data/ldavis.html")
        webbrowser.open("file://{}".format(vis_path), new=2)
Пример #7
0
def lda_vis(modeled_corpus, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, show, save_html

    model_vis_data = _to_py_lda_vis(modeled_corpus)
    prepared_model_vis_data = prepare(**model_vis_data)

    if mode == 'save_html' and filename:
        save_html(prepared_model_vis_data, filename)
    else:
        show(prepared_model_vis_data)
Пример #8
0
def lda_vis(modeled_corpus, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, show, save_html

    model_vis_data = _to_py_lda_vis(modeled_corpus)
    prepared_model_vis_data = prepare(**model_vis_data)
    if mode == 'save_html' and filename:
        logging.info("Saving pyLDAVis to {}".format(filename))
        save_html(prepared_model_vis_data, filename)
    else:
        show(prepared_model_vis_data, ip="0.0.0.0", port=8888)
Пример #9
0
 def visualize(self, outfn):
     """
     Produce a pyLDAvis visualization of a model and save to disk at the given location.
     """
     if self.has_viz_data:
         pyLDAvis.save_html(self.vis_data, outfn)
         return
     assert(self.has_vocab and self.has_corpus)
     assert(self.is_trained)
     # this might crash. I think because corpus, vocab, and _lda_model are all big. 
     self.vis_data = prepare(self._lda_model, self.corpus, self.vocab)
     self.has_viz_data = True
     pyLDAvis.save_html(self.vis_data, outfn)
Пример #10
0
	def vectorize(self):
                '''
                args: 
                    none
                output:
                    generates an LDA topic model of the document using gensim and pyLDAvis
                '''
		# tokenize and remove stopwords
		sentences = self.sent_detector.tokenize(self.raw.decode('utf-8').strip()) # use raw text
		#sentences = Topic(raw_input('topic: ')).text # get text from wikipedia
		#stoplist  = set('for this that by or is a of the and to in are be as an it can on if at which then also with used such not from use other have some these more using has many one was may often but their they than when been its not all may some have had'.split())
		texts     = [[word for word in sentence.lower().split() if word not in self.stopwords] for sentence in sentences]
		
		# compute the frequency of each token
		frequency = defaultdict(int)
		for text in texts:
			for token in text:
				frequency[token] += 1

		# remove words that appear only once
		texts = [[token for token in text if frequency[token] > 1] for text in texts]
		
		# construct a gensim dictionary and corpus (bag of words)
		dictionary = corpora.Dictionary(texts)
		corpus     = [dictionary.doc2bow(text) for text in texts] # currently, "text" is a sentence in the document

		# define LDA model
		lda = models.ldamodel.LdaModel( corpus       = corpus, 
						id2word      = dictionary,
						num_topics   = 10, #what should this be ???
						update_every = 1, 
						chunksize    = 10000, 
						passes       = 1 )
		
		# visualize the lda space
		vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
        	pyLDAvis.display(vis_data)
       		pyLDAvis.show(vis_data)
                with open('topic_models/'+self.name+'.json', 'a+') as topic_json:
                    pyLDAvis.save_json(vis_data, topic_json)
                with open('topic_models/'+self.name+'.html', 'a+') as topic_html:
                    pyLDAvis.save_html(vis_data, topic_html)
def narrative_analysis(df):
    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop

    for index in range(0,len(df.index)):
        if str(df['narrative'].ix[index]) != 'nan':
            intext = df['narrative'].ix[index]
            intext = re.sub(r"X+", "", intext)
            raw = intext.lower()
            tokens = tokenizer.tokenize(raw)
       
            # remove stop words from tokens
            stopped_tokens = [i for i in tokens if not i in en_stop]
        
            # stem tokens and add them to list
            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
            texts.append(stemmed_tokens)

    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "narrative_lda.html")
    #pyLDAvis.save_json(vis_data, "narrative_lda.json")
    pyLDAvis.save_html(vis_data, "narrative_lda_2016.html")
    pyLDAvis.save_json(vis_data, "narrative_lda_2016.json")

    return 0
Пример #12
0
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

#Creating LDA model
print('\nBuilding the model\n')
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=100,
                                            alpha='auto',
                                            per_word_topics=True)
topics = lda_model.print_topics()

print(topics)
#topics = zip(*topics)

# Visualize the topics
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(p, 'lda.html')
    def main(self):
        # set seed
        np.random.seed(1)

        # preprocess tweets
        self.preprocess_tweets()

        # word cloud
        word_cloud_img = os.path.join(self.this_dir, "../plots/word_cloud.png")
        if not os.path.isfile(word_cloud_img):
            self.word_cloud(word_cloud_img)

        # initialise the count vectorizer with English stop words
        count_vectorizer = CountVectorizer(stop_words="english")

        # fit and transform preprocessed tweets (counts the num of each word in vector)
        count_data = count_vectorizer.fit_transform(
            self.tweets_df["content_pro"])

        # most common words
        most_comm_words = self.most_common_words(count_data, count_vectorizer)

        # best fitted LDA model and num of topics
        best_lda_model = self.grid_search(count_data)
        n_topics = best_lda_model.n_components

        # best fitted LDA model performance
        log_like_best, perp_best = self.performance(count_data, best_lda_model)
        print("Model: best_lda_model", end="\n")
        print(f"Best Model's Params: {best_lda_model.get_params()}")
        print(f"Log Likelihood: {log_like_best}")
        print(f"Perplexity: {perp_best}")

        # extract topics from top keywords in each tweet
        topics, doc_topic_df = self.extract_topics(count_data,
                                                   count_vectorizer,
                                                   best_lda_model)

        # add topics to self.tweets_df
        self.tweets_df["topic"] = doc_topic_df["topic"].tolist()

        # set LDAvis_prepared paths
        LDAvis_prep_data_path = os.path.join(
            self.this_dir, "../data/ldavis_data_" + str(n_topics))
        LDAvis_prep_html_path = os.path.join(
            self.this_dir, "../plots/ldavis_html_" + str(n_topics))

        # load LDAvis_prepared data from disk
        # plot showing topics in topic model that has been fitted to corpus of text data
        try:
            with open(LDAvis_prep_data_path, "rb") as f:
                LDAvis_prep = cPickle.load(f)

        except FileNotFoundError:
            LDAvis_prep = sklearn_lda.prepare(best_lda_model, count_data,
                                              count_vectorizer)
            with open(LDAvis_prep_data_path, "wb") as f:
                cPickle.dump(LDAvis_prep, f)

            # save html file
            pyLDAvis.save_html(LDAvis_prep, LDAvis_prep_html_path + ".html")

        # returns interactive plot, groups, and 10 most common words
        return (
            self.tweets_df[["content_pro", "topic"]],
            LDAvis_prep_html_path,
            most_comm_words,
            topics,
        )
Пример #14
0
from gensim import corpora, models
import pyLDAvis.gensim
import pyLDAvis

dic = corpora.Dictionary.load('data/model/newsgroups.dict')
corp = corpora.MmCorpus('data/model/newsgroups.mm')
lda = models.ldamodel.LdaModel.load('data/model/newsgroups_50.model')

# Prepare the data for the visualization
newsgroup_data = pyLDAvis.gensim.prepare(lda, corp, dic)

# Create the visualization
pyLDAvis.display(newsgroup_data)

# Save the visualization as a html file 
pyLDAvis.save_html(newsgroup_data, 'data/model/newsgroup_ldavis.html')
Пример #15
0
    update_every=1,
    chunksize=100,
    passes=10,
    alpha="auto",
    per_word_topics=True,
)

# 13. View the topics in LDA model
# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

# 14. Compute Model Perplexity and Coherence Score
# Compute Perplexity
print("\nPerplexity: ", lda_model.log_perplexity(
    corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=data_lemmatized,
                                     dictionary=id2word,
                                     coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print("\nCoherence Score: ", coherence_lda)

# 15. Visualize the topics-keywords
# Visualize the topics
# pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, "lda.html")
Пример #16
0
def visualize_data(bow_corpus, tweet_dictionary, lda_model):
    lda_visualization = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus,
                                                       tweet_dictionary)
    pyLDAvis.save_html(lda_visualization, 'vis.html')
Пример #17
0
def visualize_model(model, dictcorpus, vectorcorpus, vizfile):
    visualization = pyLDAvis.gensim.prepare(model, vectorcorpus, dictcorpus, sort_topics=False, mds="mmds")
    pyLDAvis.save_html(visualization, vizfile)
Пример #18
0
count_vectorizer = CountVectorizer(stop_words=my_stop_words)# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(texts)# Visualise the 10 most common words
#count_data = crossRef(count_data, men)
#plot_10_most_common_words(count_data, count_vectorizer)


# Tweak the two parameters below
number_topics = 11
number_words = 10# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)



LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(number_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
    f.close()
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    #f.encode('utf-8').strip()
    LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared, './outs/ldavis_prepared_'+ str(number_topics) +'.html')
Пример #19
0
def make(corpus_path, dest_path, num_topics=50, passes=10):
    corpus_id = corpus_path.split("/").pop().replace('.pkl', "")
    cfname = os.path.join(dest_path, "lda_" + corpus_id)
    data_words = create_data_words(corpus_path)

    print "MAKING GRAM MODELS"
    # Build the bigram and trigram models
    # higher threshold fewer phrases.
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Ngram models
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    data_words_nostops = remove_stopwords(data_words)
    data_words_bigrams = make_bigrams(bigram_mod, data_words_nostops)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(
        nlp,
        data_words_bigrams,
        allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    print "MAKING CORPUS RESOURCES"
    texts = data_lemmatized

    # Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # TDF
    corpus = [id2word.doc2bow(text) for text in texts]

    lda_model = None

    # # Check if model exists
    if not os.path.isfile(cfname):
        # LDA model
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=num_topics,
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=passes,
                                                    alpha='auto',
                                                    per_word_topics=True)
        lda_model.save(cfname)
    else:
        print "MODEL FOUND USING PREVIOUS"
        lda_model = gensim.models.ldamodel.LdaModel.load(cfname)

    doc_lda = lda_model[corpus]

    # Compute Perplexity
    # a measure of how good the model is. lower the better.
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data_lemmatized,
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)

    # Topics
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')
    vis_file_name = dest_path + '/' + \
        str(num_topics) + "_" + corpus_id + ".html"
    pyLDAvis.save_html(vis, vis_file_name)
Пример #20
0
def get_result_lda(corpus_file_name, paper_file_name, trial_file_name,
                   result_paper_file, result_trail_file, label_word_file,
                   result_topic_file, most_relevant_file, topic_number,
                   html_file):
    """
    Perform lda, get the result
    :param html_file:
    :param topic_number:
    :param most_relevant_file:
    :param result_topic_file:
    :param corpus_file_name: Corpus file
    :param paper_file_name: paper file
    :param trial_file_name: trial file
    :param result_paper_file: Tagged paper file
    :param result_trail_file: Tagged trial file
    :param label_word_file: Main word file
    :return:
    """
    # TODO: data processing
    data = pd.read_excel(corpus_file_name)
    data['context'] = data['context'].apply(lambda x: x
                                            if x is not np.nan else '')
    doc_clean = [x.split() for x in list(data['context'])]
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    # TODO: 模型训练
    Lda = gensim.models.ldamodel.LdaModel
    ldamodel = Lda(doc_term_matrix,
                   num_topics=topic_number,
                   id2word=dictionary,
                   random_state=4,
                   iterations=1000)
    # TODO: Judgment tags, main keywords,Perc Contribution
    df_dominant_topic = format_topics_sentences(ldamodel, doc_term_matrix,
                                                doc_clean, data['id'])
    df_dominant_topic.columns = [
        'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'id'
    ]  # Show
    df_dominant_topic.to_excel(result_topic_file)

    labels = pd.DataFrame(list(df_dominant_topic['Dominant_Topic']),
                          columns=['label'])
    paper_data = pd.read_excel(paper_file_name)
    trail_data = pd.read_excel(trial_file_name)
    paper_data['label'] = list(labels['label'])[:len(paper_data)]
    trail_data['label'] = list(labels['label'])[len(paper_data):]
    paper_data.to_excel(result_paper_file, index=False)
    trail_data.to_excel(result_trail_file, index=False)
    # TODO: Statistics of each category:
    print(labels['label'].value_counts())
    print('Statistics of the number of papers in each category')
    data['label'] = labels
    data1 = data[data['type'] == '论文']
    print(data1['label'].value_counts())
    print('Statistics of the number of trial in each  categories')
    data2 = data[data['type'] == '试验']
    print(data2['label'].value_counts())

    # TODO:Determine the top 10 words in the total frequency of each category
    s = ldamodel.print_topics(num_topics=topic_number, num_words=20)
    result_topic = []
    for doc_class, doc_t in s:
        doc_topics = doc_t.split('+')
        for doc_topic in doc_topics:
            result_topic.append([
                doc_class,
                doc_topic.split('*')[1].strip(),
                doc_topic.split('*')[0].strip()
            ])
    result_topic = pd.DataFrame(result_topic,
                                columns=['class', 'topic', 'score'])
    result_topic.to_excel(label_word_file, index=None)
    result_topic.to_excel(label_word_file, index=False)
    # TODO: Determine the most similar corpus (corpus, ID)
    sent_topics_sorteddf_mallet = pd.DataFrame()
    sent_topics_outdf_grpd = df_dominant_topic.groupby('Dominant_Topic')
    for i, grp in sent_topics_outdf_grpd:
        sent_topics_sorteddf_mallet = pd.concat([
            sent_topics_sorteddf_mallet,
            grp.sort_values(['Topic_Perc_Contrib'], ascending=[0]).head(10)
        ],
                                                axis=0)  # Reset Index
    sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)  # Format
    sent_topics_sorteddf_mallet.columns = [
        'Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text", 'id'
    ]  # Show
    sent_topics_sorteddf_mallet.head(10)
    sent_topics_sorteddf_mallet.to_excel(most_relevant_file, index=None)
    # TODO: Measurement model
    # cm_result = []
    # for coherence in ['u_mass']:
    #     goodcm = CoherenceModel(model=ldamodel, corpus=doc_term_matrix, dictionary=dictionary, coherence=coherence)
    #     cm_result.append(goodcm.get_coherence())
    # for coherence in ['c_v', 'c_uci', 'c_npmi']:
    #     goodcm = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence=coherence)
    #     cm_result.append(goodcm.get_coherence())
    # print(cm_result)
    vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
    # pyLDAvis.show(vis)
    pyLDAvis.save_html(vis, html_file)

    print('Clustering complete!')
                                  id2word = dictionary,
                                  num_topics = k,
                                  alpha = alpha,
                                  eta = eta,
                                  random_state = 100,
                                  chunksize = 100,
                                  passes = 10,
                                  per_word_topics = True)
for idx, item in enumerate(final_model.print_topics(num_topics = -1, num_words = 30)):
    print("Topic %s has following keywords: "%(idx))
    patterns=re.findall("\"(.*?)\"",str(item),re.S)
    print(patterns)
    # print(cv_score(corpus=corpus, dict_ = dictionary, k=30, alpha="symmetric", eta="auto"))
# 使用 pyLDAvis 进行可视化
viz = pyLDAvis.gensim.prepare(final_model, corpus, dictionary)
pyLDAvis.save_html(viz, "./tm_viz.html")
#命名生成的html
# 修改网页中的 3 处调用
with open("./tm_viz_new.html", "w") as t1:
    with open("./tm_viz.html", "r") as f2:
        webpage = f2.read()
        # 将 css js 文件存放在本地,避免加载堵塞
        webpage = webpage.replace("https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css",
                                  "ldavis.v1.0.0.css")
        webpage = webpage.replace("https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js",
                                  "ldavis.v1.0.0.js")
        # 需要修正 d3.js 调用,回归老版本
        webpage = webpage.replace("https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js",
                                  "https://d3js.org/d3.v3.js")
    t1.write(webpage)
folder = 'lda/manifesto/'
# this has been added in previous steps (basically the number of topics)
postfix = '_100'
manifestos = [
    'cdu_2002.csv', 'cdu_2005.csv', 'cdu_2009.csv', 'cdu_2013.csv',
    'fdp_2002.csv', 'fdp_2005.csv', 'fdp_2009.csv', 'fdp_2013.csv',
    'gruene_2002.csv', 'gruene_2005.csv', 'gruene_2009.csv', 'gruene_2013.csv',
    'linke_2005.csv', 'linke_2009.csv', 'linke_2013.csv', 'pds_2002.csv',
    'piraten_2013.csv', 'spd_2002.csv', 'spd_2005.csv', 'spd_2009.csv',
    'spd_2013.csv'
]

start = time.time()

for file in manifestos:
    checkpoint = time.time()
    print('starting analysis for file ' + file)
    model = gensim.models.ldamodel.LdaModel.load(folder + file + postfix +
                                                 '.model')
    corpus = gensim.corpora.mmcorpus.MmCorpus(folder + file + postfix +
                                              '.corpus')
    dictionary = gensim.corpora.dictionary.Dictionary.load(
        folder + file + postfix + '.dictionary', )

    visdata = gensimvis.prepare(model, corpus, dictionary, R=15)
    pyLDAvis.save_html(visdata, folder + file + postfix + '.html')
    print('generated html for ' + file + ' in ' +
          str(time.time() - checkpoint) + 's')

print('generating html for all files took ' + str(time.time() - start) + 's')
def get_pyLDAvis(model, corpus, id2word, current_dir):
    print("\n* Now we will visualize the topics using pyLDAvis.")
    vis = pyLDAvis.gensim.prepare(model, corpus, id2word, sort_topics=False)
    pyLDAvis.save_html(vis, '%s/topic_model.html' % current_dir)
    print("PyLDAvis saved to html.")
Пример #24
0
text = list(csv.reader(open('result.txt', encoding='UTF-8')))

for i in range(len(text)):
    for j in range(len(text[i])):
        text[i][j] = re.sub(r'[!-@]', "", text[i][j])
        text[i][j] = re.sub(r'[{-~]', "", text[i][j])
        text[i][j] = text[i][j].lower()

data = []
for i in range(len(text)):
    data.append([
        word for word in text[i] if word not in stop_words and len(word) >= 2
    ])

with open('fil_data.pkl', 'wb') as w:
    pickle.dump(data[::2], w)

dictionary = gensim.corpora.Dictionary(data)
dictionary.filter_extremes(no_below=3, no_above=0.8)
corpus = [dictionary.doc2bow(t) for t in data]
print('vocab size: ', len(dictionary))

#LDAvis
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=6,
                                            random_state=0)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False)
vis
pyLDAvis.save_html(vis, 'LDAvis_output.html')
Пример #25
0
from sklearn.decomposition import LatentDirichletAllocation as LDA

# Tweak the two parameters below
number_topics = 10
number_words = 20
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

LDAvis_data_filepath = os.path.join(
    output_dir, filename_stem + '_' + 'lda_vis_prepared_' + str(number_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(
    LDAvis_prepared,
    os.path.join(
        output_dir, filename_stem + '_' + 'lda_vis_prepared_' +
        str(number_topics) + '.html'))

foo = 1
Пример #26
0
                                             texts=all_studio_comment_list,
                                             dictionary=id2word,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

        #Coherence Score:  0.28624721848288204

        # Visualize the topics
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(lda_model,
                                      comment_vectors,
                                      id2word,
                                      mds='mmds')
        #pyLDAvis.show(vis)
        pyLDAvis.save_html(vis, 'lda_t15_w40.html')

        mallet_path = 'mallet-2.0.8/bin/mallet'  # update this path
        ldamallet = gensim.models.wrappers.LdaMallet(mallet_path,
                                                     corpus=comment_vectors,
                                                     num_topics=20,
                                                     id2word=id2word)

        # Show Topics
        pprint(ldamallet.show_topics(formatted=False))

        #endregion 2.4) LDA from Gensim

        #########
        # region 2.5) Build tfidf model
        comment_tfidf = TfidfModel(comment_vectors)
Пример #27
0
    def modeling(self):
        data_ = self.data
        string_ = self.string
        stop_words = stopwords.words('english')
        stop_words.extend(list(STOPWORDS))
        stop_words.extend(list(ENGLISH_STOP_WORDS))
        stop_words1 = get_stop_words('english')
        stop_words.extend(stop_words1)
        stop_words = list(set(stop_words))
        stop_words.extend([
            "_d180g", "Object", "Name", "NaN", "dtype", "Length",
            "backupnotes", "contact", "history"
        ])
        dataS4 = data_[string_].values.tolist()

        # Word tokenization
        def sent_to_words(sentences):
            for sentence in sentences:
                yield (gensim.utils.simple_preprocess(str(sentence),
                                                      deacc=True)
                       )  # deacc=True removes punctuations

        def remove_stopwords(texts):
            return [[
                word for word in simple_preprocess(str(doc))
                if word not in stop_words
            ] for doc in texts]

        data_words = list(sent_to_words(dataS4))
        data_words = remove_stopwords(data_words)
        # Build the bigram and trigram models
        bigram = gensim.models.Phrases(
            data_words, min_count=2,
            threshold=2)  # higher threshold fewer phrases.
        trigram = gensim.models.Phrases(bigram[data_words], threshold=2)

        # Faster way to get a sentence clubbed as a trigram/bigram
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        trigram_mod = gensim.models.phrases.Phraser(trigram)

        def make_bigrams(texts):
            return [bigram_mod[doc] for doc in texts]

        def make_trigrams(texts):
            return [trigram_mod[bigram_mod[doc]] for doc in texts]

        def lemmatization(texts,
                          allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
            """https://spacy.io/api/annotation"""
            texts_out = []
            for sent in texts:
                doc = nlp(" ".join(sent))
                texts_out.append([
                    token.lemma_ for token in doc
                    if token.pos_ in allowed_postags
                ])
            return texts_out

        # Remove Stop Words
        data_words_nostops = remove_stopwords(data_words)

        # Form Bigrams
        data_words_bigrams = make_bigrams(data_words_nostops)

        # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
        # python3 -m spacy download en
        nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

        # Do lemmatization keeping only noun, adj, vb, adv
        data_lemmatized = lemmatization(
            data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
        # Create Dictionary
        id2word = corpora.Dictionary(data_lemmatized)

        # Create Corpus
        texts = data_lemmatized

        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]
        # Build LDA model
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=9,
                                                    random_state=100,
                                                    per_word_topics=True)
        # Compute Perplexity
        #print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
        # Compute Coherence Score
        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=data_lemmatized,
                                             dictionary=id2word,
                                             coherence='u_mass')
        coherence_lda = coherence_model_lda.get_coherence()
        #print('\nCoherence Score: ', coherence_lda)
        # Visualize the topics
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
        pyLDAvis.save_html(vis, string_ + '.html')
        return (print('\nPerplexity: ', lda_model.log_perplexity(corpus)),
                print('\nCoherence Score: ', coherence_lda))
Пример #28
0
def exportLDA_vis(best_model, corpus, id2word, filename='pyLDAvis.html'):
    import pyLDAvis
    import pyLDAvis.gensim
    panel = pyLDAvis.gensim.prepare(best_model, corpus, id2word)
    pyLDAvis.save_html(panel, filename)
Пример #29
0
def visualize_pyldavis(lda_model, corpus, dictionary):
    prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
    pyLDAvis.save_html(prepared, 'vis_topic_model_02.html')
    pyLDAvis.show(prepared)
Пример #30
0
NUM_TOPICS = 8 #optium = 8
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

# # Build the LSI model
# lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

print("LDA Model:")
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx))


from pyLDAvis import gensim
import pyLDAvis
visualisation = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(visualisation, 'LDA_Visualization.html')



# creat lda values 
lda_value = []
for token in tokenized_data:
    #print(text)
    bow = dictionary.doc2bow(token)
    lda_value.append(lda_model[bow])

# bow = dictionary.doc2bow(clean_text(text_df[3]))
# print(lda_model[bow])
# bow3 = dictionary.doc2bow(tokenized_data[3])
# print(lda_model[bow3])
embeddings_index_all = {}
def produce_visualization(
        file_names=["Isla Vista - All Excerpts - 1_2_2019.xlsx"],
        tokenizer=stem_tokenizer,
        labels=['ACCOUNT', 'HERO'],
        max_sentences=None,
        as_sentences=False,
        output_file='ldavis'):
    data = load_data.load_xlsx_data(file_names,
                                    max_sentences=max_sentences,
                                    as_sentences=as_sentences,
                                    labels=labels)

    excerpts = list(data['Excerpts'])

    # exclude labels with no true label
    keep_labels = []
    for lab in labels:
        if sum(data[lab]) > 0:
            keep_labels.append(lab)
        else:
            print(lab + " label not present in files: " + str(file_names))
    labels = keep_labels

    # create a subset of the data frame that is the account label types
    main_types_df = data[labels]

    main_types_df.index = range(1, main_types_df.shape[0] + 1)

    # drop rows and excerpts with no label
    # build vocab and doc_lengths
    all_words = []
    doc_lengths = []
    main_types_excerpts = []
    for idx, doc in enumerate(excerpts):
        if sum(main_types_df.loc[idx + 1]) < 1:
            # if this document had no main type label
            main_types_df = main_types_df.drop([idx + 1], axis=0)
        else:
            main_types_excerpts.append(doc)
            doc_toks = stem_tokenizer(doc)
            all_words.extend(doc_toks)
            doc_lengths.append(len(doc_toks))
    fdist = FreqDist(all_words)
    fdistmc = fdist.most_common()
    vocab = [word for word, count in fdistmc]
    term_frequency = [count for word, count in fdistmc]
    print("number of labelled documents: " + str(len(doc_lengths)))

    # build topic-term distribution
    stop_words = set(stopwords.words('english'))
    freq_dist_dict = {}
    topic_size = []
    topic_num_words = []
    i = 0
    for coln in main_types_df.columns:
        categ_excerpts = list(
            compress(main_types_excerpts, main_types_df[coln].values))
        exq = [tokenizer(doc) for doc in categ_excerpts]
        excerpt_words = [tok for tok_list in exq for tok in tok_list]
        i = i + 1
        topic_size.append(len(exq))
        topic_num_words.append(len(excerpt_words))
        #print("Topic "+str(i)+": "+coln+" number of excerpts: "+str(len(exq)))
        words = [
            word for word in excerpt_words
            if word.lower() not in stop_words and word.isalpha()
        ]
        freq_dist_dict[coln] = FreqDist(words)

    topic_term_dists = []

    for coln in main_types_df.columns:
        ffdist = freq_dist_dict[coln]
        fdist = [
            ffdist.freq(word) if word in ffdist.keys() else np.nextafter(
                float(0), (1)) for word in vocab
        ]
        #print("categ: "+str(coln)+" len of freq dist "+str(len(fdist))+" sum of vetor: "+str(sum(fdist)))
        topic_term_dists.append([float(i) for i in fdist])

    # Document-topic distribution
    doc_topic_dists = []
    for index, rowi in main_types_df.iterrows():
        row = list(rowi)
        if (sum(row) > 1.01 or sum(row) < 0.99):
            #print(str(index)+" row: "+str(row))
            # normalize row
            row = [r / sum(row) for r in row]
        if (sum(row) == 0):
            print(row)
        doc_topic_dists.append([float(i) for i in row])

    # format for pyLDAvis
    data_dict = {
        'topic_term_dists': topic_term_dists,
        'doc_topic_dists': doc_topic_dists,
        'doc_lengths': doc_lengths,
        'vocab': vocab,
        'term_frequency': term_frequency
    }
    #print('Topic-Term shape: %s' % str(np.array(data_dict['topic_term_dists']).shape))
    #print('Doc-Topic shape: %s' % str(np.array(data_dict['doc_topic_dists']).shape))

    # save data as json
    with open(output_file + '.json', 'w') as json_file:
        json.dump(data_dict, json_file)

    vis_data = pyLDAvis.prepare(**data_dict, n_jobs=-1)

    # order the columns for pyldavis
    col_order = vis_data.topic_order
    categs = list(main_types_df.columns)
    string_list = [""] * len(col_order)
    for idx, i in enumerate(col_order):
        msg = "Topic " + str(idx + 1) + ": " + categs[
            i - 1] + ", number of words: " + str(topic_num_words[i - 1])
        print(msg)
        string_list[idx] = msg

    with open(output_file + '.txt', 'w') as f:
        for msg in string_list:
            f.write("%s\n" % msg)

    pyLDAvis.save_html(vis_data, output_file + '.html')
    #if display:
    #pyLDAvis.display(vis_data)
    return vis_data
Пример #32
0
import pickle
import pandas as pd

infile = open(
    '/Users/gavin/Documents/Metis/Coursework/Project_4/notebooks/lda_tfidf.pkl',
    'rb')
lda_tfidf = pickle.load(infile)

infile = open(
    '/Users/gavin/Documents/Metis/Coursework/Project_4/notebooks/dtm_tfidf.pkl',
    'rb')
dtm_tfidf = pickle.load(infile)

infile = open(
    '/Users/gavin/Documents/Metis/Coursework/Project_4/notebooks/tf_idf_vectorizer.pkl',
    'rb')
tf_vectorizer = pickle.load(infile)

import pyLDAvis
import pyLDAvis.sklearn

visualization = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tf_vectorizer)

pyLDAvis.save_html(
    visualization,
    '/Users/gavin/Documents/Metis/Coursework/Project_4/notebooks/LDA_Visualization.html'
)
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

panel = pyLDAvis.sklearn.prepare(best_lda_model,
                                 data_vectorized,
                                 vectorizer,
                                 mds='tsne')
pyLDAvis.save_html(panel, 'models/lda.html')

# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2),
                                 columns=topicnames,
                                 index=docnames)
def topicVisuals(df):

    parent_df = preProcess_data(df)

    parent_df['sat1'] = parent_df['sat1'].astype(int)
    # In[8]:

    parent_df['bagofwords'].head(10)

    # In[9]:

    df = parent_df

    # In[10]:

    #Segmenting the complete data frame based on the quarter
    df_q1 = df[df['rptqtr'] == '201603']
    df_q2 = df[df['rptqtr'] == '201604']
    df_q3 = df[df['rptqtr'] == '201701']
    df_q4 = df[df['rptqtr'] == '201702']
    df_q5 = df[df['rptqtr'] == '201703']
    df_q6 = df[df['rptqtr'] == '201704']
    df_q7 = df[df['rptqtr'] == '201801']
    df_q8 = df[df['rptqtr'] == '201802']
    df_q9 = df[df['rptqtr'] == '201803']

    df_q1['sat1'] = df_q1['sat1'].astype(int)
    df_q2['sat1'] = df_q2['sat1'].astype(int)
    df_q3['sat1'] = df_q3['sat1'].astype(int)
    df_q4['sat1'] = df_q4['sat1'].astype(int)
    df_q5['sat1'] = df_q5['sat1'].astype(int)
    df_q6['sat1'] = df_q6['sat1'].astype(int)
    df_q7['sat1'] = df_q7['sat1'].astype(int)
    df_q8['sat1'] = df_q8['sat1'].astype(int)
    df_q9['sat1'] = df_q9['sat1'].astype(int)

    print(df_q1.shape)
    print(df_q2.shape)
    print(df_q3.shape)
    print(df_q4.shape)
    print(df_q5.shape)
    print(df_q6.shape)
    print(df_q7.shape)
    print(df_q8.shape)
    print(df_q9.shape)

    # In[11]:

    #Topic modelling

    #First tokenizing the data sentence wise and then after that word wise to avoid missing characters like punctuation
    def tokenize_only(text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [
            word.lower() for sent in nltk.sent_tokenize(str(text))
            for word in nltk.word_tokenize(sent)
        ]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)
        return filtered_tokens

    # In[12]:

    word_list = tokenize_only(df['bagofwords'].tolist())
    stop_words = stopwords.words('english')

    filtered_words = [
        word for word in word_list if word.lower().strip() not in stop_words
    ]

    tfidf_vect = TfidfVectorizer(norm='l2',
                                 min_df=0,
                                 use_idf=True,
                                 smooth_idf=False,
                                 sublinear_tf=False)
    count_vect = CountVectorizer(max_df=0.80, max_features=50000)

    X = tfidf_vect.fit_transform(filtered_words)
    _X_ = count_vect.fit_transform(filtered_words)

    dense_matrix = _X_.todense()

    print("Sparsity: ", ((dense_matrix > 0).sum() / dense_matrix.size) * 100,
          "%")

    n_components = 10
    lda = LatentDirichletAllocation(n_components=n_components,
                                    learning_method='batch',
                                    max_iter=25,
                                    random_state=0)
    document_topics = lda.fit_transform(_X_).T
    sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
    feature_names = np.array(count_vect.get_feature_names())
    mglearn.tools.print_topics(topics=range(n_components),
                               feature_names=feature_names,
                               sorting=sorting,
                               topics_per_chunk=5,
                               n_words=10)
    # Log Likelyhood: Higher the better
    print("Log Likelihood using tf-idf: ", lda.score(_X_))
    print("Perplexity using tf-idf: ", lda.perplexity(_X_))
    # In[13]:

    lda_model, countMatrix, countVectorizer, tfidfMatrix, tfidfVectorizer = lda, _X_, count_vect, X, tfidf_vect

    p = pyLDAvis.sklearn.prepare(lda_model,
                                 countMatrix,
                                 countVectorizer,
                                 mds='mmds')
    visual_file = 'visuals.html'
    pyLDAvis.save_html(p, os.getcwd() + '/templates/' + visual_file)

    # In[14]:

    detract_df = parent_df[parent_df['sat1'] < 8]

    promo_df = parent_df[parent_df['sat1'] > 8]

    return (visual_file)
Пример #35
0
                      help="specify LDA model.")
    args.add_argument("-s",
                      "--save_to_file",
                      type=str,
                      help="speficy file which the HTML will be saved to.")
    args.add_argument("-t",
                      "--use_tfidf",
                      action="store_true",
                      help="use TF-IDF corpus.")
    args.add_argument(
        "--method",
        type=str,
        default="pcoa",
        help="specify a method for MDS by one from 'pcoa', 'mmds', or 'tsne'.")
    return args.parse_args()


if __name__ == "__main__":
    args = parse_arg()
    model = LdaModel.load(args.model[0])
    corpus = MmCorpus(args.corpus[0])
    if args.use_tfidf:
        tfidf = TfidfModel(corpus)
        corpus = tfidf[corpus]
    dictionary = Dictionary.load_from_text(args.dictionary[0])
    vis = pyLDAvis.gensim.prepare(model, corpus, dictionary, mds=args.method)
    if args.save_to_file is not None:
        pyLDAvis.save_html(vis, args.save_to_file)
    else:
        pyLDAvis.show(vis)
Пример #36
0
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import gensim
import csv
import logging

from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
path_to_mallet_binary = "/home/xiu-xiu/Mallet/bin/mallet"

tweets = []
with open('data/clear_covid_tweets.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for tweet in reader:
        tweets.append(tweet['text'].split(' '))

dictionary = Dictionary(tweets)
corpus = [dictionary.doc2bow(tweet) for tweet in tweets]
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
    LdaMallet(path_to_mallet_binary,
              corpus=corpus,
              num_topics=50,
              id2word=dictionary))

vis_data = gensimvis.prepare(model, corpus, dictionary)
pyLDAvis.save_html(vis_data, 'lda.html')
Пример #37
0
def get_vis(model,corpus,dictionary):
    vis=pyLDAvis.gensim.prepare(model,corpus,dictionary)
    pyLDAvis.display(vis)
    pyLDAvis.save_html(vis,configuration.lda_dir + 'lda_visualization_test.html')
Пример #38
0
def main(root_path):
    timeStamp = str(int(time()))
    # todo change this for full run
    num = 1000  # 128915 is the total
    out_file_name = '../out/output-' + timeStamp + "-" + str(num) + '.txt'
    out_file = open(out_file_name, 'w')

    start = time()
    spark = init_spark()
    json_files = read_json_files(root_path, spark, num)
    data = get_body_text(spark, json_files)
    print("data reading done")

    # clean the data
    word_clean_up_F = F.udf(lambda x: clean_up(x), StringType())
    data = data.withColumn("body_text_cleaned", word_clean_up_F("body_text"))
    data = data.select("body_text_cleaned")
    print("data processing done")

    tokenizer = Tokenizer(inputCol="body_text_cleaned", outputCol="words")
    token_DataFrame = tokenizer.transform(data)
    token_DataFrame = token_DataFrame.select("words")

    # Remove stopwords
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    cleaned_DataFrame = remover.transform(token_DataFrame)
    cleaned_DataFrame = cleaned_DataFrame.select('filtered')

    # Count vectorizer
    cv_tmp = CountVectorizer(inputCol="filtered", outputCol="count_features")
    cvmodel = cv_tmp.fit(cleaned_DataFrame)
    count_dataframe = cvmodel.transform(cleaned_DataFrame)
    count_dataframe = count_dataframe.select('count_features')

    # TF-IDF Vectorizer
    tfidf = IDF(inputCol="count_features", outputCol="features")
    tfidfmodel = tfidf.fit(count_dataframe)
    tfidf_dataframe = tfidfmodel.transform(count_dataframe).select("features")

    print("Ready to fit with the LDA model")
    # Fit the LDA Model
    num_topics = 5
    max_iterations = 20
    lda_start = time()
    lda = LDA(seed=1, optimizer="em", k=num_topics, maxIter=max_iterations)
    lda_model = lda.fit(tfidf_dataframe)
    lda_transformed = lda_model.transform(tfidf_dataframe)
    lda_end = time()
    print("LDA complete")
    # joblib.dump(lda_model, 'lda.csv')

    # Get terms per topic
    topics = lda_model.topicsMatrix()
    vocabArray = cvmodel.vocabulary

    wordNumbers = 15  # number of words per topic
    topicIndices = lda_model.describeTopics(maxTermsPerTopic=wordNumbers).rdd.map(tuple)

    topics_final = topicIndices.map(lambda topic: topic_render(topic, wordNumbers, vocabArray)).collect()

    for topic in range(len(topics_final)):
        print("Topic " + str(topic) + ":")
        print("Topic " + str(topic) + ":", file=out_file)
        print(topics_final[topic])
        print(topics_final[topic], file=out_file)

    print("Full runtime : {} min. ".format((time() - start) / 60))
    print("LDA runtime : {} min. ".format((lda_end - lda_start) / 60))
    print("Check" + out_file.name)

    cleaned_DataFrame.cache()
    lda_transformed.cache()

    # Data Visualization
    data = format_data_to_pyldavis(cleaned_DataFrame, cvmodel, lda_transformed, lda_model)
    print("Preparing data with pyLDAvis ...")
    filter_bad_docs(data)
    py_lda_prepared_data = pyLDAvis.prepare(**data)
    file_name = '../out/data-viz-' + timeStamp + '.html'
    print("Saving pyLDAvis html page ...")
    pyLDAvis.save_html(py_lda_prepared_data, file_name)
    pyLDAvis.show(py_lda_prepared_data)
    spark.stop()
Пример #39
0
corpus, keywords = preprocess_data(df)
print(len(corpus))
#print(corpus)

start, stop, step = 2, 12, 1
maximum = plot_graph(corpus, start, stop, step)
print(maximum)

# LSI Model
print("LSI Model")
number_of_topics = maximum
words = 15
#document_list,titles=load_data("","articles.txt")
#clean_text=preprocess_data(document_list)
model = create_gensim_lsa_model(corpus, number_of_topics, words)

# LDA Model
print("\n LDA Model")
number_of_topics = maximum
words = 15
#document_list,titles=load_data("","articles.txt")
#clean_text=preprocess_data(document_list)
lda_model, dictionary, corpus_out = create_gensim_lda_model(
    corpus, number_of_topics)

#print(keywords)
vis_file = open("full_lak_lda_vis.html", "w")
vis = pyLDAvis.gensim.prepare(lda_model, corpus_out, dictionary)
#pyLDAvis.display(vis)
pyLDAvis.save_html(vis, vis_file)
Пример #40
0
def main():
	
	t = time()

	parser = argparse.ArgumentParser(description='Process some integers.')
	parser.add_argument('--path', required=False,default=".", type=str)
	args = parser.parse_args()
	path = args.path
	
	""" ------------------------------------------------- Bag of words -------------------------------------------------------- """


	with open(path+'\\documents.pkl','rb')as f:
		documents = pkl.load(f)	

	dictionary = gensim.corpora.Dictionary(documents)

	dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

	bow_corpus = [dictionary.doc2bow(doc) for doc in documents]


	""" ---------------------------------------- Coherence Values and Num Topics Graph ---------------------------------------- """

	def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):

		coherence_values = []
		model_list = []
		for num_topics in range(start, limit, step):
			print("Working on next model, num_topics =",num_topics,"...")
			model = gensim.models.LdaMulticore(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary, passes=3, workers=3, random_state=0)
			model_list.append(model)
			coherencemodel = model.log_perplexity(bow_corpus)
			#coherencemodel = CoherenceModel(model=model, texts=documents, dictionary=dictionary, coherence='c_npmi')
			print("Perplexity: ",coherencemodel)
			coherence_values.append(coherencemodel)

		return model_list, coherence_values

	# Can take a long time to run.
	print("Computing coherence values...")
	model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=documents, start=2, limit=40, step=6)

	# Save graph
	limit=40; start=2; step=6;
	x = range(start, limit, step)
	plt.plot(x, coherence_values)
	plt.xlabel("Num Topics")
	plt.ylabel("Perplexity score")
	plt.legend(("coherence_values"), loc='best')
	print('Time for this WHOLE thing: {} mins'.format(round((time() - t) / 60, 2)))
	plt.savefig(path+'\\perplex.png')

	

	""" --------------------------------------------------- LDA -------------------------------------------------------------- """

	
	print("\nWorking on simple LDA num_topics=7, passes=3...")


	lda_model_bow = gensim.models.LdaModel(bow_corpus, num_topics=7, id2word=dictionary, passes=3, random_state=0)

	f = open(path+"\\stats.txt",'w')
	
	for idx, topic in lda_model_bow.print_topics(-1):
		print('Topic: {} \nWords: {}'.format(idx, topic))
		f.write(str('\nTopic: {} \nWords: {}'.format(idx, topic)))
	

	print('\nPerplexity: ', lda_model_bow.log_perplexity(bow_corpus))
	f.write('\nPerplexity: '+str(lda_model_bow.log_perplexity(bow_corpus)))
	f.write('\n')

	"""
	coherence_model_lda = CoherenceModel(model=lda_model_bow, texts=documents, dictionary=dictionary, coherence='c_v')
	coherence_lda = coherence_model_lda.get_coherence()
	print('\nCoherence Score: ', coherence_lda)
	f.write('\nCoherence Score: '+str(coherence_lda))
	"""

	f.close()
	print('\nworking on topic visualization')
	
	vis = pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary)
	pyLDAvis.save_html(vis,path+'\\LDA_visualized.html')

	print('Time for this WHOLE thing: {} mins'.format(round((time() - t) / 60, 2)))
Пример #41
0
    id2word.save('dictionary.gensim')
    #generate corpus with doc2bow
    corpus = [id2word.doc2bow(text) for text in cleandoc]
    
    #generate LDA model for 10 topics
    print("generating LDA model")
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=i, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)
    
    print("print topics")
    print(lda_model.show_topics())
    lda_model.save(str(i)+'topicLDAModel.gensim')


    #generate Visualiziation
    #pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    print("Open interactive visualization in web browser")
    pyLDAvis.display(vis)
    pyLDAvis.save_html(vis,visavestr)                                          
print("--- %s seconds ---" % (time.time() - start_time))

   
  
    
    #Plot topic labels and terms labels separately to have different colours
    g = G.subgraph([topic for topic, _ in pos.items() if topic in t])
    nx.draw_networkx_labels(g, pos, font_size=20, font_color='r')
    #If network graph is difficult to read, don't plot ngrams titles.
    #g = G.subgraph([term for term, _ in pos.items() if str(term) not in t])
    #nx.draw_networkx_labels(g, pos, font_size=12, font_color='orange')
    #Plot edges
    nx.draw_networkx_edges(G, pos, edgelist=G.edges(), alpha=0.3)
    #Having trouble saving graph to file automatically; below code not working. Must manually save.
    plt.axis('off')
    plt.show(block=False)
    plt.savefig('/Users/Marcia/OneDrive/UNCC General/DSBA_6880/Misc_Analysis_Files/TopicNetwork'+num+'.png', bbox_inches='tight')

graph_terms_to_topics(lda, num_terms=num_top) 


#Create interactive graph to examine top 30 ngrams in each topic.
#Use pyLDAvis to visualize the topics in a network using 
#   Jensen-Shannon divergence as metric of distance between the topics.
import pyLDAvis.gensim as gensimvis
import pyLDAvis
#Create data to visualize.
vis_data = gensimvis.prepare(lda, corpus, dictionary)
#pyLDAvis.display(vis_data)
#Use vis_data "prepared" in earlier step.
#Now display the visualization in a local server page. 
#pyLDAvis.show(vis_data) 
#Save the visualization to an html file.
pyLDAvis.save_html(vis_data, '/Users/Marcia/OneDrive/UNCC General/DSBA_6880/Misc_Analysis_Files/ClaimsInteractVis'+num+'.html')
 def save_lda_vis_as_html(self, filename="./pyldavis_output.html", method=None):
     if method is None:
         vis = pyLDAvis.gensim.prepare(self.model.lda, self.data.corpuses, self.model.dictionary, n_jobs=1, sort_topics=False)
     else:
         vis = pyLDAvis.gensim.prepare(self.model.lda, self.data.corpuses, self.model.dictionary, n_jobs=1, mds=method, sort_topics=False)
     pyLDAvis.save_html(vis, filename)
Пример #44
0
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
             for text in texts]

    #pprint(texts)

    dictionary = corpora.Dictionary(texts)
    dictionary.save('/tmp/trends.dict')  # store the dictionary, for future reference
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('/tmp/trends.mm', corpus)  # store to disk, for later use


preprocessdocuments()
dictionary = corpora.Dictionary.load('/tmp/trends.dict')
corpus = corpora.MmCorpus('/tmp/trends.mm')
tfidf = models.TfidfModel(corpus)
model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=3)
print('Topics: ')
print(model.print_topics(3, 3))

vis_data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
pyLDAvis.save_html(vis_data, 'e.html')

# print('Test: ')
# print(model[tfidf[dictionary.doc2bow(['smartes', 'armband', 'fitnessarmband', 'dienen', 'sms', 'emails', 'anzeigen'])]])
# print(model[tfidf[dictionary.doc2bow(['verfolgen', 'produktion', 'sicherstellen', 'richtigen', 'kunden', 'kunde', 'tag'])]])
# print(model[tfidf[dictionary.doc2bow(['sunpartner','transparente','solarfolie','entwickelt'])]])
Пример #45
0
plt.tight_layout()
plt.show()

# save as png
plt.savefig('work/wordcloud.png')

# %%
# Vis PCoA
vis_pcoa = pyLDAvis.gensim.prepare(lda_model,
                                   corpus,
                                   dictionary,
                                   sort_topics=False)
vis_pcoa

# save as html
pyLDAvis.save_html(vis_pcoa, 'work/pyldavis_output_pcoa.html')

# %%
data = []
for c, words, fileName, title, category in zip(corpus, df['words'], df['file'],
                                               df['title'], df['category']):
    topics = []
    for topic, score in lda_model[c]:
        if (score > 0.7):
            topics.append(str(topic))
    data.append([fileName, title, category, ','.join(topics)])

df_topic = pd.DataFrame(data, columns=['file', 'title', 'category', 'topics'])
df_topic.head()

# %%