예제 #1
0
    def visualize_topics(self, notebook_mode: bool = False, mds: str = 'pcoa'):
        """
        Print important topics based on decomposition.

        Parameters
        ----------
        mds : str, optional (default='pcoa')
            2D Decomposition. Allowed values:

            * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling)
            * ``'mmds'`` - Dimension reduction via Multidimensional scaling
            * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding
        """

        if not isinstance(self.comp, LatentDirichletAllocation):
            raise ValueError('only support lda_topic_modelling()')

        import pyLDAvis
        import pyLDAvis.sklearn

        if notebook_mode:
            pyLDAvis.enable_notebook()

        prepared_vis_data = pyLDAvis.sklearn.prepare(self.comp,
                                                     self._vectors,
                                                     self.vectorizer,
                                                     mds=mds)
        if notebook_mode:
            return prepared_vis_data
        else:
            pyLDAvis.show(prepared_vis_data)
def LDAvisualization(lda, X_train, vectorizer):
    # Creates a HTML document that graphicaly shows
    # the performance of a LDA model.

    pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(lda, X_train, vectorizer, mds='tsne')
    pyLDAvis.save_html(panel, "./ldavis_prepared.html")
예제 #3
0
def files_10():
    #Change number to select file
    count = 7
    data = pd.read_csv('file' + str(count) + '.csv')
    df = pd.DataFrame(data)
    for index, c in df.iterrows():
        preprocess(c['"QUESTION' + str(count) + '"'])
    print(result)
    text = 'Question ' + str(count)
    with open(text, "w") as result_file:
        result_file.write('')
    dictionary = gensim.corpora.Dictionary(result)
    bow_corpus = [dictionary.doc2bow(doc) for doc in result]
    bow_doc_x = bow_corpus[0]
    for i in range(len(bow_doc_x)):
        print("Word {} (\"{}\") appears {} time.".format(
            bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1]))
    lda_model = gensim.models.LdaMulticore(bow_corpus,
                                           num_topics=6,
                                           id2word=dictionary,
                                           passes=10,
                                           workers=2,
                                           per_word_topics=True)

    for idx, topic in lda_model.print_topics(-1):
        with open(text, "a") as result_file:
            result_file.write("Topic: {} \nWords: {}".format(idx, topic) +
                              "\n")
    vis = pyLDAvis.gensim.prepare(topic_model=lda_model,
                                  corpus=bow_corpus,
                                  dictionary=dictionary)
    pyLDAvis.enable_notebook()
    pyLDAvis.show(vis)
예제 #4
0
 def visualize_LDA_model(self):
     pyLDAvis.enable_notebook()
     vis = pyLDAvis.gensim.prepare(self.LDA_model, self.corpus,
                                   self.id2word)
     self.plot = vis
     vis
     return vis
예제 #5
0
def topic_visual(best_lda_model, data_vectorized, vectorizer):
    pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(best_lda_model,
                                     data_vectorized,
                                     vectorizer,
                                     mds='tsne')
    pyLDAvis.show(panel)
예제 #6
0
    def view_clusters(self):
        '''
        
        '''
        if self.number_of_topics is None:
            print('Error: Number of topics not set.')
            print('Set number of topics with [object].set_number_of_topics(X)')
            return
        self.id2word = hf.create_id2word(self.texts)
        self.corpus = hf.create_corpus(self.id2word, self.texts)

        clusters = self.number_of_topics

        # Build LDA model
        lda_model = gensim.models.ldamodel.LdaModel(corpus=self.corpus,
                                                    id2word=self.id2word,
                                                    num_topics=clusters,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=10,
                                                    alpha='auto',
                                                    per_word_topics=True)

        # Display clusters
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(lda_model, self.corpus, self.id2word)
        pyLDAvis.display(vis)
        return vis
예제 #7
0
파일: lab4.py 프로젝트: UppsalaIM/2IS060
def visualize_lda_model():
    data = preprocess_to_lemmatization()
    stopwords_verbs = [
        'say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see',
        'want', 'come', 'take', 'use', 'would', 'can'
    ]
    stopwords_other = [
        'one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also',
        'copyright', 'something'
    ]
    my_stopwords = stopwords.words(
        'english') + stopwords_verbs + stopwords_other
    data['tokens'] = data['tokens_sentences_lemmatized'].map(
        lambda sentences: list(chain.from_iterable(sentences)))
    data['tokens'] = data['tokens'].map(lambda tokens: [
        token.lower() for token in tokens if token.isalpha() and token.lower()
        not in my_stopwords and len(token) > 1
    ])
    tokens = data['tokens'].tolist()
    bigram_model = Phrases(tokens)
    trigram_model = Phrases(bigram_model[tokens], min_count=1)
    tokens = list(trigram_model[bigram_model[tokens]])

    dictionary_LDA = corpora.Dictionary(tokens)
    dictionary_LDA.filter_extremes(no_below=3)
    corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]
    np.random.seed(123456)
    num_topics = 20
    lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                      id2word=dictionary_LDA, \
                                      passes=4, alpha=[0.01]*num_topics, \
                                      eta=[0.01]*len(dictionary_LDA.keys()))
    lda_viz = gensimvis.prepare(lda_model, corpus, dictionary_LDA)
    pyLDAvis.enable_notebook()
    return pyLDAvis.display(lda_viz)
예제 #8
0
def visualize():
    # just for later
    import pyLDAvis
    import pyLDAvis.gensim
    vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
    pyLDAvis.enable_notebook()
    pyLDAvis.display(vis)
 def plot_pyLDAvis(self):
     import pyLDAvis.gensim
     import pickle
     import pyLDAvis
     # Visualize the topics
     pyLDAvis.enable_notebook()
     LDAvis_prepared = pyLDAvis.gensim.prepare(self.lda_nounAdj, self.corpus_NounAdj, self.id2word_nounAdj)
     return LDAvis_prepared
예제 #10
0
def visLDAIPython(model, data, vectorizer, ip, port):
    pyLDAvis.enable_notebook()
    # https://github.com/bmabey/pyLDAvis/issues/69
    visData = pyLDAvis.sklearn.prepare(model,
                                       data,
                                       vectorizer,
                                       mds='mmds',
                                       sort_topics=False)
    pyLDAvis.show(visData, ip=ip, port=port)
예제 #11
0
def visualize_lda(model, corpus, dictionary):
    """returns the pyLDAvis PreparedData given model, corpus, dictionary"""
    """Could pickle this to save it"""
    """pyLDAvis.save_html(vis, "filename") also works to export in html"""
    pyLDAvis.enable_notebook()
    t0 = time.time()
    vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
    print('{} seconds'.format(time.time() - t0))
    return vis
예제 #12
0
def visualize_topics(model, corpus, id2word, cv):
    d = corpora.Dictionary()
    word2id = dict((k, v) for k, v in cv.vocabulary_.items())
    d.id2word = id2word
    d.token2id = word2id

    pyLDAvis.enable_notebook()
    visualization = pyLDAvis.gensim.prepare(model, corpus, d)

    return visualization
예제 #13
0
def visualise_lda_topics(lda_model, corpus, id2word):
    '''
    Visualizes the topics for Gensim's LDA implementation
    :param lda_model: 
    :param corpus: 
    :param id2word: 
    :return: visualisation
    '''
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    return vis
예제 #14
0
def visualize(ldamodel, doc_term_matrix, dictionary):
    import pyLDAvis
    try:
        pyLDAvis.enable_notebook()
    except:
        print('not in jupyter notebook')

    viz = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)

    pyLDAvis.save_html(viz, 'TM_viz50Com.html')

    return viz
예제 #15
0
def get_visualization(top_dir, nr_samples, nr_topics):
    saved_model = SavedLdaModel(top_dir, nr_samples, nr_topics)

    lda_model = saved_model.get_model()
    corpus = saved_model.get_corpus()
    id2word = saved_model.get_dict()

    # Visualize the topic
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

    return vis
예제 #16
0
def showPyLDAvisNB(allDict, numTopics=30):
    # TODO: see if we can get ngrams into pyLDAvis

    dataTuple = preparePyLDAvisData(allDict, limit=None, numTopics=numTopics)
    data = pyLDAvis.gensim.prepare(dataTuple[0], dataTuple[1], dataTuple[2])
    output_notebook()
    pyLDAvis.enable_notebook(True)
    p = pyLDAvis.display(data, template_type='general')
    plt.tight_layout()

    display(p)
    return
예제 #17
0
def LDA(doc_term_matrix):
    # Creating the object for LDA model using gensim library
    Lda = gensim.models.LdaMulticore
    # Running and Trainign LDA model on the document term matrix.
    ldamodel = Lda(doc_term_matrix, num_topics=25, id2word = dictionary, passes=50, workers=4)
    ldamodel.save("ldamodel_sample")
    # Load a potentially pretrained model from disk.
    ldamodel = gensim.models.LdaMulticore.load("ldamodel_sample")
    pprint(ldamodel.print_topics(num_topics=15, num_words=5))
    # Visualize the topics
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
    pyLDAvis.save_html(vis,fileobj='visuals.html')
예제 #18
0
def visualise_ldamallet_topics(dataset, alpha, num_topic):
    '''
    Extracts relevant information form ldamallet's LDA model and visualizes the topics with Gensim's LDA visualisation
    :return: visualisation
    '''
    ldamallet_dir = 'data/topic_models/basic/{}_alpha{}_{}/ldamallet'.format(
        dataset, alpha, num_topic)  # e.g. Semeval_alpha50_20
    convertedLDAmallet = convertLDAmallet(dataDir=ldamallet_dir,
                                          filename='state.mallet.gz')
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.prepare(**convertedLDAmallet)
    # pyLDAvis.display(vis)
    return vis
예제 #19
0
def visualize_model(model, corpus, id2word):
    """
    Parameters:
        - `model`
            a gensim LDA model
        - `corpus`
            the corpus on which the model was trained
        - `id2word`
            the dictionary on which the model was trained
    
    Returns: a pyLDAvis visualization
    """
    pyLDAvis.enable_notebook()
    return pyLDAvis.gensim.prepare(model, corpus, id2word, mds='mmds')
예제 #20
0
def plot_date_model(lda_model, date="2018-12-25"):
    df = result[result["created_at"] == date]
    
    df = df.dropna()
    xs = [df["cleaned_text"].iloc[i].split() for i in range(df.shape[0])]
    
    id2word = corpora.Dictionary(xs)
    texts = xs
    corpus = [id2word.doc2bow(text) for text in texts]
    
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    
    return vis
def visualize():
	lda_model, corpus, data_lemmatized, dictionary = train()

	#Perplejidad
	print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

	# Score de coherencia
	coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
	coherence_lda = coherence_model_lda.get_coherence()
	print('\nCoherence Score: ', coherence_lda)

	# Visualizamos los temas
	pyLDAvis.enable_notebook()
	vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
	vis
예제 #22
0
def showPyLDAvis(allDict, notebook=True, numTopics=30):
    # TODO: see if we can get ngrams into pyLDAvis

    dataTuple = preparePyLDAvisData(allDict, limit=None, numTopics=numTopics)
    data = pyLDAvis.gensim.prepare(dataTuple[0], dataTuple[1], dataTuple[2])
    if notebook == True:
        output_notebook()
        pyLDAvis.enable_notebook(True)
        p = pyLDAvis.display(data, template_type='general')
        display(p)
    else:
        output_file("pyDAVis.html")
        p = pyLDAvis.show(
            data)  # displays in own window combined with output_file
        show(p)
    return
예제 #23
0
def lda(df, n_topics=5, lda_str='all'):
    all_words = []
    for text in df['text']:
        all_words.append(text)

    # Create dictionary and corpus
    word2num = cp.Dictionary(all_words)
    texts = all_words

    # Get term frequency
    corpus = [word2num.doc2bow(text) for text in texts]

    lda_model = gs.models.LdaMulticore(corpus=corpus,
                                       id2word=word2num,
                                       num_topics=n_topics)
    doc_lda = lda_model[corpus]

    print('\nTopics')
    print(lda_model.print_topics())

    print('\nScores')
    for i in range(0, len(corpus), 500):
        for index, score in sorted(lda_model[corpus[i]],
                                   key=lambda tup: -1 * tup[1]):
            print("\nScore: {}\t \nTopic: {}".format(
                score, lda_model.print_topic(index, 10)))

    LDAvis_prepared = pyLDAvis.enable_notebook()
    pyLDAvis.save_html(LDAvis_prepared,
                       './html/{}_lda_n{}.html'.format(lda_str, n_topics))
def lda_vis(lda_model, corpus, dictionary):

    # visualize the topics and words

    import pyLDAvis
    import pyLDAvis.gensim  # don't skip this
    import matplotlib.pyplot as plt
    #%matplotlib inline

    pyLDAvis.enable_notebook()

    vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

    vis

    return vis
예제 #25
0
def topicModeling(corpus, dictionary, texts):

    ldamodel = LdaModel(corpus=corpus,
                        num_topics=3,
                        id2word=dictionary,
                        passes=5)

    x = ldamodel.show_topics()  #show generated topics

    #----------------------------------------------------------
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series(
                    [int(topic_num),
                     round(prop_topic, 4), topic_keywords]),
                                                       ignore_index=True)
            else:
                break
    sent_topics_df.columns = [
        'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'
    ]

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)

    #-------Generate Visualization------------------------------

    pyLDAvis.enable_notebook()

    topicModel = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

    pyLDAvis.save_html(
        topicModel,
        '/Users/[email protected]/Documents/projects/PEM/elon.html')

    pyLDAvis.show(topicModel)

    return x, sent_topics_df
예제 #26
0
    def visualize(self):
        import pyLDAvis
        try:
            pyLDAvis.enable_notebook()
        except:
            print ('not in jupyter notebook')
            
        start = time()

        self.viz = pyLDAvis.gensim.prepare(self.ldamodel, self.doc_term_matrix, self.dictionary)

        print ('used: {:.2f}s'.format(time()-start))
        print ('saving viz to '+self.name+'_viz.html')
        
        pyLDAvis.save_html(self.viz, self.name+'_viz.html')
        
        return self.viz
예제 #27
0
def visualize_topics(id2word, corpus, lda_model, path='./', num_topics=10):
    print('Creating visualization html at {}'.format(path))
    #visualizing topics
    pyLDAvis.enable_notebook()
    LDAvis_data_filepath = os.path.join('{}ldavis_prepared_{}topics'.format(
        path, str(num_topics)))
    #this is a bit time consuming
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
    #load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(
        LDAvis_prepared,
        '{}ldavis_prepared_{}topics{}'.format(path, str(num_topics), '.html'))
    LDAvis_prepared
예제 #28
0
def main():
	# Training data preprocessing
	docs, asin_list, test_docs = read_content('../data/product_description_complete.tsv')
	docs = tokenize(docs)
	docs = lemmatize(docs)
	docs = compute_bigrams(docs)
	dictionary = remove_rare_common_words(docs)
	corpus = vectorize(dictionary, docs)

	# Train model
	(model,id2word) = train(dictionary,corpus,17,docs)

	# Print topics
	for i in range(17):
		topics = model.show_topic(i)
		print(i,[topic[0] for topic in topics])


	# Testing data preprocessing
	test_docs = tokenize(test_docs)
	test_docs = lemmatize(test_docs)
	test_docs = compute_bigrams(test_docs)
	test_dictionary = remove_rare_common_words(test_docs)
	test_corpus = vectorize(test_dictionary, test_docs)

	# Write predicted results
	i = 0
	with open('../results/product_description_complete.tsv', 'wt') as tsvfile:
    	writer = csv.writer(tsvfile, delimiter='\t')
    	writer.writerow(["asin", "topic_distribution"])
    	for c in test_corpus:
        	writer.writerow([asin_list[i], model[c]])
        	i += 1


    # Visualize the topics (the following code can only be run on Notebook)
	pyLDAvis.enable_notebook()
	LDAvis_prepared = pyLDAvis.gensim.prepare(model, corpus, dictionary)
	LDAvis_prepared




if __name__ == '__main__':
	main()
예제 #29
0
def build_topic_model_dict(text_array):
    global dictionary
    dictionary = corpora.Dictionary(text_array)

    global doc_term_matrix
    doc_term_matrix = [dictionary.doc2bow(rev) for rev in text_array]

    # creating the object for LDA model using gensim library
    LDA = gensim.models.ldamodel.LdaModel

    #build the model
    global lda_model
    lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=7,
                    random_state=100, chunksize=1000, passes=50)

    #print topics
    #lda_model.print_topics()

    pyLDAvis.enable_notebook()
예제 #30
0
    def visualize_topics(self, notebook_mode = False, mds = 'pcoa'):
        """
        Print important topics based on decomposition.

        Parameters
        ----------
        mds : str, optional (default='pcoa')
            2D Decomposition. Allowed values:

            * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling)
            * ``'mmds'`` - Dimension reduction via Multidimensional scaling
            * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding
        """
        if not isinstance(mds, str):
            raise ValueError('mds must be a string')
        if not isinstance(notebook_mode, bool):
            raise ValueError('notebook_mode must be a boolean')
        try:
            import pyLDAvis
            import pyLDAvis.sklearn
        except:
            raise Exception(
                'pyldavis not installed. Please install it and try again.'
            )

        if notebook_mode:
            pyLDAvis.enable_notebook()

        vis_data = _prepare_topics(
            self._doc_embed,
            self._topic_embed,
            self._word_embed,
            np.array(self._features),
            doc_lengths = self._doc_len,
            term_frequency = self._freqs,
            normalize = True,
        )
        prepared_vis_data = pyLDAvis.prepare(**vis_data)
        if notebook_mode:
            return prepared_vis_data
        else:
            pyLDAvis.show(prepared_vis_data)
ntvf = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=0.95, ngram_range=(1,1), sublinear_tf=True)
ntvf_features = ntvf.fit_transform(negative_reviews)
# view feature set dimensions
print(ptvf_features.shape, ntvf_features.shape)


# # Topic Modeling on Reviews

# In[4]:

import pyLDAvis
import pyLDAvis.sklearn
from sklearn.decomposition import NMF
import topic_model_utils as tmu

pyLDAvis.enable_notebook()
total_topics = 10


# ## Display and visualize topics for positive reviews

# In[5]:

# build topic model on positive sentiment review features
pos_nmf = NMF(n_components=total_topics, 
          random_state=42, alpha=0.1, l1_ratio=0.2)
pos_nmf.fit(ptvf_features)      
# extract features and component weights
pos_feature_names = ptvf.get_feature_names()
pos_weights = pos_nmf.components_
# extract and display topics and their components