示例#1
0
def visualize():
    # just for later
    import pyLDAvis
    import pyLDAvis.gensim
    vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
    pyLDAvis.enable_notebook()
    pyLDAvis.display(vis)
示例#2
0
    def view_clusters(self):
        '''
        
        '''
        if self.number_of_topics is None:
            print('Error: Number of topics not set.')
            print('Set number of topics with [object].set_number_of_topics(X)')
            return
        self.id2word = hf.create_id2word(self.texts)
        self.corpus = hf.create_corpus(self.id2word, self.texts)

        clusters = self.number_of_topics

        # Build LDA model
        lda_model = gensim.models.ldamodel.LdaModel(corpus=self.corpus,
                                                    id2word=self.id2word,
                                                    num_topics=clusters,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=10,
                                                    alpha='auto',
                                                    per_word_topics=True)

        # Display clusters
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(lda_model, self.corpus, self.id2word)
        pyLDAvis.display(vis)
        return vis
示例#3
0
    def visualize(self, mds='pcoa'):
        """
        visualize LDA using pyLDAvis

        see: https://nbviewer.jupyter.org/github/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb#topic=8&lambda=1&term=
        paper: https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf

        Parameters
        ----------
        mds: str
            scaling function
            valild options are ['pcoa', 'tnse', mmds']

        Returns
        -------

        """
        import pyLDAvis
        import pyLDAvis.gensim

        print("Make sure you have pyLDAviz imported in the notebook:\n\n"
              "import pyLDAvis\n"
              "pyLDAvis.enable_notebook()\n")

        ldavis = pyLDAvis.gensim.prepare(self.model,
                                         self.corpus,
                                         self.dictionary,
                                         mds=mds)
        pyLDAvis.display(ldavis)

        return ldavis
def pylda_visualize(csv_chemin, ecriture_chemin, tfidf_visualization = False, num_topic=3, filter_by_cluster=None):
    ''' gets the clustering result from csv_chemin and then writes the LDA visualisation as an html file into ecriture_chemin
        csv_chemin points to a dataframe with two columns: one corresponding to the cluster, the other containing the text
         num_topic is the number of topics we want to extract from the texts
         filter_by_cluster is the cluster index, if we want to extract topics from one cluster only
    '''
    #df = pd.read_csv('df_brown.csv')
    clustering_result_df = pd.read_csv(csv_chemin)
    if filter_by_cluster:
        clustering_result_df[clustering_result_df['pred_cluster'] == filter_by_cluster]
    text = clustering_result_df['text'].values
    #text = ' '.join(text)

    docs = pd.DataFrame(list(map(load_doc, enumerate(list(clustering_result_df['text'].apply(clean))))))
    docs.head()

    dictionary, corpus = prep_corpus(docs['tokens'])
    #dictionary : keys = word_id ; value = word
    #corpus[i] = list of tuples (word_id, count) where count is the number of occurence of the word in the text corpus[i]

    if tfidf_visualization:
        # Instead of representing each text as tuples (word_idx, term_frequency), we represent them as (word_idx, word_tfidf_weight)
        model = TfidfModel(corpus)
        new_corpus = []
        for i in range(len(corpus)):
            element = corpus[i]
            new_element = []
            for j in range(len(element)):
                #word = dictionary[pair[0]]
                pair = element[j]
                #dict_idx = pair[0]
                tfidf_vector = model[element]
                word_tfidf_weight = tfidf_vector[j]
                new_element += (pair[0], word_tfidf_weight)
            new_corpus.append(new_element)

        MmCorpus.serialize(ecriture_chemin + '.mm', corpus)
        dictionary.save(ecriture_chemin + '.dict')

        lda = models.ldamodel.LdaModel(corpus=new_corpus, id2word=dictionary, num_topics=15, passes=10)

        lda.save(ecriture_chemin + '.model')


        vis_data = gensimvis.prepare(lda, new_corpus, dictionary)
        pyLDAvis.display(vis_data)
        pyLDAvis.save_html(vis_data, ecriture_chemin + '.html')

    else:
        MmCorpus.serialize(ecriture_chemin + '.mm', corpus)
        dictionary.save(ecriture_chemin + '.dict')

        lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic, passes=10)

        lda.save(ecriture_chemin + '.model')

        vis_data = gensimvis.prepare(lda, corpus, dictionary)
        pyLDAvis.display(vis_data)
        pyLDAvis.save_html(vis_data, ecriture_chemin + '.html')
示例#5
0
 def display_data(self):
     lda = LdaMulticore.load(self.lda_model_filepath)
     trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
     trigram_dictionary = Dictionary.load_from_text(self.trigram_dictionary_filepath)
     LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                               trigram_dictionary)
     with open(self.LDAvis_data_filepath, 'w') as f:
         f.write(str(LDAvis_prepared))
         # json.dump(LDAvis_prepared.to_json(), f)
     with open(self.LDAvis_data_filepath) as f:
         LDAvis_prepared = f
     pyLDAvis.display(LDAvis_prepared)
def ldavis_create(lda,
                  corpus,
                  gensim_dict,
                  LDAvis_data_filepath=fpathroot + fpathappend + '_lda_vis',
                  return_ldavis=False):
    LDAvis_prepared = pyLDAvis.prepare(lda, corpus, gensim_dict)
    with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
    if return_ldavis == True:
        return LDAvis_prepared
    else:
        pyLDAvis.display(LDAvis_prepared)
示例#7
0
def visualise(model_file, corpus_file, dictionary_file):
    # use Notebook version if not working

    print('Loading corpus from ' + corpus_file)
    corpus = MmCorpus(corpus_file)
    print('Loading dictionary from ' + dictionary_file)
    dictionary = Dictionary.load(dictionary_file)
    print('Loading model from ' + model_file)
    model = models.ldamulticore.LdaMulticore.load(model_file)

    vis_data = gensimvis.prepare(model, corpus, dictionary)
    pyLDAvis.display(vis_data)
    print('Please use Jupyter notebook visualise.ipynb if not working')
示例#8
0
def topicmodel_forproyect(id_proyect):

    df_comments = get_data(id_proyect)
    #list_mask=np.unique(df_comments.project_id)

    #mask = df_comments["project_id"] == id_proyect

    #df2 = pd.read_excel("datos_congresista_virtual.xlsx", sheet_name="clasificaciones")
    num_topics = 5

    df2 = df_comments.body
    df2 = df2.str.lower()
    pattern = r"@([A-Za-z0-9_]+)"
    df2 = df2.str.replace(pattern, '')

    elements = np.array(df2.tolist())
    tokenizer = RegexpTokenizer(r'\w+')
    es_stop = get_stop_words('es')
    p_stemmer = PorterStemmer()
    texts = []
    print(str(id_proyect))
    for i in elements:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in es_stop]
        #stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stopped_tokens)
        #texts.append(stemmed_tokens)
        print(i)

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=20)
    #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, distributed=True, passes=20)
    try:
        ldamodel = gensim.models.ldamulticore.LdaMulticore(
            corpus, num_topics=num_topics, id2word=dictionary, passes=20)
    except ValueError:
        return "Coleccion Vacia. Aparentemente parametros faltantes o mal ingresados."

    import pyLDAvis.gensim
    import pyLDAvis

    vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
    pyLDAvis.display(vis_data)

    return pyLDAvis.prepared_data_to_html(vis_data)
示例#9
0
文件: lab4.py 项目: UppsalaIM/2IS060
def visualize_lda_model():
    data = preprocess_to_lemmatization()
    stopwords_verbs = [
        'say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see',
        'want', 'come', 'take', 'use', 'would', 'can'
    ]
    stopwords_other = [
        'one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also',
        'copyright', 'something'
    ]
    my_stopwords = stopwords.words(
        'english') + stopwords_verbs + stopwords_other
    data['tokens'] = data['tokens_sentences_lemmatized'].map(
        lambda sentences: list(chain.from_iterable(sentences)))
    data['tokens'] = data['tokens'].map(lambda tokens: [
        token.lower() for token in tokens if token.isalpha() and token.lower()
        not in my_stopwords and len(token) > 1
    ])
    tokens = data['tokens'].tolist()
    bigram_model = Phrases(tokens)
    trigram_model = Phrases(bigram_model[tokens], min_count=1)
    tokens = list(trigram_model[bigram_model[tokens]])

    dictionary_LDA = corpora.Dictionary(tokens)
    dictionary_LDA.filter_extremes(no_below=3)
    corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]
    np.random.seed(123456)
    num_topics = 20
    lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                      id2word=dictionary_LDA, \
                                      passes=4, alpha=[0.01]*num_topics, \
                                      eta=[0.01]*len(dictionary_LDA.keys()))
    lda_viz = gensimvis.prepare(lda_model, corpus, dictionary_LDA)
    pyLDAvis.enable_notebook()
    return pyLDAvis.display(lda_viz)
def visual_lda():
    lda = LdaMulticore.load("../model/lda.model")
    with open("../result/ad_issue_reviews") as fin:
        reviews = json.load(fin)
    # build bag-of-words, corpus
    reviews = [[word for word in review if word not in stopwords.words('english')] for review in reviews]
    from collections import defaultdict
    freq = defaultdict(int)
    for review in reviews:
        for token in review:
            freq[token] += 1
    reviews = [[token for token in review if freq[token] > 1] for review in reviews]
    dictionary = corpora.Dictionary(reviews)
    corpus = [dictionary.doc2bow(review) for review in reviews]
    import pyLDAvis.gensim as gensimvis
    import pyLDAvis
    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.display(vis_data)
示例#11
0
def topicmodel_allcoments():

    df_comments = get_data()

    pattern = r"http\S+"
    #df['TEXTO'] = df['TEXTO'].str.replace(pattern,'')

    df_comments['body'] = df_comments['body'].str.replace(pattern, '')

    df2 = df_comments.body
    df2 = df2.str.lower()
    pattern = r"@([A-Za-z0-9_]+)"
    df2 = df2.str.replace(pattern, '')
    #pattern = r"\b(word1|word2|word3|word4|word5|word|etc)\b"
    #df2 = df2.str.replace(pattern,'')

    elements = np.array(df2.tolist())
    tokenizer = RegexpTokenizer(r'\w+')
    es_stop = get_stop_words('es')
    p_stemmer = PorterStemmer()
    texts = []
    for i in elements:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in es_stop]
        #stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stopped_tokens)
        #texts.append(stemmed_tokens)

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                               num_topics=5,
                                               id2word=dictionary,
                                               passes=20)
    import pyLDAvis.gensim
    import pyLDAvis

    vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
    pyLDAvis.display(vis_data)

    #return pyLDAvis.save_json(vis_data, 'TopicModel_allcomments.json')
    return pyLDAvis.json.dumps(vis_data)
示例#12
0
 def display(self):
     """
     Use advance view on document topics with salient words with pyLDAvis framework.
     :return:
     """
     vis = pyLDAvis.gensim.prepare(topic_model=self.lda_model_tfidf, corpus=self.tf_idf_corpus,
                                   dictionary=self.dictionary)
     from IPython.core.display import HTML
     html: HTML = pyLDAvis.display(vis)
     return html.data
示例#13
0
	def vectorize(self):
                '''
                args: 
                    none
                output:
                    generates an LDA topic model of the document using gensim and pyLDAvis
                '''
		# tokenize and remove stopwords
		sentences = self.sent_detector.tokenize(self.raw.decode('utf-8').strip()) # use raw text
		#sentences = Topic(raw_input('topic: ')).text # get text from wikipedia
		#stoplist  = set('for this that by or is a of the and to in are be as an it can on if at which then also with used such not from use other have some these more using has many one was may often but their they than when been its not all may some have had'.split())
		texts     = [[word for word in sentence.lower().split() if word not in self.stopwords] for sentence in sentences]
		
		# compute the frequency of each token
		frequency = defaultdict(int)
		for text in texts:
			for token in text:
				frequency[token] += 1

		# remove words that appear only once
		texts = [[token for token in text if frequency[token] > 1] for text in texts]
		
		# construct a gensim dictionary and corpus (bag of words)
		dictionary = corpora.Dictionary(texts)
		corpus     = [dictionary.doc2bow(text) for text in texts] # currently, "text" is a sentence in the document

		# define LDA model
		lda = models.ldamodel.LdaModel( corpus       = corpus, 
						id2word      = dictionary,
						num_topics   = 10, #what should this be ???
						update_every = 1, 
						chunksize    = 10000, 
						passes       = 1 )
		
		# visualize the lda space
		vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
        	pyLDAvis.display(vis_data)
       		pyLDAvis.show(vis_data)
                with open('topic_models/'+self.name+'.json', 'a+') as topic_json:
                    pyLDAvis.save_json(vis_data, topic_json)
                with open('topic_models/'+self.name+'.html', 'a+') as topic_html:
                    pyLDAvis.save_html(vis_data, topic_html)
示例#14
0
def show_topics(corpus):
    """
    Topics visualization
    
    Parameters
    ----------
    corpus : list
        corpus of (string) documents
    """
    dic = gensim.corpora.Dictionary(corpus)
    bow_corpus = [dic.doc2bow(doc) for doc in corpus]
    lda_model = gensim.models.LdaMulticore(bow_corpus,
                                           num_topics=4,
                                           id2word=dic,
                                           passes=10,
                                           workers=2)

    lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic)
    pyLDAvis.enable_notebook()
    pyLDAvis.display(lda_vis)
示例#15
0
def generate_ldavis_data(data_path, model, idx_to_word, freqs, vocab_size):
    """This method will launch a locally hosted session of
    pyLDAvis that will visualize the results of our model
    
    Parameters
    ----------
    data_path : str
        Location where your data is stored.
    model : Lda2Vec
        Loaded lda2vec tensorflow model. 
    idx_to_word : dict
        index to word mapping dictionary
    freqs list: 
        Frequencies of each token.
    vocab_size : int
        Total size of your vocabulary
    """

    doc_embed = model.sesh.run(model.mixture.doc_embedding)
    topic_embed = model.sesh.run(model.mixture.topic_embedding)
    word_embed = model.sesh.run(model.w_embed.embedding)

    # Extract all unique words in order of index 0-vocab_size
    vocabulary = []
    for k, v in idx_to_word.items():
        vocabulary.append(v)

    # Read in document lengths
    doc_lengths = np.load(data_path + "/doc_lengths.npy")

    # The prepare_topics function is a direct copy from Chris Moody
    vis_data = prepare_topics(doc_embed,
                              topic_embed,
                              word_embed,
                              np.array(vocabulary),
                              doc_lengths=doc_lengths,
                              term_frequency=freqs,
                              normalize=True)

    prepared_vis_data = pyLDAvis.prepare(**vis_data)
    pyLDAvis.display(prepared_vis_data)
示例#16
0
def showPyLDAvisNB(allDict, numTopics=30):
    # TODO: see if we can get ngrams into pyLDAvis

    dataTuple = preparePyLDAvisData(allDict, limit=None, numTopics=numTopics)
    data = pyLDAvis.gensim.prepare(dataTuple[0], dataTuple[1], dataTuple[2])
    output_notebook()
    pyLDAvis.enable_notebook(True)
    p = pyLDAvis.display(data, template_type='general')
    plt.tight_layout()

    display(p)
    return
示例#17
0
def textTopicmodel(n_topics=2):
    segment = segWord()
    segment = [str(w) for w in segment if len(str(w)) >= 2]
    corpus = [''.join(one) for one in segment]
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=1,
                                    max_features=1500,
                                    stop_words=None)
    tf = tf_vectorizer.fit_transform(corpus)
    words = tf_vectorizer.get_feature_names()  #提取文本的关键字
    lda = LatentDirichletAllocation(n_components=n_topics,
                                    learning_offset=50,
                                    random_state=0)
    docres = lda.fit_transform(tf)
    print('============================')
    print(docres)
    print('==========================')
    print(lda.components_)
    # pyLDAvis.enable_notebook()
    visualisation = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
    # pyLDAvis.save_html(visualisation,'visualisation.html')
    pyLDAvis.display(visualisation)
    pyLDAvis.show(visualisation)
示例#18
0
 def evaluate_pyldavis(self, model=None, use_jupyter=None):
     """
     Method for a visual evaluation of the LDA topic model using pyldavis.
     :param model: LDA model that is to be evaluated. If 'None', it will use the last model that has been saved
     within the class.
     :param use_jupyter: set how the pyldavis panel is displayed. If default (None), it will try to find out if run
     from jupyter and set the method accordingly
     :return:
     """
     if model is None:
         if self.lda_model is None:
             raise Exception(
                 "Please create a LDA model for evaluation before running this method."
             )
         model = self.lda_model
     if isinstance(model, LdaMallet):
         model = malletmodel2ldamodel(model)
     panel = pyLDAvis.gensim.prepare(model, self.bag_of_words, self.id2word)
     if use_jupyter is None:
         try:
             is_jupyter = os.environ['_'].split(
                 "/")[-1] == "jupyter-notebook"
             if is_jupyter:
                 pyLDAvis.enable_notebook()
         except KeyError:
             is_jupyter = False
         if is_jupyter:
             pyLDAvis.display(panel)
         else:
             pyLDAvis.show(panel)
     else:
         if use_jupyter:
             pyLDAvis.enable_notebook()
             pyLDAvis.display(panel)
         elif not use_jupyter:
             pyLDAvis.show(panel)
def visuzalization(ldamodel, corpus, dictionary, num_words):
    viz = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
    legend = topic_items(ldamodel, 15)

    for i, (k, v) in enumerate(legend.items()):
        plt.figure()
        plt.imshow(
            WordCloud(background_color="white").fit_words(
                ldamodel.show_topic(k, num_words)))
        plt.axis("off")
        plt.title("Topic #" + str(k + 1))
        plt.show()

    display = pyLDAvis.display(viz)

    return display
示例#20
0
def showPyLDAvis(allDict, notebook=True, numTopics=30):
    # TODO: see if we can get ngrams into pyLDAvis

    dataTuple = preparePyLDAvisData(allDict, limit=None, numTopics=numTopics)
    data = pyLDAvis.gensim.prepare(dataTuple[0], dataTuple[1], dataTuple[2])
    if notebook == True:
        output_notebook()
        pyLDAvis.enable_notebook(True)
        p = pyLDAvis.display(data, template_type='general')
        display(p)
    else:
        output_file("pyDAVis.html")
        p = pyLDAvis.show(
            data)  # displays in own window combined with output_file
        show(p)
    return
示例#21
0
 def py_lda_vis(column,
                lib,
                lda_models,
                dtm=None,
                vectorizer=None,
                corpus=None,
                dictionary=None):
     if lib == 'sklearn':
         vis_data = pyLDAvis.sklearn.prepare(lda_models[column],
                                             np.asmatrix(dtm[column]),
                                             vectorizer[column],
                                             sort_topics=False)
     else:
         vis_data = pyLDAvis.gensim.prepare(lda_models[column],
                                            corpus[column],
                                            dictionary[column],
                                            sort_topics=False)
     display(pyLDAvis.display(vis_data))
 def fit(
     self,
     num_topics,
     alpha="symmetric",
     beta=None,
     passes=2,
     random_state=9,
     tuning=False,
     predict_training_samples=False,
 ):
     self.model = models.ldamodel.LdaModel(
         self.bow_corpus,
         num_topics=num_topics,
         alpha=alpha,
         eta=beta,
         id2word=self.dictionary,
         passes=passes,
         random_state=random_state,
     )
     # calculate perplexity score (the lower the better)
     self.perplexity_score_ = self.model.log_perplexity(self.bow_corpus)
     #         # calculate coherence score (the higher the better)
     self.coherence_score_, self.coherence_score_per_topic_ = self.score(
         self.docs, return_per_topic=True)
     if not tuning:
         pyLDAvis.enable_notebook()
         vis = pyLDAvis.gensim.prepare(self.model, self.bow_corpus,
                                       self.dictionary)
         self.visualize_topics_ = pyLDAvis.display(vis)
     else:
         self.visualize_topics_ = 'Set tuning parameter in fit function to "False" to visualize LDA result!'
         return self.coherence_score_
     if predict_training_samples:
         (
             self.training_samples_predict_proba_,
             self.training_samples_prediction_,
         ) = self.predict(self.texts, True)
示例#23
0
    def visualize_lda_to_html(
            self,
            target_topic_num,
            top_n=10,
            r_normalized=False,
            relevence_lambda_val=.6,
            workers_n=2,
            random_seed=1,
            savepath='./',
            filename_affix='lda',
            # save_type='html',  # {'html', 'json'}
            save_relevent_terms_ok=True,
            save_html_ok=True,
            display_ok=False,
            ):
        """
        Run `pyLDAvis.prepare` & get adjusted scores(use saliency & relevence) of terms by each topic.

        Parameters
        ----------

        target_topic_num: int
            A topic number of LDA model to visualize.

        top_n: int (default: `10`)
            A number of the most relevent terms in a topic.

        r_normalized: bool (default: `False`)
            Use normalized probabilities when it is `True`. (not recommended in most cases.)

        relevence_lambda_val: float (defautl: `.6`).
            A lambda value(ratio) to calculate relevence.

        workers_n: int (default: `2`)
            A number of CPU cores to calculate(`pyLDAvis.prepare`)

        random_seed: int (default: `1`)
            A random seed number.

        savepath: str (default: `'./'`)
            A dirpath to save `pyLDAvis` or other `pandas.DataFrame`s.

        filename_affix: str (default: `'lda'`)
            An affix of filename to save `pyLDAvis` html or json.

        save_relevent_terms_ok: bool (default: `True`)
            An option to save `pandas.DataFrame` of `top_relevent_terms`.

        save_html_ok: bool (default: `True`)
            An option to save html.

        display_ok: bool (default: `False`)
            Call `pyLDAvis.display` when it is `True`.

        References
        ----------

        Saliency: 
            `Chuang, J., 2012. Termite: Visualization techniques for assessing textual topic models`

        Relevence:
            `Sievert, C., 2014. LDAvis: A method for visualizing and interpreting topics`

        Example
        -------

        >>> import unipy_nlp.analyze.topic_modeling as utpm
        >>> tpm = utpm.TopicModeler(sentence_list, tokenized)
        >>> tpm.pick_best_lda_topics(
        ...     num_topic=5,
        ...     workers_n=8,
        ...     random_seed=1,
        ... )
        >>> tpm.visualize_lda_to_html(
        ...     7,
        ...     top_n=10,
        ...     r_normalized=False,
        ...     relevence_lambda_val=.6,
        ...     workers_n=8,
        ...     random_seed=1,
        ...     savepath='data/_tmp_dump/topic_modeling',
        ...     filename_affix='lda',
        ...     save_relevent_terms_ok=True,
        ...     save_html_ok=True,
        ...     display_ok=False,
        ... )

        """
        if target_topic_num in self.lda_model_dict.keys():
            self.selected_topic_num = target_topic_num
            self.selected_model = (
                self.lda_model_dict[target_topic_num]['model']
            )
        else:
            raise KeyError("Model doesn't exist. Select a proper number.")

        (vis_prepared,
         total_terms_df,
         top_relevant_terms_df,
         r_adj_score_df,
         bow_score_list) = self._get_terminfo_table(
            self.selected_model,
            corpus=self.bow_corpus_doc,
            dictionary=self.corpora_dict,
            doc_topic_dists=None,
            use_gensim_prepared=True,
            top_n=top_n,
            r_normalized=r_normalized,
            relevence_lambda_val=relevence_lambda_val,
            workers_n=workers_n,
            random_seed=random_seed,
        )

        self.vis_prepared = vis_prepared
        self.total_terms_df = total_terms_df
        self.top_relevant_terms_df = top_relevant_terms_df
        self.r_adj_score_df = r_adj_score_df
        self.bow_score_list = bow_score_list

        if save_html_ok:
            os.makedirs(savepath, exist_ok=True)
            ldavis_filename_html_str = os.path.join(
                savepath,
                f'{filename_affix}_topics-{target_topic_num}.html',
            )
            pyLDAvis.save_html(
                self.vis_prepared,
                ldavis_filename_html_str,
            )
            print(f"LDAVIS HTML Saved: '{ldavis_filename_html_str}'")

        if save_relevent_terms_ok:
            os.makedirs(savepath, exist_ok=True)
            ldavis_filename_rdf_str = os.path.join(
                savepath,
                '_'.join([
                    f'{filename_affix}',
                    f'topics-{target_topic_num}',
                    f'top{top_n}_relevent_terms_df.csv',
                ]),
            )
            self.top_relevant_terms_df.to_csv(
                ldavis_filename_rdf_str,
                index=True,
                header=True,
                encoding='utf-8',
            )
            print(f"LDAVIS DF Saved: '{ldavis_filename_rdf_str}'")

        if display_ok:
            pyLDAvis.display(self.vis_prepared, local=False)
示例#24
0
def pyLDAvisData(lda, num_topics, len_vocab, corpus, text, dictionary_tokens):
    data = {'topic_term_dists':topic_term_dists(lda,num_topics,len_vocab), 
            'doc_topic_dists': doc_topic_dists(corpus, lda),
            'doc_lengths': doc_lengths(text),
            'vocab': get_vocabularyAlpha(dictionary_tokens),
            'term_frequency':get_term_frequency(corpus)
           }
    return data
  
# 1 - PyLDAvis
import pyLDAvis

data = pyLDAvisData(lda, 5, len(dictionary.token2id), corpus, texts, dictionary.token2id)
topics_model_data = data
topics_vis_data = pyLDAvis.prepare(**topics_model_data)
pyLDAvis.display(topics_vis_data)


# 2 - Tendance des topics
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline 

def get_topic_apperences_year_month(data, info):
    dict_topics = {}
    for i in range(0,len(data)):
        idt = data[i].index(max(data[i]))
        dict_topics.setdefault(idt, []).append(info[i][0][7:14])
    return dict_topics

def get_topic_apperences_year(data, info):
示例#25
0
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim

warnings.simplefilter('ignore')

# Convert reviews into bag of words
total_review_text = pd.DataFrame(list(business_reviews.items()), 
                                 columns = ['business_id', 'review']).review.apply(tokenize_text)
# Create dictionary of words
dictionary = corpora.Dictionary(total_review_text)
# Compute the term frequency of terms in each document
corpus = [dictionary.doc2bow(review) for review in total_review_text]
# Compute LDA model (num_topics = 4, since we want to compare the topics to the previous 4 wordclouds)
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics = 4, id2word = dictionary, passes = 10)
print('The words and scores defining each topic are:')
lda_model.print_topics(num_topics = 4, num_words = 8)


# In[23]:


vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)


# After using the LDA algorithm to find 4 large topics, it can be observed that the topics do indeed have a number of similar words shown in wordclouds (which is created through Louvain for partitioning and TF-IDF for scoring). 
# 
# For example, topic 4 presented here is clearly showing words related to food and dessert, such as: '_salad_', '_steak_' or '_buffet_', which is very similar to the words shown in the wordcloud for community 2. 
                startrow=0)
print('LDA_result_pos 成功输出!\n')
# 负面主题分析
neg_dict = corpora.Dictionary(neg)
neg_corpus = [neg_dict.doc2bow(i) for i in neg]
neg_lda = models.LdaModel(neg_corpus,
                          num_topics=10,
                          id2word=neg_dict,
                          passes=10)
for i in range(10):
    print('neg_topic' + ' ' + str(i + 1) + ' : ')
    print(neg_lda.print_topic(i))
LDA_result_neg = neg_lda.print_topics(num_topics=10, num_words=10)
df_neg = pd.DataFrame(data=LDA_result_neg)
df_neg.to_excel('LDA_result_neg.xlsx')
print('LDA_result_neg 成功输出!\n')

# =================主题聚类可视化==================
data2 = pyLDAvis.gensim.prepare(pos_lda, pos_corpus, pos_dict)
print('以下是正面可视化参数\n')
print(data2)
pyLDAvis.save_html(data2, 'postopic.html')
pyLDAvis.display(data2)
pyLDAvis.show(data2, open_browser=True)
# data1 = pyLDAvis.gensim.prepare(neg_lda, neg_corpus, neg_dict)
# print('以下是负面可视化参数\n')
# print(data1)
# pyLDAvis.save_html(data1, 'negtopic.html')
# pyLDAvis.display(data1)
# pyLDAvis.show(data1, open_browser=True)
示例#27
0
from gensim import corpora, models
import pyLDAvis.gensim
import pyLDAvis

dic = corpora.Dictionary.load('data/model/newsgroups.dict')
corp = corpora.MmCorpus('data/model/newsgroups.mm')
lda = models.ldamodel.LdaModel.load('data/model/newsgroups_50.model')

# Prepare the data for the visualization
newsgroup_data = pyLDAvis.gensim.prepare(lda, corp, dic)

# Create the visualization
pyLDAvis.display(newsgroup_data)

# Save the visualization as a html file 
pyLDAvis.save_html(newsgroup_data, 'data/model/newsgroup_ldavis.html')
示例#28
0
 def display(self) -> None:
     pyLDAvis.display(self.get_vis())
示例#29
0
 def visualization(self):
     zit = pyLDAvis.sklearn.prepare(self.lda, self.X, self.vectorizer)
     return (pyLDAvis.display(zit))
示例#30
0
rn = ReviewNormalizer()
normalized_reviews = [rn.tokenize(r)
                      for r in reviews]
pretty_print_html([" ".join(normalized_reviews[randint(0, len(normalized_reviews))]), 
                   " ".join(normalized_reviews[randint(0, len(normalized_reviews))])])


# #### Training the model (this might take a while...)

# In[12]:

dictionary = corpora.Dictionary(normalized_reviews)
corpus = [dictionary.doc2bow(r)
          for r in normalized_reviews]
lda = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary, passes=100)


# #### Prepare data and visualize!

# In[14]:

prepared_data = prepare(lda, corpus, dictionary)
pyLDAvis.display(prepared_data)


# In[ ]:



示例#31
0
def get_vis(model,corpus,dictionary):
    vis=pyLDAvis.gensim.prepare(model,corpus,dictionary)
    pyLDAvis.display(vis)
    pyLDAvis.save_html(vis,configuration.lda_dir + 'lda_visualization_test.html')
tokens_after_lemmas_and_rm_stopwords = open('tokens_after_lemmas_and_rm_stopwords.txt', 'w')
for item in texts:
    tokens_after_lemmas_and_rm_stopwords.write("%s\n" % item)
    
dictionary.save_as_text('lemmas_nostopwords_with_otherdatacleaning_dictionary_' + sys.argv[2] + '.txt')

corpora.MmCorpus.serialize('lemmas_nostopwords_corpus_'+ sys.argv[2] +'.mm', corpus)
    
joblib.dump(lda, 'ldamodel_'+ sys.argv[2]+ '.pkl')


# In[6]:

print(corpus[56])


# In[4]:

dictionary = gensim.corpora.Dictionary.load_from_text('lemmas_nostopwords_with_otherdatacleaning_dictionary_1000000.txt')
corpus = gensim.corpora.MmCorpus('lemmas_nostopwords_corpus_1000000.mm')
lda = joblib.load('ldamodel_1000000.pkl')

(lda.print_topics(num_topics=20, num_words=8))


# In[4]:

lda_vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_vis)

示例#33
0
                                                             1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

# calculate doc lengths as the sum of each row of the dtm
doc_lengths = count_data.sum(axis=1)
doc_lengths = doc_lengths.flatten()
doc_lengths = doc_lengths.tolist()[0]
len(doc_lengths)
# transpose the dtm and get a sum of the overall term frequency
dtm_trans = count_data.T
total = dtm_trans.sum(axis=1)
total = total.flatten()
total = total.tolist()[0]
len(total)
len(vocab)

data = {
    'topic_term_dists': model.topic_word_,
    'doc_topic_dists': model.doc_topic_,
    'doc_lengths': doc_lengths,
    'vocab': vocab,
    'term_frequency': list(total)
}
# prepare the data
tef_vis_data = pyLDAvis.prepare(**data)

# this bit needs to be run after running the earlier code for reasons
pyLDAvis.display(tef_vis_data)

pyLDAvis.save_html(tef_vis_data, './guidedldavis_prepared_250k' + '.html')
示例#34
0
import json
import numpy as np
import pyLDAvis
# TODO for readme
# conda install -c conda-forge pyldavis
from bokeh.io import show, output_notebook, output_file


def load_R_model(filename):
    with open(filename, 'r') as j:
        data_input = json.load(j)
    data = {
        'topic_term_dists': data_input['phi'],
        'doc_topic_dists': data_input['theta'],
        'doc_lengths': data_input['doc.length'],
        'vocab': data_input['vocab'],
        'term_frequency': data_input['term.frequency']
    }
    return data


f = output_file("pyDAVis.html")
# output_notebook() # TODO for use in notebook
# pyLDAvis.enable_notebook()
movies_model_data = load_R_model('data/movie_reviews_input.json')

movies_vis_data = pyLDAvis.prepare(**movies_model_data)
p = pyLDAvis.display(movies_vis_data)  # should use this in notebook
# p=pyLDAvis.show(movies_vis_data) # displays in own window combined with output_file
show(p)
    # list(map(load_doc, [glob('notebooks/pyLDAvis/data/20news-bydate-train/*/*')[0]]))
    # docs = pd.DataFrame(list(map(load_doc, glob('notebooks/pyLDAvis/data/20news-bydate-train/*/*')))).set_index(['group', 'id'])

    docs.head()

    # %%

    # %%
    docs = docs[docs.astype(str)["tokens"] != '[]']  # remove empty letters
    dictionary, corpus = prep_corpus(docs['tokens'])
    MmCorpus.serialize('courrier.mm', corpus)
    dictionary.save('courrier.dict')

    # %%
    num_topics = 5
    lda = models.ldamodel.LdaModel(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=num_topics,
                                   passes=10)

    lda.save(f'courrier_{num_topics}_lda.model')

    import pyLDAvis.gensim as gensimvis
    import pyLDAvis

    #%%

    lda.load(f'courrier_{num_topics}_lda.model')
    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.display(vis_data)
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [p_stemmer.stem(t) for t in filtered_tokens]
    return stems

from gensim import corpora, models, similarities 
#tokenize
token_emails = [tokenize_and_stem(text) for text in clean_emails]

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(token_emails)

#remove extremes
dictionary.filter_extremes(no_below=1, no_above=0.8)

dictionary.compactify()

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in token_emails]
final=models.ldamodel.LdaModel.load('output/final_topic10.model')
import pyLDAvis.gensim as gensimvis
import pyLDAvis
vis_data = gensimvis.prepare(final, corpus, dictionary)
pyLDAvis.display(vis_data)