Пример #1
0
def get_topics(data, max_freq, min_occurrence, num_components):
    cv = CountVectorizer(max_df=max_freq,
                         min_df=min_occurrence,
                         stop_words='english')
    dtm = cv.fit_transform(data)
    model = LatentDirichletAllocation(num_components,
                                      learning_method='online',
                                      random_state=0,
                                      n_jobs=-1)
    output = model.fit_transform(dtm)
    s_viz.prepare(model, dtm, cv, mds='tsne')
Пример #2
0
    def LDA(count_data, count_vectorizer):
        # Tweak the two parameters below
        number_topics = 35
        number_words = 20
        # Create and fit the LDA model
        lda = LDA(n_components=number_topics, n_jobs=-1)
        lda.fit(count_data)
        # Print the topics found by the LDA model
        print("Topics found via LDA:")
        MyLDA.print_topics(lda, count_vectorizer, number_words)

        LDAvis_data_filepath = os.path.join('./ldavis_prepared_' +
                                            str(number_topics))
        # # this is a bit time consuming - make the if statement True
        # # if you want to execute visualization prep yourself
        if 1 == 1:
            LDAvis_prepared = sklearn_lda.prepare(lda, count_data,
                                                  count_vectorizer)
        with open(LDAvis_data_filepath, 'wb') as f:
            pickle.dump(LDAvis_prepared, f)

        # load the pre-prepared pyLDAvis data from disk
        with open(LDAvis_data_filepath, 'rb') as f:
            LDAvis_prepared = pickle.load(f)
        return pyLDAvis.save_html(
            LDAvis_prepared,
            './ldavis_prepared_' + str(number_topics) + '.html')
Пример #3
0
    def lda_model_checking(lda,
                           count_data,
                           count_vectorizer,
                           number_topics,
                           lda_html_name=None):
        """
        draw the topic distribution by group using ldavis library
        :param count_data:
        :param count_vectorizer:
        :param lda_html_name:
        :return:
        """
        if lda_html_name is None:
            lda_html_name = "./ldavis_prepared_"
        LDAvis_data_filepath = os.path.join(lda_html_name + str(number_topics))
        # # this is a bit time consuming - make the if statement True
        # # if you want to execute visualization prep yourself
        if 1 == 1:
            LDAvis_prepared = sklearn_lda.prepare(lda, count_data,
                                                  count_vectorizer)
        with open(LDAvis_data_filepath, 'wb') as f:
            pickle.dump(LDAvis_prepared, f)

        # load the pre-prepared pyLDAvis data from disk
        with open(LDAvis_data_filepath, 'rb') as f:
            LDAvis_prepared = pickle.load(f)
        pyLDAvis.save_html(LDAvis_prepared,
                           lda_html_name + str(number_topics) + '.html')
Пример #4
0
def lda_func(string, filename):
    sns.set_style('whitegrid')
    count_vectorizer = CountVectorizer(stop_words=('english'))
    count_data = count_vectorizer.fit_transform([string])
    plot_10_most_common_words(count_data, count_vectorizer, filename)
    warnings.simplefilter("ignore", DeprecationWarning)
    number_topics = 3
    number_words = 5
    lda = LDA(n_components=number_topics, n_jobs=-1, learning_method='online')
    lda.fit(count_data)
    print("Topics found via LDA:")
    print_topics(lda, count_vectorizer, number_words)

    from pyLDAvis import sklearn as sklearn_lda
    import pickle
    import pyLDAvis
    LDAvis_data_filepath = os.path.join('./ldavis_prepared_' +
                                        str(number_topics) + filename)
    # # this is a bit time consuming - make the if statement True
    # # if you want to execute visualization prep yourself
    if 1 == 1:

        LDAvis_prepared = sklearn_lda.prepare(lda, count_data,
                                              count_vectorizer)

        with open(LDAvis_data_filepath, 'wb') as f:
            pickle.dump(LDAvis_prepared, f)

    # load the pre-prepared pyLDAvis data from disk
        with open(LDAvis_data_filepath, 'rb') as f:
            LDAvis_prepared = pickle.load(f)
            pyLDAvis.save_html(
                LDAvis_prepared,
                './ldavis_prepared_' + str(number_topics) + filename + '.html')
Пример #5
0
def fit_ddl_lda(words_sentences, output_dir, filename_stem, number_topics):

    # Initialise the count vectorizer with the English stop words
    count_vectorizer = CountVectorizer(stop_words='english')

    # Fit and transform the processed titles
    count_data = count_vectorizer.fit_transform(words_sentences)

    # Tweak the two parameters below
    number_words = 20
    # Create and fit the LDA model
    lda = LDA(n_components=number_topics, n_jobs=-1)
    lda.fit(count_data)
    # Print the topics found by the LDA model
    print("Topics found via LDA:")
    print_topics(lda, count_vectorizer, number_words)

    LDAvis_data_filepath = os.path.join(output_dir, filename_stem + '_' + 'lda_vis_prepared_' + str(number_topics))
    # # this is a bit time consuming - make the if statement True
    # # if you want to execute visualization prep yourself
    if 1 == 1:
        LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared, os.path.join(output_dir, filename_stem + '_' + 'lda_vis_prepared_' + str(number_topics) + '.html'))

    return lda
Пример #6
0
def latent_dirichlet_allocation_topic_extraction():
    """
    Function performs topic extraction on Tweets using Scikit-Learn LDA model.

    :return: None.
    """
    from sklearn.decomposition import LatentDirichletAllocation

    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model.
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=1000,
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(slo_feature_series)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # Run LDA.
    lda = LatentDirichletAllocation(n_components=20,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0).fit(tf)
    time.sleep(3)

    # Display the top words for each topic.
    lda_util.display_topics(lda, tf_feature_names, 10)

    import pyLDAvis
    from pyLDAvis import sklearn
    # pyLDAvis.enable_notebook()
    visualization = sklearn.prepare(lda_model=lda,
                                    vectorizer=tf_vectorizer,
                                    dtm=tf)
    pyLDAvis.save_html(visualization,
                       'lda_visualization-no-company-words.html')
Пример #7
0
def save_LDA_visualization(lda_tf, dtm_tf, tf_vectorizer, html_file):
    """
    Save LDA visualization as html
    """
    from pyLDAvis.sklearn import prepare
    data = prepare(lda_tf, dtm_tf, tf_vectorizer)
    from pyLDAvis import save_html
    save_html(data, html_file)
Пример #8
0
 def generate_ldavis(self):
     params = {"mds": "pcoa"}
     try:
         LDAvis_prepared = sklearn_lda.prepare(self.nmf_model,
                                               self.vectorized_out,
                                               self.vectorizer, **params)
     except:
         return "This visualization is currently not available."
     return LDAvis_prepared
 def explore_topics_viz(self, save_path):
     self.LDAvis_prepared = sklearn_lda.prepare(self.lda, self.count_data,
                                                self.count_vectorizer)
     LDAvis_data_filepath = os.path.join(
         f"{save_path}_{self.task}_{self.num_topics}")
     with open(LDAvis_data_filepath + ".pkl", "wb") as fp:
         pickle.dump(self.LDAvis_prepared, fp)
         pyLDAvis.save_html(self.LDAvis_prepared,
                            LDAvis_data_filepath + '.html')
Пример #10
0
    def visualize(self):
        """ Start local web-server and display LDA fitted model """

        self.check_model()
        show(
            prepare(self.model,
                    self.vectorized_data,
                    self.vectorizer,
                    mds='tsne'))
    def visualize_lda(self, n):
        """Visualizing Topic Modeling using pyLDAvis.

        Args:
            n (int): number of topic
        Return:
        """
        lda, doc2vec, tfidf = self.topic_modeling(n)
        prepared = prepare(lda, doc2vec, tfidf)
        pyLDAvis.save_html(prepared, './figure/topic_modeling.html')
Пример #12
0
def count_and_lda(text):
    top_N = 20

    words = nltk.tokenize.word_tokenize(text)
    word_dist = nltk.FreqDist(words)

    stopwords = nltk.corpus.stopwords.words('english')
    words_except_stop_dist = nltk.FreqDist(w for w in words
                                           if w not in stopwords)

    rslt = pd.DataFrame(word_dist.most_common(top_N),
                        columns=['Word', 'Frequency'])

    rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                        columns=['Word', 'Frequency']).set_index('Word')

    counts = Counter(words).most_common(20)

    # print counts

    vectorizer = TfidfVectorizer()
    dtm_tfidf = vectorizer.fit_transform(words)
    # print(dtm_tfidf.shape)

    lda_tfidf = LatentDirichletAllocation(n_components=10,
                                          learning_offset=50,
                                          max_iter=10)
    lda_tfidf.fit(dtm_tfidf)

    data = prepare(lda_tfidf, dtm_tfidf, vectorizer)
    pyLDAvis.save_html(data, './static/data.html')

    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=500,
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(words)
    vocab = tf_vectorizer.get_feature_names()

    model = lda.LDA(n_topics=20, n_iter=2000, random_state=1)
    model.fit(tf)

    topic_word = model.topic_word_
    n = 5
    topics = []
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n + 1):-1]
        # print('*Topic {}\n- {}'.format(i, ', '.join(topic_words)))
        topics.append(', '.join(topic_words))

    the_counts = []
    for count in counts:
        the_counts.append({'data': count[0], 'value': count[1]})

    return topics, the_counts
Пример #13
0
    def visualize(self):
        """ Start local web-server to display the LDA fitted model """

        if not self.fitted:
            raise ValueError('LDA model is not fitted')

        show(
            prepare(self.lda,
                    self.vectorized_data,
                    self.vectorizer,
                    mds='tsne'))
Пример #14
0
def fit_ea_lda(df_app, output_dir, options):
    # Helper function
    def print_topics(model, count_vectorizer, n_top_words):
        words = count_vectorizer.get_feature_names()
        for topic_idx, topic in enumerate(model.components_):
            print("\nTopic #%d:" % topic_idx)
            print(" ".join(
                [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

    df_app_tmp = df_app.copy()
    df_app_tmp.drop('ANNOTATE', axis=1, inplace=True)
    if options['CLASS'] != '':
        df_app_tmp.drop('CLASS', axis=1, inplace=True)

    df_app_words = transform_to_nlp(df_app_tmp)
    makeImage(df_app_tmp.sum(), output_dir,
              ea_decode.options_filename(options) + '_' + 'WC')
    # Initialise the count vectorizer with the English stop words
    count_vectorizer = CountVectorizer(stop_words='english')
    # Fit and transform the processed titles
    count_data = count_vectorizer.fit_transform(df_app_words['Words'])

    #plot_most_common_words(count_data, count_vectorizer, 15)

    # Load the LDA model from sk-learn
    from sklearn.decomposition import LatentDirichletAllocation as LDA
    # Tweak the two parameters below
    number_topics = 10
    number_words = 20
    # Create and fit the LDA model
    lda = LDA(n_components=number_topics, n_jobs=-1)
    lda.fit(count_data)
    # Print the topics found by the LDA model
    print("Topics found via LDA:")
    print_topics(lda, count_vectorizer, number_words)

    file_out = ea_decode.options_filename(
        options) + '_' + 'LDA_VIS' + '_' + str(number_topics)
    LDAvis_data_filepath = os.path.join(output_dir, file_out)
    # # this is a bit time consuming - make the if statement True
    # # if you want to execute visualization prep yourself
    if 1 == 1:
        LDAvis_prepared = sklearn_lda.prepare(lda, count_data,
                                              count_vectorizer)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared,
                       os.path.join(output_dir, file_out + '.html'))
Пример #15
0
    def generate_ldavis(self):
        # 0 = (nr_of_topics, total_nr_of_words)
        # 1 = (nr_of_articles, total_nr_of_words)
        # 2 = (nr_of_articles, nr_of_words_per_article)

        params = {"mds": "pcoa"}
        try:
            LDAvis_prepared = sklearn_lda.prepare(self.nmf_model,
                                                  self.vectorized_out,
                                                  self.vectorizer, **params)
        except:
            return "This visualization is currently not available."
        return LDAvis_prepared
Пример #16
0
def lda_topics_visualization(lda, count_vectorizer, paper_words_count_matrix,
                             number_topics):
    results_file_path = os.path.join(LDA_RESULTS_FILE_PATH,
                                     'ldavis_prepared_' + str(number_topics))
    # # this is a bit time consuming - make the if statement True
    # # if you want to execute visualization prep yourself
    if 1 == 1:
        LDAvis_prepared = sklearn_lda.prepare(lda, paper_words_count_matrix,
                                              count_vectorizer)
    with open(results_file_path, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

    # load the pre-prepared pyLDAvis data from disk
    with open(results_file_path, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared, results_file_path + '.html')
Пример #17
0
def get_lda_summary(number_topics, number_words, data_series, output_name):
    from sklearn.decomposition import LatentDirichletAllocation as LDA
    from pyLDAvis import sklearn as sklearn_lda
    import pickle
    import pyLDAvis
    count_vectorizer = CountVectorizer(token_pattern=r'[^\s]+')
    count_data = count_vectorizer.fit_transform(data_series)
    lda = LDA(n_components=number_topics, n_jobs=-1)
    lda.fit(count_data)
    if 1 == 1:
        LDAvis_prepared = sklearn_lda.prepare(lda, count_data,
                                              count_vectorizer)
    with open(output_name, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

        pyLDAvis.save_html(LDAvis_prepared, output_name + '.html')
Пример #18
0
def lda_vis(df_all_tweets, num_topics):
    lda_vis_path = r"C:\Users\btier\Documents\lda_vis.html"
    # start count vector with stop words
    count_vectorizer = CountVectorizer(stop_words='english')
    # Fit and transform the processed titles
    count_data = count_vectorizer.fit_transform(
        df_all_tweets['PROCESSED_TEXT'])
    # Create / fit LDA
    lda = LDA(n_components=num_topics, n_jobs=-1)
    lda.fit(count_data)
    LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
    with open(lda_vis_path, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
    with open(lda_vis_path) as f:
        LDAvis_prepared = pickle.load(f)
    return pyLDAvis.save_html(LDAvis_prepared,
                              './ldavis_prepared_' + str(num_topics) + '.html')
Пример #19
0
    def generate_lda_visualisation(self, text):
        '''This function will create an interactive graphic using the text provided and topic count of the object.
        Also saves an html file in the local directory.

        self = dataframe object
        text = text to analyse and group'''
        LDA_vect = CountVectorizer(stop_words='english') 
        LDA_count_data = LDA_vect.fit_transform(text)

        # Create and fit the LDA model
        lda = LDA(n_components=self.topic_count, n_jobs=-1)
        lda.fit(LDA_count_data)

        # Generate the LDA visualisation and display it
        LDAvis_prepared = sklearn_lda.prepare(lda, LDA_count_data, LDA_vect)
        path = '../lda_visualisations/ldavis_prepared_'+ str(self.topic_count) +'.html'
        pyLDAvis.save_html(LDAvis_prepared, path)
        display(HTML(path))
Пример #20
0
def pyldavis_visualization(corpus,
                           topics,
                           num_docs=None,
                           ngrams=1,
                           weighting='tf',
                           min_df=0.1,
                           max_df=0.7,
                           mds='pcoa',
                           *args,
                           **kwargs):
    model, doc_term_matrix, vectorizer = build_model(corpus, topics, num_docs,
                                                     ngrams, weighting, min_df,
                                                     max_df)
    prep_data = prepare(model.model, doc_term_matrix, vectorizer, mds=mds)
    out = StringIO()
    save_html(prep_data, out)
    out.seek(0)
    return (doc_term_matrix, out.read())
Пример #21
0
def visualize_topic_model(lda, count_data, count_vectorizer,
                          num_topics, ldavis_filename_prefix):
    from pyLDAvis import sklearn as sklearn_lda
    import pickle
    import pyLDAvis

    ldavis_data_path = os.path.join(ldavis_filename_prefix + str(num_topics))
    ldavis_html_path = ldavis_filename_prefix + str(num_topics) + '.html'
    # this is a bit time consuming - make the if statement True
    # if you want to execute visualization prep yourself
    ldavis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
    with open(ldavis_data_path, 'wb') as f:
        pickle.dump(ldavis_prepared, f)

    # load the pre-prepared pyLDAvis data from disk
    with open(ldavis_data_path, 'rb') as f:
        ldavis_prepared = pickle.load(f)
        pyLDAvis.save_html(ldavis_prepared, ldavis_html_path)
Пример #22
0
    def init(self, df: pd.DataFrame, generate_visualization=False, lang="fi"):
        """
        :param df: :class:`~pandas.Dataframe` containing text colums
        :param generate_visualization: Generate visalization of LDA results. Slows down
                                       generation notably.
        :param lang: Language for :class:`~Voikko`
        """
        if self._count_vector and self._lda:
            return True

        file_words = self.instance_path() / "word.dat"
        file_lda = self.instance_path() / "lda.dat"
        file_ldavis = self.instance_path() / "ldavis.html"

        try:
            # Try loading saved lda files.
            self._count_vector = joblib.load(file_words)
            self._lda = joblib.load(file_lda)
        except FileNotFoundError as e:
            logger.exception(e)

            texts = [x for x in df.to_numpy().flatten() if x is not np.NaN]

            # Setup word count vector
            self._count_vector = CountVectorizer(
                tokenizer=self.text_tokenize,
                stop_words=self.stop_words
            )
            count_data = self._count_vector.fit_transform(texts)

            self._lda = LDA(n_components=self.number_topics, n_jobs=-1)
            self._lda.fit(count_data)

            if generate_visualization:
                logger.debug("Generating LDA visualization. This might take a while")
                from pyLDAvis import sklearn as sklearn_lda
                import pyLDAvis

                LDAvis_prepared = sklearn_lda.prepare(self._lda, count_data, self._count_vector)
                pyLDAvis.save_html(LDAvis_prepared, str(file_ldavis))

            joblib.dump(self._count_vector, file_words)
            joblib.dump(self._lda, file_lda)
Пример #23
0
def pyldavis_run(lda_model_path, document_term_matrix_path, vectorizer_path):
    '''
    Computes the pyLDAvis visualisation of the LDA model.

    Parameters
    ----------
    lda_model_ath : str
        Path of the  pickle object (serialised python object) of the LDA model. This is created in the lda_tsne_model2.py module.
    document_term_matrix_path : str
        Path of the  pickle object (serialised python object) of the document-term matrix which is created using the CountVectorizer in the lda_tsne_model2.py module.
    vectorizer_path : str 
        Path of the  pickle object (serialised python object) of the vectorizer used to create the document-term matrix.This is usually the CountVectorizer in the lda_tsne_model2.py module.

    Returns
    ----------
    Embedded html pyldavis visulisation of the LDA model.
    '''

    t0 = time.time()

    # loading the pickle objects from the paths parameters.
    lda_model = pickle.load(open(lda_model_path, "rb"))
    document_term_matrix = pickle.load(open(document_term_matrix_path, "rb"))
    cvectorizer = pickle.load(open(vectorizer_path, "rb"))

    #prepares the pyldavis visualisation. There is a choice of dimensionality reduction methods here, TSNE is chosen as it is consistent
    #with the previous analysis in the lda_tsne_model2.py module and has shown to yield better results than other available methods.
    prepared_data = prepare(lda_model,
                            document_term_matrix,
                            cvectorizer,
                            mds='tsne',
                            plot_opts={
                                'xlab': '',
                                'ylab': ''
                            })

    html = pyLDAvis.prepared_data_to_html(prepared_data)

    t1 = time.time()
    print("time for pyldavis: " + str(t1 - t0), file=sys.stdout)

    return html
Пример #24
0
def wordcloud_visualization(corpus,
                            topics,
                            num_docs=None,
                            min_df=0.1,
                            ngrams=1,
                            weighting='tf',
                            max_df=0.7,
                            mds='pcoa',
                            *args,
                            **kwargs):
    font = pkg_resources.resource_filename(__name__,
                                           "fonts/ZillaSlab-Medium.ttf")
    print(font)
    model, doc_term_matrix, vectorizer = build_model(corpus, topics, num_docs,
                                                     ngrams, weighting, min_df,
                                                     max_df)
    prep_data = prepare(model.model, doc_term_matrix, vectorizer, mds=mds)
    ti = prep_data.topic_info
    topic_labels = ti.groupby(['Category']).groups.keys()

    plt.clf()
    topics = []
    for label in topic_labels:
        out = StringIO()
        df = ti[ti.Category == label].sort_values(by='Total',
                                                  ascending=False)[:20]
        tf = dict(df[['Term', 'Total']].to_dict('split')['data'])

        wc = wordcloud.WordCloud(font_path=font,
                                 width=600,
                                 height=300,
                                 background_color='white')
        wc.fit_words(tf)
        plt.imshow(wc)
        plt.axis('off')
        plt.savefig(out)
        out.seek(0)
        topics.append((label, out.read()))

    return topics
    """
Пример #25
0
    def __init__(self, corpus, numTopics=20, load=True, language='en'):
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            print("Instantiating Latent Dirichlet Allocation (Topic Modeling)")
            if not load:
                # self.SP = SentenceProcessor(language=language)
                # self.countVectorizer = CountVectorizer(stop_words=self.SP.stopWords, lowercase=True, strip_accents='ascii')
                # self.countData = self.countVectorizer.fit_transform( [self.SP.getProcessedSentence(doc.text) for doc in corpus.docList] )
                # with open("./modelsData/LDA/countVectorizer_59k.pkl", 'wb') as f: pickle.dump(self.countVectorizer, f)
                # with open("./modelsData/LDA/countData_59k.pkl", 'wb') as f: pickle.dump(self.countData, f)

                with open("./modelsData/LDA/countVectorizer_59k.pkl",
                          'wb') as f:
                    self.countVectorizer = pickle.load(f)
                with open("./modelsData/LDA/countData_59k.pkl", 'wb') as f:
                    self.countData = pickle.load(f)

                self.lda = SkLearnLDA(n_components=numTopics,
                                      n_jobs=3,
                                      max_iter=100,
                                      verbose=1,
                                      random_state=0)
                self.lda.fit(self.countData)
                with open("./modelsData/LDA/SKLearnLDAModel.pkl", 'wb') as f:
                    pickle.dump(self.lda, f)
                self.ldaModel = sklearn_lda.prepare(self.lda, self.countData,
                                                    self.countVectorizer)
                with open("./modelsData/LDA/SKLearnLDA.pkl", 'wb') as f:
                    pickle.dump(self.ldaModel, f)
            else:
                with open("./modelsData/LDA/SKLearnLDA_59k_100it.pkl",
                          "rb") as f:
                    self.ldaModel = pickle.load(f)
                with open("./modelsData/LDA/SKLearnLDAModel_59k_100it.pkl",
                          'rb') as f:
                    self.lda = pickle.load(f)
Пример #26
0
number_topics = 3
random_seed = 2

# Create and fit the LDA model
lda = LDA(n_components=number_topics, random_state=random_seed, verbose=1)
lda.fit(count_data)

# Define the word list
number_words = 10
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("\nTopic #%d" % topic_idx)
    print(" ".join([words[i] for i in topic.argsort()[:-number_words:-1]]))
    
pyLDAvis.enable_notebook()
sklearn_lda.prepare(lda, count_data, count_vectorizer)

# Calculate the topic distributions for all articles in the training and test sets
X_train = lda.transform(count_data)
X_test = lda.transform(count_test)

# Find the topic given the topic distribution in the training set
Topic_train = np.argmax(X_train, axis=1)
Topic_train_df = pd.DataFrame(Topic_train, columns=["Topic"])
# Find the topic given the topic distribution in the test set
Topic_test = np.argmax(X_test, axis=1)
Topic_test_df = pd.DataFrame(Topic_test, columns=["Topic"])
# Reset the index of the df for Topic_test
Topic_test_df.index += df_test.index[0]

# Allocate the topic to the original dataframe
Пример #27
0
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

%%time
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(number_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath) as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html')




from wordcloud import WordCloud

wordcloud = WordCloud(background_color="white", 
	max_words=5000, contour_width=3, 
Пример #28
0
def buildsklearnselectedworks(so: SearchObject, bagsofsentences: list):
    """
    see:
        http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

    see also:

        https://nlpforhackers.io/topic-modeling/

    CountVectorizer:
    max_df : float in range [0.0, 1.0] or int, default=1.0
        When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).

    min_df : float in range [0.0, 1.0] or int, default=1
        When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.


    see:
        https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer#35615151

    max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

        max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
        max_df = 25 means "ignore terms that appear in more than 25 documents".

    The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

    min_df is used for removing terms that appear too infrequently. For example:

        min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
        min_df = 5 means "ignore terms that appear in less than 5 documents".

    The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

    notes:
        maxfreq of 1 will give you a lot of excessively common words: 'this', 'that', etc.
        maxfreq of

    on the general issue of graphing see also:
        https://speakerdeck.com/bmabey/visualizing-topic-models
        https://de.dariah.eu/tatom/topic_model_visualization.html

    on the axes:
        https://stats.stackexchange.com/questions/222/what-are-principal-component-scores

    """

    activepoll = so.poll
    vv = so.vectorvalues

    settings = {
        'maxfeatures': vv.ldamaxfeatures,
        'components': vv.ldacomponents,  # topics
        'maxfreq': vv.
        ldamaxfreq,  # fewer than n% of sentences should have this word (i.e., purge common words)
        'minfreq': vv.ldaminfreq,  # word must be found >n times
        'iterations': vv.ldaiterations,
        'mustbelongerthan': vv.ldamustbelongerthan
    }

    activepoll.statusis('Running the LDA vectorizer')
    # Use tf (raw term count) features for LDA.
    ldavectorizer = CountVectorizer(max_df=settings['maxfreq'],
                                    min_df=settings['minfreq'],
                                    max_features=settings['maxfeatures'])

    ldavectorized = ldavectorizer.fit_transform(bagsofsentences)

    ldamodel = LatentDirichletAllocation(n_components=settings['components'],
                                         max_iter=settings['iterations'],
                                         learning_method='online',
                                         learning_offset=50.,
                                         random_state=0)

    ldamodel.fit(ldavectorized)

    visualisation = ldavis.prepare(ldamodel, ldavectorized, ldavectorizer)
    # pyLDAvis.save_html(visualisation, 'ldavis.html')

    ldavishtmlandjs = pyLDAvis.prepared_data_to_html(visualisation)
    storevectorindatabase(so, ldavishtmlandjs)

    return ldavishtmlandjs
Пример #29
0
def _lda4(table,
          input_col,
          topic_name='topic',
          num_voca=1000,
          num_topic=5,
          num_topic_word=10,
          max_iter=20,
          learning_method='online',
          learning_offset=10.,
          random_state=None):
    # generate model
    corpus = np.array(table[input_col])
    if isinstance(corpus[0], np.ndarray):
        tf_vectorizer = CountVectorizer(preprocessor=' '.join,
                                        stop_words='english',
                                        max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca)
    else:
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca,
                                        stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()
    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")
    log_likelihood = lda_model.score(term_count)
    perplexity = lda_model.perplexity(term_count)

    # create topic table
    vocab_weights_list = []
    vocab_list = []
    weights_list = []
    topic_term_prob = normalize(lda_model.components_, norm='l1')
    for vector in topic_term_prob:
        pairs = []
        for term_idx, value in enumerate(vector):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        vocab_weights = []
        vocab = []
        weights = []
        for pair in pairs[:num_topic_word]:
            vocab_weights.append("{}: {}".format(pair[1], pair[0]))
            vocab.append(pair[1])
            weights.append(pair[0])
        vocab_weights_list.append(vocab_weights)
        vocab_list.append(vocab)
        weights_list.append(weights)
    topic_table = pd.DataFrame({
        'vocabularies_weights': vocab_weights_list,
        'vocabularies': vocab_list,
        'weights': weights_list
    })
    topic_table['index'] = [idx + 1 for idx in topic_table.index]
    topic_table = topic_table[[
        'index', 'vocabularies_weights', 'vocabularies', 'weights'
    ]]

    # create output table
    doc_topic = lda_model.transform(term_count)
    out_table = pd.DataFrame.copy(table, deep=True)
    topic_dist_name = topic_name + '_distribution'
    if topic_name in table.columns or topic_dist_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains Topic Column Name. Please choose again."
        }])
    out_table[topic_name] = [
        doc_topic[i].argmax() + 1 for i in range(len(corpus))
    ]
    out_table[topic_dist_name] = doc_topic.tolist()

    # pyLDAvis
    prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer)
    html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'Number of topics': num_topic,
        'Number of words for each topic': num_topic_word,
        'Maximum number of iterations': max_iter,
        'Learning method': learning_method,
        'Learning offset': learning_offset,
        'Seed': random_state
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Latent Dirichlet Allocation Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Log Likelihood
    | {log_likelihood}
    |
    | ### Perplexity
    | {perplexity}
    |
    | ### Parameters
    | {params}
    """.format(log_likelihood=log_likelihood,
               perplexity=perplexity,
               params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['lda_model'] = lda_model
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
Пример #30
0
 def show_pyLDAvis_plots(self):
     from pyLDAvis.sklearn import prepare
     prepare(self.model['lda_tf'], self.model['dtm_tf'],
             self.model['tf_vectorizer'])