示例#1
0
def pretrained_doc2vec(texts,
                       labels=[],
                       pretrained_emb="saved_models/apnews_dbow/doc2vec.bin",
                       epochs=10,
                       workers=3,
                       lr_reduce=0.002,
                       rm_training_data=False,
                       save_model=True,
                       save_dir='saved_models',
                       filename='',
                       save_as_word2vec=True,
                       **kwargs):

    it = LabeledLineSentence(texts, labels)

    pretrained_d2v = Doc2Vec(pretrained_emb=pretrained_emb,
                             workers=workers,
                             **kwargs)
    pretrained_d2v.build_vocab(it)

    for epoch in range(epochs):
        pretrained_d2v.train(it,
                             total_examples=pretrained_d2v.corpus_count,
                             epochs=1,
                             start_alpha=pretrained_d2v.alpha)
        pretrained_d2v.alpha -= lr_reduce  # decrease the learning rate
        texts, labels = shuffle(texts, labels)
        it = LabeledLineSentence(texts, labels)

    if rm_training_data:
        print(
            'Deleting training data - keeping doctag vectors and inference...')
        pretrained_d2v.delete_temporary_training_data(
            keep_doctags_vectors=True, keep_inference=True)

    if save_model:
        if len(filename) == 0:
            filename = 'pretrained_d2v_{}epochs_'.format(epochs)

        full_path = save_folder_file(save_dir,
                                     filename,
                                     ext='.model',
                                     optional_folder='WordEmbeddings')

        if save_as_word2vec:
            filename_w2v = 'pretrained_d2v_to_w2v_{}epochs_'.format(epochs)
            full_path_w2v = save_folder_file(save_dir,
                                             filename_w2v,
                                             ext='.word2vec',
                                             optional_folder='WordEmbeddings')
            pretrained_d2v.save_word2vec_format(full_path_w2v)
        pretrained_d2v.save(full_path)

    return pretrained_d2v
    def LSI(self, num_topics=10, 
                  print_params=True, 
                  save_model=True,
                  save_dir='saved_models',
                  filename='',
                  **kwargs):
        '''
        Topic Modeling with Latent Semantic Indexing
        '''
        lsi_model = models.LsiModel(self.bow, 
                                  id2word=self.gensim_dict, 
                                  num_topics=num_topics,
                                  **kwargs)
        print('Running LSI model...\n')

        if print_params:
            print('Parameters used in model:')
            print('Number of topics: {}\nTFIDF transformation: {}\n'.format(num_topics,
                                                                          self.tfidf))

        if save_model:  
            if len(filename) == 0:
                filename = 'LSI_Params_NT{}_TFIDF{}_'.format(num_topics,
                                                                         self.tfidf)                                                      
            
            full_path = save_folder_file(save_dir, filename, ext='.model', optional_folder='LSI')

            lsi_model.save(full_path) 
            print('Saving LSI model to: \n{}\n'.format(full_path))  

        return(lsi_model)              
    def top_texts_per_topic(self, df_dominant_topic,
                                   save_output=True,
                                   save_dir='results',
                                   filename=''):

        '''
        Most representative statements for each topic
        Helps to make sense of each topic (for labeling)
        '''

        sent_topics_sorteddf = pd.DataFrame()
        
        sent_topics_outdf_grpd = df_dominant_topic.groupby('Dominant_Topic')
        
        for i, grp in sent_topics_outdf_grpd:
            sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                              grp.sort_values(['Percent_Contribution'], 
                                              ascending=[0]).head(1)], 
                                              axis=0)

        sent_topics_sorteddf.reset_index(drop=True, inplace=True)

        if save_output:  
            if len(filename) == 0: 
                filename = 'top_texts_per_topic'                                                     
          
            full_path = save_folder_file(save_dir, filename, ext='.csv')
            print('Saving the table to: {}'.format(full_path))
            sent_topics_sorteddf.to_csv(full_path, index=False)            
  
        return sent_topics_sorteddf
    def toolkit_cv_plot(self, varying_params, 
                         constant_params,
                         save_plot=True,
                         save_dir='results/model_validation',
                         filename='',
                         ext='.pdf', 
                         size=(20, 15),
                         **kwargs):
        '''
        Using tmtoolkit for parameter tuning based on a wider variety of measures
        '''

        warnings.filterwarnings("ignore", category = UserWarning)   

        print('evaluating {} topic models'.format(len(varying_params)))
        eval_results = tm_gensim.evaluate_topic_models((self.gensim_dict, 
                                                        self.bow), 
                                                        varying_params, 
                                                        constant_params,
                                                        coherence_gensim_texts=self.text,
                                                        **kwargs)  

        results_by_n_topics = results_by_parameter(eval_results, 'num_topics')
        plot_eval_results(results_by_n_topics, xaxislabel='num topics',
                  title='Evaluation results', figsize=size);
          
        if save_plot:
            filename = 'tmtoolkit_CV_'                                                     
            full_path = save_folder_file(save_dir, filename, ext=ext, 
                                         optional_folder='convergence_plots')
      
            plt.savefig(full_path)
        return(results_by_n_topics)    
    def LDAvis(self, model,
                     save_plot=True,
                     save_dir='results',
                     filename='',
                     ext='.html',
                     show_plot=True,
                     is_notebook=True,
                     mds='mds',
                     sort_topics=False,
                     **kwargs):
        '''
        Use pyLDAvis to visualize clustering
        '''

        print('Rendering visualization...')

  

        vis = gensimvis.prepare(model, self.bow, self.gensim_dict, mds=mds, sort_topics=sort_topics, **kwargs)
        
        if save_plot:
            if len(filename) == 0:
                filename = 'LDAvis_plot_'                                                     
                full_path = save_folder_file(save_dir, filename, ext=ext, 
                                             optional_folder='LDAvis_plots')
            if ext == '.html':
                pyLDAvis.save_html(vis, full_path)
            else:
                print('File extension not supported')  
        
        if show_plot:              
            if is_notebook:
                return(vis)  # show          
            else:
                pyLDAvis.show(vis)  
    def HDP(self, print_params=True, 
                  save_model=True,
                  save_dir='saved_models',
                  filename='',
                  **kwargs):
        '''
        Estimate a 'good' number of topics to set, based on the data
        '''

        hdp_model = models.HdpModel(self.bow, 
                                    id2word=self.gensim_dict,
                                    **kwargs)

        print('Inferring number of topics with Hierarchical Dirichlet Process...\n')

        if print_params:
            print('Parameters used in model:')
            print('TFIDF transformation: {}\n'.format(self.tfidf))

        if save_model:  
            if len(filename) == 0:
                filename = 'HDP_Params_TFIDF{}_'.format(self.tfidf)                                                      
            
            full_path = save_folder_file(save_dir, filename, ext='.model', optional_folder='HDP')
            hdp_model.save(full_path) 
            print('Saving HDP model to: \n{}\n'.format(full_path))  

        return hdp_model
    def format_topics_sentences(self,
                                save_output=True,
                                save_dir='results',
                                filename=''):
        '''
        Find the dominant topic in each statement
        Topic with highest percentage contribution in each statement
        '''


        # Init output
        sent_topics_df = pd.DataFrame()
    
        # Get main topic in each document
        for i, row in enumerate(self.model[self.corpus]):
            row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:  # => dominant topic
                    wp = self.model.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    sent_topics_df = sent_topics_df.append(
                                                 pd.Series(
                                                       [int(topic_num) + 1, 
                                                        round(prop_topic, 4), 
                                                        topic_keywords]), 
                                                        ignore_index=True)
                else:
                    break  # break to only get the top topic

        sent_topics_df.columns = ['Dominant_Topic', 'Percent_Contribution', 'Important_Keywords']
    
        # Add original text to the end of the output
        sent_topics = pd.concat([sent_topics_df, self.texts], axis=1)

        topics_df = sent_topics.reset_index()

        if save_output:  
            if len(filename) == 0: 
                filename = 'dominant_topic_per_text_'                                                     
          
            full_path = save_folder_file(save_dir, filename, ext='.csv')
            print('Saving the table to: {}'.format(full_path))
            topics_df.to_csv(full_path, index=False)      

        return topics_df
    def gensimBOW(self, gensim_dict,
                        save_matrix=True,
                        save_dir='data/corpus_data',
                        filename=''):
        '''
         Make a gensim Bag-of-Words representation matrix
        '''
        bow_corpus = [gensim_dict.doc2bow(text) for text in self.data]

        if save_matrix:   
            if len(filename) == 0:
                filename = 'BOWmat'                                                     

            full_path = save_folder_file(save_dir, filename, ext='.mm')
            corpora.MmCorpus.serialize(full_path, bow_corpus)  # store to disk, for later use
            print('Saving .mm matrix to {}\n'.format(full_path))

        return bow_corpus
    def topic_distribution(self, df_dominant_topic,
                                 top_text_topic,
                                 save_output=True,
                                 save_dir='results',
                                 filename=''):
        '''
        Topic distribution across statements
        Volume and distribution of topics to see how spread out it is
        '''
        # Number of Documents for Each Topic
        topic_counts = df_dominant_topic['Dominant_Topic'].value_counts()

        # Percentage of Documents for Each Topic
        topic_contribution = round(topic_counts/topic_counts.sum(), 5)

        topic_stats = pd.concat([topic_counts, topic_contribution], axis=1)

        # Make a column for topic number (was previously index)
        topic_stats.reset_index(level=0, inplace=True)
        topic_stats.columns = ['Dominant_Topic',
                               'Num_Documents', 
                               'Perc_Documents']
        topic_stats['Dominant_Topic'] = topic_stats['Dominant_Topic'] 
        # Topic Number and Keywords
        topic_num_keywords = top_text_topic[['Dominant_Topic', 'Important_Keywords']]
        # Merge on Topic Number
    
        df_dominant_topics = topic_num_keywords.merge(topic_stats, on='Dominant_Topic', how='left')

        df_dominant_topics.reset_index()

        if save_output:  
            if len(filename) == 0: 
                filename = 'doc_distribution_in_topics'                                                     
          
            full_path = save_folder_file(save_dir, filename, ext='.csv')
            print('Saving the table to: {}'.format(full_path))
            df_dominant_topics.to_csv(full_path, index=False)            
  
        return df_dominant_topics
    def gensimDict(self, min_word_len=3,
                         prop_docs=0.8,
                         compact=True,
                         save_dict=True,
                         save_dir='data/corpus_data',
                         filename='',
                         keep_n = None):
        '''
         `min_word_len`: int, remove words smaller than min_word_len (should already be done)
         `prop_docs`:    float (0 to 1), max proportion of docs a word can appear before being removed
         `compact`:      bool, Do we reset the index after some rows were deleted in preprocess?
         `save_dict`:    bool, Are we saving this object
         `save_dir`:     str, folder to save the dictionary, child of the current dir
                         will be created if it doesn't exists
         `filename`:     str, filename. If empty string, a new folder name will be created  
         `keep_n`:       int, maximum number of words to keep during filtering (None if keep all) 
        '''

        dict_words = corpora.Dictionary(self.data) # build gensim dictionary of corpus

        print('Removing words of less than {} characters, and ' \
                     'words present in at least {}% of documents\n'.format(
                                                                      min_word_len, prop_docs))
        dict_words.filter_extremes(no_below=min_word_len, no_above=prop_docs, keep_n=keep_n)
        if compact:
            dict_words.compactify()  # remove gaps in id sequence after words that were removed 
            print('Removing gaps in indices caused by preprocessing...\n')  

        if save_dict:   
            if len(filename) == 0:
                filename = 'Gensim_dict_Params_MWL{}_PD{}_'.format(min_word_len,
                                                                   prop_docs)     

            full_path = save_folder_file(save_dir, filename, ext='.dict')
            dict_words.save(full_path)  # store the dictionary for future reference        
            print('Saving gensim dictionary to {}\n'.format(full_path))

        return dict_words  
    def convergence_plot(self, log_file,
                               eval_every=5,
                               save_plot=True,
                               save_dir='results/model_validation',
                               filename='',
                               ext='.pdf', 
                               size=(12, 9),
                               show_plot=True):
        '''
        plot that uses logfile to see if model convereged based on perplexity
        or log-likelihood (proxy for KL-divergence) 
        '''
        pattern = re.compile(r'(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity')
        matches = [pattern.findall(log) for log in open(log_file)]
        matches_pos = [match for match in matches if len(match) > 0]
        scores = [pos[0] for pos in matches_pos]
        perplexity = [float(score[1]) for score in scores]
        likelihood = [float(score[0]) for score in scores]
        iterations = list(range(0, len(scores)*eval_every, eval_every))
        plt.figure(figsize=size)
        plt.plot(iterations, perplexity)
        plt.ylabel("Perplexity", fontsize=15)
        plt.xlabel("Iteration", fontsize=15)
        plt.title("Topic Model Convergence", fontsize=20)    
        plt.grid()  

        if save_plot:
            filename = 'perplex_convergence_plot_'                                                     
            full_path = save_folder_file(save_dir, filename, ext=ext, optional_folder='convergence_plots')
      
            plt.savefig(full_path)
        
        if show_plot==True:
            plt.show() 
        else:
            plt.close()   
    def plotTSNE(
            self,
            n_top_words=8,  # number of keywords we show
            save_dir='visualization',
            filename='',
            ext='.html'):
        '''
        Dimension reduction plots using T-SNE 
        Automatically saves - the plot is not displayed automatically
        Output is a html file with the plot
        '''

        # 20 colors
        colormap = np.array([
            "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", "#98df8a",
            "#d62728", "#ff9896", "#9467bd", "#c5b0d5", "#8c564b", "#c49c94",
            "#e377c2", "#f7b6d2", "#7f7f7f", "#c7c7c7", "#bcbd22", "#dbdb8d",
            "#17becf", "#9edae5"
        ])

        X_topics = self.X_topics
        num_example = self.num_example
        tsne_model = self.tsne_model
        topic_word = self.model.components_  # all topic words
        vocab = self.tf_vectorizer.get_feature_names()
        cleaned = self.cleaned

        _model_keys = []
        for i in range(X_topics.shape[0]):
            _model_keys.append(X_topics[i].argmax())

        topic_summaries = []
        for i, topic_dist in enumerate(topic_word):
            # get topic keywords and append
            topic_words = np.array(vocab)[np.argsort(
                topic_dist)][:-(n_top_words + 1):-1]
            topic_summaries.append(' '.join(topic_words))

        dict_df = {
            'content': cleaned[:num_example],
            'topic_key': _model_keys[:num_example]
        }
        df = pd.DataFrame(data=dict_df)

        source = bp.ColumnDataSource(df)

        num_example = len(X_topics)

        # plot
        title = "[t-SNE visualization of LDA model trained on {} statements, " \
                "{} topics, thresholding at {} topic probability, ({} data " \
                "points and top {} words)".format(X_topics.shape[0],
                                                  self.n_components,
                                                  self.threshold,
                                                  num_example,
                                                  n_top_words)

        plot_lda = bp.figure(
            plot_width=1400,
            plot_height=1100,
            title=title,
            tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
            x_axis_type=None,
            y_axis_type=None,
            min_border=1)

        plot_lda.scatter(x=tsne_model[:, 0],
                         y=tsne_model[:, 1],
                         color=colormap[_model_keys][:num_example])

        # randomly choose a text (within a topic) coordinate as the crucial words coordinate
        topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
        for topic_num in _model_keys:
            if not np.isnan(topic_coord).any():
                break
            topic_coord[topic_num] = tsne_model[_model_keys.index(topic_num)]

        # plot crucial words
        for i in range(X_topics.shape[1]):
            plot_lda.text(topic_coord[i, 0], topic_coord[i, 1],
                          [topic_summaries[i]])

        # hover tools
        hover = plot_lda.select(dict(type=HoverTool))
        hover.tooltips = {"content": "@content - topic: @topic_key"}

        if len(filename) == 0:
            filename = "{}_statements_" \
                       "{}_topics_{}_topic_prob_threshold_" \
                       "{}_data_pts_and_top_{}_words".format(X_topics.shape[0],
                                                             self.n_components,
                                                             self.threshold,
                                                             num_example,
                                                             n_top_words)

        full_path = save_folder_file(save_dir, filename, ext=ext)
        print('T-SNE html output saved to `{}`.\n'.format(full_path))

        # save the plot
        save(plot_lda, full_path)
示例#13
0
def plot_groups_w2v(w2v,
                    size=(18, 10),
                    n_clusters=4,
                    max_iter=100,
                    init='k-means++',
                    max_idx=200,
                    title='2D Rendition of Keywords, by Category',
                    random_state=0,
                    with_adjust_text=False,
                    group_color_list=None,
                    **kwargs):

    words_np = []
    #a list of labels (words)
    words_label = []
    for word in w2v.vocab.keys():
        words_np.append(w2v[word])
        words_label.append(word)
    print('Added {} words. Shape {}'.format(len(words_np), np.shape(words_np)))

    # Apply K-means clustering on the model
    kmeans_model = KMeans(n_clusters=n_clusters,
                          init=init,
                          max_iter=max_iter,
                          **kwargs)

    X = kmeans_model.fit(words_np)
    labels = kmeans_model.labels_.tolist()
    l = kmeans_model.fit_predict(words_np)

    words_np, labels, words_label = shuffle(words_np,
                                            labels,
                                            words_label,
                                            n_samples=max_idx,
                                            random_state=random_state)

    pca = PCA(n_components=2)
    pca.fit(words_np)
    datapoint = pca.transform(words_np)
    centroids = kmeans_model.cluster_centers_
    centroidpoint = pca.transform(centroids)

    # dict_colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
    if not group_color_list:
        # default to using Tableau colors - could get fancier with CSS4 colours too
        if n_clusters < 11:
            color_list = [val for key, val in mcolors.TABLEAU_COLORS.items()]
        else:
            color_list = [val for key, val in mcolors.CSS4.items()]
        group_color_list = np.random.choice(color_list,
                                            n_clusters,
                                            replace=False)

    color = [group_color_list[lab] for lab in labels]
    plt.figure(figsize=size)
    texts = []
    for index, vec in enumerate(datapoint):
        x, y = vec[0], vec[1]
        plt.scatter(x, y, s=100, c=color[index], edgecolors='#000000')
        if with_adjust_text:
            texts.append(plt.annotate(words_label[index], xy=(x, y), size=15))
        else:
            plt.annotate(words_label[index], xy=(x, y), size=25)

    plt.tick_params(labelsize=15)
    plt.xticks(rotation=45)
    plt.title(title, fontsize=20)

    if with_adjust_text:
        adjust_text(texts)

    filename = 'class_w2v'

    full_path = save_folder_file('results/model_validation',
                                 filename,
                                 ext='.pdf',
                                 optional_folder='CV_score_plots')

    plt.savefig(full_path)
    plt.show()
    def score_plot(self, tuning_df,
                         save_plot=True,
                         save_dir='results/model_validation',
                         ext='.pdf', 
                         size=(12, 5),
                         is_notebook=True,
                         tune_params=['eta', 'decay'],
                         score = ['coherence', 'perplexity'],
                         pref = ['higher', 'lower']):
        '''
        Plots showing coherence and perplexity measures vs. number of topics
        used in tuning process
        '''
        fig, axes = plt.subplots(1, len(score), sharex=True, figsize=size);
        
        # create a color palette
        palette = plt.get_cmap('Set1');
        
        params = []
        
        for param1_name, param1_df in tuning_df.groupby(tune_params[0]):
            for param2_name, param2_df in param1_df.groupby(tune_params[1]):
        
                for i, ax in enumerate(axes.flatten()):
                    ax.plot(param2_df["topic_num"], param2_df[score[i]]);
                    
                    ax.set_xlabel('Number of Topics', fontsize=15);
                    ax.set_ylabel('{}'.format(score[i]), fontsize=15);
        
                    ax.spines[ "top" ].set_visible( False );
                    ax.spines[ "right" ].set_visible( False );
            
                    ax.tick_params(axis='both', which='major', labelsize=15 );
                    ax.set_title('{} ({} is better)'.format(score[i], pref[i]));
                    fig.text( 0.5, -0.03,
                              'Note the different y axes',
                              ha='center', va='center',
                              fontsize = 14);
                    ax.grid(True);

                params.append('{}: {}, {}: {}'.format(tune_params[0], param1_name, 
                                                          tune_params[1], param2_name));
                    
        axes[0].legend( params,
                        loc='upper center',
                        bbox_to_anchor=(1.1, 1.35),
                        shadow=True,
                        ncol=4 );
        
        plt.suptitle( 'Validation score plots', fontsize = 20 );
        

        if is_notebook:
            plt.show();    

        if save_plot:
            filename = 'validation_from{}_to{}_by{}_'.format(self.start,
                                                             self.max_num_topics,
                                                             self.step)                                                      
        
        full_path = save_folder_file(save_dir, filename, 
                                               ext=ext, 
                                               optional_folder='CV_score_plots')
      
        plt.savefig(full_path)            
    def compare_scores(self, max_num_topics = 20, 
                             start = 2, 
                             step = 2, 
                             etas = ['auto'],
                             decays = [0.7],
                             random_state=919,
                             save_output=True,
                             save_dir='results/model_validation',
                             print_params = False,
                             eval_every = 5,
                             **kwargs):
        """
        Compute c_v coherence and perplexity for various number of topics
    
          `max_num_topics` :  int, Max number of topics to test
          `start`:            int, Min number of topics to test
          `step`:             int, increased by stepsize
          `save_output`:      bool, save output?
          `save_dir`:         str, folder to save the results, child of the current dir
                                  will be created if it doesn't exists 
          `random_state`:        int, seed to reproduce
          `print_params`:     bool, whether to output details
          `eval_every`:       int, calculates perplexity every _ iterations (small num -> slow)

    
        Returns:
          `model_list` :  list of LDA topic models used for tuning
          `score_dict`,:    dict with {`key`: value}: 
                          `coherence_values` : Coherence values corresponding to the LDA
                                               model with respective number of topics
                          `perplexity_values`: kl-divergence between theoretical and empirical distribution                         
        `score_df`,:    DataFrame with a column for each tuning parameters, coherence and perplexity  


        """
        warnings.filterwarnings("ignore", category = DeprecationWarning) 

        self.start = start
        self.max_num_topics = max_num_topics
        self.step = step
        self.print_params = print_params
        self.eval_every  = eval_every
        
        # number of lists could be reduced 
        model_list = []
        eta_list = []
        decay_list = []
        num_topics_list = []
        p_score = []
        c_score = []    
        score_dict = {}    

        print('\nTesting topics {} to {} for:\n'.format(start,
                                                       (max_num_topics - step)))


        for eta in etas:
            for decay in decays:

                print('\n {} eta and {} decay...\n'.format(eta,
                                                           decay))
                for num_topics in range(start, max_num_topics, step):
                    params = "topics{}_eta{}_decay{}".format(num_topics,
                                                             eta,
                                                             decay)
                    
                    model = self.LDA(print_params = self.print_params, 
                                     num_topics = num_topics, 
                                     eta = eta, 
                                     decay = decay,
                                     eval_every = self.eval_every,
                                     save_model=False,
                                     random_state=random_state,
                                     **kwargs)
    
                    model_list.append(model)
                    
                    coherencemodel = CoherenceModel(model=model, 
                                                    corpus=self.bow,
                                                    texts=self.text,
                                                    coherence='c_v')
    
                    coherent = coherencemodel.get_coherence()
                    perplex = model.log_perplexity(self.bow) 
    
                    eta_list.append(eta)
                    decay_list.append(decay)
                    num_topics_list.append(num_topics)
                    p_score.append(coherent)
                    c_score.append(perplex)

        score_df = pd.DataFrame({'eta':eta_list, 'decay':decay_list, 'topic_num':num_topics_list,
                                 'coherence':c_score, 'perplexity':p_score})        
        
        score_df.replace(to_replace=[None], value='none', inplace=True)

        if save_output:  
            filename = 'Coherence_Perplexity_from{}_to{}_by{}'.format(start,
                                                                      max_num_topics,
                                                                      step)                                                                                                                                   
            full_path = save_folder_file(save_dir, filename, ext='.csv', optional_folder='scores')
            score_df.to_csv(full_path, index=False)

        score_dict['perplexity'] = p_score
        score_dict['coherence'] = c_score
        
        return model_list, score_df, score_dict
    def freq_plot(
            self,
            top_n=50,
            width=1.0,
            c_scale='Portland',
            title='Top word frequencies (after cleanup and lemmatization)',
            plotname='word_count_bar',
            image_format='png',
            save_plot=True,
            save_dir='visualization',
            filename='',
            is_notebook=True,
            **kwargs):
        """
        Interactive bar frequency plot

        `top_n`:        int, to plot a number top_n of most frequent words
        `width`:        float, bar width
        `c_scale`:      str, colour scheme (see matplotlib colour schemes)
        `title`:        str, title to display on image
        `plotname`:     str, for notebook display
        `image_format`: str, image extension, of the for 'png', 'pdf', etc - NO dot
        `save_plot`:    bool, is the plot saved
        `save_dir`:     str, folder to save plot (child of the working directory)
                             folder will be created if it doesn't exists
                             NOTE: orca must be installed to save a still image of the plot
        `filename`:     str, filename for the still image to save
        `is_notebook`:  bool, is this displayed on a notebook?
        """

        ordered_count = self.order_count()
        sorted_word = [count[0] for count in ordered_count[:top_n]]
        sorted_freq = [count[1] for count in ordered_count[:top_n]]

        data_word = [
            go.Bar(x=sorted_word,
                   y=sorted_freq,
                   marker=dict(colorscale=c_scale,
                               color=sorted_freq,
                               line=dict(color='rgb(0,0,0)', width=width)),
                   text='Word count')
        ]

        layout = go.Layout(title=title)
        fig = go.Figure(data=data_word, layout=layout, **kwargs)

        if is_notebook:
            iplot(fig, filename=plotname, image=image_format)

        if save_plot:
            if len(filename) == 0:
                filename = 'word_frequency_barplot_top{}_words_'.format(top_n)

            full_path = save_folder_file(save_dir,
                                         filename,
                                         ext='.' + image_format)

            print('Pyplot word frequency bar chart saved to `{}`.\n'.format(
                full_path))

            pio.write_image(fig, full_path)
    def cloud_plot(self,
                   size=(9, 6),
                   background_color="black",
                   max_words=1000,
                   max_font_size=60,
                   min_font_size=5,
                   collocations=False,
                   colormap="coolwarm",
                   plot_title="Most common words",
                   plot_fontsize=30,
                   interpolation='lanczos',
                   save_plot='True',
                   save_dir='visualization',
                   filename='',
                   image_format='.png',
                   is_notebook=True,
                   **kwargs):
        '''
         `size`:             tuple of ints, image size
         `background_color`: str, colour name
         `max_words`:        int, maximum number of words to plot 
         `max_font_size`:    int, maximum font size
         `min_font_size`:    int, minimum font size
         `collocations`:     bool, * set to False * to avoid duplicates
         `colormap`:         str, colour scheme for letters (see matplotlib colours)
         `plot_title`:       str, title 
         `plot_fontsize`:    int, average fontsize
         `interpolation`:    str, smoother, example of possible choices: 
                                  'nearest', 'bilinear', 'hamming', 'quadric', 'lanczos'
        `save_plot`:    bool, is the plot saved
        `save_dir`:     str, folder to save plot (child of the working directory)
                             folder will be created if it doesn't exists
        `filename`:     str, filename for the still image to save
        `image_format`: str, extension, of the form '.png', '.pdf', etc
        `is_notebook`:  bool, is this displayed on a notebook?
        '''

        self.text_cloud = " ".join(word for word in self.count_dict.elements())

        plt.figure(figsize=size)
        wc = WordCloud(background_color=background_color,
                       max_words=max_words,
                       max_font_size=max_font_size,
                       min_font_size=min_font_size,
                       collocations=collocations,
                       colormap=colormap)

        wc.generate(self.text_cloud, **kwargs)
        plt.title(plot_title, fontsize=plot_fontsize)
        plt.margins(x=0.5, y=0.25)
        plt.axis('off')

        plt.imshow(wc, interpolation=interpolation)
        if is_notebook:
            plt.show()

        if save_plot:
            if len(filename) == 0:
                filename = 'wordcloud_plot_'

            full_path = save_folder_file(save_dir, filename, ext=image_format)

            print('Wordcloud plot saved to `{}`.\n'.format(full_path))
            # store to file
            wc.to_file(full_path)
            plt.savefig(full_path)
    def LDA(self, num_topics=10, 
                  update_every=1, 
                  chunksize=100, 
                  full_data_chunk=True,
                  iterations=10000,
                  passes=10,
                  eval_every=5,  
                  alpha='auto',
                  eta='auto',
                  decay=0.8,
                  minimum_probability = 0.05,
                  minimum_phi_value = 0.02,
                  per_word_topics=True,
                  print_params=True,
                  save_model=True,
                  save_dir='saved_models',
                  filename='',
                  random_state=919,
                  **kwargs):
        '''
         `num_topics`:    int, Number of latent topics (clusters) extracted from training corpus (bow)
         `update_every`:  int, Number of chunks to process prior to moving 
                               onto the M step of EM.
         `chunksize`:     int, Number of documents to load into memory at a time 
                               and process E step of EM
         `full_data_chunk=`: bool, Overrides chunksize. Load all docs into memory at once?
         `iterations`:       int, Maximum number of training iterations through the corpus.
         `passes`:           int, Number of passes through the entire corpus for training
         `eval_every`:       int, the smaller the number, the finer grained is convergence plot
         `alpha='auto',      str, number of expected topics that expresses our a-priori belief 
                                  for the each topics' probability. 
                                  Choices: 'auto': Learns an asymmetric prior from the corpus.
                                           'asymmetric': Fixed normalized asymmetric prior of 1.0 / topicnum.

         `eta`:   prior on word probability, can be:
                          scalar for a symmetric prior over topic/word probability,
                          vector of length num_words for user defined prob for each word,
                          matrix (num_topics x num_words) to assign prob to word-topic combinations,
                          or str 'auto' to learn the asymmetric prior from the data.
         `decay`:               float, Number between (0.5, 1] how much past documents are forgotten when new document is seen    
         `minimum_probability`: float, Topics with a prob lower than this are filtered out.
         `minimum_phi_value`:   float, lower bound on the term probabilities (when `per_word_topics` = True)
         `per_word_topics`:     bool, sorts topics in descending order (from most likely topics for each word) 
         `print_params`:        bool, are the parameters printed?
         `save_model`:          bool, save model?
         `save_dir`:            str, folder to save the model, child of the current dir
                                     will be created if it doesn't exists
         `filename`:            str, filename. If empty string, a new folder name will be created 
         `random_state`:        int, seed to reproduce
        '''
        # remove deprecation warnings
        warnings.filterwarnings("ignore", category = DeprecationWarning) 

        if full_data_chunk:
            chuncksize = len(self.bow)

        lda_model = models.LdaModel(self.bow, 
                           id2word=self.gensim_dict, 
                           num_topics=num_topics,
                           update_every=update_every,
                           chunksize=chunksize,
                           iterations=iterations,
                           passes=passes,
                           alpha=alpha,
                           eta=eta,
                           decay=decay,
                           minimum_probability=minimum_probability,
                           minimum_phi_value=minimum_phi_value,
                           per_word_topics=per_word_topics,
                           eval_every=eval_every,
                           random_state=random_state,
                           **kwargs)

        if print_params:
            print('Parameters used in model: ')
            model_pars = 'Number of topics: {}\nTFIDF transformation: {}\n'\
                         'Number of iterations: {}\nBatch size: {}\n' \
                         'Update every {} pass\nNumber of passes: {}\n' \
                         'Topic inference per word: {}\nAlpha: {}\n'\
                         'Eta: {}\nDecay: {}\nMinimum probability: {}\n' \
                         'Minimum_phi_value: {}\nEvaluate every: {}\n' \
                         'Random seed: {}\n'.format(num_topics,
                                                    self.tfidf,
                                                    iterations,
                                                    chunksize,
                                                    update_every,
                                                    passes,
                                                    per_word_topics,
                                                    alpha,
                                                    eta,
                                                    decay,
                                                    minimum_probability,
                                                    minimum_phi_value,
                                                    eval_every,
                                                    random_state)
            print(model_pars)    

        if save_model:
            if len(filename) == 0:
                filename = 'LDA_Params_NT{}_TFIDF{}'\
                           'Per_word_topic{}'.format(num_topics,
                                                            self.tfidf,
                                                            per_word_topics)                                                      
            
            full_path = save_folder_file(save_dir, filename, ext='.model', optional_folder='LDA')
            full_path_txt = save_folder_file(save_dir, filename + '_parameters', ext='.txt', optional_folder='LDA')
 
            
            print('Saving LDA model to: \n{}'.format(full_path))   
            lda_model.save(full_path) 

            f = open(full_path_txt,'w')  # write down corresponding parameters
            f.write(model_pars)
            f.close()

        return lda_model