예제 #1
0
    def on_done(self, corpus):
        self.Outputs.corpus.send(corpus)
        pos_tags = self.corpus.pos_tags is not None
        self.topic_desc.show_model(self.model, pos_tags=pos_tags)
        if self.__pending_selection:
            self.topic_desc.select(self.__pending_selection)
            self.__pending_selection = None

        if self.model.actual_topics != self.model.num_topics:
            self.Warning.less_topics_found()

        if self.model.name == "Latent Dirichlet Allocation":
            bound = self.model.model.log_perplexity(corpus.ngrams_corpus)
            self.perplexity = "{:.5f}".format(np.exp2(-bound))
        cm = CoherenceModel(model=self.model.model,
                            texts=corpus.tokens,
                            corpus=corpus,
                            coherence="c_v")
        coherence = cm.get_coherence()
        self.coherence = "{:.5f}".format(coherence)

        self.Outputs.all_topics.send(self.model.get_all_topics_table())
예제 #2
0
def find_optimum_topics(corpus, final_documents, word_dict):
    topics_wise_score = {}
    for num_topics in range(1, TOPICS_LIMIT):
        lda_model = ldamodel.LdaModel(corpus=corpus,
                                      random_state=100,
                                      id2word=word_dict,
                                      passes=NUM_PASSES,
                                      num_topics=num_topics)
        coherence_score = CoherenceModel(model=lda_model,
                                         texts=final_documents,
                                         dictionary=word_dict,
                                         coherence='c_v').get_coherence()
        topics_wise_score[num_topics] = coherence_score

    leader = -1
    leader_score = -1
    for num_topics, score in topics_wise_score.items():
        if score > leader_score:
            leader = num_topics
            leader_score = score

    return leader, leader_score
예제 #3
0
def get_topics(num, corpus, id2word, output_dir, all_sentences):
    print(num)
    ldamallet = LdaMallet(args.mallet_dir,
                          corpus=corpus,
                          num_topics=num,
                          prefix=output_dir + "/" + str(num),
                          workers=4,
                          id2word=id2word,
                          iterations=1000,
                          random_seed=42)
    coherence_model_ldamallet = CoherenceModel(model=ldamallet,
                                               texts=all_sentences,
                                               dictionary=id2word,
                                               coherence='c_v')
    coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    print('\nCoherence Score: ', coherence_ldamallet)
    keywords = {i: ", ".join([word for word, prop in ldamallet.show_topic(i)]) for i in range(ldamallet.num_topics)}
    with open(output_dir + "/" + str(num) + '_words.json', 'w') as f:
        f.write(json.dumps(keywords))
    ldamallet.save(output_dir + "/" + str(num))
    #ldamallet.show_topics(num_topics=num, formatted=True)
    return coherence_ldamallet
예제 #4
0
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=doc_term_matrix,
                                                num_topics=num_topics,
                                                random_state=2,
                                                id2word=dictionary,
                                                iterations=10)
        model_list.append(model)
        coherence_model = CoherenceModel(model=model,
                                         texts=texts,
                                         dictionary=dictionary,
                                         coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
    return model_list, coherence_values
    """
예제 #5
0
def calcCoherence(lemmatizedTexts, passes=100, nTopics=5, workers = 1):

    id2word = Dictionary(lemmatizedTexts)
    corp = [id2word.doc2bow(text) for text in lemmatizedTexts]

    ldaModel = gensim.models.LdaMulticore(
        corpus=corp,
        id2word=id2word,
        num_topics=nTopics,
        passes=passes,
        random_state=100,
        per_word_topics=False,
        alpha=0.01,
        eta=0.9,
        workers=workers
    )

    coherenceModel = CoherenceModel(
        model=ldaModel, texts=lemmatizedTexts, dictionary=id2word, coherence='c_v', processes=0
    )

    return coherenceModel.get_coherence()
예제 #6
0
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    id2word = dictionary
    coherence_values = []
    model_list = []
    mallet_path = 'mallet-2.0.8/bin/mallet'
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path,
                                                 corpus=corpus,
                                                 num_topics=num_topics,
                                                 id2word=id2word)
        model.save('models/ldamodel_' + str(num_topics) + '.lda')
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
 def compute_coherence_values(dictionary,
                              corpus,
                              texts,
                              limit,
                              start=2,
                              step=3):
     coherence_values = []
     model_list = []
     for num_topics in range(start, limit, step):
         model = models.LdaMulticore(corpus=corpus,
                                     id2word=dictionary,
                                     num_topics=num_topics,
                                     chunksize=10000,
                                     passes=30,
                                     iterations=100)
         model_list.append(model)
         coherencemodel = CoherenceModel(model=model,
                                         texts=texts,
                                         dictionary=dictionary,
                                         coherence='c_v')
         coherence_values.append(coherencemodel.get_coherence())
     return model_list, coherence_values
예제 #8
0
def ret_top_model():
    top_topics = [(0, 0)]
    rounds = 1
    high = 0.0
    out_lm = None
    #while top_topics[0][1] < 0.97 and rounds < 2: #0.97
    while True:
        lm = LdaModel(corpus=corpus, num_topics=20, id2word=dictionary)
        coherence_values = {}
        for n, topic in lm.show_topics(num_topics=-1, formatted=False):
            topic = [word for word, _ in topic]
            cm = CoherenceModel(topics=[topic], texts=train_texts, dictionary=dictionary, window_size=10)
            coherence_values[n] = cm.get_coherence()
        top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True)
        if high < top_topics[0][1]:
            high = top_topics[0][1]
            out_lm = lm
        print('round ',rounds,':',top_topics[0][1])
        if rounds > 2:
            break
        rounds+=1
    return out_lm, top_topics, high
def compute_eval_values(dictionary, corpus, texts, limit, start=2, step=5):
    """
        Compute c_v coherence and perplexity for various number of topics

        Parameters:
        ----------
        dictionary : Gensim dictionary
        corpus : Gensim corpus
        texts : List of input texts
        limit : Max num of topics

        Returns:
        -------
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        perplexity_values
    """
    coherence_values = []
    perplexity_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        perplexity_values.append(model.log_perplexity(corpus))

    return model_list, coherence_values, perplexity_values
예제 #10
0
def find_optimal_topic(data_corpus, corp_dictionary, start, end, cleaned):

    coherence_dict = {}
    for num_topics in range(start, end):
        lda_model = gensim.models.ldamodel.LdaModel(corpus=data_corpus,
                                                    id2word=corp_dictionary,
                                                    num_topics=num_topics,
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=350,
                                                    passes=10,
                                                    alpha='auto',
                                                    per_word_topics=True)
        # Compute Coherence Score
        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=cleaned,
                                             dictionary=corp_dictionary,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        coherence_dict[num_topics] = coherence_lda

    return max(coherence_dict.items(), key=operator.itemgetter(1))[0]
예제 #11
0
def basic_lda(total_topics,corpus,dictionary,docs,score=False):
    #total_topics = 15
    print('Training for {} documents ......'.format(len(corpus)))
    
    lda = LdaModel(corpus = corpus,
                              id2word = dictionary,
                              num_topics = total_topics,
                              alpha='auto',
                              eta = 'auto',
                              random_state = 2)#,
                              #workers = 20) #
                              #iterations = 1000,
    # Compute Coherence Score
    if score:
        print('calculating coherence socre for {} documents ......'.format(len(docs)))
        coherence_model_lda = CoherenceModel(model=lda, texts=docs, dictionary=dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

        return lda,coherence_lda
    
    return lda
예제 #12
0
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """

    # Can take a long time to run

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus,
                                                random_state=2,
                                                num_topics=num_topics,
                                                id2word=dictionary,
                                                iterations=10)
        model_list.append(model)
        coherence_model = CoherenceModel(model=model,
                                         texts=texts,
                                         dictionary=dictionary,
                                         coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
    return model_list, coherence_values
예제 #13
0
    def compute_coherence_values(self,
                                 dictionary,
                                 corpus,
                                 texts,
                                 limit,
                                 start=2,
                                 step=3):
        """
        Compute c_v coherence for various number of topics

        Args:
            dictionary : Gensim dictionary
            corpus : Gensim corpus
            texts : List of input texts
            limit : Max num of topics

        Returns:
            model_list : List of LDA topic models
            coherence_values : Coherence values corresponding to the LDA model with respective number of topics

        Raises:
            None
        """
        coherence_values = []
        model_list = []
        for num_topics in range(start, limit, step):
            model = gensim.models.wrappers.LdaMallet(self.mallet_path,
                                                     corpus=corpus,
                                                     num_topics=num_topics,
                                                     id2word=dictionary)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model,
                                            texts=texts,
                                            dictionary=dictionary,
                                            coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())

        return model_list, coherence_values
예제 #14
0
파일: model.py 프로젝트: madhugraj/sptm
    def optimum_topic(self, start=10, limit=100, step=11):
        """Compute c_v coherence for various number of topics

        if you want to change the parameters of the model while training,
        call Model.params() first as it uses the same parameters.

        NOTE: You cannot compute the coherence score of a saved model.

        Args:
            dictionary: Gensim dictionary
            corpus: Gensim corpus
            texts: List of input texts
            limit: Max num of topics

        Returns:
            Dictionary of {num_topics, c_v}
        """
        coherence_values = []
        model_list = []
        for num_topics in range(start, limit, step):
            model = Wrappers.LdaMallet(self.mallet_path, \
                corpus=self.corpus, num_topics=num_topics, \
                alpha=self.alpha, id2word=self.id2word, \
                workers=self.workers, prefix=self.prefix, \
                optimize_interval=self.optimize_interval, \
                iterations=self.iterations, \
                topic_threshold=self.topic_threshold)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, \
                texts=self.tokens, dictionary=self.id2word, \
                coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())
        x = range(start, limit, step)
        out = dict()
        for m, cv in zip(x, coherence_values):
            out["num_topics"] = m
            out["c_v"] = round(cv, 4)
        return out
def calculate_scores(dictionary, corpus,  texts, limit, output_path, start=2, step=3):
    """
    Compute c_v coherence for a wide range of topic numbers.
    Adapted from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    method = c_v or u_mass
    texts : List of input texts (doc_clean)
    limit : Max num of topics

    Returns:
    -------
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    graphical outputs
    """
    coherence_dict = dict()

    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        coherencemodel1 = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_dict[num_topics] = coherencemodel1.get_coherence()

    coherence_df = pd.DataFrame(pd.Series(coherence_dict)).reset_index()

    coherence_df.columns = ['Num_topics','Coherence_score']

    # Show graph
    fig, ax = plt.subplots(figsize=(12,10))
    ax.plot(coherence_df['Num_topics'], coherence_df['Coherence_score'])
    ax.set_xlabel("No. of topics", fontweight='bold')
    ax.set_ylabel("Cv Coherence score", fontweight='bold')
    ax.axvline(coherence_df[coherence_df['Coherence_score'] == coherence_df['Coherence_score'].max()]['Num_topics'].tolist(), color='red')
    fig.savefig(os.path.join(output_path, 'broad_topic_k_search.png'), format='png',dpi=300)

    return coherence_df
    def mallet_coherence_values(data, limit, start=2, step=1):
        """
        Compute c_v coherence for various number of topics for the mallet model 
        Default alpha is 50/n -> contributing to too much latent topics in one document.
        Testing out with lower alpha and optimized_intervals of 10, allows the dirichlet alpha and (b)eta to be optimized faster
        Parameters:
        ----------
        data: dataframe consisting of the lemmatized and tokenized text
        limit : Max num of topics
        Testing the auto alpha and beta values to let the 'model' learn the hyperparameters from the data
        Returns:
        -------
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """
        coherence_values = []
        model_list = []
        dictionary = corpora.Dictionary(data['tokenized_text'])
        corpus = [dictionary.doc2bow(doc) for doc in data['tokenized_text']]
        texts = list(data['tokenized_text'])
        mallet_path = 'C:/new_mallet/mallet-2.0.8/bin/mallet'
        for num_topics in range(start, limit, step):
            model = gensim.models.wrappers.LdaMallet(mallet_path,
                                                     corpus=corpus,
                                                     num_topics=num_topics,
                                                     optimize_interval=10,
                                                     alpha=1,
                                                     id2word=dictionary,
                                                     iterations=2000,
                                                     random_seed=456)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model,
                                            texts=texts,
                                            dictionary=dictionary,
                                            coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())

        return model_list, coherence_values
예제 #17
0
def select_k(corpus, dictionary, texts, limit, start=3, step=2):
    """
    Compute coherence for models with k number of topics to facilitate selecting the best model

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for k in range(start, limit, step):
        LDA = gensim.models.ldamodel.LdaModel
        model = LDA(corpus=corpus,
                    id2word=dictionary,
                    num_topics=k,
                    alpha='auto',
                    eta='auto',
                    passes=10,
                    iterations=400,
                    eval_every=1,
                    chunksize=20)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        print("finished training topic_number: ", k)

    return model_list, coherence_values
예제 #18
0
def compute_coherence_values_topic_num(data, limit, start=2, step=1):
    """
    Compute u_mass coherence for various number of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    with open(data, 'rb') as file:
        # read the data as binary data stream
        print("... Reading the pre-processed data from local binary file...")
        documents = pickle.load(file)

    documents = extract_important_words_tfidf(
        documents, 0.60)  # extracting top 60% (TF-IDF) terms per document
    documents = remove_low_high_frequent_words(documents, 0.03, 1.0)

    corpus = get_tfidf(documents)["corpus_tfidf"]
    dictionary = get_tfidf(documents)["index2word"]

    coherence_values = []
    model_list = []

    for num_topics in range(start, limit, step):
        model = models.ldamodel.LdaModel(corpus=corpus,
                                         id2word=dictionary,
                                         num_topics=num_topics,
                                         eta=0.3)
        model_list.append(model)

        coherencemodel = CoherenceModel(model=model,
                                        dictionary=dictionary,
                                        corpus=corpus,
                                        coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
예제 #19
0
def compute_coherence_values(texts, dictionary, corpus, limit, start=2, step=3):
    '''
    Compute c_v coherence for various numbers of topics

    Parameters:
    ----------
    texts: list of input texts
    dictionary: gensim dictionary
    corpus: gensim corpus
    num_topics: number of topics
    limit: max number of topics

    Returns:
    -------
    model_list: list of LDA topic models
    coherence_values: coherence values corresponding to the LDA model with respective number of topics
    '''
    # Tokenize texts
    text_tokenized = [word_tokenize(text) for text in texts]
    
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics,
                                           random_state=0,
                                           passes=10,
                                           update_every=1,
                                           chunksize=100,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=text_tokenized, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
예제 #20
0
    def set_coherence(self):
        '''
        Use the Gensim CoherenceModel to gauge the internal coherence of the
        model
        '''
        print('\nDetermining Coherence measure from fit model.')
        start = datetime.now()

        self.lda_coherence_model = CoherenceModel(
            model=self.lda_model,
            corpus=self.corpus_cards,
            dictionary=self.built_corpus.vocabulary_,
            coherence='u_mass')

        self.lda_coherence_score = float(
            str('{0:.2f}'.format(self.lda_coherence_model.get_coherence())))

        print('   ** Coherence Score for {} topics and {} cards: {} **'.format(
            self.n_topics, self.built_corpus.total_samples,
            self.lda_coherence_score))

        end = datetime.now()
        print("   Time taken: {}".format(end - start))
예제 #21
0
def build_lda_models(corpus, id2word, texts, num_topics_range, num_trials):
    print("\n* Running %s trials up to %s topics each to determine optimal number of topics." %(num_trials,num_topics_range[-1]))
    tic = time.time()
    coherence_values = [[] for i in range(num_trials)]
    models = [[] for i in range(num_trials)]
    for i in range(num_trials):
        print("* Running Trial %s ..." %(i+1))
        for num_topics in num_topics_range:
            tfidf_model = gensim.models.TfidfModel(corpus=corpus,id2word=id2word)
            model = gensim.models.ldamodel.LdaModel(corpus=tfidf_model[corpus], 
                                                    id2word=id2word, 
                                                    num_topics=num_topics)
            ''' Non-TFIDF model (lower score)
            model = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                                    id2word=id2word, 
                                                    num_topics=num_topics)
            '''
            models[i].append(model)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
            coherence_values[i].append(coherencemodel.get_coherence())
            toc = time.time()
            print("* -> Computed for %s topics. Time taken to compute: %.2fs" %(num_topics,toc - tic))
    return models, coherence_values
예제 #22
0
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    coherence_values = []
    model_list = []
    num_topics_list = []

    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                num_topics=num_topics,
                                                id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence="c_v")
        coherence_values.append(coherencemodel.get_coherence())
        num_topics_list.append(num_topics)

    return model_list, coherence_values, num_topics_list
예제 #23
0
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print('Calculating {}-topic model'.format(num_topics))
        model = gensim.models.wrappers.LdaMallet(mallet,
                                                 corpus=corpus,
                                                 num_topics=num_topics,
                                                 id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
예제 #24
0
def model_lda(clean_doc, dictionary, doc_term_matrix):

    lda_model = gensim.models.ldamodel.LdaModel(corpus=doc_term_matrix,
                                                id2word=dictionary,
                                                num_topics=25,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=25,
                                                alpha='auto',
                                                per_word_topics=True)
    print("Topics generated with the in-built LDA model are:\n")
    pprint(lda_model.print_topics())
    print("----------------------------------------------------")

    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=clean_doc,
                                         dictionary=dictionary,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"coherence score: {coherence_lda}")

    return lda_model
예제 #25
0
def compute_coherence_values(id2word, corpus, texts, maximum, start=2, step=3):
    """
    Compute coherences for different topic number variations to find the best one

    Returns:
    model_list - list of models for the given topic number range
    coh_values - values of coherences for these models
    """
    coh_values = []
    model_list = []
    for num_topics in range(start, maximum, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path,
                                                 corpus=corpus,
                                                 num_topics=num_topics,
                                                 id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=id2word,
                                        coherence='c_v')
        coh_values.append(coherencemodel.get_coherence())

    return model_list, coh_values
예제 #26
0
def compute_coherence_values(dictionary, corpus, texts, limit, start = 2, step = 3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    for num_topics in range(2, 50, 6):
        Lda = gensim.models.ldamodel.LdaModel
        model = Lda(doc_term_matrix, num_topics = num_topics, random_state = 2, id2word = dictionary, iterations = 10)
        model_list.append(model)
        coherencemodel = CoherenceModel(model = model, texts = texts, dictionary = dictionary, coherence = "c_v")
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values
예제 #27
0
    def save_this_to_use_properly(self, language_processed_data: str):
        """
            Since LDAmodel is a probabilistic model, it comes up different topics each time we run it. To control the
            quality of the topic model we produce, we can see what the interpretability of the best topic is and keep
            evaluating the topic model until this threshold is crossed.

            Returns:
            -------
            lm: Final evaluated topic model
            top_topics: ranked topics in decreasing order. List of tuples
            """
        top_topics = [(0, 0)]
        lm = None
        while top_topics[0][1] < 0.97:
            lm = LdaModel(corpus=self.essentials.corpus, id2word=self.essentials.dictionary.id2token)
            coherence_values = {}
            for n, topic in lm.show_topics(num_topics=-1, formatted=False):
                topic = [word for word, _ in topic]
                cm = CoherenceModel(topics=[topic], texts=language_processed_data,
                                    dictionary=self.essentials.dictionary, window_size=10)
                coherence_values[n] = cm.get_coherence()
            top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True)
        return lm, top_topics
예제 #28
0
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    topic_list : No. of topics chosen
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    topic_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(doc_term_matrix,
                                                random_state=0,
                                                num_topics=num_topics,
                                                id2word=dictionary,
                                                iterations=10)
        topic_list.append(num_topics)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return topic_list, coherence_values
예제 #29
0
def compute_coherence_values(texts, start=2, stop=30, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with
                       respective number of topics
    """
    coherence_values = []
    model_list = []

    id2word = create_id2word(texts)
    corpus = create_corpus(id2word, texts)

    for num_topics in range(start, stop, step):
        print('Calculating {}-topic model'.format(num_topics))
        model = gensim.models.wrappers.LdaMallet(mallet_path,
                                                 corpus=corpus,
                                                 num_topics=num_topics,
                                                 id2word=id2word)
        model_list.append((num_topics, model))
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=id2word,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values, id2word, corpus
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    """
    Compute c_v coherence for various number of topics
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics
    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    perplexity_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path,
                                                 corpus=corpus,
                                                 num_topics=num_topics,
                                                 id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        mallet_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
            model)
        perplexity_values.append(mallet_model.log_perplexity(corpus))
    return model_list, coherence_values, perplexity_values