예제 #1
0
for parameters in parameters_combinations:
    lda_model = gensim.models.LdaModel(corpus=tokens_tfidf,
                                       num_topics=parameters.get('num_topics'),
                                       id2word=id2token,
                                       chunksize=parameters.get('chunksize'),
                                       passes=parameters.get('passes'),
                                       iterations=parameters.get('iterations'),
                                       random_state=1332,
                                       alpha="auto",
                                       eta="auto")

    coherence_model_lda = CoherenceModel(model=lda_model,
                                         corpus=tokens_tfidf,
                                         coherence='u_mass')

    coherence_score = coherence_model_lda.get_coherence()

    coherence_score_out = dict()
    coherence_score_out['num_topics'] = parameters.get('num_topics')
    coherence_score_out['chunksize'] = parameters.get('chunksize')
    coherence_score_out['passes'] = parameters.get('passes')
    coherence_score_out['iterations'] = parameters.get('iterations')
    coherence_score_out['coherence_u_mass'] = coherence_score

    coherence_scores.append(coherence_score_out)

    # Export coherence scores
    if not os.path.isfile(os.path.join(out_path, filename_out)):
        with open(os.path.join(out_path, filename_out), 'w',
                  encoding='utf-8') as f:
            f.write(str(coherence_score_out) + "\n")
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
#print(dictionary)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
#print(corpus

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=60)

topics = []
for topic in ldamodel.print_topics(num_topics=30,num_words=12):
    topics.append(topic)
print(topics)


df_topics = pd.DataFrame(topics)

with open (r"E:\Helen\FinalProject_INFO5731\ALL_OUTPUTS\LDAgensim_wholeDS.csv", 'w',  newline="",
         encoding='utf-8') as file:
    df_topics.to_csv(file)


# Compute Model Perplexity and Coherence Score: This model to judge how good the model performed,
# especially by Coherene score
# Compute Coherence Score
coherence_ldamodel = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_ldamodel.get_coherence()
print('\nCoherence Score: ', coherence_lda)
예제 #3
0
def spatial_lda(adata,
                x_coordinate='X_centroid',
                y_coordinate='Y_centroid',
                phenotype='phenotype',
                method='radius',
                radius=30,
                knn=10,
                imageid='imageid',
                num_motifs=10,
                random_state=0,
                subset=None,
                label='spatial_lda',
                **kwargs):
    """
Parameters:
    adata : AnnData object

    x_coordinate : float, required  
        Column name containing the x-coordinates values.

    y_coordinate : float, required  
        Column name containing the y-coordinates values.

    phenotype : string, required  
        Column name of the column containing the phenotype information. 
        It could also be any categorical assignment given to single cells.

    method : string, optional  
        Two options are available: a) 'radius', b) 'knn'.  
        a) radius - Identifies the neighbours within a given radius for every cell.  
        b) knn - Identifies the K nearest neigbours for every cell.  

    radius : int, optional  
        The radius used to define a local neighbhourhood.

    knn : int, optional  
        Number of cells considered for defining the local neighbhourhood.

    imageid : string, optional  
        Column name of the column containing the image id.

    subset : string, optional  
        imageid of a single image to be subsetted for analyis.

    num_motifs : int, optional  
        The number of requested latent motifs to be extracted from the training corpus.

    random_state : int, optional  
        Either a randomState object or a seed to generate one. Useful for reproducibility.

    label : string, optional  
        Key for the returned data, stored in `adata.obs`.

Returns:
    adata : AnnData object  
        Updated AnnData object with the results stored in `adata.obs ['spatial_lda']`.
    
Example:
```python
    # Running the radius method
    adata = sm.tl.spatial_lda (adata, num_motifs=10, radius=100)
```
    """

    # Function
    def spatial_lda_internal(adata_subset, x_coordinate, y_coordinate,
                             phenotype, method, radius, knn, imageid):

        # Print which image is being processed
        print('Processing: ' + str(np.unique(adata_subset.obs[imageid])))

        # Create a DataFrame with the necessary inforamtion
        data = pd.DataFrame({
            'x': adata_subset.obs[x_coordinate],
            'y': adata_subset.obs[y_coordinate],
            'phenotype': adata_subset.obs[phenotype]
        })

        # Identify neighbourhoods based on the method used
        # a) KNN method
        if method == 'knn':
            print("Identifying the " + str(knn) +
                  " nearest neighbours for every cell")
            tree = BallTree(data[['x', 'y']], leaf_size=2)
            ind = tree.query(data[['x', 'y']], k=knn, return_distance=False)

        # b) Local radius method
        if method == 'radius':
            print("Identifying neighbours within " + str(radius) +
                  " pixels of every cell")
            kdt = BallTree(data[['x', 'y']], leaf_size=2)
            ind = kdt.query_radius(data[['x', 'y']],
                                   r=radius,
                                   return_distance=False)

        # Map phenotype
        phenomap = dict(zip(list(range(len(ind))),
                            data['phenotype']))  # Used for mapping
        for i in range(len(ind)):
            ind[i] = [phenomap[letter] for letter in ind[i]]

        # return
        return ind

    # Subset a particular image if needed
    if subset is not None:
        adata_list = [adata[adata.obs[imageid] == subset]]
    else:
        adata_list = [
            adata[adata.obs[imageid] == i]
            for i in adata.obs[imageid].unique()
        ]

    # Apply function to all images
    # Create lamda function
    r_spatial_lda_internal = lambda x: spatial_lda_internal(
        adata_subset=x,
        x_coordinate=x_coordinate,
        y_coordinate=y_coordinate,
        phenotype=phenotype,
        method=method,
        radius=radius,
        knn=knn,
        imageid=imageid)
    all_data = list(map(r_spatial_lda_internal, adata_list))  # Apply function

    # combine all the data into one
    texts = np.concatenate(all_data, axis=0).tolist()

    # LDA pre-processing
    print('Pre-Processing Spatial LDA')
    # Create Dictionary
    id2word = corpora.Dictionary(texts)

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # Build LDA model
    print('Training Spatial LDA')
    try:
        lda_model = gensim.models.ldamulticore.LdaMulticore(
            corpus=corpus,
            id2word=id2word,
            num_topics=num_motifs,
            random_state=random_state,
            **kwargs)
    except:
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=num_motifs,
                                                    random_state=random_state,
                                                    **kwargs)

    # Compute Coherence Score
    print('Calculating the Coherence Score')
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=texts,
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)

    # isolate the latent features
    print('Gathering the latent weights')
    topic_weights = []
    for row_list in lda_model[corpus]:
        tmp = np.zeros(num_motifs)
        for i, w in row_list:
            tmp[i] = w
        topic_weights.append(tmp)
    # conver to dataframe
    arr = pd.DataFrame(topic_weights, index=adata.obs.index).fillna(0)
    arr = arr.add_prefix('Motif_')

    # isolate the weights of phenotypes
    pattern = "(\d\.\d+).\"(.*?)\""
    cell_weight = pd.DataFrame(index=np.unique(adata.obs[phenotype]))
    for i in range(0, len(lda_model.print_topics())):
        level1 = lda_model.print_topics()[i][1]
        tmp = pd.DataFrame(re.findall(pattern, level1))
        tmp.index = tmp[1]
        tmp = tmp.drop(columns=1)
        tmp.columns = ['Motif_' + str(i)]
        cell_weight = cell_weight.merge(tmp,
                                        how='outer',
                                        left_index=True,
                                        right_index=True)
    # fill zeros
    cell_weight = cell_weight.fillna(0).astype(float)

    # save the results in anndata object
    adata.uns[label] = arr  # save the weight for each cell
    adata.uns[str(label) +
              '_probability'] = cell_weight  # weights of each cell type
    adata.uns[str(label) + '_model'] = lda_model

    # return
    return adata
def main():
    print('LSI model with gensim')
    print('1) 1 gram, 2) 2 gram, 3) 3 gram')
    op = input()
    op = int(op)
    lsReturn = []
    lsDocuments = []
    lsSubject = []
    #Get the the information into a list of documents
    lsReturn = mlf.getRawTextToList()
    lsDocuments = lsReturn[0]
    lsSubject = lsReturn[1]
    lsDocuments_NoSW = [[
        word for word in simple_preprocess(str(doc)) if word not in sw
    ] for doc in lsDocuments]
    if (op == 1):
        print('LSI model with gensim for 1 gram')

    if (op == 2):
        print('LSI model with gensim for 2 gram')
        bigram = gensim.models.Phrases(lsDocuments_NoSW,
                                       min_count=5,
                                       threshold=100)
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        lsDocBiGram = [bigram_mod[doc] for doc in lsDocuments_NoSW]
        lsDocuments_NoSW.clear()
        lsDocuments_NoSW = [[
            word for word in simple_preprocess(str(doc)) if word not in sw
        ] for doc in lsDocBiGram]

    if (op == 3):
        print('LSI model with gensim for 3 gram')
        bigram = gensim.models.Phrases(lsDocuments_NoSW,
                                       min_count=5,
                                       threshold=100)
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        trigram = gensim.models.Phrases(bigram[lsDocuments_NoSW],
                                        threshold=100)
        trigram_mod = gensim.models.phrases.Phraser(trigram)
        lsDocTrigram = [trigram_mod[doc] for doc in lsDocuments_NoSW]
        lsDocuments_NoSW.clear()
        lsDocuments_NoSW = [[
            word for word in simple_preprocess(str(doc)) if word not in sw
        ] for doc in lsDocTrigram]
        """
        print('Getting bigrams list...')
        for doc in lsDocuments_NoSW:
            for word in doc:
                mlf.appendInfoToFile(pathtohere,'\\trigrams.txt',word+'\n')
        """

    print('LSI Model starting...')
    # Create Dictionary
    id2word = corpora.Dictionary(lsDocuments_NoSW)
    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in lsDocuments_NoSW]
    # Build LDA model
    lsi_model = gensim.models.LsiModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=20)

    df = pd.DataFrame()
    #df=mlf.getDominantTopicDataFrame(lsi_model,corpus,lsDocuments_NoSW,lsSubject)
    #mlf.generateFileSeparatedBySemicolon(df,'LSI_trigram_csv.txt')

    lsi_cm = CoherenceModel(model=lsi_model,
                            corpus=corpus,
                            dictionary=id2word,
                            texts=lsDocuments_NoSW)
    print('LSI Coherence:', lsi_cm.get_coherence())
예제 #5
0
    def compute_coherence_values(self, limit=11, start=4, step=1):
        """
        Compute c_v coherence for various number of topics

        Parameters:
        ----------
        limit : Max num of topics
        start : Least num of topics
        step  : Step-size

        Returns:
        -------
        model_list : List of LDA topic models (not now)
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics (not now)
        optim_k : optimal number of Topics
        """
        coherence_values = []
        perplexity_values = []
        model_list = []
        for num_topics in range(start, limit, step):
            model = gensim.models.ldamulticore.LdaMulticore(corpus=self.corpus,
                                                   id2word=self.dictionary,
                                                   num_topics=num_topics, 
                                                   passes=10,
                                                   alpha = 'asymmetric',
                                                   eta = 'auto',
                                                   random_state=42,
                                                   iterations = 500,
                                                   per_word_topics=True,
                                                   eval_every=None)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=self.texts, dictionary=self.dictionary, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence()) 
            perplexity_values.append(model.log_perplexity(self.corpus))

        plot_val = [-1 * i / j for i, j in zip(coherence_values, perplexity_values)]
        x = np.array(coherence_values)
        z = (x-min(x))/(max(x)-min(x))
        scaled_coherence = z.tolist()
        scaled_subset = [i for i in scaled_coherence if i >= max(scaled_coherence)*0.7]
        scaled_subset_index = [scaled_coherence.index(i) for i in scaled_subset]

        try:
            scaled_subset_index.remove(4)
            if(len(scaled_subset_index)==0):
                scaled_subset_index = [4]    
        except:
            scaled_subset_index = scaled_subset_index
        finally:
            best_model = model_list[min(scaled_subset_index)]

        optim_k = best_model.get_topics().shape[0]

        x = range(start, limit, step)
        plt.plot(x, coherence_values)
        plt.xlabel("Num Topics")
        plt.ylabel("Coherence score")
        plt.legend(("coherence_values"), loc='best')
        plt.show()

        x = range(start, limit, step)
        plt.plot(x, perplexity_values)
        plt.xlabel("Num Topics")
        plt.ylabel("Perplexity score")
        plt.legend(("perplexity_values"), loc='best')
        plt.show()
        
        self.num_topics = optim_k
############ (7) Mallet Model ##############
# I use the Mallet Model to improve the lda results and opt for the optimal number of topics
mallet_path = '/Users/elsayedissa/Desktop/Topic_Modeling/mallet-2.0.8/bin/mallet'  # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path,
                                             corpus=corpus,
                                             num_topics=100,
                                             id2word=dictionary)
# Show Topics
#pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet,
                                           texts=lemmatized_data,
                                           dictionary=dictionary,
                                           coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


############# (9) Finding the Optimal Number of Topics for the LDA Model ########################
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
예제 #7
0
                                       num_topics=40,
                                       id2word=dictionary,
                                       workers=3,
                                       alpha=0.2,
                                       eta=0.03)

#measure performance of LDA

from gensim.models.coherencemodel import CoherenceModel

# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=tokenize,
                                     dictionary=dictionary,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=tokenize,
                                     dictionary=dictionary,
                                     coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Save model to disk.
temp_file = datapath("maude_ventilator_lda_gensim")
lda_model.save(temp_file)

# Load a potentially pretrained model from disk.
예제 #8
0
def label_documents(documents: List, LVL, SET):
    OPTIMAL_TOPICS = 10
    PREPROCESSINGs = ["lemmed"]
    for PREPROCESSING in PREPROCESSINGs:
        # texts = clean_documents([str(doc) for doc in documents], PREPROCESSING)
        # pickle.dump(texts, open("{}_{}_{}_text".format(SET, LVL, PREPROCESSING), "wb"))
        texts = pickle.load(
            open("{}_{}_{}_text".format(SET, LVL, PREPROCESSING), "rb"))
        dictionary = Dictionary(texts)
        print('Number of unique tokens: %d' % len(dictionary))
        dictionary.filter_extremes(no_below=20, no_above=0.5)
        print('Number filtered of unique tokens: %d' % len(dictionary))

        logger.info("Clean complete")
        logger.info("Dictionary complete")
        corpus = [dictionary.doc2bow(text) for text in texts]
        logger.info("Corpus complete")
        # model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=1000, num_topics=OPTIMAL_TOPICS, workers=3)

        # model.save("{}_{}_{}_model_cv_coherence_{}".format(LVL, OPTIMAL_TOPICS, PREPROCESSING))

        model = LdaMulticore.load("{}_{}_1000_model_cv_coherence_{}".format(
            LVL, OPTIMAL_TOPICS, PREPROCESSING))
        coherence_model = CoherenceModel(model=model,
                                         texts=texts,
                                         dictionary=dictionary,
                                         coherence='c_v')
        print("Coherence {}".format(coherence_model.get_coherence()))
        topics = []
        corpus = [dictionary.doc2bow(text) for text in texts]
        for document in corpus:
            topic = model.get_document_topics(document)
            topics.append(topic)

        with open("topics_results_{}_{}_{}.csv".format(SET, LVL,
                                                       PREPROCESSING),
                  "wt",
                  encoding="utf8",
                  newline="") as outf:
            writer = csv.writer(outf)
            for document_topics in topics:
                sorted_topics = sorted(document_topics, key=lambda x: -x[1])
                if sorted_topics:
                    best = [sorted_topics[0][0]]
                else:
                    best = [-1]
                    print("NONE ERROR")
                writer.writerow(best)

        x = model.show_topics(num_topics=OPTIMAL_TOPICS,
                              num_words=10,
                              formatted=True)
        # Below Code Prints Topics and Words
        with open("topics_keywords_{}_{}_{}".format(SET, LVL, PREPROCESSING),
                  "wt",
                  encoding="utf8",
                  newline="") as outf:
            writer = csv.writer(outf)
            for t in x:
                topic = t[0]
                words = [(word_score.split("*")[1].strip()[1:-1],
                          float(word_score.split("*")[0].strip()))
                         for word_score in t[1].split("+")]
                sort_words = sorted(words, key=lambda z: z[1])
                print(str(topic) + " " + str(sort_words))
                words = [w for w, s in sort_words]
                score = [s for w, s in sort_words]
                topics_word_bar(words, score, t[0])
                writer.writerow([topic] + [sort_words])

    return
예제 #9
0
        tfidf = gensim.models.TfidfModel(bowCorpus)
        bowCorpus = tfidf[bowCorpus]
        extraDict['tfidf'] = True
        
    #   Ende der Data-Prepartion
    #   Training des Models
    
    ##  Initzialisiert die Trainingsphase für den Algorithmus. Die Parameter sollten aus dem extraDict kommen, falls nicht default.
    lsiModel = gensim.models.LsiModel(bowCorpus, num_topics=numberTopics, id2word=dictionary, power_iters=extraDict['power_iters'])
     
    #   Ende des Trainings
    #   Validierung
    
    ## Erstellt den Coherence-Score u_mass
    cm = CoherenceModel(model=lsiModel, corpus=bowCorpus, coherence='u_mass')
    coherence = cm.get_coherence() 

    ## Erstellt eine Liste und ein String mit den erstellten Topics
    topicString = ""
    topicList = []
    for idx, topic in lsiModel.print_topics(-1):
        topicList.append(topic)
        topicString += '\nTopic: {} \nWords: {}'.format(idx, topic) 
    
    ## Speichert die Ergebnisse und den Modeltype in ein Dictionary    
    resultDict = {'modelType': modelType,
                 'topicN': numberTopics,
                 'coherence': coherence,
                 'topicList':topicList}
    
    ## Berechnung und Ausgabe des CoherenceCV scores c_v - zeitintensiv 
예제 #10
0
def ldaStemmed(corpus,
               NUM_TOPICS=4,
               fechaMin='1-1',
               fechaMax='1-1',
               random_state=100,
               update_every=1,
               chunksize=100,
               passes=10,
               alpha='auto',
               per_word_topics=True,
               iterations=50,
               eval_every=1):

    fechaMin = fechaMin.split('-')
    fechaMax = fechaMax.split('-')

    diaMin = int(fechaMin[0])
    diaMax = int(fechaMax[0])

    fechaMin = datetime.datetime(2020, int(fechaMin[1]), diaMin)
    fechaMax = datetime.datetime(2020, int(fechaMax[1]), diaMax)

    data_words = []

    if fechaMin.month == fechaMax.month:
        corpus = corpus[meses[fechaMin.month -
                              1]].query('dia >= @diaMin and dia <= @diaMax')

    else:
        p1 = corpus[meses[fechaMin.month - 1]].query('dia >= @diaMin')
        p2 = pd.DataFrame()
        for i in range(fechaMin.month, fechaMax.month - 1):
            p2 = p2.append(corpus[meses[i]])

        p3 = corpus[meses[fechaMax.month - 1]].query('dia <= @diaMax')

        corpus = pd.concat([p1, p2, p3])

    corpus = corpus[corpus.lexemas.values == corpus.lexemas.values]

    arrayTokens = corpus.lexemas.values
    for tweet in arrayTokens:
        d = nlp(tweet)
        data_words.append([
            str(t) for t in d
            if t.pos_ == 'ADJ' or t.pos_ == 'PROPN' or t.pos_ == 'NOUN'
        ])

    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(
        data_words, min_count=5,
        threshold=100)  # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    data_words_bigrams = make_bigrams(data_words)

    id2word = corpora.Dictionary(data_words_bigrams)
    text = data_words_bigrams
    corpus = [id2word.doc2bow(text) for text in data_words_bigrams]

    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=NUM_TOPICS,
        random_state=random_state,
        update_every=update_every,
        chunksize=chunksize,
        passes=passes,
        alpha=alpha,
        per_word_topics=per_word_topics,
        iterations=iterations)

    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data_words_bigrams,
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    # print('\nCoherence Score with ' + str(NUM_TOPICS) + ' topics and ' + str(iterations)  + ' iterations: ' + str(coherence_lda))

    return [lda_model, coherence_lda]
예제 #11
0
def ldaMulticore(corpus,
                 workers=1,
                 NUM_TOPICS=4,
                 fechaMin='1-1',
                 fechaMax='1-1',
                 random_state=100,
                 chunksize=100,
                 passes=10,
                 alpha='auto',
                 per_word_topics=True,
                 iterations=50):

    fechaMin = fechaMin.split('-')
    fechaMax = fechaMax.split('-')

    diaMin = int(fechaMin[0])
    diaMax = int(fechaMax[0])

    fechaMin = datetime.datetime(2020, int(fechaMin[1]), diaMin)
    fechaMax = datetime.datetime(2020, int(fechaMax[1]), diaMax)

    data_words = []

    if fechaMin.month == fechaMax.month:
        corpus = corpus[meses[fechaMin.month -
                              1]].query('dia >= @diaMin and dia <= @diaMax')

    else:
        p1 = corpus[meses[fechaMin.month - 1]].query('dia >= @diaMin')
        p2 = pd.DataFrame()
        for i in range(fechaMin.month, fechaMax.month - 1):
            p2 = p2.append(corpus[meses[i]])

        p3 = corpus[meses[fechaMax.month - 1]].query('dia <= @diaMax')

        corpus = pd.concat([p1, p2, p3])

    arrayTokens = corpus.tokens.values
    for tweet in arrayTokens:
        d = ' '.join(tweet)
        d = nlp(d)
        data_words.append([
            str(t) for t in d
            if t.pos_ == 'ADJ' or t.pos_ == 'PROPN' or t.pos_ == 'NOUN'
        ])

    id2word = corpora.Dictionary(data_words)
    text = data_words
    corpus = [id2word.doc2bow(text) for text in data_words]

    lda_model = gensim.models.ldamulticore.LdaMulticore(
        workers=workers,
        corpus=corpus,
        id2word=id2word,
        num_topics=NUM_TOPICS,
        random_state=random_state,
        chunksize=chunksize,
        passes=passes,
        alpha=alpha,
        per_word_topics=per_word_topics,
        iterations=iterations)

    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data_words,
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score with ' + str(NUM_TOPICS) + ' topics: ' +
          str(coherence_lda))

    return lda_model
예제 #12
0
def lda(corpus,
        NUM_TOPICS=4,
        random_state=100,
        update_every=1,
        chunksize=100,
        passes=10,
        alpha='auto',
        per_word_topics=True,
        iterations=50,
        eval_every=1):

    data_words = []

    arrayTokens = corpus.tokens.values
    for tweet in arrayTokens:
        d = ' '.join(tweet)
        d = nlp(d)
        data_words.append([
            str(t) for t in d
            if t.pos_ == 'ADJ' or t.pos_ == 'PROPN' or t.pos_ == 'NOUN'
        ])

    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(
        data_words, min_count=5,
        threshold=100)  # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    data_words_bigrams = make_bigrams(data_words)

    id2word = corpora.Dictionary(data_words_bigrams)
    text = data_words_bigrams
    corpus = [id2word.doc2bow(text) for text in data_words_bigrams]

    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=NUM_TOPICS,
        random_state=random_state,
        update_every=update_every,
        chunksize=chunksize,
        passes=passes,
        alpha=alpha,
        per_word_topics=per_word_topics,
        iterations=iterations)

    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data_words_bigrams,
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    # print('\nCoherence Score with ' + str(NUM_TOPICS) + ' topics and ' + str(iterations)  + ' iterations: ' + str(coherence_lda))

    return [lda_model, corpus, id2word, coherence_lda]
def compute_all_scores(model_name,
                       corpus,
                       id2word,
                       texts,
                       measure,
                       max_topics,
                       start=5,
                       step=5,
                       random_state=100,
                       update_every=1,
                       chunksize=100,
                       passes=10,
                       alpha='auto',
                       per_word_topics=True):
    """
    This module computes coherence scores for increasing number of topics.

    Arguments:
        model_name {str} -- Name of the model
        corpus {list} -- List of corpus created from preprocessed texts
        id2word {Dictionary} -- Dictionary of ids with mapped words
        texts {list} -- List of preprocessed input words
        measure {str} -- Name of the coherence score you want to use. Possible measures: c_v, c_uci, c_npmi, u_mass
        max_topics {int} -- Maximum number of topics used in computing various coherence scores

    Keyword Arguments:
        start {int} -- Start value for x-axis values (default: {5})
        step {int} -- Steps for values on x-axis (default: {5})

    Returns:
        list, list -- List of computed models, List of coherence scores for respective computed models
    """
    def compute_model(model_name, corpus, id2word, num_topics):
        """
        This module computes various topic models from the gensim library.

        Arguments:
            model_name {str} -- Name of the model
            corpus {list} -- List of corpus created from preprocessed texts
            id2word {Dictionary} -- Dictionary of ids with mapped words

        Keyword Arguments:
            num_topics {int} -- Number of topics as a parameter to gensim model computation (default: {5})

        Returns:
            gensim.models.MODEL -- Return the computed model
        """

        # LSI Model
        if model_name == 'LsiModel' or model_name == 'lsimodel' or model_name == 'lsi':
            model = gensim.models.lsimodel.LsiModel(corpus=corpus,
                                                    num_topics=num_topics,
                                                    id2word=id2word,
                                                    chunksize=chunksize)
        # LDA Model
        elif model_name == 'LdaModel' or model_name == 'ldamodel' or model_name == 'lda':
            model = gensim.models.ldamodel.LdaModel(
                corpus=corpus,
                id2word=id2word,
                num_topics=num_topics,
                random_state=random_state,
                update_every=update_every,
                chunksize=chunksize,
                passes=passes,
                alpha=alpha,
                per_word_topics=per_word_topics)
        # LDAMallet Model
        elif model_name == 'LdaMallet' or model_name == 'ldamallet' or model_name == 'ldamallet':
            model = gensim.models.wrappers.LdaMallet(mallet_path,
                                                     corpus=corpus,
                                                     num_topics=num_topics,
                                                     id2word=id2word)
        else:
            print('Invalid model!')
            return None

        return model

    # Models list and coherence scores list
    model_list, coherence_values = [], []
    # Iterator for max_topics
    iterator = range(start, max_topics + 1, step)

    for num_topics in iterator:
        # Compute models for given max_topics
        computed_model = compute_model(model_name=model_name,
                                       corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
        # Append above computed to a list
        model_list.append(computed_model)

        # Compute coherence score for above model
        coherencemodel = CoherenceModel(model=computed_model,
                                        texts=texts,
                                        dictionary=id2word,
                                        coherence=measure)
        # Append above coherence score to a list
        coherence_values.append(coherencemodel.get_coherence())

    # Print all scores to the console
    print('\nCalculating various coherence scores...\n')
    for number_of_topics, score in zip(iterator, coherence_values):
        print(model_name, ": Num Topics =", number_of_topics,
              " Coherence Value :", round(score, 4))

    return model_list, coherence_values
예제 #14
0
    def most_similar_texts(
        self, X, num_examples, text_column_name, num_topics=None, chosen_stopwords=set()
    ):
        """
        Uses NMF clustering to create n topics based on adjusted word frequencies

        Parameters
        --------
        X: DataFrame
        num_examples: int
        text_column_name: str
        num_topics: int
            Optional - if none algorithm will determine best number

        Returns
        --------
        topic_words_df: DataFrame
            Top 15 words/phrases per topic
        combined_df: DataFrame
            Original text with topic number assigned to each

        """
        X = X[~X[text_column_name].isna()]
        X = X[X[text_column_name] != ""]
        X = X[X[text_column_name] != " "]
        X = X[X[text_column_name] != "NA"]
        X = X[X[text_column_name] != "n/a"]
        X = X[X[text_column_name] != "N/A"]
        X = X[X[text_column_name] != "na"]

        all_stop_words = (
            set(ENGLISH_STOP_WORDS)
            | set(["-PRON-"])
            | set(string.punctuation)
            | set([" "])
            | chosen_stopwords
        )

        ct = CleanText()
        vectorizer = TfidfVectorizer(
            tokenizer=ct.lematize,
            ngram_range=(1, 3),
            stop_words=all_stop_words,
            min_df=5,
            max_df=0.4,
        )
        vectors = vectorizer.fit_transform(X[text_column_name]).todense()

        # Adding words/phrases used in text data frequencies back into the dataset (so we can see feature importances later)
        vocab = vectorizer.get_feature_names()
        vector_df = pd.DataFrame(vectors, columns=vocab, index=X.index)

        if X.shape[0] < 20:
            return "Too few examples to categorize."

        if not num_topics:

            # In case 1, add 1 to get at least 2
            # The rest are based on eyeballing numbers
            min_topics = ceil(X.shape[0] * 0.01) + 1
            max_topics = ceil(X.shape[0] * 0.2)
            step = ceil((max_topics - min_topics) / 5)

            topic_nums = list(np.arange(min_topics, max_topics, step))

            texts = X[text_column_name].apply(ct.lematize)

            # In gensim a dictionary is a mapping between words and their integer id
            dictionary = Dictionary(texts)

            # Filter out extremes to limit the number of features
            dictionary.filter_extremes(no_below=2, no_above=0.85, keep_n=5000)

            # Create the bag-of-words format (list of (token_id, token_count))
            corpus = [dictionary.doc2bow(text) for text in texts]

            coherence_scores = []

            for num in topic_nums:
                model = nmf.Nmf(
                    corpus=corpus,
                    num_topics=num,
                    id2word=dictionary,
                    chunksize=2000,
                    passes=5,
                    kappa=0.1,
                    minimum_probability=0.01,
                    w_max_iter=300,
                    w_stop_condition=0.0001,
                    h_max_iter=100,
                    h_stop_condition=0.001,
                    eval_every=10,
                    normalize=True,
                    random_state=42,
                )

                cm = CoherenceModel(
                    model=model, texts=texts, dictionary=dictionary, coherence="u_mass"
                )

                coherence_scores.append(round(cm.get_coherence(), 5))

            scores = list(zip(topic_nums, coherence_scores))
            chosen_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0]
        else:
            chosen_num_topics = num_topics

        model = NMF(n_components=chosen_num_topics, random_state=42)
        model.fit(vectors)
        component_loadings = model.transform(vectors)

        top_topics = pd.DataFrame(
            np.argmax(component_loadings, axis=1), columns=["top_topic_num"]
        )

        top_topic_loading = pd.DataFrame(
            np.max(component_loadings, axis=1), columns=["top_topic_loading"]
        )

        X.reset_index(inplace=True, drop=False)
        vector_df.reset_index(inplace=True, drop=True)

        # Fix for duplicate text_column_name
        vector_df.columns = [x + "_vector" for x in vector_df.columns]

        combined_df = pd.concat([X, vector_df, top_topics, top_topic_loading], axis=1)

        combined_df.sort_values(by="top_topic_loading", ascending=False, inplace=True)

        combined_df = pd.concat([X, vector_df, top_topics], axis=1)

        topic_words = {}
        sample_texts_lst = []
        for topic, comp in enumerate(model.components_):
            word_idx = np.argsort(comp)[::-1][:num_examples]
            topic_words[topic] = [vocab[i] for i in word_idx]
            sample_texts_lst.append(
                list(
                    combined_df[combined_df["top_topic_num"] == topic][
                        text_column_name
                    ].values[:num_examples]
                )
            )

        topic_words_df = pd.DataFrame(
            columns=[
                "topic_num",
                "num_in_category",
                "top_words_and_phrases",
                "sample_texts",
            ]
        )

        topic_words_df["topic_num"] = [k for k, _ in topic_words.items()]
        topic_words_df["num_in_category"] = (
            combined_df.groupby("top_topic_num").count().iloc[:, 0]
        )
        topic_words_df["top_words_and_phrases"] = [x for x in topic_words.values()]
        topic_words_df["sample_texts"] = sample_texts_lst

        topic_words_explode = pd.DataFrame(
            topic_words_df["sample_texts"].tolist(), index=topic_words_df.index,
        )

        topic_words_explode.columns = [
            "example{}".format(num) for num in range(len(topic_words_explode.columns))
        ]

        concated_topics = pd.concat(
            [
                topic_words_df[
                    ["topic_num", "num_in_category", "top_words_and_phrases"]
                ],
                topic_words_explode,
            ],
            axis=1,
        )

        print("Topics created with top words & example texts:")
        print(concated_topics)

        original_plus_topics = combined_df[list(X.columns) + ["index", "top_topic_num"]]
        original_with_keywords = pd.merge(
            original_plus_topics,
            concated_topics[["topic_num", "top_words_and_phrases"]],
            left_on="top_topic_num",
            right_on="topic_num",
            how="left",
        ).drop("top_topic_num", axis=1)

        return (
            concated_topics,
            original_with_keywords,
            model,
        )
예제 #15
0
파일: main.py 프로젝트: davidhin/rmsc2020
def get_coherence(model, texts, dictionary):
    coherence_model_ldamallet = CoherenceModel(model=model,
                                               texts=texts,
                                               dictionary=dictionary,
                                               coherence='c_v')
    return coherence_model_ldamallet.get_coherence()
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

    # Print the Keyword in the 10 topics
    pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data_lemmatized,
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)

else:
    print(
        "Topic number NOT given. Training multiple models. Optimal one will return at the end..."
    )
    print("______________ \n \n ")
    # this may take a long time - every model in the list will be evaluated based on coherence
    coherence_values = []
    model_list = []
    for num_topics in range(5, 20, 1):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics,
                                                random_state=100,
예제 #17
0
    def evaluate_allmodels(self, file_extension, bool_allmodels):
        print('Number of topics: %d' % self.topics_n)
        print('Number of documents: %d' % len(self.corpus))
        print('Number of unique tokens: %d' % len(self.dictionary))
        topicmodel_list = ['JST', 'VODUM', 'TAM', 'LAM']
        coherence_list = []
        randindex_list = []
        output_list = []

        for topic_number in range(self.min_topics_n, self.topics_n + 1):
            print(topic_number)
            lda_model = lda.fit_lda_model(topic_number, self.corpus,
                                          self.dictionary)
            lda_topics = lda_model.show_topics(formatted=False)
            lda_topics = [[word for word, prob in topic]
                          for topicid, topic in lda_topics]
            lda_cm = CoherenceModel(topics=lda_topics,
                                    texts=self.tokenized_data,
                                    corpus=self.corpus,
                                    dictionary=self.dictionary,
                                    coherence='c_v')
            lda_coherence = lda_cm.get_coherence()
            results_lda_df = evalmodels.evaluate_LDA(
                ldafoldername=file_extension,
                corpus=self.corpus,
                ldamodel=lda_model,
                top_topics_n=1,
                data=self.data,
                perspectives=self.perspectives)
            lda_randindex = evalmodels.rand_index(
                results_lda_df['perspective'], results_lda_df['sum_topic'])
            print('LDA : \n', 'Adjust. Rand index: ', lda_randindex,
                  '\n Topic coherence: ', lda_coherence)
            print(lda_topics)

        if bool_allmodels == True:
            '''JST'''
            jstfile = '../data/models/JST/' + file_extension + '/final.twords'
            if os.path.isfile(jstfile):
                jst_topic = fm.read_jst_toptopics(jstfile)
                jst_results_model_df = evalmodels.evaluate_JST(
                    file_extension, self.data, self.perspectives)
                coherence_list.append(
                    CoherenceModel(topics=jst_topic,
                                   texts=self.tokenized_data,
                                   corpus=self.corpus,
                                   dictionary=self.dictionary,
                                   coherence='c_v').get_coherence())
                randindex_list.append(
                    evalmodels.rand_index(jst_results_model_df['perspective'],
                                          jst_results_model_df['sum_topic']))
                output_list.append(jst_topic)
            '''VODUM'''
            vodum_topic = fm.read_vodum_toptopics(file_extension)
            vodum_results_model_df = evalmodels.evaluate_VODUM(
                vodum_foldername=file_extension,
                data=self.data,
                perspectives=self.perspectives,
                topics_list=vodum_topic)
            coherence_list.append(
                CoherenceModel(topics=vodum_topic,
                               texts=self.tokenized_data,
                               corpus=self.corpus,
                               dictionary=self.dictionary,
                               coherence='c_v').get_coherence())
            randindex_list.append(
                evalmodels.rand_index(vodum_results_model_df['perspective'],
                                      vodum_results_model_df['sum_topic']))
            output_list.append(vodum_topic)
            '''TAM'''
            tamfile = '../data/models/TAM/' + file_extension + '/output_topwords_tokenized_data.txt'
            tam_topic = fm.read_tam_toptopics(tamfile)
            tam_results_model_df = evalmodels.evaluate_TAM(
                tam_foldername=file_extension,
                data=self.data,
                perspectives=self.perspectives,
                topics_list=tam_topic)
            coherence_list.append(
                CoherenceModel(topics=tam_topic,
                               texts=self.tokenized_data,
                               corpus=self.corpus,
                               dictionary=self.dictionary,
                               coherence='c_v').get_coherence())
            randindex_list.append(
                evalmodels.rand_index(tam_results_model_df['perspective'],
                                      tam_results_model_df['sum_topic']))
            output_list.append(tam_topic)
            '''LAM'''
            lamfile = '../data/models/LAM/' + file_extension + '/lam.out'
            lam_topic_dict, lam_topic = fm.read_lam_toptopics(lamfile)
            lam_results_model_df = evalmodels.evaluate_LAM(
                lam_foldername=file_extension,
                data=self.data,
                perspectives=self.perspectives,
                topics_dict=lam_topic_dict)
            coherence_list.append(
                CoherenceModel(topics=lam_topic,
                               texts=self.tokenized_data,
                               corpus=self.corpus,
                               dictionary=self.dictionary,
                               coherence='c_v').get_coherence())
            randindex_list.append(
                evalmodels.rand_index(lam_results_model_df['perspective'],
                                      lam_results_model_df['sum_topic']))
            output_list.append(lam_topic)
            '''RANDOM TF-IDF'''
            random_model, top_tokens = randommodel.fit_tfidf(
                self.tokenized_data)
            print('RANDOM MODEL results: \n', random_model)

            for i in range(0, len(topicmodel_list)):
                print('\n ----------', topicmodel_list[i],
                      ' ------------- \n Adjust. Rand index: ',
                      randindex_list[i], '\n Topic coherence: ',
                      coherence_list[i])
                print('Model output:')
                for topicelement in output_list[i]:
                    print(topicelement)
예제 #18
0
def main():

    t = time()
    tree = ET.parse('WSD_Training_Corpora\SemCor\semcor.data.xml')
    root = tree.getroot()
    """
    root          - corpus
    root[0]       - text (document)
    root[0][0]    - sentence 
    root[0][0][0] - word (token)

    """

    documents = [0] * len(root)
    i = 0

    for text in root:
        document = ''
        result = []
        for sentence in text:
            for word in sentence:
                document = document + " " + word.text

        #egy dokumentum
        document = simple_preprocess(document)
        for token in document:
            if token not in STOPWORDS and len(token) > 3:
                result.append(PorterStemmer().stem(
                    WordNetLemmatizer().lemmatize(token, pos='v')))

        documents[i] = result
        print(i)
        i += 1
    """ ------------------------------------------------- Bag of words -------------------------------------------------------- """

    dictionary = gensim.corpora.Dictionary(documents)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

    bow_corpus = [dictionary.doc2bow(doc) for doc in documents]
    """ ---------------------------------------- Coherence Values and Num Topics Graph ---------------------------------------- """
    def compute_coherence_values(dictionary,
                                 corpus,
                                 texts,
                                 limit,
                                 start=2,
                                 step=3):

        coherence_values = []
        model_list = []
        for num_topics in range(start, limit, step):
            print("Working on next model, num_topics =", num_topics, "...")
            model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                               num_topics=num_topics,
                                               id2word=dictionary,
                                               passes=10,
                                               workers=3)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model,
                                            texts=documents,
                                            dictionary=dictionary,
                                            coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())

        return model_list, coherence_values

    # Can take a long time to run.
    print("Computing coherence values...")
    model_list, coherence_values = compute_coherence_values(
        dictionary=dictionary,
        corpus=bow_corpus,
        texts=documents,
        start=2,
        limit=40,
        step=6)

    # Show graph
    limit = 40
    start = 2
    step = 6
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    """ --------------------------------------------------- LDA -------------------------------------------------------------- """

    print("\nWorking on simple LDA num_topics=16, passes=10...")

    lda_model_bow = gensim.models.LdaModel(bow_corpus,
                                           num_topics=16,
                                           id2word=dictionary,
                                           passes=10)

    for idx, topic in lda_model_bow.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx, topic))

    print('\nPerplexity: ', lda_model_bow.log_perplexity(bow_corpus))

    coherence_model_lda = CoherenceModel(model=lda_model_bow,
                                         texts=documents,
                                         dictionary=dictionary,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)

    print('\nworking on topic visualization')

    vis = pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary)
    pyLDAvis.save_html(vis, 'LDA_visualized.html')

    print('Time for this WHOLE thing: {} mins'.format(
        round((time() - t) / 60, 2)))
    plt.show()
예제 #19
0
파일: LDA.py 프로젝트: strath-ace/smart-nlp
def ldaModelGeneration(train_corpus, test_corpus, topic_number, value_save_model):
    '''
    LDA model training, visualisation and evaluation

    Inputs:
    - train_corpus: 80% of the wikipedia corpus to be used for training of LDA model
    - test_corpus: 20% of the wikipedia corpus to be used for final evaluation of trained model(perplexity+coherence)
    - topic_number: number of latent topics to be found by model
    - value_save_model: if True, will save model, model dictionary and pyldavis visualisation

    Output:
    - perplexity value: evaluation of perplexity of trained model over unseen documents (test_corpus) --> common LDA
    evaluation metrics
    - coherence value: coherence score of trained model over unseen documents (test_corpus) --> less reliable
    '''

    fileDir = os.path.dirname(os.path.abspath(__file__))  #
    parentDir = os.path.dirname(fileDir)  # Directory of the Module directory

    # ------------------------------------------------------------------------------------------------------------
    # LDA MODEL - GENERATION/VISUALISATION WITH TRAINING CORPUS
    # ------------------------------------------------------------------------------------------------------------
    # Choose LDA model name for this iteration
    model_name = 'model_' + str(topic_number)

    # Create model dictionary
    dictionary = corpora.Dictionary(train_corpus)
    dictionary.filter_extremes(no_below=0.2)
    print('\n LDA Model Inputs:\n Dictionary Size:', dictionary)

    # Create Document-Term matrix
    corpus = [dictionary.doc2bow(tokens) for tokens in train_corpus]

    # Generate LDA model
    ldamodel = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=topic_number, passes=300)

    if value_save_model == True:

        # Visualise topics: words and their weights
        print("LDA Topics:")
        for i in ldamodel.show_topics(formatted=False, num_topics=ldamodel.num_topics, num_words=20):
            print(i)

        # Save model
        dictionary.save(parentDir + '/TopicModeling/LDAmodels/new_unsupervised/dic_' + str(model_name) + '.dict')
        ldamodel.save(parentDir+'/TopicModeling/LDAModels/new_unsupervised/'+str(model_name))
        print('LDA model generated and saved')

        # Save pyldavis (usually takes a few minutes to generate)
        vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics = False)
        pyLDAvis.save_html(vis, parentDir + '/TopicModeling/LDAmodels/new_unsupervised/LDA_Visualization_' + str(topic_number)+'.html')

    # ------------------------------------------------------------------------------------------------------------
    #                                LDA MODEL - EVALUATION
    # ------------------------------------------------------------------------------------------------------------

    # Use same dictionary as the model was trained with to transform unseen data into Document-Term matrix
    corpusTest = [dictionary.doc2bow(tokens) for tokens in test_corpus]

    # Model Perplexity - must be minimised
    perplexity = ldamodel.log_perplexity(corpusTest)
    perplexityExp = math.exp(perplexity)

    # Topic Coherence
    cm = CoherenceModel(model=ldamodel, corpus=corpusTest, coherence='u_mass')
    coherence = cm.get_coherence()  # get coherence value

    return perplexityExp, coherence
예제 #20
0
def get_coherence(lda_model, data_lemmatized, id2word):
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=data_lemmatized,
                                         dictionary=id2word,
                                         coherence='c_v')
    return coherence_model_lda.get_coherence()
def calculate_cv(model, data, id2word):
    coherence_model_lda = CoherenceModel(model=model,
                                         texts=data,
                                         dictionary=id2word,
                                         coherence='c_v')
    return coherence_model_lda.get_coherence()
예제 #22
0
bag_of_words = [dictionary.doc2bow(abstract) for abstract in abstracts]

print('- Read and preprocessed the dataset!')

########################## DYNAMIC TOPIC MODELING ##########################

#Build the model
print('- Training the model')
start_time = time.time()  #Start count time
ldaseq = ldaseqmodel.LdaSeqModel(corpus=bag_of_words,
                                 id2word=dictionary,
                                 time_slice=time_slices_2years_interval,
                                 num_topics=8)
print('- Model finish running in', round((time.time() - start_time) / 60),
      'min(s)')
#Save the model
path = datapath('dynamic_model_code')
ldaseq.save(path)

########################## EVALUATION ##########################
coherence = ldaseq.dtm_coherence(time=0)
temp = CoherenceModel(topics=coherence,
                      corpus=bag_of_words,
                      dictionary=dictionary,
                      coherence='u_mass')
print("u_mass = ", temp.get_coherence())
temp = CoherenceModel(topics=coherence,
                      texts=abstracts,
                      dictionary=dictionary,
                      coherence='c_v')
print("c_v = ", temp.get_coherence())