예제 #1
0
 def testPersistenceCompressed(self):
     fname = testfile() + '.gz'
     model = CoherenceModel(
         topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
     model.save(fname)
     model2 = CoherenceModel.load(fname)
     self.assertTrue(model.get_coherence() == model2.get_coherence())
    def EvaluateCoherence(self, model, coherence='c_v'):
        """
        Evaluate the coherence of the LDA model. 
        The core estimation code is based on the onlineldavb.py script by M. Hoffman, 
        see Hoffman, Blei, Bach: Online Learning for Latent Dirichlet Allocation, NIPS 2010.
        """
        logger = logging.getLogger(__name__)

        if isinstance(coherence, str):
            coherence = [coherence]
        elif not isinstance(coherence, list):
            raise ValueError("The coherence method should be either a list or a specific type")
        
        values = dict()
        supported = set(['u_mass', 'c_v', 'c_uci', 'c_npmi'])
        for ctype in coherence:
            if ctype not in supported:
                logger.warning("Coherence evaluation for type %s is not supported, ignored. Only support types %s" % (ctype, str(supported)))
                continue
            cm = CoherenceModel(model=model, texts=self.tokenizedDocs, corpus=self.corpus, 
                            dictionary=self.id2token, coherence=ctype, topn=10)
            values[ctype] = cm.get_coherence()

            # Add run log for the coherence values
            if ctype == 'u_mass':
                self.u_mass_list.append(values[ctype])
            elif ctype == 'c_v':
                self.c_v_list.append(values[ctype])
            elif ctype == 'c_uci':
                self.c_uci_list.append(values[ctype])
            else:
                self.c_npmi_list.append(values[ctype])
            logger.info("Coherence type: %s, coherence value = %.6f" % (ctype, values[ctype]))
        return values
 def testPersistence(self):
     fname = get_tmpfile('gensim_models_coherence.tst')
     model = CoherenceModel(
         topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass'
     )
     model.save(fname)
     model2 = CoherenceModel.load(fname)
     self.assertTrue(model.get_coherence() == model2.get_coherence())
예제 #4
0
 def testAccumulatorCachingTopicSubsets(self):
     kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
     cm1 = CoherenceModel(topics=self.topics1, **kwargs)
     cm1.estimate_probabilities()
     accumulator = cm1._accumulator
     self.assertIsNotNone(accumulator)
     cm1.topics = [t[:2] for t in self.topics1]
     self.assertEqual(accumulator, cm1._accumulator)
     cm1.topics = self.topics1
     self.assertEqual(accumulator, cm1._accumulator)
예제 #5
0
    def check_coherence_measure(self, coherence):
        """Check provided topic coherence algorithm on given topics"""
        if coherence in BOOLEAN_DOCUMENT_BASED:
            kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence=coherence)
        else:
            kwargs = dict(texts=self.texts, dictionary=self.dictionary, coherence=coherence)

        cm1 = CoherenceModel(topics=self.topics1, **kwargs)
        cm2 = CoherenceModel(topics=self.topics2, **kwargs)
        self.assertGreater(cm1.get_coherence(), cm2.get_coherence())
예제 #6
0
 def testAccumulatorCachingWithModelSetting(self):
     kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
     cm1 = CoherenceModel(topics=self.topics1, **kwargs)
     cm1.estimate_probabilities()
     self.assertIsNotNone(cm1._accumulator)
     cm1.model = self.ldamodel
     topics = []
     for topic in self.ldamodel.state.get_lambda():
         bestn = argsort(topic, topn=cm1.topn, reverse=True)
         topics.append(bestn)
     self.assertTrue(np.array_equal(topics, cm1.topics))
     self.assertIsNone(cm1._accumulator)
예제 #7
0
    def testAccumulatorCachingWithTopnSettingGivenModel(self):
        kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, topn=5, coherence='u_mass')
        cm1 = CoherenceModel(model=self.ldamodel, **kwargs)
        cm1.estimate_probabilities()
        self.assertIsNotNone(cm1._accumulator)

        accumulator = cm1._accumulator
        topics_before = cm1._topics
        cm1.topn = 3
        self.assertEqual(accumulator, cm1._accumulator)
        self.assertEqual(3, len(cm1.topics[0]))
        self.assertEqual(topics_before, cm1._topics)

        cm1.topn = 6  # should be able to expand given the model
        self.assertEqual(6, len(cm1.topics[0]))
예제 #8
0
 def testPersistenceAfterProbabilityEstimationUsingTexts(self):
     fname = testfile()
     model = CoherenceModel(
         topics=self.topics1, texts=self.texts, dictionary=self.dictionary, coherence='c_v')
     model.estimate_probabilities()
     model.save(fname)
     model2 = CoherenceModel.load(fname)
     self.assertIsNotNone(model2._accumulator)
     self.assertTrue(model.get_coherence() == model2.get_coherence())
 def testPersistenceAfterProbabilityEstimationUsingCorpus(self):
     fname = get_tmpfile('gensim_similarities.tst.pkl')
     model = CoherenceModel(
         topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass'
     )
     model.estimate_probabilities()
     model.save(fname)
     model2 = CoherenceModel.load(fname)
     self.assertIsNotNone(model2._accumulator)
     self.assertTrue(model.get_coherence() == model2.get_coherence())
예제 #10
0
def checkCoherenceMeasure(topics1, topics2, coherence):
    """Check provided topic coherence algorithm on given topics"""
    if coherence in boolean_document_based:
        cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence)
        cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence)
    else:
        cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence)
        cm2 = CoherenceModel(topics=topics2, texts=texts, dictionary=dictionary, coherence=coherence)
    return cm1.get_coherence() > cm2.get_coherence()
 def testAccumulatorCachingSameSizeTopics(self):
     kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
     cm1 = CoherenceModel(topics=self.topics1, **kwargs)
     cm1.estimate_probabilities()
     accumulator = cm1._accumulator
     self.assertIsNotNone(accumulator)
     cm1.topics = self.topics1
     self.assertEqual(accumulator, cm1._accumulator)
     cm1.topics = self.topics2
     self.assertEqual(None, cm1._accumulator)
예제 #12
0
def evaluation(randomint, num_topics, n_iter, n_partition, result_path,
               result_folder, data_path, par):

    tempdf = pd.read_csv(data_path)
    mydictionary = copy.deepcopy(par['word_token'])

    result_file = result_path + result_folder + str(
        num_topics) + 'topics' + str(
            n_iter) + 'iteration_' + 'topic_important_words_sep.csv'
    wordtemp = []
    with open(result_file) as f:
        reader = csv.reader(f, delimiter='\n')
        for row in reader:
            wordtemp.append(row)

    rsult_file = result_path + result_folder + '/' + str(
        num_topics) + 'topics' + str(
            n_iter) + 'iteration_' + 'topic_important_words_probs_sep.csv'
    probtemp = []
    with open(result_file) as f:
        reader = csv.reader(f, delimiter='\n')
        for row in reader:
            probtemp.append(row)

    all_dic = []
    for i in range(len(wordtemp)):  # for number of topics
        wordtemp[i] = str(wordtemp[i]).replace('[', '').replace(
            ']', '').replace('"', '').replace("'", '').replace(',',
                                                               ' ').split()
        probtemp[i] = str(probtemp[i]).replace('[', '').replace(
            ']', '').replace('"', '').replace("'", '').replace(',',
                                                               ' ').split()
        probtemp[i] = np.array(probtemp[i]).astype(float)
        all_dic.append(dict(zip(wordtemp[i], probtemp[i])))

    id2word = corpora.Dictionary([list(mydictionary)])
    my_data = tempdf.nofreq.apply(
        lambda x: x.replace('[', '').replace(']', '').replace('"', '').replace(
            "'", '').replace(' ', '').replace(',', ' ').split()).values
    my_topics_ls = [list(all_dic[i].keys()) for i in range(len(all_dic))]

    cm = CoherenceModel(topics=my_topics_ls,
                        texts=my_data,
                        dictionary=id2word,
                        coherence='c_v')
    coherence = cm.get_coherence()
    print('Coherence Score: ', coherence)
예제 #13
0
def compute_coherence_values(dictionary,
                             bow_corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    max_coherencemodel = 0
    best_topic = 0
    for num_topics in range(start, limit, step):
        print('Training with ', num_topics, ' Topic')
        lda_model = gensim.models.LdaMulticore(
            bow_corpus,
            num_topics=num_topics,
            id2word=dictionary,
            passes=15,
            workers=8,
            minimum_probability=0.04,
            random_state=50,
            alpha=1e-2,
            chunksize=4000,
            eta=0.5e-2,
        )
        coherencemodel = CoherenceModel(model=lda_model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')

        print("Num Topics =", num_topics, " coherence = ",
              round(coherencemodel.get_coherence(), 4))

        if max_coherencemodel < round(coherencemodel.get_coherence(), 4):
            max_coherencemodel = round(coherencemodel.get_coherence(), 4)
            best_model = lda_model
            best_topic = num_topics

        coherence_values.append(coherencemodel.get_coherence())

    return best_model, best_topic, coherence_values
예제 #14
0
    def testCompareCoherenceForModels(self):
        models = [self.ldamodel, self.ldamodel]
        cm = CoherenceModel.for_models(
            models, dictionary=self.dictionary, texts=self.texts, coherence='c_v')
        self.assertIsNotNone(cm._accumulator)

        # Accumulator should have all relevant IDs.
        for model in models:
            cm.model = model
            self.assertIsNotNone(cm._accumulator)

        (coherence_topics1, coherence1), (coherence_topics2, coherence2) = \
            cm.compare_models(models)

        self.assertAlmostEqual(np.mean(coherence_topics1), coherence1, 4)
        self.assertAlmostEqual(np.mean(coherence_topics2), coherence2, 4)
        self.assertAlmostEqual(coherence1, coherence2, places=4)
    def testCompareCoherenceForTopics(self):
        topics = [self.topics1, self.topics2]
        cm = CoherenceModel.for_topics(
            topics, dictionary=self.dictionary, texts=self.texts, coherence='c_v')
        self.assertIsNotNone(cm._accumulator)

        # Accumulator should have all relevant IDs.
        for topic_list in topics:
            cm.topics = topic_list
            self.assertIsNotNone(cm._accumulator)

        (coherence_topics1, coherence1), (coherence_topics2, coherence2) = \
            cm.compare_model_topics(topics)

        self.assertAlmostEqual(np.mean(coherence_topics1), coherence1, 4)
        self.assertAlmostEqual(np.mean(coherence_topics2), coherence2, 4)
        self.assertGreater(coherence1, coherence2)
예제 #16
0
    def testCompareCoherenceForTopics(self):
        topics = [self.topics1, self.topics2]
        cm = CoherenceModel.for_topics(
            topics, dictionary=self.dictionary, texts=self.texts, coherence='c_v')
        self.assertIsNotNone(cm._accumulator)

        # Accumulator should have all relevant IDs.
        for topic_list in topics:
            cm.topics = topic_list
            self.assertIsNotNone(cm._accumulator)

        (coherence_topics1, coherence1), (coherence_topics2, coherence2) = \
            cm.compare_model_topics(topics)

        self.assertAlmostEqual(np.mean(coherence_topics1), coherence1, 4)
        self.assertAlmostEqual(np.mean(coherence_topics2), coherence2, 4)
        self.assertGreater(coherence1, coherence2)
    def testCompareCoherenceForModels(self):
        models = [self.ldamodel, self.ldamodel]
        cm = CoherenceModel.for_models(
            models, dictionary=self.dictionary, texts=self.texts, coherence='c_v')
        self.assertIsNotNone(cm._accumulator)

        # Accumulator should have all relevant IDs.
        for model in models:
            cm.model = model
            self.assertIsNotNone(cm._accumulator)

        (coherence_topics1, coherence1), (coherence_topics2, coherence2) = \
            cm.compare_models(models)

        self.assertAlmostEqual(np.mean(coherence_topics1), coherence1, 4)
        self.assertAlmostEqual(np.mean(coherence_topics2), coherence2, 4)
        self.assertAlmostEqual(coherence1, coherence2, places=4)
예제 #18
0
def getCoherency(d, corp, topics=10, coherence='u-mass', varyTopics=False):
    m1 = LdaModel(corp, topics, d)
    cm = CoherenceModel(model=m1, corpus=corp, coherence='u_mass')
    if varyTopics:
        topics = range(5, 16)
        coherencies = []
        for topic in topics:
            m = LdaModel(corp, topic, d)
            c = CoherenceModel(model=m, corpus=corp, coherence='u_mass')
            coherencies.append(c.get_coherence())
        return np.max(coherencies)
    return cm.get_coherence()
예제 #19
0
def fit_model(corpora, dictionary, topicNum, beta):
    corpus = [dictionary.doc2bow(text) for text in corpora]

    model = LdaTransformer(id2word=dictionary, num_topics=topicNum, alpha='auto', eta=beta, iterations=100, random_state=2019)
    lda = model.fit(corpus)
    #docvecs = lda.transform(corpus)
    coherence = evaluateModel(lda.gensim_model, corpora)

    try:
        cm = CoherenceModel(model=lda.gensim_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        u_mass = cm.get_coherence()

        cm = CoherenceModel(model=lda.gensim_model, texts=corpora, coherence='c_uci')
        c_uci = cm.get_coherence()

        cm = CoherenceModel(model=lda.gensim_model, texts=corpora, coherence='c_npmi')
        c_npmi = cm.get_coherence()

        saveModelConfigs(lda, coherence, u_mass, c_uci, c_npmi, config_path)
    except:
        saveModelConfigs(lda, coherence, "Invalid", "Invalid", "Invalid", config_path)
    #return lda.gensim_model, docvecs
    return lda.gensim_model
예제 #20
0
def find_best_coherence(tokens, range_num_topics):
    dct = corpora.Dictionary(tokens)
    coherences = []
    for i in range(range_num_topics):
        mod = train_LSIModel(tokens, i + 1)
        coherences[i] = CoherenceModel(model=mod,
                                       texts=tokens,
                                       dictionary=dct,
                                       coherence='c_v').get_coherence()
    # Find maximum
    maximum = coherences[0]
    max_index = 0
    for i in range(len(coherences)):
        if coherences[i] > maximum:
            max_index = i + 1
            maximum = coherences[i]
    print(str(max_index) + " has coherence " + maximum)
    return max_index
예제 #21
0
파일: model.py 프로젝트: bachmitre/ntm
    def get_coherence(self,
                      docs=None,
                      dictionary=None,
                      corpus=None,
                      n_terms=10):

        topics = self.get_topics(n_terms=n_terms)

        if not dictionary and not corpus:
            dictionary = Dictionary(docs)
            corpus = [dictionary.doc2bow(t) for t in docs]

        return CoherenceModel(topn=self.n_components,
                              texts=docs,
                              topics=topics.values,
                              corpus=corpus,
                              dictionary=dictionary,
                              coherence='c_npmi').get_coherence()
예제 #22
0
 def create_models(self):
     file_name = self.folder_path + self.algorithm + '/' + get_range_file_name(
     ) + ".csv"
     c_v_list = []
     for i in self.num_topics:
         print(i)
         model = self.get_model(i)
         c_v_list.append(
             CoherenceModel(model=model,
                            texts=self.dataset,
                            corpus=self.corpus_tfidf,
                            coherence='c_v').get_coherence())
     coherence_scores_df = pd.DataFrame({
         'num_topics': self.num_topics,
         'c_v': c_v_list,
     })
     coherence_scores_df.to_csv(file_name)
     self.__plot_coherence_scores(c_v_list, "c_v")
     print("models created")
예제 #23
0
    def check_coherence_measure(self, coherence):
        """Check provided topic coherence algorithm on given topics"""
        if coherence in BOOLEAN_DOCUMENT_BASED:
            kwargs = dict(corpus=self.corpus,
                          dictionary=self.dictionary,
                          coherence=coherence)
        else:
            kwargs = dict(texts=self.texts,
                          dictionary=self.dictionary,
                          coherence=coherence)

        cm1 = CoherenceModel(topics=self.topics1, **kwargs)
        cm2 = CoherenceModel(topics=self.topics2, **kwargs)
        self.assertGreater(cm1.get_coherence(), cm2.get_coherence())
    def testAccumulatorCachingWithTopnSettingGivenModel(self):
        kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, topn=5, coherence='u_mass')
        cm1 = CoherenceModel(model=self.ldamodel, **kwargs)
        cm1.estimate_probabilities()
        self.assertIsNotNone(cm1._accumulator)

        accumulator = cm1._accumulator
        topics_before = cm1._topics
        cm1.topn = 3
        self.assertEqual(accumulator, cm1._accumulator)
        self.assertEqual(3, len(cm1.topics[0]))
        self.assertEqual(topics_before, cm1._topics)

        cm1.topn = 6  # should be able to expand given the model
        self.assertEqual(6, len(cm1.topics[0]))
예제 #25
0
class MyLda:
    def __init__(self, myDictionary, num_topics=100, topic_threshold=0.15):
        self.num_topics = num_topics
        self.topic_threshold = topic_threshold
        self.myDictionary = myDictionary
        self.model = LdaModel(self.myDictionary.doc2bows, \
         id2word=self.myDictionary.dictionary, \
         num_topics=num_topics)
        self.topic2ids, self.id2topics = self.get_mappings()
        self.coherenceModel = None
        print("- Created MyLda with {} topics".format(self.num_topics))

    def get_mappings(self):
        topic2ids, id2topics = defaultdict(list), defaultdict(list)
        for i, doc2bow in enumerate(self.myDictionary.doc2bows):
            topic_pairs = self.model.get_document_topics(doc2bow)
            for j, (topic, prob) in enumerate(topic_pairs):
                if prob >= self.topic_threshold or j == 0:
                    topic2ids[topic].append(i)
                    id2topics[i].append(topic)
        return topic2ids, id2topics

    def get_topic_terms(self, topic):
        terms = self.model.get_topic_terms(topic)
        return terms

    def get_top_topic(self):
        top_topics = self.model.top_topics(corpus=self.myDictionary.doc2bows)
        average = sum([t[1] for t in top_topics]) / self.num_topics
        return top_topics, average

    def get_perplexity(self):
        return self.model.log_perplexity(self.myDictionary.doc2bows)

    def get_coherence(self):
        if not self.coherenceModel:
            self.coherenceModel = CoherenceModel(model=self.model, \
             corpus=self.myDictionary.doc2bows, \
             dictionary=self.myDictionary.dictionary, \
             coherence='u_mass')
        return self.coherenceModel.get_coherence()
예제 #26
0
def getCoherenceScores(nTopics):
    model = DtmModel(path_to_dtm_binary,
                     corpus=corpus,
                     num_topics=nTopics,
                     id2word=dictionary,
                     time_slices=timeSlice)
    model.save(f'./Models/model{nTopics}Topics')
    wordRepresentationTopics = [
        model.dtm_coherence(time=time) for time in range(0, len(timeSlice))
    ]
    coherenceModels = [
        CoherenceModel(topics=wordRepresentationTopics[time],
                       corpus=corpus,
                       dictionary=dictionary,
                       coherence='u_mass')
        for time in range(0, len(timeSlice))
    ]
    coherenceScores = [
        coherenceModels[time].get_coherence()
        for time in range(0, len(timeSlice))
    ]
    return coherenceScores
    def testAccumulatorCachingWithTopnSettingGivenTopics(self):
        kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, topn=5, coherence='u_mass')
        cm1 = CoherenceModel(topics=self.topics1, **kwargs)
        cm1.estimate_probabilities()
        self.assertIsNotNone(cm1._accumulator)

        accumulator = cm1._accumulator
        topics_before = cm1._topics
        cm1.topn = 3
        self.assertEqual(accumulator, cm1._accumulator)
        self.assertEqual(3, len(cm1.topics[0]))
        self.assertEqual(topics_before, cm1._topics)

        # Topics should not have been truncated, so topn settings below 5 should work
        cm1.topn = 4
        self.assertEqual(accumulator, cm1._accumulator)
        self.assertEqual(4, len(cm1.topics[0]))
        self.assertEqual(topics_before, cm1._topics)

        with self.assertRaises(ValueError):
            cm1.topn = 6  # can't expand topics any further without model
예제 #28
0
def evaluate_graph(dictionary, corpus, texts, begin, end, steps):
    """
    Function to display num_topics - LDA graph using c_v coherence

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    """
    u_mass = []
    c_v = []
    lm_list = []
    for num_topics in range(begin, end, steps):
        lm = LdaMulticore(corpus=corpus,
                          num_topics=num_topics,
                          workers=24,
                          id2word=dictionary,
                          eval_every=10,
                          eta='auto',
                          passes=20)
        lm_list.append(lm)
        cm_umass = CoherenceModel(model=lm,
                                  corpus=corpus,
                                  dictionary=dictionary,
                                  coherence='u_mass')
        cm_cv = CoherenceModel(model=lm,
                               texts=texts,
                               dictionary=dictionary,
                               coherence='c_v')
        c_v.append(cm_cv.get_coherence())
        u_mass.append(cm_umass.get_coherence())
    print(c_v)
    file_1 = open('c_v.txt', 'w')
    for item in c_v:
        file_1.write("%s\n" % item)

    print(u_mass)
    file_2 = open('u_mass.txt', 'w')
    for item in u_mass:
        file_2.write("%s\n" % item)
예제 #29
0
    def testAccumulatorCachingWithTopnSettingGivenTopics(self):
        kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, topn=5, coherence='u_mass')
        cm1 = CoherenceModel(topics=self.topics1, **kwargs)
        cm1.estimate_probabilities()
        self.assertIsNotNone(cm1._accumulator)

        accumulator = cm1._accumulator
        topics_before = cm1._topics
        cm1.topn = 3
        self.assertEqual(accumulator, cm1._accumulator)
        self.assertEqual(3, len(cm1.topics[0]))
        self.assertEqual(topics_before, cm1._topics)

        # Topics should not have been truncated, so topn settings below 5 should work
        cm1.topn = 4
        self.assertEqual(accumulator, cm1._accumulator)
        self.assertEqual(4, len(cm1.topics[0]))
        self.assertEqual(topics_before, cm1._topics)

        with self.assertRaises(ValueError):
            cm1.topn = 6  # can't expand topics any further without model
예제 #30
0
    def topic_coherence(self):
        if self.lda_model == None:
            self.fit()

        # Compute Coherence Score using c_v
        coherence_model_lda = CoherenceModel(model=self.lda_model,
                                             texts=self.docs,
                                             dictionary=self.dictionary,
                                             coherence='c_v')
        coherence_lda_CV = coherence_model_lda.get_coherence()
        log.info('\nCoherence Score CV method: ', coherence_lda_CV)

        # Compute Coherence Score using UMass
        coherence_model_lda = CoherenceModel(model=self.lda_model,
                                             texts=self.docs,
                                             dictionary=self.dictionary,
                                             coherence="u_mass")
        coherence_lda_umass = coherence_model_lda.get_coherence()
        log.info('\nCoherence Score: ', coherence_lda_umass)

        return coherence_lda_CV, coherence_lda_umass
def check_coherence(listcorpus, vectorcorpus, model, numtopics, resultsfolder):
    print("check_coherence")
    # coherence for the entire model, using several measures
    measures = ["c_v", "c_npmi", "u_mass", "c_uci"]
    coherences = []
    for measure in measures:
        coherencemodel = CoherenceModel(texts=listcorpus,
                                        model=model,
                                        corpus=vectorcorpus,
                                        coherence=measure,
                                        processes=3)
        coherence = coherencemodel.get_coherence()
        coherences.append(coherence)
    coherences = dict(zip(measures, coherences))
    coherences = pd.DataFrame.from_dict(coherences,
                                        orient='index',
                                        columns=["score"])
    with open(join(resultsfolder, "coherences-model.csv"),
              "w",
              encoding="utf8") as outfile:
        coherences.to_csv(outfile, sep="\t")
    # coherence of each topic, using one measure only
    coherencemodel = CoherenceModel(texts=listcorpus,
                                    model=model,
                                    corpus=vectorcorpus,
                                    coherence="c_v",
                                    processes=3)
    coherences = list(
        zip(range(0, numtopics), coherencemodel.get_coherence_per_topic()))
    coherences = pd.DataFrame(coherences,
                              columns=["topic",
                                       "score"]).sort_values(by="score",
                                                             ascending=False)
    with open(join(resultsfolder, "coherences-topics.csv"),
              "w",
              encoding="utf8") as outfile:
        coherences.to_csv(outfile, sep="\t")
예제 #32
0
파일: LDA.py 프로젝트: BigGold0202/Module2
lda_model = gensim.models.LdaModel(rec_corpus,
                                   num_topics=5,
                                   id2word=rec_dict,
                                   passes=2)

# compare different # of topics
cm_score = []
for i in [3, 5, 10]:
    lda_model0 = gensim.models.LdaModel(rec_corpus,
                                        num_topics=i,
                                        iterations=100,
                                        id2word=rec_dict,
                                        passes=2)
    cm = CoherenceModel(model=lda_model0,
                        corpus=rec_corpus,
                        coherence='u_mass')
    cm_score.append(cm.get_coherence())

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


def lda_ana(rev_text, n, iterations=100):
    # rev_text[rev_text.isna()] = 'na'
    rev_text = rev_text.apply(lambda x: " ".join(ast.literal_eval(x)))
    rev_text = rev_text.apply(lambda x: x.split(' '))

    rec_dict = gensim.corpora.Dictionary(rev_text)
    rec_dict.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    rec_corpus = [rec_dict.doc2bow(doc) for doc in rev_text]
예제 #33
0
# applying the functions to the documents -> list of stemmed tokens
# and their frequencies in each document (bag of words)
docs = list(map(preprocess, reviews))
dictionary = gensim.corpora.Dictionary(docs)
dictionary.filter_extremes(no_below=10)
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

# choosing the optimal number of topics via coherence score
cs = []
for i in range(2, 21):
    model = gensim.models.LdaModel(bow_corpus,
                                   num_topics=i,
                                   id2word=dictionary,
                                   passes=10)

    cv = CoherenceModel(model=model, texts=list(docs),
                        coherence='c_v').get_coherence()
    cs.append(cv)
    print(f'{i} topics, Coherence Score = {cv: .3f}')

# buiding the 6 topic model and saving the topics
model = gensim.models.LdaModel(bow_corpus,
                               num_topics=6,
                               id2word=dictionary,
                               passes=50)

# saving the topics and the model
with open('topics.txt', 'w') as f:
    for l in range(6):
        f.write(f'TOPIC {l}:\n')
        for i in model.get_topic_terms(l):
            f.write(f'{dictionary[i[0]]}\n')
def coherence_score(model,tokens_lst,dictionary):
    coherence_model_lda = CoherenceModel(model=model, texts=data['tokens'], dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
예제 #35
0
def evaluate(docs):
    # global docs
    # Perform function on our document
    docs = docs_preprocessor(docs)
    # Create Biagram & Trigram Models
    from gensim.models import Phrases
    if __name__ == "__main__":
        # Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more.
        bigram = Phrases(docs, min_count=10)
        trigram = Phrases(bigram[docs])

        for idx in range(len(docs)):
            for token in bigram[docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)
            for token in trigram[docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)
        # Remove rare & common tokens
        # Create a dictionary representation of the documents.
        dictionary = Dictionary(docs)
        dictionary.filter_extremes(no_below=10, no_above=0.2)
        # Create dictionary and corpus required for Topic Modeling
        corpus = [dictionary.doc2bow(doc) for doc in docs]
        print('Number of unique tokens: %d' % len(dictionary))
        print('Number of documents: %d' % len(corpus))
        print(corpus[:1])

        # Set parameters.
        num_topics = 20
        chunksize = 500
        passes = 20
        iterations = 400
        eval_every = 1

        # Make a index to word dictionary.
        temp = dictionary[0]  # only to "load" the dictionary.
        id2word = dictionary.id2token

        lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                             alpha='auto', eta='auto', \
                             iterations=iterations, num_topics=num_topics, \
                             passes=passes, eval_every=eval_every)
        # Print the Keyword in the 5 topics
        print(lda_model.print_topics())

        # Compute Coherence Score using c_v
        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=docs,
                                             dictionary=dictionary,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

        # Compute Coherence Score using UMass
        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=docs,
                                             dictionary=dictionary,
                                             coherence="u_mass")
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        """
               Compute c_v coherence for various number of topics
    
               Parameters:
               ----------
               dictionary : Gensim dictionary
               corpus : Gensim corpus
               texts : List of input texts
               limit : Max num of topics
    
               Returns:
               -------
               model_list : List of LDA topic models
               coherence_values : Coherence values corresponding to the LDA model with respective number of topics
               """

        model_list, coherence_values = compute_coherence_values(
            dictionary=dictionary,
            corpus=corpus,
            texts=docs,
            start=2,
            limit=40,
            step=6)
        # Show graph
        import matplotlib.pyplot as plt

        limit = 40
        start = 2
        step = 6
        x = range(start, limit, step)
        plt.plot(x, coherence_values)
        plt.xlabel("Num Topics")
        plt.ylabel("Coherence score")
        plt.legend(("coherence_values"), loc='best')
        plt.show()

        return coherence_lda
예제 #36
0
def model_creation(file):
    print("Importing Data")
    data = pd.read_csv(os.getcwd() + "\\Src Files\\" + file + ".csv",
                       error_bad_lines=False)
    data_text = data[['Body']]
    documents = data_text
    print("Data import is completed")

    print("Running pre Processing on Documents")
    processed_body = documents['Body'].map(pp.pre_process)
    print("Pre Processing is completed")
    print("Creation dictionary with documents")
    dictionary = corpora.Dictionary(processed_body)
    dictionary.filter_extremes(no_below=100, no_above=0.5, keep_n=200000)
    print("Saving dictionary")

    print("Creating dictionary directory")
    dirName = os.getcwd() + "\\Dictionaries\\" + file
    dictPath = dirName + "\\dictionary.pkl"
    path = Path(dirName)
    path.mkdir(parents=True, exist_ok=True)

    print("Creating Topic directory")
    dirName = os.getcwd() + "\\Topics\\" + file
    path = Path(dirName)
    path.mkdir(parents=True, exist_ok=True)

    dirName = os.getcwd() + "\\Named Topics\\" + file
    path = Path(dirName)
    path.mkdir(parents=True, exist_ok=True)

    print("Saving Dictionary")
    with open(dictPath, 'wb') as d:
        pickle.dump(dictionary, d, protocol=pickle.HIGHEST_PROTOCOL)

    print("Creating Corpus")
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_body]
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    print("Creating LDA Model")
    lda = gensim.models.ldamodel.LdaModel(corpus_tfidf,
                                          num_topics=20,
                                          id2word=dictionary,
                                          passes=4)
    coherence = CoherenceModel(model=lda,
                               corpus=corpus_tfidf,
                               dictionary=dictionary,
                               coherence='u_mass')
    coherenceLda = coherence.get_coherence()
    print("Coherence: " + str(coherenceLda))

    print("Creating model directory")
    dirName = os.getcwd() + "\\Models\\" + file
    path = Path(dirName)
    path.mkdir(parents=True, exist_ok=True)

    print("Saving model")
    lda.save(dirName + "\\models")

    print("Saving Topics to file")
    topicList = lda.show_topics(num_topics=20,
                                num_words=15,
                                log=False,
                                formatted=True)
    topicsPath = os.getcwd() + "\\Topics\\"
    namedTopicsPath = os.getcwd() + "\\Named Topics\\"

    with open(topicsPath + file + "\\topic.txt", 'w') as f:
        for x, item in topicList:
            f.write(str(x) + ", " + item + "\n")

    with open(namedTopicsPath + file + "\\topic.csv", 'w') as f:
        f.write("index,Body,Topic\n")
        for x, item in topicList:
            f.write(str(x) + ", " + item + ",Replace Topic Name" + "\n")
예제 #37
0
파일: lda.py 프로젝트: Soumithri/Thesis
def run_lda(corpus,
            dictionary,
            texts,
            num_topics=10,
            passes=20,
            iterations=100):
    eval_frame = pd.DataFrame(columns=[
        'Num_Topics', 'Log_Perplexity_P_{0}_I_{1}'.format(passes, iterations),
        'Topic_Coherence(u_mass)_P_{0}_I_{1}'.format(passes, iterations),
        'Topic_Coherence(c_uci)_P_{0}_I_{1}'.format(passes, iterations),
        'Topic_Coherence(c_v)_P_{0}_I_{1}'.format(passes, iterations),
        'Topic_Coherence(c_npmi)_P_{0}_I_{1}'.format(passes, iterations)
    ])
    logging.debug('******* RUNNING LDA *************')
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics,
                             passes=passes,
                             iterations=iterations,
                             chunksize=2500)
    coh_model_umass = CoherenceModel(model=lda_model,
                                     corpus=corpus,
                                     dictionary=dictionary,
                                     coherence='u_mass')
    coh_model_uci = CoherenceModel(model=lda_model,
                                   texts=texts,
                                   coherence='c_uci')
    coh_model_ucv = CoherenceModel(model=lda_model,
                                   texts=texts,
                                   coherence='c_v')
    coh_model_npmi = CoherenceModel(model=lda_model,
                                    texts=texts,
                                    coherence='c_npmi')
    eval_frame.loc[len(eval_frame)] = [
        num_topics,
        lda_model.log_perplexity(corpus),
        coh_model_umass.get_coherence(),
        coh_model_uci.get_coherence(),
        coh_model_ucv.get_coherence(),
        coh_model_npmi.get_coherence()
    ]
    model = namedtuple('model', ['lda_model', 'eval_frame'])
    return model(lda_model, eval_frame)
예제 #38
0
def build_array():
    start = time.time()
    print("start ---------------------------------------------------")

    # load test set
    test_year = dictload(2018)

    # load the rest
    intermediate_path = "../Data/Intermediate/"
    doc_set = pickle.load(
        open(os.path.join(intermediate_path + 'doc_set.p'), "rb"))
    label_set = pickle.load(
        open(os.path.join(intermediate_path + 'label_set.p'), "rb"))
    topic_superset = pickle.load(
        open(os.path.join(intermediate_path + 'topic_superset.p'), "rb"))

    time_load = time.time()
    print("It took", time_load - start, "seconds to load")
    print("training ------------------------------------------------")

    doc_texts = tokenize(doc_set)

    print("tokenized")

    # build individual lda
    lda_superset = []
    num_topics_list = []
    dictionary_set = []

    i = 0
    for topic_set in topic_superset:
        topic_texts = tokenize(topic_set)

        # turn our tokenized documents into a id - term dictionary
        dictionary = corpora.Dictionary(topic_texts)
        dictionary_set.append(dictionary)

        # convert tokenized documents into a document-term matrix
        corpus = [dictionary.doc2bow(text) for text in topic_texts]

        # generate LDA model
        # number of topics is logarithmic
        # num_topics = math.floor(math.log2(len(topic_set)))
        # number of topics is modified logarithmic
        # 15*rounded(log_2())-140
        num_topics = 15 * (round(math.log2(len(topic_set)))) - 140
        # num_topics = math.floor(len(topic_set)/1000)
        print(str(i) + ' ' + "number of topics: " + str(num_topics))
        num_topics_list.append(num_topics)
        ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                                   num_topics=num_topics,
                                                   id2word=dictionary,
                                                   passes=20)
        lda_superset.append(ldamodel)
        i += 1
        # print lda topics
        print(ldamodel.print_topics())

        # Compute Perplexity
        print('\nPerplexity: ', ldamodel.log_perplexity(
            corpus))  # a measure of how good the model is. lower the better.

        # Compute Coherence Score
        coherence_model_lda = CoherenceModel(model=ldamodel,
                                             texts=topic_texts,
                                             dictionary=dictionary,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

    print("all LDA built")

    # build training matrix
    prop_array_superset = []
    for i in range(len(num_topics_list)):
        num_topics = num_topics_list[i]
        topic_prop_array = np.zeros((len(doc_texts), num_topics))
        for j in range(len(doc_texts)):
            text = doc_texts[j]
            textProp = lda_superset[i][dictionary_set[i].doc2bow(text)]
            for pair in textProp:
                topicIdx = pair[0]
                weight = pair[1]
                topic_prop_array[j, topicIdx] = weight
        prop_array_superset.append(topic_prop_array)

    # concat full feature array
    training_array = prop_array_superset[0]
    for i in range(len(prop_array_superset)):
        if i != 0:
            training_array = np.concatenate(
                (training_array, prop_array_superset[i]), axis=1)

    print("training matrix built")
    time_train = time.time()
    print("It took", time_train - time_load, "seconds to train")
    print("---------------------------------------------------------")
    print("testing")

    # test on new data 1000 documents split by proportion of training data
    test_set = test_year['astro'][0:144] + test_year['cond'][0:145] + \
        test_year['cs'][0:125] + test_year['hep'][0:113] + \
        test_year['math'][0:257] + test_year['physics'][0:134] + \
        test_year['qbio'][0:13] + test_year['qfin'][0:7] + \
        test_year['quant'][0:45] + test_year['stat'][0:17]
    test_label = [1]*144 + [2]*145 + [3]*125 + [4]*113 + [5]*257 + \
        [6]*134 + [7]*13 + [8]*7 + [9]*45 + [10]*17

    test_texts = tokenize(test_set)

    # build individual test prop array
    test_prop_array_superset = []
    for i in range(len(num_topics_list)):
        num_topics = num_topics_list[i]
        test_prop_array = np.zeros((len(test_label), num_topics))
        for j in range(len(test_texts)):
            test = test_texts[j]
            testProp = lda_superset[i][dictionary_set[i].doc2bow(test)]
            for pair in testProp:
                topicIdx = pair[0]
                weight = pair[1]
                test_prop_array[j, topicIdx] = weight
        test_prop_array_superset.append(test_prop_array)

    # concat full test array
    test_array = test_prop_array_superset[0]
    for i in range(len(test_prop_array_superset)):
        if i != 0:
            test_array = np.concatenate(
                (test_array, test_prop_array_superset[i]), axis=1)

    arraydump('modifiedlog_', training_array, test_array)

    x_train, x_test, y_train, y_test = training_array, test_array, label_set, test_label

    print("training_array length: " + str(len(topic_prop_array)))
    print("test_array length: " + str(len(test_prop_array)))
    print("training_label length: " + str(len(label_set)))
    print("test_label length: " + str(len(test_label)))
    print("---------------------------------------------------------")

    # choose model via a list
    model_names = ["knn3"]
    buildmodel(model_names, x_train, y_train, x_test, y_test)

    time_end = time.time()
    print("total time is ", time_end - start)
예제 #39
0
파일: LDA.py 프로젝트: BigGold0202/Module2
def lda_score(model, rec_corpus):
    cm = CoherenceModel(model=model, corpus=rec_corpus, coherence='u_mass')
    return cm.get_coherence()
예제 #40
0
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

#%%Set up two topic models
goodLdaModel = LdaModel(corpus=corpus,
                        id2word=dictionary,
                        iterations=50,
                        num_topics=2)
badLdaModel = LdaModel(corpus=corpus,
                       id2word=dictionary,
                       iterations=1,
                       num_topics=2)

#%% Using U_Mass Coherence
goodcm = CoherenceModel(model=goodLdaModel,
                        corpus=corpus,
                        dictionary=dictionary,
                        coherence='u_mass')  #coherence='u_mass'

badcm = CoherenceModel(model=badLdaModel,
                       corpus=corpus,
                       dictionary=dictionary,
                       coherence='u_mass')  #coherence='u_mass'

#View the pipeline parameters for one coherence model
#print(goodcm)
print(goodcm.get_coherence())

#print(badcm)
print(badcm.get_coherence())

#%% check how much topics - coherence = 'u_mass'
예제 #41
0
def main(cuda, batch_size, epochs, top_words, testing_mode, verbose_mode):
    print("Loading input data")
    # TODO fix relative paths
    data_train = load_npz("data/train.txt.npz")
    data_val = load_npz("data/test.txt.npz")
    corpus = Sparse2Corpus(data_train, documents_columns=False)
    with open("data/vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
    reverse_vocab = {vocab[word]: word for word in vocab}
    indexed_vocab = [
        reverse_vocab[index] for index in range(len(reverse_vocab))
    ]
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(autoencoder, epoch, lr, loss, perplexity):
        if verbose_mode:
            decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
            topics = [[
                reverse_vocab[item.item()] for item in topic
            ] for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
            cm = CoherenceModel(
                topics=topics,
                corpus=corpus,
                dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
                coherence="u_mass",
            )
            coherence = cm.get_coherence()
            coherences = cm.get_coherence_per_topic()
            for index, topic in enumerate(topics):
                print(
                    str(index) + ":" + str(coherences[index]) + ":" +
                    ",".join(topic))
            print(coherence)
        else:
            coherence = 0
        writer.add_scalars(
            "data/autoencoder",
            {
                "lr": lr,
                "loss": loss,
                "perplexity": perplexity,
                "coherence": coherence,
            },
            global_step=epoch,
        )

    ds_train = CountTensorDataset(data_train)
    ds_val = CountTensorDataset(data_val)
    autoencoder = ProdLDA(in_dimension=len(vocab),
                          hidden1_dimension=100,
                          hidden2_dimension=100,
                          topics=50)
    if cuda:
        autoencoder.cuda()
    print("Training stage.")
    ae_optimizer = Adam(autoencoder.parameters(), 0.0001, betas=(0.99, 0.999))
    train(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=epochs,
        batch_size=batch_size,
        optimizer=ae_optimizer,
        update_callback=training_callback,
        sampler=WeightedRandomSampler(torch.ones(data_train.shape[0]), 20000),
        num_workers=4,
    )
    autoencoder.eval()
    decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
    topics = [[reverse_vocab[item.item()] for item in topic]
              for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
    cm = CoherenceModel(
        topics=topics,
        corpus=corpus,
        dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
        coherence="u_mass",
    )
    coherence = cm.get_coherence()
    coherences = cm.get_coherence_per_topic()
    for index, topic in enumerate(topics):
        print(
            str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic))
    print(coherence)
    if not testing_mode:
        writer.add_embedding(
            autoencoder.encoder.linear1.weight.detach().cpu().t(),
            metadata=indexed_vocab,
            tag="feature_embeddings",
        )
    writer.close()
예제 #42
0
from gensim.models.coherencemodel import CoherenceModel
import pickle

lda = pickle.load(open('../output/lda_model', 'rb'))

# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda['model'],
                                     texts=lda['texts'],
                                     corpus=lda['corpus'],
                                     dictionary=lda['dictionary'],
                                     coherence='u_mass')
lda['coherence'] = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', lda['coherence'])
예제 #43
0
    random_state=10)  #tune parameters as needed

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

doc_lda = lda_model[corpus]

################### perplexity and coherence metrics #####################

#perplexity, the lower the better
print('\nPerplexity: ', lda_model.log_perplexity(corpus, total_docs=10000))

# Compute Coherence Score, the higher the better
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=texts,
                                     dictionary=id2word,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)  #higher the better

################### OPTIONAL: Tuning with c_v #####################
##ran on vpn eng computer took 7 hrs to get to 41% compelte.


# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):

    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=k,
                                           random_state=10,