예제 #1
0
    def testTransform(self):
        passed = False
        # sometimes, LDA training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(5):  # restart at most 5 times
            # create the transformation model
            model = ldamulticore.LdaMulticore(id2word=dictionary,
                                              num_topics=2,
                                              passes=100)
            model.update(corpus)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(
                transformed,
                2)  # convert to dense vector, for easier equality tests
            expected = [0.13, 0.87]
            passed = numpy.allclose(
                sorted(vec), sorted(expected),
                atol=1e-2)  # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning(
                "LDA failed to converge on attempt %i (got %s, expected %s)" %
                (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed)
예제 #2
0
    def testTopicSeeding(self):
        passed = False
        for topic in range(2):
            # try seeding it both ways round, check you get the same
            # topics out but with which way round they are depending
            # on the way round they're seeded
            for i in range(5): # restart at most 5 times

                eta = numpy.ones((2, len(dictionary))) * 0.5
                system = dictionary.token2id[u'system']

                # aggressively seed the word 'system', in one of the
                # two topics, 10 times higher than the other words
                eta[topic, system] *= 10

                model = ldamulticore.LdaMulticore(id2word=dictionary, num_topics=2, passes=200, eta=eta)
                model.update(corpus)

                topics = [dict((word, p) for p, word in model.show_topic(j)) for j in range(2)]

                # check that the word system in the topic we seeded, got a high weight,
                # and the word 'trees' (the main word in the other topic) a low weight --
                # and vice versa for the other topic (which we didn't seed with 'system')
                result = [[topics[topic].get(u'system',0), topics[topic].get(u'trees',0)],
                          [topics[1-topic].get(u'system',0), topics[1-topic].get(u'trees',0)]]
                expected = [[0.385, 0.022],
                            [0.025, 0.157]]
                passed = numpy.allclose(result, expected, atol=1e-2)
                if passed:
                    break
                logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                                (i, result, expected))
            self.assertTrue(passed)
예제 #3
0
def topic_model(tokenized_docs,
                num_topics=10,
                iterations=50,
                passes=10,
                chunksize=2000,
                workers=DEFAULT_WORKERS,
                **kwargs):
    id2word, corpus = create_id2word(tokenized_docs)

    model = ldamulticore.LdaMulticore(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        workers=workers,
        iterations=iterations,
        passes=passes,
        chunksize=chunksize,
        eval_every=10,  # Setting this to one slows down training by ~2x
        per_word_topics=True)

    # computing perplexity and coherence
    perplexity = model.log_perplexity(corpus)
    coherence_model = CoherenceModel(model=model,
                                     texts=tokenized_docs,
                                     dictionary=id2word,
                                     coherence='c_v')
    coherence = coherence_model.get_coherence()
    return model, corpus, coherence, perplexity
예제 #4
0
def train():
    global config
    config = AwsTrain()

    logger.info('MODE: ' + config.dictionary_label)
    visual_matrix = loadVisualMatrix(config)
    imgid2wordscoretuple = prepareTexts()

    #uncomment if loading previously loaded, and comment the next line
    # dictionary = corpora.Dictionary().load(getLastDictFileName())
    dictionary = createDictionary()
    config.dict_size = len(dictionary)
    logger.info('Dict read')

    #comment this if loading from a previously serialised corpus(much quicker)
    bow = BOW(dictionary=dictionary,
              input=MyCorpus(visual_matrix, imgid2wordscoretuple))
    corporaFname = 'data/corpora' + config.dictionary_label
    gensim.corpora.MmCorpus.serialize(corporaFname, bow)
    bow = gensim.corpora.MmCorpus(corporaFname)
    logger.info('Corpora read')

    topics = config.lda_topics
    passes = config.lda_passes
    # start training
    lda = models.LdaMulticore(corpus=bow,
                              id2word=dictionary,
                              num_topics=topics,
                              passes=passes,
                              chunksize=config.chunksize,
                              workers=4)
    modelFname = config.model_folder + 'lda_%i_topics_%i_passes_%s.%s.model' % (
        topics, passes, config.dictionary_label, pretty_current_time())
    # persist the model for later
    lda.save(modelFname)
예제 #5
0
def latent_dirichlet_allocation(corpus_fname,
                                output_fname,
                                tokenizer_name="mecab"):
    make_save_path(output_fname)
    documents, tokenized_corpus = [], []
    tokenizer = get_tokenizer(tokenizer_name)
    with open(corpus_fname, 'r', encoding='utf-8') as f:
        for document in f:
            tokens = list(set(tokenizer.morphs(document.strip())))
            documents.append(document)
            tokenized_corpus.append(tokens)
    dictionary = corpora.Dictionary(tokenized_corpus)
    corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]
    LDA = ldamulticore.LdaMulticore(corpus,
                                    id2word=dictionary,
                                    num_topics=30,
                                    minimum_probability=0.0,
                                    workers=4)
    # 특정 토픽의 확률이 0.5보다 클 경우에만 데이터를 리턴한다
    # 확률의 합은 1이기 때문에 해당 토픽이 해당 문서에서 확률값이 가장 큰 토픽이 된다
    all_topics = LDA.get_document_topics(corpus,
                                         minimum_probability=0.5,
                                         per_word_topics=False)
    with open(output_fname + ".results", 'w') as f:
        for doc_idx, topic in enumerate(all_topics):
            if len(topic) == 1:
                topic_id, prob = topic[0]
                f.writelines(documents[doc_idx].strip() + "\u241E" +
                             ' '.join(tokenized_corpus[doc_idx]) + "\u241E" +
                             str(topic_id) + "\u241E" + str(prob) + "\n")
    LDA.save(output_fname + ".model")
예제 #6
0
    def getOptimalTopicNum(self):
        dictionary = corpora.Dictionary(self.corpus)
        corpus = [dictionary.doc2bow(text) for text in self.corpus]

        com_nums = []
        for i in range(0, 100, 10):
            if i == 0:
                p = 1
            else:
                p = i
            com_nums.append(p)

        coherence_list = []

        for i in com_nums:
            # lda = gensim.models.ldamodel.LdaModel(corpus=corpus,
            #                                       id2word=dictionary,
            #                                       num_topics=i,
            #                                       iterations=100,
            #                                       alpha='auto',
            #                                       random_state=100,
            #                                       update_every=1,
            #                                       chunksize=10,
            #                                       passes=20,
            #                                       per_word_topics=True)
            lda = ldamulticore.LdaMulticore(corpus=corpus,
                                            id2word=dictionary,
                                            passes=20,
                                            num_topics=i,
                                            workers=4,
                                            iterations=100,
                                            alpha='symmetric',
                                            gamma_threshold=0.001)
            coh_model_lda = CoherenceModel(model=lda,
                                           corpus=corpus,
                                           dictionary=dictionary,
                                           coherence='u_mass')
            coherence_value = coh_model_lda.get_coherence()

            # coh = lda.log_perplexity(corpus)
            coherence_list.append(coherence_value)
            print('k = {}  coherence value = {}'.format(
                str(i), str(coherence_value)))

        coh_dict = dict(zip(com_nums, coherence_list))
        sorted_coh_dict = sorted(coh_dict.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)

        plt.plot(com_nums, coherence_list)
        plt.xlabel('topic')
        plt.ylabel('coherence value')
        plt.draw()
        fig = plt.gcf()
        fig.savefig(self.model_path + '/coherence.png')
        t_ind = np.argmin(coherence_list)
        self.num_topics = t_ind * 10
        print('optimal topic number = ', str(t_ind))
        return sorted_coh_dict[0][0]
예제 #7
0
def train_lda(train_corpus4):
    lda_train4 = ldamulticore.LdaMulticore(
        corpus=train_corpus4,
        num_topics=50,
        passes=50,
        eval_every=1,
        per_word_topics=True)
    lda_train4.
예제 #8
0
def trainLDA(docRep, dictionary, save=False, name=""):
    ''' Function to train and return an ldamodel. Expects a sparse matrix as input '''
    corpus = Sparse2Corpus(docRep, documents_columns=False)
    ldamodel = ldamulticore.LdaMulticore(
        corpus, num_topics=20, id2word=dictionary, workers=4, passes=4)
    if save:
        saveData(ldamodel, 'ldamodel-' + name)
    return ldamodel
예제 #9
0
 def testPersistenceCompressed(self):
     fname = testfile() + '.gz'
     model = ldamulticore.LdaMulticore(self.corpus, num_topics=2)
     model.save(fname)
     model2 = ldamulticore.LdaMulticore.load(fname, mmap=None)
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
     tstvec = []
     self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
예제 #10
0
    def testLargeMmapCompressed(self):
        fname = testfile() + '.gz'
        model = ldamulticore.LdaMulticore(self.corpus, num_topics=2)

        # simulate storing large arrays separately
        model.save(fname, sep_limit=0)

        # test loading the large model arrays with mmap
        self.assertRaises(IOError, ldamulticore.LdaModel.load, fname, mmap='r')
예제 #11
0
 def testPersistence(self):
     model = ldamulticore.LdaMulticore(self.corpus, num_topics=2)
     model.save(testfile())
     model2 = ldamulticore.LdaMulticore.load(testfile())
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
     tstvec = []
     self.assertTrue(numpy.allclose(
         model[tstvec], model2[tstvec]))  # try projecting an empty vector
예제 #12
0
 def train(self):
     if self.workers > 1:
         self.model = ldamulticore.LdaMulticore(
             self.corpus,
             **dict(self.model_params,
                    id2word=self.corpus.dictionary,
                    workers=self.workers))
     else:
         self.model = ldamodel.LdaModel(
             self.corpus,
             **dict(self.model_params, id2word=self.corpus.dictionary))
     self.save_article_topics()
     self.save_topic_words()
예제 #13
0
    def testLargeMmap(self):
        fname = testfile()
        model = ldamulticore.LdaMulticore(self.corpus, num_topics=2)

        # simulate storing large arrays separately
        model.save(testfile(), sep_limit=0)

        # test loading the large model arrays with mmap
        model2 = ldamulticore.LdaModel.load(testfile(), mmap='r')
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(isinstance(model2.expElogbeta, numpy.memmap))
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
예제 #14
0
def ldaTrain(f_inputs, model_output, corpus_name, **args):
    """
    Input Args
        f_input
            training corpus, one document a line with ' ' seperated
        model_input
            model name
        corpus_name
            convert training corpus to gensim format corpus
        **args
            contain 'num_topics', 'alpha', 'eta'
    """
    lda_corpus, id2word = genCorpus(corpus_name=corpus_name, f_inputs=f_inputs)
    if CPU_NUM > 1:
        lda_model = ldamulticore.LdaMulticore(lda_corpus, workers=CPU_NUM, id2word=id2word, **args)
    else:
        lda_model = ldamodel.LdaModel(lda_corpus, id2word=id2word, **args)
    lda_model.save(model_output, ignore=['state', 'dispatcher'])
예제 #15
0
    def saveLDAModel(self, model_path):
        print(' ...start to build lda model...')
        dictionary = corpora.Dictionary(self.corpus)
        corpus = [dictionary.doc2bow(text) for text in self.corpus]

        # lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
        #                                           id2word=dictionary,
        #                                           num_topics=self.num_topics,
        #                                           iterations=100,
        #                                           alpha='auto',
        #                                           random_state=100,
        #                                           update_every=1,
        #                                           chunksize=10,
        #                                           passes=20,
        #                                           per_word_topics=True)

        lda_model = ldamulticore.LdaMulticore(corpus=corpus,
                                              id2word=dictionary,
                                              passes=20,
                                              num_topics=self.num_topics,
                                              workers=4,
                                              iterations=100,
                                              alpha='symmetric',
                                              gamma_threshold=0.001)

        all_topics = lda_model.get_document_topics(corpus,
                                                   minimum_probability=0.5,
                                                   per_word_topics=False)

        documents = self.documents
        with open(model_path + '/lda.results', 'w', -1, 'utf-8') as f:
            for doc_idx, topic in enumerate(all_topics):
                if len(topic) == 1:
                    topic_id, prob = topic[0]
                    f.writelines(documents[doc_idx].strip() + "\u241E" +
                                 ' '.join(self.corpus[doc_idx]) + "\u241E" +
                                 str(topic_id) + "\u241E" + str(prob) + '\n')
        lda_model.save(model_path + '/lda.model')
        with open(model_path + 'model.dictionary', 'wb') as f:
            pickle.dump(dictionary, f)

        return lda_model
    def saveLDAModel(self):
        print(' ...start to build lda model...')
        # dictionary = corpora.Dictionary(self.corpus)
        # corpus = [dictionary.doc2bow(text) for text in self.corpus]
        # tfidf = TfidfModel(corpus)
        # corpus_tfidf = tfidf[corpus]

        lda_model = ldamulticore.LdaMulticore(corpus=self.corpus_tfidf,
                                              id2word=self.dictionary,
                                              passes=20,
                                              num_topics=self.num_topics,
                                              workers=4,
                                              iterations=100,
                                              alpha='symmetric',
                                              gamma_threshold=0.001)
        with open(self.model_path + self.data_name + '_lda_model.pickle',
                  'wb') as f:
            pickle.dump(lda_model, f)

        all_topics = lda_model.get_document_topics(self.corpus_tfidf,
                                                   minimum_probability=0.5,
                                                   per_word_topics=False)

        documents = self.documents

        with open(self.model_path + self.data_name + '_lda.results', 'w', -1,
                  'utf-8') as f:
            for doc_idx, topic in enumerate(all_topics):
                print(doc_idx, ' || ', topic)
                if len(topic) == 1:
                    topic_id, prob = topic[0][0], topic[0][1]
                    f.writelines(
                        str(doc_idx) + "\u241E" + documents[doc_idx].strip() +
                        "\u241E" + ' '.join(self.corpus[doc_idx]) + "\u241E" +
                        str(topic_id) + "\u241E" + str(prob) + '\n')
        lda_model.save(self.model_path + self.data_name + '_lda.model')
        with open(self.model_path + self.data_name + '_model.dictionary',
                  'wb') as f:
            pickle.dump(self.dictionary, f)

        return lda_model
    def getOptimalTopicNum(self):
        # dictionary = corpora.Dictionary(self.corpus)
        # corpus = [dictionary.doc2bow(text) for text in self.corpus]
        # tfidf = TfidfModel(corpus)
        # self.corpus_tfidf = tfidf[corpus]
        # self.dictionary = corpora.Dictionary(self.corpus_tfidf)
        com_nums = []
        for i in range(10, 60, 10):
            if i == 0:
                p = 1
            else:
                p = i
            com_nums.append(p)

        coherence_list = []

        for i in com_nums:
            # lda = gensim.models.ldamodel.LdaModel(corpus=corpus,
            #                                       id2word=dictionary,
            #                                       num_topics=i,
            #                                       iterations=100,
            #                                       alpha='auto',
            #                                       random_state=100,
            #                                       update_every=1,
            #                                       chunksize=10,
            #                                       passes=20,
            #                                       per_word_topics=True)
            lda = ldamulticore.LdaMulticore(corpus=self.corpus_tfidf,
                                            id2word=self.dictionary,
                                            passes=20,
                                            num_topics=i,
                                            workers=4,
                                            iterations=100,
                                            alpha='symmetric',
                                            gamma_threshold=0.001)
            coh_model_lda = CoherenceModel(model=lda,
                                           corpus=self.corpus_tfidf,
                                           dictionary=self.dictionary,
                                           coherence='u_mass')
            coherence_value = coh_model_lda.get_coherence()

            # coh = lda.log_perplexity(corpus)
            coherence_list.append(coherence_value)

            print('k = {}  coherence value = {}'.format(
                str(i), str(coherence_value)))

        # for co_value in coherence_list:
        df = pd.DataFrame({'num': com_nums, 'co_value': coherence_list})
        delta = df['co_value'].diff() / df['co_value'][1:]
        df['delta'] = df['co_value'].diff()
        find = df['delta'] == df['delta'].max()
        df_find = df[find]
        optimal_value = 0
        if coherence_list[0] >= df_find['delta'].tolist()[0]:
            optimal_value = coherence_list[0]
            optimal_num = com_nums[0]
        else:
            optimal_value = df_find['delta'].tolist()[0]
            optimal_num = df_find['num'].tolist()[0]

        print('==== coherence values =====')
        print(df, end='\n')
        print('==== final values =====')
        print(df_find)

        df.to_csv(self.model_path + self.data_name + '_coherence_delta.csv',
                  mode='w',
                  encoding='utf-8')

        coh_dict = dict(zip(com_nums, coherence_list))
        sorted_coh_dict = sorted(coh_dict.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
        plt.plot(com_nums, coherence_list, marker='o')
        plt.xlabel('topic')
        plt.ylabel('coherence value')
        plt.draw()
        fig = plt.gcf()
        fig.savefig(self.model_path + self.data_name + '_coherence.png')
        t_ind = np.argmax(coherence_list)
        # self.num_topics = sorted_coh_dict[0][0]
        print('optimal topic number = ', optimal_num)
        return optimal_num
예제 #18
0
# neg_corpus = [neg_dict.doc2bow(i) for i in ne['2']]
# neg_lda = ldamulticore.LdaMulticore(neg_corpus,num_topics = 3,id2word = neg_dict, workers=48)

# 正面主题分析
pos_dict = corpora.Dictionary(po)
pos_corpus = [pos_dict.doc2bow(i) for i in po]
joblib.dump(pos_dict, args.model_file + ".dic")
joblib.dump(pos_corpus, args.model_file + ".cps")
# pos_lda = ldamulticore.LdaMulticore(pos_corpus,num_topics= 3,id2word =pos_dict, workers=1)
score_dic = {}
lda_modes = []

for n in range(1, 5):
    if platform == "linux" or platform == "linux2":
        pos_lda = ldamulticore.LdaMulticore(pos_corpus,
                                            num_topics=n * 5,
                                            id2word=pos_dict,
                                            workers=4)
        goodcm = CoherenceModel(model=pos_lda,
                                texts=po,
                                dictionary=pos_dict,
                                coherence='c_v',
                                processes=4)
    elif platform == "win32":
        pos_lda = ldamodel.LdaModel(pos_corpus,
                                    num_topics=n * 5,
                                    id2word=pos_dict)
        goodcm = CoherenceModel(model=pos_lda,
                                texts=po,
                                dictionary=pos_dict,
                                coherence='c_v',
                                processes=1)