Пример #1
0
    def train(self, arg_fname, is_pre=True, method='lsi', **params):
        self.fname = arg_fname
        self.method = method
        self._generate_conf()
        if is_pre:
            self.docs, self.dictionary, corpus = self._preprocess()
        else:
            self.docs = pickle.load(open(self.conf['fname_docs']))
            self.dictionary = corpora.Dictionary.load(self.conf['fname_dict'])
            corpus = corpora.MmCorpus(self.conf['fname_corpus'])

        if params is None:
            params = {}

        logger.info("training TF-IDF model")
        self.tfidf = models.TfidfModel(corpus, id2word=self.dictionary)
        corpus_tfidf = self.tfidf[corpus]

        if method == 'lsi':
            logger.info("training LSI model")
            self.lsi = models.LsiModel(corpus_tfidf,
                                       id2word=self.dictionary,
                                       **params)
            self.lsi.print_topics(-1)
            self.lsi_similarity_index = similarities.MatrixSimilarity(
                self.lsi[corpus_tfidf])
            self.para = self.lsi[corpus_tfidf]
        elif method == 'lda_tfidf':
            logger.info("training LDA model")
            # try 6 workers here instead of original 8
            self.lda_tfidf = models.LdaMulticore(corpus_tfidf,
                                                 id2word=self.dictionary,
                                                 workers=6,
                                                 **params)
            self.lda_tfidf.print_topics(-1)
            self.lda_tfidf_similarity_index = similarities.MatrixSimilarity(
                self.lda[corpus_tfidf])
            self.para = self.lda[corpus_tfidf]
        elif method == 'lda':
            logger.info("training LDA model")
            # try 6 workers here instead of original 8
            self.lda = models.LdaMulticore(corpus,
                                           id2word=self.dictionary,
                                           workers=6,
                                           **params)
            self.lda.print_topics(-1)
            self.lda_similarity_index = similarities.MatrixSimilarity(
                self.lda[corpus])
            self.para = self.lda[corpus]
        elif method == 'logentropy':
            logger.info("training a log-entropy model")
            self.logent = models.LogEntropyModel(corpus,
                                                 id2word=self.dictionary)
            self.logent_similarity_index = similarities.MatrixSimilarity(
                self.logent[corpus])
            self.para = self.logent[corpus]
        else:
            msg = "unknown semantic method %s" % method
            logger.error(msg)
            raise NotImplementedError(msg)
Пример #2
0
def get_LDA_model_multi_cores(paths,
                              corpus,
                              id2word,
                              num_topics,
                              passes,
                              a=None,
                              b=None):
    if a is None and b is None:
        lda_model = models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        passes=passes,
                                        num_topics=num_topics,
                                        workers=4,
                                        chunksize=100,
                                        per_word_topics=True,
                                        minimum_probability=0.0)
    else:
        lda_model = models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        passes=passes,
                                        num_topics=num_topics,
                                        workers=4,
                                        alpha=a,
                                        eta=b,
                                        chunksize=100,
                                        per_word_topics=True,
                                        minimum_probability=0.0)
    save_lda_model(paths,
                   lda_model,
                   num_topics=num_topics,
                   passes=passes,
                   alpha=a,
                   beta=b)
    return lda_model
Пример #3
0
 def createModel(self, corpus, dictionary, info):
     logging.basicConfig(format='%(asctime)s: %(levelname)s : %(message)s',
                         level=logging.INFO)
     path = 'TopicModel/' + info.data + '_' + info.identifier
     if not type(corpus) == list:
         corpus = matutils.Sparse2Corpus(corpus, documents_columns=False)
     if not os.path.exists(path):
         if self.name == 'LDA':
             if info.multicore:
                 self.model = models.LdaMulticore(
                     corpus,
                     num_topics=info.numberTopics,
                     id2word=dictionary,
                     passes=info.passes,
                     iterations=info.iterations,
                     batch=0)
             else:
                 self.model = models.LdaModel(corpus,
                                              num_topics=info.numberTopics,
                                              id2word=dictionary,
                                              passes=info.passes,
                                              iterations=info.iterations,
                                              update_every=info.online,
                                              chunksize=info.chunksize)
         elif self.name == 'LSI':
             self.model = models.LsiModel(corpus, info.numberTopics,
                                          dictionary)
             self.info = str(self.model)
         else:
             print 'Unkown Model type'
         print 'save Model'
         self.model.save(path)
     else:
         print 'Load Model'
         self.model = models.LdaModel.load(path)
Пример #4
0
    def infer(self):
        courses = [
            list(set(stop_words(item).remove()))
            for item in [w.split() for w in self.Courses]
        ]
        classes = list(set(stop_words(self.File_class).remove()))

        dictionary = corpora.Dictionary(courses)
        feature_cnt = len(dictionary.token2id)
        corpus = [dictionary.doc2bow(text) for text in courses]
        tfidf = models.TfidfModel(corpus)
        kw_vector = dictionary.doc2bow(classes)
        index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                    num_features=feature_cnt)
        sim = index[tfidf[kw_vector]]

        course_rec = dict(zip(sim, self.Names))
        course_sort = sorted(course_rec.items(), reverse=True)

        lda_model = models.LdaMulticore(tfidf[corpus],
                                        num_topics=10,
                                        id2word=dictionary,
                                        passes=2,
                                        workers=2)

        for idx, topic in lda_model.print_topics(-1):
            print('Topic: {} \nWords: {}'.format(idx, topic))

        for index, score in sorted(lda_model[tfidf[kw_vector]],
                                   key=lambda tup: -1 * tup[1]):
            print("\nScore: {}\t \nTopic: {}".format(
                score, lda_model.print_topic(index, 10)))

        return course_sort
Пример #5
0
def genlda(textlist, n):
    # ticks = str(time.time()).replace('.','')[-6:-1]
    nn = str(n)
    dictionary = corpora.Dictionary(textlist)
    corpus = [dictionary.doc2bow(text) for text in textlist]

    lda = models.LdaMulticore(corpus=corpus,
                              id2word=dictionary,
                              num_topics=n,
                              passes=100,
                              workers=3)
    doc_topic = [a for a in lda[corpus]]

    topics_r = lda.print_topics(num_topics=n, num_words=20)
    k = 0
    LDAlabel = []
    for i in lda.get_document_topics(corpus)[:]:
        listj = []
        for j in i:
            listj.append(j[1])
        bz = listj.index(max(listj))
        iiilabel = k, i[bz][0], i[bz][1], listj, listj.index(max(listj))
        LDAlabel.append(iiilabel)
        k = k + 1

    return LDAlabel
Пример #6
0
def compute_coherence_values(dictionary, corpus, limit, start=2, step=3):
    """
    Compute u_mass coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = models.LdaMulticore(gensim_corpus, id2word=dictionary, num_topics=num_topics, workers=2)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
def lda(name):
    if os.path.exists('./corpora_dicts/{}.dict'.format(name)):
        dictionary = corpora.Dictionary.load(
            './corpora_dicts/{}.dict'.format(name))
        corpus = corpora.MmCorpus('./corpus/{}.mm'.format(name))

        print("Loaded!!!")
        print(corpus)
    else:
        print("Error!!!!")

    lda = models.LdaMulticore(corpus,
                              id2word=dictionary,
                              num_topics=15,
                              passes=2,
                              workers=2)
    """
        To save the model
            temp_file = datapath("model")
            lda.save(temp_file)
        To load a saved model
            lda = LdaModel.load(temp_file)

    """

    # print('----------------END-----------------')
    # s = re.findall(something, str(lda.print_topics()))
    # print(s)

    for idx, topics in lda.print_topics(-1):
        print("Topic: {} ------------>".format(idx))
        print(topics)
Пример #8
0
 def train_lda(self, num_topics, chunksize=1000, passes=4):
     self.model = models.LdaMulticore(corpus=self.corpus_tfidf,
                                      num_topics=num_topics,
                                      id2word=self.dictionary,
                                      workers=workers,
                                      chunksize=chunksize,
                                      passes=passes)
Пример #9
0
def main(text_dir):
    topics = range(10, 101, 10) + range(120, 201, 20) + range(250, 451, 50)
    #topics = range(10, 21, 10)
    #corpus = DocCorpus(text_dir)
    #dictionary = corpus.dictionary
    corpus = MmCorpus('../twitter_LDA_topic_modeling/simple-wiki.mm')
    dictionary = Dictionary.load(
        '../twitter_LDA_topic_modeling/simple-wiki.dict')
    print('Building LDA models')
    lda_models = [
        models.LdaMulticore(corpus=corpus,
                            id2word=dictionary,
                            num_topics=i,
                            passes=5) for i in tqdm(topics)
    ]
    print('Generating coherence models')
    texts = [[dictionary[word_id] for word_id, freq in doc] for doc in corpus]
    pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1))
    func = partial(build_coherence_models,
                   corpus=corpus,
                   dictionary=dictionary,
                   texts=texts)
    coherence_models = pool.map(func, lda_models)
    pool.close()
    #    print('Extracting data from models')
    #    model_data = [extract_data(model, corpus, dictionary) for model in tqdm(lda_models)]
    #    d = defaultdict(list)
    #    print('Generating output data')
    #    for i, data in tqdm(enumerate(model_data)):
    #        d['num_topics'].append(data['num_topics'])
    #        d['cao_juan_2009'].append(cao_juan_2009(data['topic_term_dists'], data['num_topics']))
    #        d['arun_2010'].append(arun_2010(data['topic_term_dists'], data['doc_topic_dists'], data['doc_lengths'], data['num_topics']))
    #        d['deveaud_2014'].append(deveaud_2014(data['topic_term_dists'], data['num_topics']))
    #        d['u_mass_coherence'].append(data['u_mass_coherence'])
    d = defaultdict(list)
    print('Generating output data')
    for data in tqdm(coherence_models):
        d['num_topics'].append(data['num_topics'])
        d['u_mass'].append(data['u_mass'])
        d['c_v'].append(data['c_v'])
        d['c_uci'].append(data['c_uci'])
        d['c_npmi'].append(data['c_npmi'])
    df = pd.DataFrame(d)
    df = df.set_index('num_topics')
    df.to_csv('coherence_simple_wiki', sep='\t')
    df.plot(xticks=df.index, style=['bs-', 'yo-', 'r^-', 'gx-'])
    ax1 = df.plot(xticks=df.index, style='bs-', grid=True, y='u_mass')
    ax2 = df.plot(xticks=df.index, style='yo-', grid=True, y='c_v', ax=ax1)
    ax3 = df.plot(xticks=df.index, style='r^-', grid=True, y='c_npmi', ax=ax2)
    df.plot(xticks=df.index, style='gx-', grid=True, y='c_uci', ax=ax3)
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.17),
               fancybox=True,
               shadow=True,
               ncol=4,
               fontsize=9)
    plt.subplots_adjust(bottom=0.2)
    plt.xticks(df.index, rotation=45, ha='right', fontsize=8)
    plt.savefig('coherence_simple_wiki')
    plt.close()
Пример #10
0
    def fit(self, list_toks):
        utils.verbose('start training lda dictionary')
        self.dict = corpora.Dictionary(list_toks)

        utils.verbose('start building lda corpus')
        self.corpus = [self.dict.doc2bow(toks) for toks in list_toks]

        utils.verbose('start training lda model')
        self.model = models.LdaMulticore(self.corpus,
                                         self.vec_dim,
                                         id2word=self.dict)

        utils.verbose('start saving lda dictionary and model')
        self.model.save(self.paths['model'])
        self.dict.save(self.paths['dict'])

        utils.verbose('start vectorization for lda')
        self.ann = AnnoyIndex(self.vec_dim)
        for n, toks in enumerate(list_toks):
            if not n % 10000 and n:
                utils.verbose('vectorizing {} lines for lda'.format(n))
            vec = self.get(toks)
            self.ann.add_item(n, vec)

        utils.verbose('start building lda ann')
        self.ann.build(self.num_trees)
        self.ann.save(self.paths['ann'])
        utils.verbose('dump lda annoy into {}'.format(self.paths['ann']))
Пример #11
0
def lda_tfidf(num_topics, tfidf, text, dictionary, random_state, cluster_ID,
              data_path):
    coherence_ldas = []
    LDA_models = []
    topics = []
    for num_topic in num_topics:
        lda_tfidfmodel = models.LdaMulticore(tfidf,
                                             num_topics=num_topic,
                                             id2word=dictionary,
                                             passes=2,
                                             workers=2,
                                             random_state=random_state)
        coherence_model_lda = CoherenceModel(model=lda_tfidfmodel,
                                             texts=text,
                                             dictionary=dictionary,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        topics.append(num_topic)
        LDA_models.append(lda_tfidfmodel)
        coherence_ldas.append(coherence_lda)

    plt.figure(figsize=(20, 10))
    plt.plot(num_topics, coherence_ldas, marker='o', markersize=10)
    plt.savefig(f"{data_path}{cluster_ID}.png")
    plt.close()

    best_index = coherence_ldas.index(max(coherence_ldas))
    # get best result
    # retuen model, topic_number
    return LDA_models, topics, best_index
Пример #12
0
def main():
    nltk.download(['punkt', 'stopwords'])
    data = Data('test')
    corpus = data.get_corpus()
    tfidf = models.TfidfModel(corpus)
    print('Building TF-IDF index...')
    t_index = similarities.MatrixSimilarity(tfidf[corpus], num_best=10)
    print('Builing LDA index...')
    lda = models.LdaMulticore(corpus, id2word=data.dictionary, num_topics=40)
    l_index = similarities.MatrixSimilarity(lda[corpus], num_best=10)
    print('Idexies built')

    out = 'test_missing_with_predictions.txt'
    print('Saving output to {!r}'.format(out))
    with open(out, 'w') as f:
        for miss in data.missing:
            res = defaultdict(float)
            vector = data.dictionary.doc2bow(data.clean(miss))
            q = tfidf[vector]
            ql = lda[vector]
            for i, p in t_index[q]:
                res[i] += p
            for i, p in l_index[ql]:
                res[i] += p
            rating = sorted(res, key=res.get, reverse=True)
            id = data.dmap[rating[0]]
            line = '{} +++$+++ {}\n'.format(id, ' '.join(miss))
            f.write(line)
Пример #13
0
def lda(corpus_of_text):
    """
    Compare documents by Latent Dirichlet Allocation (LDA).

    :param corpus_of_text: list of documents, where each document is a sublist
    of tokenized strings.
    :return model: set of words that are most associated with each topic.
    """
    # Create a dictionary and corpus for the LDA model
    lda_dict = corpora.Dictionary(corpus_of_text)
    lda_corpus = [lda_dict.doc2bow(line) for line in corpus_of_text]

    # Train the model
    lda_model = models.LdaMulticore(corpus=lda_corpus,
                                    id2word=lda_dict,
                                    random_state=100,
                                    num_topics=4,
                                    passes=10,
                                    chunksize=1000,
                                    batch=False,
                                    alpha='asymmetric',
                                    decay=0.5,
                                    offset=64,
                                    eta=None,
                                    eval_every=0,
                                    iterations=100,
                                    gamma_threshold=0.001,
                                    per_word_topics=True)

    # Save the model
    # lda_model.save('lda_model.model')

    return lda_model.print_topics(-1)  # See the topics
Пример #14
0
 def __init__(self, corpus, dictionary):
     self.similar_index = 0
     self.lda = models.LdaMulticore(corpus,
                                    id2word=dictionary,
                                    workers=8,
                                    num_topics=50)
     self.corpus = self.lda[corpus]
Пример #15
0
def lda_model(corpus, dictionary, number_of_topics=20, save_path='saved_models/lda_bow'):
    if not isfile(save_path):
        lda_model = models.LdaMulticore(corpus, num_topics=number_of_topics, id2word=dictionary, passes=2, workers=2)
        lda_model.save(save_path)
    else: 
        lda_model = models.LdaMulticore.load(save_path)
    return lda_model
def generate_lda_sub_topic(topicid):
    # 载入词典
    dictionary = corpora.Dictionary.load(topic_dict_path % topicid)
    print "载入 topic %d 词典完成" % topicid

    # 载入语料
    texts = get_topic_texts(topicid)

    begin = datetime.datetime.now()
    corpus = [dictionary.doc2bow(text) for text in texts]
    # store to disk, for later use
    # corpora.MmCorpus.serialize('./nanfang.mm', corpus)
    # 单核
    # LDA = models.LdaModel(corpus, id2word=dictionary, num_topics=200, update_every=1, minimum_probability=0.1, passes=5)
    # 多核
    # models.ldamulticore.LdaMulticore(corpus, num_topics=200, id2word=dictionary, workers=None, chunksize=2000, passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001)
    print "开始训练 topic %d 的子 topic,共 %d 个" % (topicid, num_sub_topic)
    LDA = models.LdaMulticore(corpus,
                              num_topics=num_sub_topic,
                              id2word=dictionary,
                              workers=4,
                              chunksize=2000,
                              passes=1)
    end = datetime.datetime.now()
    print "训练用时", end - begin

    # 保存 LDA 模型
    path = "%s%d" % (lda_model_topic, topicid)
    LDA.save(path)
    print "topic %d 模型已保存到 %s 中\n" % (topicid, path)
    def fitTopics(self,topic_ct,passes):
        start = datetime.datetime.now()        
        self.topic_ct = topic_ct
        self.passes = passes
        
        self.verboseMsg('worp===>%d topics, %d passes: start ' %(topic_ct,passes))
        self.lda = models.LdaMulticore(
            self.corpus,
            num_topics  =  self.topic_ct,
            passes      =  passes,
            id2word     =  self.vocab,
            workers = 4,
            iterations = 2500,
            eval_every = 100,
            chunksize = 2000
        )
        self.verboseMsg('worp===>%d topics, %d passes: lda model complete ' %(topic_ct,passes))
        
        self.topic_vectors = self.lda.print_topics(num_topics=self.topic_ct, num_words=8)

        self.topic_proba = []
        for x in self.corpus:
            local = self.lda.get_document_topics(x)
            row = { x:float(0) for x in range(self.topic_ct)}
            for y in local:
                row[y[0]] = y[1]
            self.topic_proba.append(row)

        self.verboseMsg('worp===>%d topics, %d passes: creating probabilities in dataframe ' %(topic_ct,passes))
        
        self.topic_proba_df = pd.DataFrame(self.topic_proba)
    
        self.verboseMsg('worp===>%d topics, %d passes: complete ' %(topic_ct,passes))
        print datetime.datetime.now() - start
def fit_lda(num_topics, corpus, id2word, passes, multicore=0, save=True):
    """
    Fits a gensim lda model on the corpus,  Allows for easy switching between single and multicore
    implementations
    :param num_topics: Number of topics to model
    :param corpus: gensim's Sparse2Corpus object
    :param id2word:
    :param passes:
    :param multicore:
    :return:
    """
    if multicore:
        lda_fit = models.LdaMulticore(corpus=corpus,
                                      num_topics=num_topics,
                                      id2word=id2word,
                                      workers=multicore,
                                      passes=passes)
    else:
        lda_fit = models.LdaModel(corpus=corpus,
                                  num_topics=num_topics,
                                  id2word=id2word,
                                  passes=passes)

    if save:
        lda_fit.save(f'../outputs/lda_{num_topics}_topics.mdl')
    return lda_fit
    def create_documents_view(self, corpus, ir_mode):
        dictionary, pdocs = self.create_dictionary(corpus)
        bow = self.docs2bows(corpus, dictionary, pdocs)
        loaded_corpus = corpora.MmCorpus('vsm_docs.mm')  # Recover the corpus

        if ir_mode == 1:
            model = [[(w[0], 1 + np.log2(w[1])) for w in v]
                     for v in bow]  # TF model
        elif ir_mode == 2:
            model = models.TfidfModel(loaded_corpus)  # TF IDF model
        elif ir_mode == 3:
            model = models.LdaModel(loaded_corpus)  # LDA model
        elif ir_mode == 4:
            model = models.LdaMulticore(loaded_corpus)  # LDA Multicore model
        elif ir_mode == 5:
            model = models.LsiModel(loaded_corpus)  # LSI model
        elif ir_mode == 6:
            model = models.RpModel(loaded_corpus)  # RP model
        elif ir_mode == 7:
            model = models.LogEntropyModel(
                loaded_corpus)  # LogEntropyModel model

        # tf = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus

        return model, dictionary
 def train_predict(self):
     bow_corpus, dictionary = self._feature_preparations()
     corpus_tfidf = models.TfidfModel(bow_corpus)[bow_corpus]
     lda_model_tfidf = \
         models.LdaMulticore(corpus_tfidf, num_topics=self.nb_topics, id2word=dictionary, passes=2, workers=2)
     for idx, topic in lda_model_tfidf.print_topics(-1):
         print('Topic: {} Word: {}'.format(idx, topic))
Пример #21
0
def createTopics(words):

    dictionary = corpora.Dictionary(words)
    dictionary.save("dictionary_" + str(i) + "_" + str(notopics) +
                    "topics.dict")

    global global_dict
    global_dict = dictionary

    #step 2
    #convert to bag of words
    corpus = map(dictionary.doc2bow, words)
    corpora.MmCorpus.serialize(
        "corpus_" + str(i) + "_" + str(notopics) + "topics.mm", corpus)
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    lda = models.LdaMulticore(corpus_tfidf,
                              id2word=dictionary,
                              num_topics=notopics,
                              workers=4)
    # The following line of code gets topic probability distribution for a document
    # corpus_lda = lda[corpus_tfidf]

    global global_lda
    global_lda = lda

    #step 3
    #save topic models
    #These are models that you use to make topic inferences about documents
    lda.save("model_" + str(i) + "_" + str(notopics) + "topics.lda")
    # pickle.dump(corpus_lda, open("corpus_lda.pck","wb"))
    pickle.dump(tfidf, open("tfidf.pck", "wb"))
    print("done")
def generate_lda_topic():
    # 载入词典
    dictionary = corpora.Dictionary.load(dict_path)
    print "载入词典完成"

    # 载入语料
    texts = get_texts()

    begin = datetime.datetime.now()
    corpus = [dictionary.doc2bow(text) for text in texts]
    # store to disk, for later use
    # corpora.MmCorpus.serialize('./nanfang.mm', corpus)
    # 单核
    # LDA = models.LdaModel(corpus, id2word=dictionary, num_topics=200, update_every=1, minimum_probability=0.1, passes=5)
    # 多核
    # models.ldamulticore.LdaMulticore(corpus, num_topics=200, id2word=dictionary, workers=None, chunksize=2000, passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001)
    print "开始训练第一层 LDA 模型,共 %d 个 topic" % num_topic
    LDA = models.LdaMulticore(corpus,
                              num_topics=num_topic,
                              id2word=dictionary,
                              workers=4,
                              chunksize=2000,
                              passes=5)
    level1 = datetime.datetime.now()
    print "第一层 LDA 模型训练完成,用时", level1 - begin

    # 分离每个 topic 的数据
    # topic_text = [[] for i in range(num_topic)]
    # print "开始分离各个 topic 的数据"
    # for i in range(len(texts)):
    #     if (i % 10000 == 0):
    #         print "正在处理第 %d 行" % i
    #     # 获取每个文本的 topics
    #     topics = LDA.get_document_topics(corpus[i])
    #     # 这里选择 Top1
    #     if (len(topics) < 1):
    #         continue
    #     # print len(topics), topics[0]
    #     topic_text[topics[0][0]].append(" ".join(texts[i]))

    # 写入每个 topic 的数据
    # for i in range(num_topic):
    #     print "写入 topic %d 的用户问句(已分词)" % i
    #     with codecs.open("%s%d" %(topic_query, i), "w+", "utf-8") as f:
    #         for line in topic_text[i]:
    #             if (len(line) > 1):
    #                 f.write(line+"\n")

    # 写入子 topic
    # for i in range(num_topic):
    #     generate_topic_dict(i) # 生成词典
    #     generate_sub_topic(i) # 生成 子 topic

    end = datetime.datetime.now()
    print "处理 LDA 用时", end - begin

    # 保存 LDA 模型
    LDA.save(lda_model)
    print "模型已保存到 %s 中" % lda_model
    def train_topic(
        self,
        num_topics,
        no_below=1,
        no_above=0.9,
        keep_n=None,
        keep_tokens=None,
        remove_most_freq_n=None,
        bad_tokens=None,
        model="ldamulticore",
        bigrams=True,
        **kwargs,
    ):
        """
        no_below (int|None) – Keep tokens which are contained in at least
        no_below documents.
        no_above (float|None): Keep tokens which are contained in no
        more than no_above documents (fraction of total corpus size,
        not an absolute number).
        keep_n (int|None) – Keep only the first keep_n most frequent
        tokens.
        keep_tokens (iterable of str) – Iterable of tokens that must stay in
        dictionary after filtering.
        remove_most_freq_n (int|None): Remove n most frequent tokens
        model ('ldamulticore'|'lda'|'ldamallet')
        """
        if bigrams is True:
            phrases = models.Phrases(self.tokenlists, delimiter=b" ")
            phraser = models.phrases.Phraser(phrases)
            self.tokenlists = [phraser[tl] for tl in self.tokenlists]

        dictionary = corpora.Dictionary(self.tokenlists)

        if remove_most_freq_n:
            dictionary.filter_n_most_frequent(remove_most_freq_n)
        dictionary.filter_extremes(
            no_below=no_below, no_above=no_above, keep_n=keep_n, keep_tokens=keep_tokens
        )

        bows = [dictionary.doc2bow(tl) for tl in self.tokenlists]

        if bad_tokens:
            dictionary.filter_tokens(
                bad_ids=[dictionary.id2token[tok] for tok in bad_tokens]
            )

        self.bows = bows
        self.dictionary = dictionary
        if model == "ldamulticore":
            self.model = models.LdaMulticore(
                bows, num_topics=num_topics, id2word=dictionary, **kwargs
            )
        if model == "lda":
            self.model = models.LdaModel(
                bows, num_topics=num_topics, id2word=dictionary, **kwargs
            )

        if model == "ldamallet":
            raise ValueError("mallet is not yet implemented")
Пример #24
0
 def calculate_lda(self, params):
     if params.get('cpu_cores', 1) > 1:
         lda = models.LdaMulticore(self, id2word=self.dictionary, num_topics=params.get('topics', n_topics),
                                   workers=params.get('cpu_cores', params.get('cpu_cores', n_cpu_cores)))
     else:
         lda = models.LdaModel(self, id2word=self.dictionary, num_topics=params.get('topics', n_topics))
     lda.save(DOCUMENT_PATH + self.filename + '.lda')
     return lda
Пример #25
0
 def make_lda(self):
     """Create LDA model object."""
     lda_model = models.LdaMulticore(self.bow_corpus,
                                     num_topics=10,
                                     id2word=self.dictionary,
                                     passes=2,
                                     workers=2)
     return lda_model
Пример #26
0
def lda_model_tfidf(tfidf_corpus, dictionary, number_of_topics=20, save_path='saved_models/lda_tfidf'):
    if not isfile(save_path):
        lda_model_tfidf = models.LdaMulticore(tfidf_corpus, num_topics=number_of_topics, \
                                                     id2word=dictionary, passes=2, workers=4)
        lda_model_tfidf.save(save_path)
    else: 
        lda_model_tfidf = models.LdaMulticore.load(save_path)
    return lda_model_tfidf
Пример #27
0
def fit_LDA_gensim(file_path, num_topics=10, passes=1, chunksize=2000):
    """
    train and return LDA model

    Parameters:
    file_path   : path to the text file containing tfidf-filtered tokenized chats
                  one chat per line, tokens separated by whitespace
    num_topics  :
    update_every:
    passes      :
    chunksize   :
    """

    #====================================================================================#
    # Configure messages sent to the terminal
    if verbose == 'yes': level = logging.INFO
    else: level = PROGRESS_NUM
    logging.basicConfig(format='%(levelname)s : %(message)s', level=level)
    #====================================================================================#

    #====================================================================================#
    logging.log(PROGRESS_NUM, 'create a Gensim dictionary from the texts')

    dictionary = corpora.Dictionary(\
    line.split() for line in codecs.open(file_path,'r','utf-8'))
    #====================================================================================#

    #====================================================================================#
    logging.log(PROGRESS_NUM, 'convert chats to a bag of words corpus')
    chats = text_stream(file_path)
    # creates corpus object without loading the whole document in RAM
    corpus = corpus_stream(file_path, dictionary)
    ## creates corpus object loading the whole document in RAM
    #corpus = [dictionary.doc2bow(text.split()) for text in chats]
    #====================================================================================#

    #====================================================================================#
    logging.log(PROGRESS_NUM, 'Training LDA')

    if multicore == 'yes':
        lda = models.LdaMulticore(corpus, id2word=dictionary,\
        num_topics=num_topics, passes=passes,chunksize=chunksize)
    else:
        lda = models.LdaModel(corpus, id2word=dictionary, \
        num_topics=num_topics, passes=passes,chunksize=chunksize)

    lda.show_topics()
    if verbose == 'yes': lda.print_topics(num_topics)
    #====================================================================================#

    #====================================================================================#
    # creates corpus object loading the whole document in RAM
    # needed to plot with pyLDAvis
    corpus = [dictionary.doc2bow(text.strip().split()) for text in chats]
    #====================================================================================#

    return lda, corpus, dictionary
Пример #28
0
def try_LDA(corpus, id2word, num_topics):
    count = 1
    print(count)
    for n in range(31, num_topics + 1):
        lda = models.LdaMulticore(corpus=corpus, num_topics=n, id2word=id2word, passes=5,                                   random_state = 42)
        with open(f'LDA/LDA_model_{n}.pkl', 'wb') as f:
            pickle.dump(lda, f)
        count +=1
        print(count)
 def train_predict(self):
     bow_corpus, dictionary = self._feature_preparations()
     lda_model = models.LdaMulticore(bow_corpus,
                                     num_topics=self.nb_topics,
                                     id2word=dictionary,
                                     passes=2,
                                     workers=2)
     for idx, topic in lda_model.print_topics(-1):
         print('Topic: {} \nWords: {}'.format(idx, topic))
Пример #30
0
 def make_lda_tfidf(self):
     """Running LDA using TF-IDF."""
     lda_model_tfidf = models.LdaMulticore(
         self.tfidf_model[self.bow_corpus],
         num_topics=20,
         id2word=self.dictionary,
         passes=2,
         workers=4)
     return lda_model_tfidf