def train_lda(recipe_file,num_topics,output_file): corpus = RecipeCorpus(recipe_file) corpora.MmCorpus.serialize(output_file+'.corpus.mm', corpus) lda = LdaModel(corpus, id2word=corpus.dictionary, num_topics=int(num_topics), distributed=False) lda.save(output_file) return lda
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): """ Function to convert mallet model to gensim LdaModel. This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model. Args: mallet_model : Trained mallet model gamma_threshold : To be used for inference in the new LdaModel. iterations : number of iterations to be used for inference in the new LdaModel. Returns: model_gensim : LdaModel instance; copied gensim LdaModel """ model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, alpha=mallet_model.alpha, iterations=iterations, gamma_threshold=gamma_threshold, dtype=numpy. float64 # don't loose precision when converting from MALLET ) model_gensim.expElogbeta[:] = mallet_model.wordtopics return model_gensim
class LDA(object): def __init__(self, model, vocab, corpus=None, topics=200, passes=1): self._model_file = model self._dict_file = vocab self._corpus_file = corpus self._topics = topics self._passes = passes def train(self): self._corpus = SentenceDocCorpus(self._corpus_file) self._lda = LdaModel(self._corpus, num_topics=self._topics, id2word=self._corpus.dictionary, passes=self._passes) self._dictionary = self._corpus.dictionary self._lda.save(self._model_file) self._dictionary.save(self._dict_file) def load(self): self._lda = LdaModel.load(self._model_file) self._dictionary = Dictionary.load(self._dict_file) def topics(self, words): return self._lda[self._dictionary.doc2bow(common.filter(words))] def topic_vector(self, words): return np.array([ v for k, v in self._lda.__getitem__( self._dictionary.doc2bow(common.filter(words)), eps=0) ])
def update(self, corpus=[[]]): """ 在线更新,在已有模型的基础上在线更新 Args: corpus - 用于更新的文档列表 """ if not self._model and len(corpus) > 0: # 创建字典,每一个词都给予一个索引 self._common_dictionary = Dictionary(corpus) corpus_data = [ self._common_dictionary.doc2bow(sentence) for sentence in corpus ] # corpus_data 词袋矩阵 topics 主题数 id2word 将索引转化为词 passes 训练轮数 self._model = LdaModel(corpus_data, self._topics, id2word=self._common_dictionary, passes=50) #self._model = LdaModel(corpus_data, self._topics) elif self._model and len(corpus) > 0: self._common_dictionary.add_documents(corpus) new_corpus_data = [ self._common_dictionary.doc2bow(sentence) for sentence in corpus ] self._model.update(new_corpus_data)
def main(): docs = get_train( 'D:/ByResearch/基于文本的原油油价预测/20200615code/code/SeaNMF-master/data/wedata.txt' ) docs = [s.strip().split() for s in docs] # Create a dictionary representation of the documents. dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=10, no_above=0.2) corpus = [dictionary.doc2bow(doc) for doc in docs] # Make a index to word dictionary. temp = dictionary[0] # only to "load" the dictionary. id2word = dictionary.id2token PMI = [] for i in range(2, 11): print(i) lda_model = LdaModel(corpus=corpus, id2word=id2word, iterations=100, num_topics=i) # Print the Keyword in the 5 topics print(lda_model.print_topics()) coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_uci') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) del lda_model PMI.append(coherence_lda) print(PMI)
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): """Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`. This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model. Parameters ---------- mallet_model : :class:`~gensim.models.wrappers.ldamallet.LdaMallet` Trained Mallet model gamma_threshold : float, optional To be used for inference in the new LdaModel. iterations : int, optional Number of iterations to be used for inference in the new LdaModel. Returns ------- :class:`~gensim.models.ldamodel.LdaModel` Gensim native LDA. """ model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, alpha=mallet_model.alpha, iterations=iterations, gamma_threshold=gamma_threshold, dtype=numpy. float64 # don't loose precision when converting from MALLET ) model_gensim.expElogbeta[:] = mallet_model.wordtopics return model_gensim
def train_lda(n_topics, id2word_dictionary=None, documents=None, corpus=None): """ Training method for LDA. documents is a list of lists of words/tokens documents is used to construct a dictionary and a corpus from which the topics for LDA are inferred """ # Construct dictionary of words if it's not passed if not id2word_dictionary: id2word_dictionary = corpora.Dictionary(documents) word2idx_dictionary = dict([(w, idx) for (idx, w) in id2word_dictionary.items()]) # Construct corpus for model if documents and not corpus: corpus = [id2word_dictionary.doc2bow(document) for document in documents] # Cluster the documents into topics using LDA. number of topics is given # by n_topics lda_model = LdaModel(corpus=corpus, id2word=id2word_dictionary, num_topics=n_topics, update_every=1, chunksize=10000, passes=1) """ Default value for topn (number of top words to show by probability) is 10. A high enough value should return the words covering most or all of the probability mass """ topics = [lda_model.show_topic(idx, topn=50000) for idx in range(0, n_topics)] return lda_model, id2word_dictionary, word2idx_dictionary, topics
def runlda(filetopicwords,fileinput,NUMTOPICS=30,NUMPASSES=10,NUMITERATIONS=10): print('runlda...') from gensim.corpora import Dictionary from gensim.models.ldamodel import LdaModel import numpy as np docs,word2freqtopics = [],{} fr = open(fileinput,'r') for line in fr: words = line.strip('\r\n').split(' ') docs.append(words) for word in words: if not word in word2freqtopics: word2freqtopics[word] = [0,[0. for i in range(NUMTOPICS)]] word2freqtopics[word][0] += 1 fr.close() V = len(word2freqtopics) dct = Dictionary(docs) model = LdaModel(corpus=[dct.doc2bow(doc) for doc in docs],id2word=dct, \ num_topics=NUMTOPICS,passes=NUMPASSES,iterations=NUMITERATIONS) fw = open(filetopicwords,'w') for topicid in range(NUMTOPICS): s = 'topic '+str(topicid) wordscores = [] for (wordid,score) in model.get_topic_terms(topicid,topn=V): if score < 1e-6: break wordscores.append([dct[wordid],score]) scoresum = sum([x[1] for x in wordscores]) for [word,score] in wordscores: s += ','+word+':'+str(np.round(score/scoresum,6)) word2freqtopics[word][1][topicid] = score fw.write(s+'\n') fw.close() '''
def trainModel(self): if self.toweight: self.model = LdaModel(self.tfidf[self.corpus], num_topics=self.num_topics) self.index = MatrixSimilarity(self.model[self.tfidf[self.corpus]]) else: self.model = LdaModel(self.corpus, num_topics=self.num_topics) self.index = MatrixSimilarity(self.model[self.corpus])
def lda_model(self, num_topics: [int, None] = 10, passes: [int, None] = 1, seed: [int, None] = None): """ Construct LDA topic models for each year in a corpus, given a set of parameters. """ if self.word_to_id is None or self.corpora is None: self.build_dictionaries_and_corpora() results = num_dict(self.year_list) if seed is None: for year in self.year_list[:-1]: results[year] = \ LdaModel(corpus=self.corpora[year], id2word=self.word_to_id[year], num_topics=num_topics, passes=passes) else: rand = RandomState(seed) for year in self.year_list[:-1]: results[year] = \ LdaModel(corpus=self.corpora[year], id2word=self.word_to_id[year], num_topics=num_topics, passes=passes, random_state=rand) return TopicResults(results, self.num_docs, self.name)
def vwmodel2ldamodel(vw_model, iterations=50): """Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to :class:`~gensim.models.ldamodel.LdaModel`. This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel into the gensim model. Parameters ---------- vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` Trained Vowpal Wabbit model. iterations : int Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`. Returns ------- :class:`~gensim.models.ldamodel.LdaModel`. Gensim native LDA. """ model_gensim = LdaModel(num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold, dtype=numpy.float32) model_gensim.expElogbeta[:] = vw_model._get_topics() return model_gensim
def run(self): if self.clean_level in ('raw','clean','stopwords'): kind = self.clean_level else: kind = 'stopwords' for idioma in self.output()['langs'].iterkeys(): dicc_path = self.input()['dict']['langs'][idioma].path corp_path = self.input()['corp']['langs'][idioma].path print '==============================' print 'Corriendo LDA de %s con nivel de limpieza %s' % (idioma, kind) print '==============================' # Cargar diccionario y corpus dicc = corpora.Dictionary.load(dicc_path) corpus = corpora.MmCorpus(corp_path) # Correr LDA del idioma para cada numero de topicos for n_topics in self.output()['langs'][idioma].iterkeys(): print 'Número de tópicos: ' + str(n_topics) if self.by_chunks: lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, update_every=self.update_e, chunksize=self.chunk_size, passes=self.n_passes) else: lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, passes=1) lda.save(self.output()['langs'][idioma][n_topics].path)
def train_model(texts, **kwargs): # parse args filter_stopwords = kwargs.get('filter_stopwords', True) normalizer = kwargs.get('normalizer', 'porter') tfidf = kwargs.get('tfidf', True) num_topics = kwargs.get('num_topics', 20) min_freq = kwargs.get('min_freq', 2) use_pickle = kwargs.get('use_pickle', True) update_pickle = kwargs.get('update_pickle', True) report = kwargs.get('report', True) distributed = kwargs.get('distributed', False) # build corpus or read it in from pickle if use_pickle: print "INFO: loading pickled corpus and word hash" corpus = pickle.load( open( "pickles/corpus.p", "rb" ) ) id2word = pickle.load( open( "pickles/id2word.p", "rb" ) ) else: print "INFO: processing text and building corpus..." corpus, id2word = process_texts( texts = texts, filter_stopwords = filter_stopwords, normalizer = normalizer, min_freq = min_freq ) if update_pickle: # pickle files print "INFO: updating pickled coprus and word hash" pickle.dump(corpus, open( "pickles/corpus.p", "wb" ) ) pickle.dump(id2word, open( "pickles/id2word.p", "wb" ) ) # optional tfidf transformation if tfidf: print "INFO: applying tfidf transformation..." tfidf = TfidfModel(corpus) corpus = tfidf[corpus] # fit model print "INFO: fitting model..." lda = LdaModel( corpus = corpus, id2word = id2word, num_topics = num_topics, distributed = distributed ) # report if report: perplexity = lda.bound(corpus) print "RESULTS:" print "\nperplexity: ", perplexity, "\n" topics = lda.show_topics(num_topics) for i, t in enumerate(topics): print "topic %d:" % i print t return lda, corpus, id2word
def draw_cluster_key_word(cluster: list): """ 抽取一个聚类的关键词 :param cluster: list of tuple(7),问题二中聚类得到的簇 :return: list of words,关键词列表 """ stop = fetch_default_stop_words() # 停用词表 stop.extend(["", " ", "\n", "\t", "*"]) # 附加几个停用词 sents = [ jieba.lcut(row[2] + "。" + row[4], cut_all=True) for row in cluster ] # 分词 sents = [[word for word in sent if word not in stop] for sent in sents] # 去停用词 dictionary = corpora.Dictionary(sents) # 建立词典 doc_term_matrix = [dictionary.doc2bow(doc) for doc in sents] # 文档-词频矩阵 # 训练LDA模型 lda_model = LdaModel(doc_term_matrix, num_topics=1, id2word=dictionary, passes=1) # 解析出主题中概率最大的前6个词 key_words = [ word for index, word in enumerate(lda_model.show_topics()[0][1].split("\"")) if index in [1, 3, 5, 7, 9, 11] ] return key_words
class LMDL_LDA(): def __init__(self): self.lmdl = LMDL_Corpus() self.texts = self.lmdl.get_corpus_texts_words() self.dictionary = Dictionary(self.texts) self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] self.lda = LdaModel(self.corpus, num_topics=LDA_NUM_TOPICS, id2word=self.dictionary) def print_topics(self): return self.lda.print_topics(LDA_NUM_TOPICS) def get_document_topics(self, document_name): document_tokens = self.lmdl.token_list_processed(document_name) topics = self.lda.get_document_topics( self.dictionary.doc2bow(document_tokens), minimum_probability=None, minimum_phi_value=None, per_word_topics=False) show_topics_list = [] for topic in topics: lda_topic = self.lda.show_topic(topic[0], topn=10) show_topics_list.append(lda_topic) return show_topics_list def top_topics(self): return self.lda.top_topics(corpus=self.corpus, texts=self.texts, dictionary=self.dictionary, window_size=None, coherence='u_mass', topn=20, processes=-1)
def generate_topics(journal, num_topics, num_words, passes): # num_words: number of words we want to see from each topic (defult is 10) # passes: times to go over the data. 1 can be used for large corpus filename = '{}_article_titles.txt'.format(journal) with open(filename) as f: documents = f.readlines() texts = [[ word for word in document.lower().split() if word not in STOPWORDS ] for document in documents] stemmer = SnowballStemmer('english') texts_stemmed = [[stemmer.stem(word) for word in text] for text in texts] dictionary = corpora.Dictionary(texts_stemmed) corpus = [dictionary.doc2bow(text) for text in texts] # bow means bag of words # LDA model lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics, passes=passes) for topic in lda.print_topics(num_words=num_words): topicNumber = topic[0] print(topicNumber, ':', sep='') listOfTerms = topic[1].split('+') for term in listOfTerms: listItems = term.split('*') print(' ', listItems[1], '(', listItems[0], ')', sep='') '''
def makeLDA(path, num_topics, num_words, passes): num_topics = num_topics # 模型中寻找主题的数量 num_words = num_words # 从每个主题中看到多少单词 passes = passes # 重复检查数据多少次 with open(filename, encoding='utf-8') as f: documents = f.readlines() texts = [[ word for word in document.lower().split() if word not in STOPWORDS and word.isalnum() ] for document in documents] # print(texts) # 从单词列表中创建一个字典和一个语料库 dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics, passes=passes) pp = pprint.PrettyPrinter(indent=4) pp.pprint(lda.print_topics(num_words=num_words)) unseennText = '../../../data/LDA_data/lkmlSingleNewEmail.txt' with open(unseennText, encoding='utf-8') as fenw: newdoc = fenw.read() newcourpus = dictionary.doc2bow( newword for newword in newdoc.lower().split() if newword not in STOPWORDS and newword.isalnum()) #将新的语料库传入现有的LDA模型 pp.pprint(lda[newcourpus])
def get_topics(df, num_topics): df_temp = df.sample(frac=0.2) text_dict = corpora.Dictionary(df_temp['topic_modeling_text']) tweets_bow = [ text_dict.doc2bow(tweet) for tweet in df_temp['topic_modeling_text'] ] tweets_lda = LdaModel(tweets_bow, num_topics=num_topics, id2word=text_dict, random_state=1, passes=5) words = [re.findall(r'"([^"]*)"', t[1]) for t in tweets_lda.print_topics()] topics = [' '.join(t[0:10]) for t in words] # Getting the coherence score - st.write(' ') coherence_model = CoherenceModel(model=tweets_lda, texts=df_temp['topic_modeling_text'], dictionary=text_dict, coherence='c_v') coherence_lda = coherence_model.get_coherence() return topics, coherence_lda
def get_lda_feature(): doc_train = pd.read_csv(id_content_path) documents = doc_train['content'].apply(lambda x: x.split(' ')) # 建立词和ID的映射字典(id:word) dictionary = corpora.Dictionary(documents) # 建立文档和id和list(tuple(id,num)) of list df ds_df = [dictionary.doc2bow(document) for document in documents] # 建立tfidf模型,通过语料文档的tf,预测的时候只要提供语料的df tfidf_model = TfidfModel(ds_df) # 获取文档的tdf获取文档tfidf ds_tfidf = tfidf_model[ds_df] # 定义文档的主题个数 n = 60 # 构建lda模型,输入参数是文档的tfidf,并指明主题的个数 lda_model = LdaModel(ds_tfidf, num_topics=n, passes=10, random_state=12) vec_size = (len(documents), n) lda_feature = np.zeros(vec_size) i = 0 for doc in ds_tfidf: topics = lda_model.get_document_topics(doc, minimum_probability=0.01) for topic in topics: num_topic = topic[0] prob = round(topic[1], 5) lda_feature[i, num_topic] = prob i += 1 f_names = get_lda_feacture_name(n) pd.DataFrame(lda_feature, columns=f_names).to_csv(id_content_lda_path, index=0)
def main(): collection_name = "nips" years = xrange(2010, 2015) # 10 ~ 14 n_topics = 10 corpus_paths = map(lambda y: "data/{}-{}.dat".format(collection_name, y), years) all_corpus = [] year2corpus = {} for year, path in zip(years, corpus_paths): corpus = list(load_line_corpus(path)) all_corpus.append(proc_corpus(corpus)) year2corpus[year] = corpus all_corpus = list(itertools.chain.from_iterable(all_corpus)) dictionary = Dictionary(all_corpus) all_corpus = [dictionary.doc2bow(doc) for doc in all_corpus] import pdb pdb.set_trace() # print all_corpus model = LdaModel(all_corpus, num_topics=n_topics, id2word=dictionary, eval_every=10, passes=100) print model.show_topics()
def create_LDA(comment_dict, num_topics=20, chunk_size=50, max_iter=20, from_db=True, get_data_func=None): lda = None text_gen = data_preprocessor(max_iter=max_iter, from_db=from_db, get_data_func=get_data_func) corpus = [] for _, stemmed_text, _ in text_gen: if len(stemmed_text) != 0: corpus.append(comment_dict.doc2bow(stemmed_text)) if len(corpus) == chunk_size: if lda is None: lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=comment_dict, per_word_topics=1, passes=10) else: lda.update(corpus=corpus) corpus = [] return lda
def create_model(self, doc_matrix, term_dictionary, model_path, save_model=True, language='language_na'): """ Creates an LDA model based on a set of documents :param model_path: :param doc_matrix: :param term_dictionary: :param save_model: :param language: :return LDA model: """ self.language = language start = time() self.ldamodel = LdaModel(doc_matrix, num_topics=self.num_categories, id2word=term_dictionary, passes=50) if save_model: self.save_model(model_path=os.path.join( model_path, 'models', self.language, '%s_%s_category_lda.model' % (language, str(self.num_categories)))) logging.info('Training lasted: {:.2f}s'.format(time() - start)) return self.ldamodel
def find_topic(self,condition=None,n_topics=10,n_words=10,topic_model='lda',vec_model='tf',show=True,**kwargs): '''主题模型,和上面那个函数,优先使用该函数 parameter --------- condition: 语料逻辑值,可以用于专门对好评/差评进行主题分解 n_topics: 主题数 n_words: 每个主题输出的词语数 vec_model: 向量化方法,默认是tf ''' if condition is not None: texts=self.texts_seg[condition] else: texts=self.texts_seg if topic_model in ['lda','LDA']: dictionary = corpora.Dictionary([doc.split(' ') for doc in texts]) corpus = [dictionary.doc2bow(text.split(' ')) for text in texts] if vec_model in ['idf','tfidf']: tfidf = models.TfidfModel(corpus) corpus = tfidf[corpus] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics) topics_keywords=lda.show_topics(num_topics=n_topics, num_words=n_words,formatted=False) if show: print('\n'.join(['主题 {}: {}'.format(i,' | '.join([k[0] for k in \ topic[1]])) for i,topic in enumerate(topics_keywords)])) return topics_keywords
def vwmodel2ldamodel(vw_model, iterations=50): """ Function to convert vowpal wabbit model to gensim LdaModel. This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel into the gensim model. Args: vw_model : Trained vowpal wabbit model. iterations : Number of iterations to be used for inference of the new LdaModel. Returns: model_gensim : LdaModel instance; copied gensim LdaModel. """ model_gensim = LdaModel(num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold, dtype=numpy.float32) model_gensim.expElogbeta[:] = vw_model._get_topics() return model_gensim
def trainModel(): """ Train a model """ if args.mode == 'Random': return args.topics, 0 # need to train on dump files = [ f"{args.input}/{f}" for f in os.listdir(args.input) if os.path.isfile(os.path.join(args.input, f)) ] if args.mode == 'LDA': # create dictionary with open(files[0], "r", encoding='utf-8') as f: dct = Dictionary([' '.join(f.readlines()).split()]) for filename in files[1:]: with open(filename, "r", encoding='utf-8') as f: dct.add_documents([' '.join(f.readlines()).split()]) # create corpus corpus = [] for filename in files: with open(filename, "r", encoding='utf-8') as f: corpus.append(dct.doc2bow(' '.join(f.readlines()).split())) lda = LdaModel(corpus, num_topics=args.topics) lda.save("./models/LDAdump.model") dct.save("./models/LDAdump.dct") return lda, dct if args.mode == 'loadLDA': return LdaModel.load("./models/LDAdump.model"), Dictionary.load( "./models/LDAdump.dct")
def get_topics(candidate, day): start_time = datetime.strptime(day, "%Y-%m-%d").date() start_time = int(start_time.strftime('%s'))*1000 end_time = start_time + 86399999 try: client = MongoClient() tweets = client.fletcher.tweets tweets = tweets.aggregate([ {"$match":{"$text":{"$search":candidate_search[candidate_slugs[candidate]]}}}, {"$match":{"timestamp_ms":{"$gte":start_time,"$lt":end_time}}}]) documents = [] pattern = re.compile("[^a-zA-Z ]") for tweet in tweets: documents.append(pattern.sub('', tweet['text'])) stoplist = set(candidate_stop_words[candidate_slugs[candidate]] + stopwords) texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, update_every=1, chunksize=10000, passes=10) return lda.print_topics(3) except: return None
def display_perplexity_on_topic_num(start, step, limit): model_list = [] pplxty_list = [] names = locals() for num_topics in range(start, limit, step): print("############### current num:", num_topics, "###############") model_path = os.getcwd() + "\\Model\\topic_num_" + str( num_topics) + ".model" if not os.path.exists(model_path): # Modeling!!!!! print("Modeling in progress...") names['model' + str(num_topics)] = LdaModel( pubs_corpus, num_topics=num_topics, id2word=pubs_dictionary, passes=10, eval_every=1) names['model' + str(num_topics)].save(model_path) else: print("Model already exists.") names['model' + str(num_topics)] = LdaModel.load(model_path) model_list.append(names['model' + str(num_topics)]) pplxty_value = perplexity(names['model' + str(num_topics)], pubs_corpus, pubs_dictionary, len(pubs_dictionary.keys()), num_topics) pplxty_list.append(pplxty_value) return model_list, pplxty_list
def vwmodel2ldamodel(vw_model, iterations=50): """Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to :class:`~gensim.models.ldamodel.LdaModel`. This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel into the gensim model. Parameters ---------- vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` Trained Vowpal Wabbit model. iterations : int Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`. Returns ------- :class:`~gensim.models.ldamodel.LdaModel`. Gensim native LDA. """ model_gensim = LdaModel( num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold, dtype=numpy.float32 ) model_gensim.expElogbeta[:] = vw_model._get_topics() return model_gensim
class LDA(object): def __init__(self, model, vocab, corpus=None, topics=200, passes=1): self._model_file = model self._dict_file = vocab self._corpus_file = corpus self._topics = topics self._passes = passes def train(self): self._corpus = SentenceDocCorpus(self._corpus_file) self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes) self._dictionary = self._corpus.dictionary self._lda.save(self._model_file) self._dictionary.save(self._dict_file) def load(self): self._lda = LdaModel.load(self._model_file) self._dictionary = Dictionary.load(self._dict_file) def topics(self, words): return self._lda[self._dictionary.doc2bow(common.filter(words))] def topic_vector(self, words): return np.array([v for k, v in self._lda.__getitem__(self._dictionary.doc2bow(common.filter(words)), eps=0)])
def getLdaFeature(documents, topicNum): ''' Funciton: generate lda features by training lda model Input: documents: list of preprocessed sentences topicNum: output vector dimension Output: lda features(DataFrame format) ''' # get corpus # LogInfo(' Get corpus...') texts = [[word for word in document.split(' ')] for document in documents] dictionary = corpora.Dictionary(texts) corpusD = [dictionary.doc2bow(text) for text in texts] # train lda model # LogInfo(' Train LDA model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] # ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) ldaModel = LdaModel(corpus_tfidf, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) # generate lda features LogInfo(' Generate LDA features...') ldaFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpus_tfidf: topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01) for t in topic: ldaFeature[i, t[0]] = round(t[1],5) i = i + 1 colName = getColName(topicNum, "qlda") ldaFeature = pd.DataFrame(ldaFeature, columns = colName) return ldaFeature
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): """Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`. This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model. Parameters ---------- mallet_model : :class:`~gensim.models.wrappers.ldamallet.LdaMallet` Trained Mallet model gamma_threshold : float, optional To be used for inference in the new LdaModel. iterations : int, optional Number of iterations to be used for inference in the new LdaModel. Returns ------- :class:`~gensim.models.ldamodel.LdaModel` Gensim native LDA. """ model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, alpha=mallet_model.alpha, iterations=iterations, gamma_threshold=gamma_threshold, dtype=numpy.float64 # don't loose precision when converting from MALLET ) model_gensim.expElogbeta[:] = mallet_model.wordtopics return model_gensim
def __init__(self): self.lmdl = LMDL_Corpus() self.texts = self.lmdl.get_corpus_texts_words() self.dictionary = Dictionary(self.texts) self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] self.lda = LdaModel(self.corpus, num_topics=LDA_NUM_TOPICS, id2word=self.dictionary)
def convertldaMalletToldaGen(mallet_model): model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, alpha=mallet_model.alpha) # original function has 'eta=0' argument model_gensim.state.sstats[...] = mallet_model.wordtopics model_gensim.sync_state() return model_gensim
def find_topic(self, num_topics, num_words=2, passes=20): dic = Dictionary(self.texts) corpus = [dic.doc2bow(text) for text in self.texts] lda = LdaModel(corpus, num_topics=num_topics, id2word=dic, passes=passes) return lda.top_topics(topn=2, dictionary=dic, corpus=corpus)
def ldacreator(t, topics=5, stopwords=[]): texts = [[ word for word in t.lower().split() if word not in list(STOPWORDS) + stopwords and word.isalnum() ]] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda = LdaModel(corpus, id2word=dictionary, num_topics=topics, passes=10) return (lda.print_topics())
def train(self, common_texts, num_topics): self.common_dictionary = Dictionary(common_texts) common_corpus = [ self.common_dictionary.doc2bow(text) for text in common_texts ] self.model = LdaModel(common_corpus, num_topics=num_topics, alpha='auto', eval_every=5)
def lda_gensim(texts,num_topics=10): id2word = Dictionary(texts) corpus = [id2word.doc2bow(text) for text in texts] lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics) for top in lda.print_topics(): print(top) lda_corpus = lda[corpus] X_lda = corpus2csc(lda_corpus).todense().T return X_lda
def plottopicpop(): internet = [0 for i in range(10)] developing = [0 for i in range(10)] habr = [0 for i in range(10)] n = 0 for year in range(2006, 2016): articles, numberofarticles = getarticlesbyyear(year) print("Got articles for", str(year)) # Normalaize texts i = 0 for article in articles: article = replacesymbols(article) articles[i] = normalaisestr(article.lower()) i += 1 print('Normalaised') # Remove unnecessary words texts = [[word for word in article if word not in stoplist] for article in articles] print('Deleted stopwords') dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] print('Starting training') # Щадящий режим для ОЗУ for i in range(numberofarticles // 100): begin = 100 * i end = 100 * (i + 1) if end > numberofarticles: end = numberofarticles lda = LdaModel(corpus[begin:end:], id2word=dictionary, num_topics=end - begin) for j in range(lda.num_topics): topics = lda.get_topic_terms(j, 15) # print(topics) for topic in topics[0]: top = dictionary.get(topic) # print(top) if "интернет" == top: internet[n] += 1 if "разработка" == top: developing[n] += 1 if "хабра" == top: habr[n] += 1 del lda n += 1 print(internet,'\n', developing, '\n', habr) plt.title('Population of 3 topics.') plt.xlabel('Year 2006 - 2015') plt.ylabel('Number of articles') plt.plot(internet, label="Интернет") plt.plot(developing, label="Разработка") plt.plot(habr, label="Хабра") plt.legend() plt.show()
def getLdaModel(bow_corpus, dictionary, useSavedTill): if useSavedTill >= USESAVED.lda_model: common_logger.info("loading LDA model from file") return LdaModel.load(file_lda_model) else: common_logger.info("Training LDA model") num_topics = int(math.log(len(bow_corpus)) + 1) # assumption: lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses) common_logger.info("Saving LDA model") lda_model.save(file_lda_model) common_logger.info("Done creating LDA model") return lda_model
def fetch_model(dictionary): print "Fetching LDA Model... ", try: lda = LdaModel.load('Topic/lda.tm') print "LDA Model loaded!" except IOError: print "Model not found, building LDA..." corpus=MyCorpus() #lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15) lda = LdaModel(corpus,num_topics=50,id2word=dictionary,update_every=1,chunksize=1000,passes=50) print "LDA Built!" lda.save('Topic/lda.tm') return lda
def run(self): if self.clean_level in ('raw','clean','stopwords'): kind = self.clean_level else: kind = 'stopwords' if not os.path.exists(self.res_dir): print 'Creando carpeta para resultados...' os.mkdir(self.res_dir) # Aplicar cada modelo for idioma, modelos in self.input()['lda']['langs'].iteritems(): corp_path = self.input()['corp']['langs'][idioma].path corpus = corpora.MmCorpus(corp_path) for n_topics, modelo in modelos.iteritems(): model_path = modelo.path model = LdaModel.load(model_path) classification = [] for doc in corpus: topic = model.get_document_topics(doc) classification.append(topic) print '--------------------------------------' print 'USER INFO: Clasificando textos en %s con nivel de limpieza "%s" con %d tópicos' % (idioma, kind, n_topics) model.print_topics(len(corpus),5) with self.output()['langs'][idioma][n_topics]['doc_topics'].open('w') as f: pickle.dump(classification, f) with self.output()['langs'][idioma][n_topics]['topics'].open('w') as f: pickle.dump(model.print_topics(n_topics,5), f) # el 5 es un parámetro que se puede editar (numero de palabras del tópico a mostrar)
def train(self): self._corpus = SentenceDocCorpus(self._corpus_file) self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes) self._dictionary = self._corpus.dictionary self._lda.save(self._model_file) self._dictionary.save(self._dict_file)
def __init__(self, topics = 10, worker = 3, pretrained_model = None, dictionary = None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary)
def make_clouds(files, n_words=20): # set locations base_model_name = os.path.splitext(os.path.basename(files.model))[0] output_d = '../browser/clouds/' + base_model_name + '/' if not os.path.exists(output_d): os.makedirs(output_d) # create wordcloud generator wc = WordCloud(width=1000, height=500, background_color='white') print('Loading model') model = LdaModel.load(files.model) beta = model.expElogbeta print('Normalizing by topics, and by words') pTW = normalize(beta, axis=0) pWT = normalize(beta, axis=1) # load bug<->id map, then invert to id<-> bug bug_to_id = json.loads(open(files.replacements).read()) id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k} for i in range(len(beta)): # compute RAR t_rar = np.sqrt(pTW[i] * pWT[i]) top_word_ids = t_rar.argsort()[:-1 - n_words:-1] top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids] top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words] wc.fit_words(zip(top_words, t_rar[top_word_ids])) wc.to_file(output_d + str(i) + '.png')
def __init__(self, fnames, model=None, corpus=None, dictionary=None): """`fnames` is an array of files for [lda_model, distribution]""" self.reviews = open('data/electronics_topics_in.txt').readlines() print "Loding topic model..." if model is not None: print "Using argument model" self.lda = model else: self.lda = LdaModel.load(fnames[0]) if corpus is not None: print "Using argument corpus and dictionary" self.corpus = corpus self.dictionary = dictionary else: print "Loading corpus and dictionary from file" self.corpus = load("data/models/electronics_tfidf_corpus.pkl") self.dictionary = load("data/models/electronics_dict.pkl") print "Loading review-topic distribution..." self.review_dist = [l for l in self.lda[self.corpus]] tmp = lambda dist: sorted(dist, key=lambda arr: arr[1], reverse=True) self.review_dist = map(lambda dist: tmp(dist), self.review_dist) print "processing topics" tmp = map(lambda t: re.sub("(\d*\.\d*\*)", "", t), self.lda.show_topics(-1)) self.topics = map(lambda ts: re.sub("\\s\+", ",", ts), tmp)
def train_lda (self, corpus, dictionary): """ PRIVATE: train_lda ------------------ given a corpus and a dictionary, this fits parameters for self.lda_model, fills self.lda_model_topics with the """ self.lda_model = LdaModel(corpus, id2word=dictionary, num_topics=self.num_topics_lda) self.lda_model_topics = self.find_per_topic_word_distributions ()
def analyze(self, docs): # load dictionary and model self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file")) self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file")) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs] docTopicDistr = self.getDocumentTopics(docTermMatrix) return docTopicDistr
def __init__(self, ac): with open('../TextMining/Topic/data.loc','rb') as f: load(f) self.data = load(f) with open('../TextMining/Topic/translator.loc','rb') as f: self.translator = load(f) self.index = similarities.MatrixSimilarity.load('../TextMining/Topic/index.loc') self.lda = LdaModel.load('../TextMining/Topic/lda.loc') self.dictionary = Dictionary().load("../TextMining/Topic/dic.loc") self.ac_terms = ac
def __init__(self, jobdesc_fname, jobtitle_fname): self.es = Elasticsearch([{'host': app.config['ES_HOST'], 'port': 9200, 'timeout': 120}]) self.model = LdaModel.load(app.config['RCMDR_LDA_MODEL']) self.job_labels = { int(k):v for k, v in (line.split("=") for line in open(app.config['RCMDR_JOB_LABELS']) .read().strip().split('\n')) } self.jobdesc_fname = jobdesc_fname self.jobtitle_fname = jobtitle_fname
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): """ Function to convert mallet model to gensim LdaModel. This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model. Args: mallet_model : Trained mallet model gamma_threshold : To be used for inference in the new LdaModel. iterations : number of iterations to be used for inference in the new LdaModel. Returns: model_gensim : LdaModel instance; copied gensim LdaModel """ model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, alpha=mallet_model.alpha, iterations=iterations, gamma_threshold=gamma_threshold) model_gensim.expElogbeta[:] = mallet_model.wordtopics return model_gensim
def vwmodel2ldamodel(vw_model, iterations=50): """ Function to convert vowpal wabbit model to gensim LdaModel. This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel into the gensim model. Args: vw_model : Trained vowpal wabbit model. iterations : Number of iterations to be used for inference of the new LdaModel. Returns: model_gensim : LdaModel instance; copied gensim LdaModel. """ model_gensim = LdaModel( num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold ) model_gensim.expElogbeta[:] = vw_model._get_topics() return model_gensim
def __init__(self): self.dictionary = Dictionary.load(app.config["RCMDR_DICT"]) self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"]) self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"]) self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"]) self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"]) self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"]) self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"]) self.job_labels = { int(k): v for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n")) }
def AuthorTopicStd(): import nltk from gensim import corpora from gensim import matutils from gensim.models.ldamodel import LdaModel from nltk.corpus import stopwords from unidecode import unidecode TOPIC_FILE = './lda_topic.dump' LDA_FILE = './result.lda' DICTIONARY_FILE = './keywords.dict' with open(TOPIC_FILE, 'rb') as f: num_topics, topic_result = serializer.load(f) lda = LdaModel.load(LDA_FILE) dictionary = corpora.Dictionary.load(DICTIONARY_FILE) tokenizer = nltk.tokenize.RegexpTokenizer(r'[\w]{2,}') stopwords_set = set(stopwords.words()) my_topic_cache_by_aid = [None, None] def calculator(aid, pid): if my_topic_cache_by_aid[0] == aid: my_topic = my_topic_cache_by_aid[1] else: my_keywords = [] for ipid, iaid in paper_authors.get_by_aid(aid): paper = papers.get(ipid) if paper is None: continue keywords = tokenizer.tokenize(unidecode(paper[Papers.IDX_TITLE]).lower()) if not keywords: continue my_keywords.extend(keywords) my_keywords = list(filter(lambda s: s not in stopwords_set, my_keywords)) if not my_keywords: return np.nan my_topic = lda[dictionary.doc2bow(my_keywords)] my_topic_cache_by_aid[0] = aid my_topic_cache_by_aid[1] = my_topic my_topic_array = matutils.sparse2full(my_topic, num_topics) return np.std(my_topic_array) return calculator
def ldaforhabr(): numberofarticles = 0 articles, numberofarticles = getarticles() print("Got articles") # Normalaize texts i = 0 for article in articles: article = replacesymbols(article) articles[i] = normalaisestr(article.lower()) i += 1 print('Normalaised') # Remove unnecessary words texts = [[word for word in article if word not in stoplist] for article in articles] print('Deleted stopwords') dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] print('Starting training') f = open('lda.log', 'w') for i in range(i // numberofarticles): begin = 100 * i end = 100 * (i + 1) if end > numberofarticles: end = numberofarticles lda = LdaModel(corpus[begin:end:], id2word=dictionary, num_topics=end - begin) for j in range(lda.num_topics): topics = lda.get_topic_terms(j, 15) f.write(str(begin + j) + ": ") # print(topics) for topic in topics[0]: top = dictionary.get(topic) if top is not None: f.write(top + '\n') f.write('-----------\n') # i += 1 del lda f.close()
def lda_topic_model(data, is_clean=False, num_of_topics=10, num_of_pass=5): """do the topic model for the given dataset input: data: a documents or a list of words is_clean: Use this notation to pre-process the data. num_of_topics: An LDA model requires the user to determine how many topics should be generated. num_of_pass: The greater the number of passes, the more accurate the model will be. A lot of passes can be slow on a very large corpus. """ if not is_clean: stops = set(nltk.corpus.stopwords.words("english")) texts = prepare_for_lda(data, stops) else: texts = data dictionary = corpora.Dictionary(texts) print dictionary corpus = [dictionary.doc2bow(text) for text in texts] ldamodel = LdaModel(corpus, id2word=dictionary, num_topics=num_of_topics, \ passes=num_of_pass) return ldamodel.print_topics(num_topics=num_of_topics, num_words=10)
def generate_model(): np.set_printoptions(precision=2) corpus = [] corpus += load_expo_cdc() corpus += load_lago() corpus += load_news() corpus += load_news_ic() corpus += load_palestras() corpus = preprocessing(corpus) dictionary = corpora.Dictionary(corpus) bow_corpus = [dictionary.doc2bow(text) for text in corpus] dictionary.save(DICT) corpora.MmCorpus.serialize(BOW_CORPUS, bow_corpus) bow2 = np.concatenate((bow_corpus, bow_corpus), axis=0) bow2 = np.concatenate((bow2, bow2), axis=0) bow2 = np.concatenate((bow2, bow2), axis=0) TOPICS = 20 model = LdaModel(bow2, id2word=dictionary, num_topics=TOPICS, iterations=100, passes=15) model.save(MODEL) lda_corpus = [model[vector] for vector in bow2] lda_dense = gensim.matutils.corpus2dense(lda_corpus, num_terms=TOPICS).transpose() """ tfidf = models.TfidfModel(bow_corpus) tfidf_corpus = [tfidf[vector] for vector in bow_corpus] tfidf_dense = gensim.matutils.corpus2dense(tfidf_corpus, num_terms=len(dictionary)).transpose() """ classifier = LogisticRegression() labels = load_labels() labels2 = labels labels2 += labels2 labels2 += labels2 labels2 += labels2 classifier.fit(lda_dense, labels2) joblib.dump(classifier, CLASSIFIER, compress=9) #print "LDA results" probs = classifier.predict_proba(lda_dense)
def SNAP_generateLDAForTopic(self, topic, numTopics = 5): if (topic == 'all'): topics = ['syria', 'ufo', 'movie', 'celebrity', 'russia'] # bieber, cyrus for t in topics: for nt in [5, 10]: self.SNAP_generateLDAForTopic(t, nt) return id2word = self.SNAP_id2word() mmPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'snap_data', "gensim_snap_mmcorpus_%s.mm" % topic ) outPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'snap_data', "gensim_snap_lda_%s_%d" % (topic, numTopics) ) mm = MmCorpus(mmPath) lda = LdaModel(corpus=mm, id2word=id2word, num_topics=numTopics, update_every=1, chunksize=10000, passes=1) lda.save(outPath) return
def update(self, docs): # load dictionary and model self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file")) self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file")) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs] numPass = self.config.getIntConfig("train.num.pass")[0] self.ldaModel.update(docTermMatrix, passes=numPasses) docTopicDistr = self.getDocumentTopics(docTermMatrix) return docTopicDistr
def build_lda_model(corpus, dictionary, num_topics=10): file_name = None if corpus == None: corpus = get_corpus() if dictionary == None: dictionary = get_dictionary() if num_topics == 10: file_name = LDA_FILE_10 elif num_topics == 30: file_name = LDA_FILE_30 elif num_topics == 60: file_name = LDA_FILE_60 elif num_topics == 120: file_name = LDA_FILE_120 else: raise ValueError("bad number of topics") lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=1, chunksize=100, passes=1) lda.save(file_name) for topic in range(10): print "Topic {0}: {1}".format(topic, lda.print_topic(topic)) return lda
class LDA(BaseEstimator, TransformerMixin): def __init__(self, **params): self.params = params def fit(self, X, y=None): corpus = Sparse2Corpus(X, documents_columns=False) self.lda = LdaModel(corpus, **self.params) return self def transform(self, X, y=None): corpus = Sparse2Corpus(X, documents_columns=False) topics = np.array([map(lambda x: x[1], self.lda.__getitem__(c, eps=0)) for c in corpus]) print topics.shape return topics