def train_lda(corpus, token_dict, num_topics, update, passes, csize): return ldamodel.LdaModel(corpus=corpus, id2word=token_dict, num_topics=num_topics, update_every=update, passes=passes, chunksize=csize)
def testTopicSeeding(self): passed = False for topic in range(2): # try seeding it both ways round, check you get the same # topics out but with which way round they are depending # on the way round they're seeded for i in range(5): # restart at most 5 times eta = numpy.ones((2, len(dictionary))) * 0.5 system = dictionary.token2id[u'system'] trees = dictionary.token2id[u'trees'] # aggressively seed the word 'system', in one of the # two topics, 10 times higher than the other words eta[topic, system] *= 10 model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=200, eta=eta) model.update(corpus) topics = [dict((word, p) for p, word in model.show_topic(j)) for j in range(2)] # check that the word system in the topic we seeded, got a high weight, # and the word 'trees' (the main word in the other topic) a low weight -- # and vice versa for the other topic (which we didn't seed with 'system') result = [[topics[topic].get(u'system',0), topics[topic].get(u'trees',0)], [topics[1-topic].get(u'system',0), topics[1-topic].get(u'trees',0)]] expected = [[0.385, 0.022], [0.025, 0.157]] passed = numpy.allclose(result, expected, atol=1e-2) if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, result, expected)) self.assertTrue(passed)
def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize): """ Inference or E- Step. This is used to set up the gensim LdaModel to be used for each time-slice. It also allows for Document Influence Model code to be written in. """ num_topics = self.num_topics vocab_len = self.vocab_len bound = 0.0 lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word) lda.topics = np.array( np.split(np.zeros(vocab_len * num_topics), vocab_len)) ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda) model = "DTM" if model == "DTM": bound, gammas = self.inferDTMseq(corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize) elif model == "DIM": self.InfluenceTotalFixed(corpus) bound, gammas = self.inferDIMseq(corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize) return bound, gammas
def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in xrange(5): # restart at most 5 times # create the transformation model model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100) model.update(corpus) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full( transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] passed = numpy.allclose( sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning( "LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
def initialize(self, myid, dispatcher, **model_params): self.lock_update = threading.Lock() self.jobsdone = 0 # how many jobs has this worker completed? self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? self.dispatcher = dispatcher logger.info("initializing worker #%s" % myid) self.model = ldamodel.LdaModel(**model_params)
def topic_modeling(self, data_table, cols, n_topics): # Identify the words and clean the table. data_table, words = self.identify_words(data_table, cols) # Create a dictionary based on the words we have identified per row. dict_topics = gensim.corpora.Dictionary(data_table[self.col_name]) # Create a corpus containing all words. corpus = [dict_topics.doc2bow([word]) for word in words] # Apply LDA. model = lda.LdaModel(corpus, id2word=dict_topics, num_topics=n_topics) # Get the topics we found. topics = model.show_topics(num_topics=n_topics, num_words=10, log=False, formatted=False) # Create columns for the topics. for topic in range(0, n_topics): data_table[f'{cols[0]}_topic_{topic}'] = 0.0 # Score the topics per row and set the values accordingly. for i in range(0, len(data_table.index)): topic_scores = model[dict_topics.doc2bow( data_table[self.col_name][i])] for score in topic_scores: data_table.iloc[i, data_table.columns.get_loc( f'{cols[0]}_topic_{score[0]}')] = score[1] # Remove the temporary column we had created for the cleaned lists of words. del data_table[self.col_name] return data_table
def lda_mod_get(x, passes=10): newmod = ldamodel.LdaModel(corpus_train, id2word=dictionary_train, num_topics=x, passes=passes, per_word_topics=True) return (newmod)
def fit_LDA(self): """ Fit data in LDA. currently assuming that number of cores remains constant at 1. :param {str} lda_filepath: Where to save lda model to. :param {str} pyldavis_filepath: Where to save pyldavis model to. :param {int} num_topics: Number of topics the LDA model should look for. """ self.lda = ldamodel.LdaModel(corpus=self.corpus, alpha='auto', id2word=self.dictionary, **self.run_parameters) lda_vis_serialized = pyLDAvis.gensim.prepare(self.lda, self.corpus, self.dictionary, sort_topics=False) pyLDAvis.save_html(lda_vis_serialized, self.pyldavis_filepath) self.lda.save(self.lda_filepath)
def do(documents): #remove common words and tokenize stoplist = set('for a of the and to in'.split()) texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in documents] #remove words that appear only once # all_tokens = sum(texts, []) # tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) # texts = [[word for word in text if word not in tokens_once] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # I can print out the topics for LSA # lsi = models.LdaModel(corpus, id2word=dictionary, num_topics=5) # corpus_lsi = lsi[corpus] # for l,t in izip(corpus_lsi,corpus): # print l,"#",t # print # for top in lsi.print_topics(2): # print top # I can print out the documents and which is the most probable topics for each doc. lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=1) print lda.show_topics() topics_matrix = lda.show_topics(formatted=False, num_words=2) print topics_matrix print np.array(topics_matrix)
def find_lda_context(train_records, test_records): """ Uses the training records to create a topic model and then updates both the training and testing records with a vector of probabilities for each topic from the recently created topic model """ dictionary = preprocess_records(train_records, test_records) corpus = [record[Constants.CORPUS_FIELD] for record in train_records] print(corpus) topic_model = ldamodel.LdaModel( corpus, id2word=dictionary, num_topics=num_topics, passes=Constants.LDA_MODEL_PASSES, iterations=Constants.LDA_MODEL_ITERATIONS) print(corpus) for i in range(num_topics): print(topic_model.show_topic(i, topn=2)) records = train_records + test_records for record in records: document_topics =\ topic_model.get_document_topics(record[Constants.CORPUS_FIELD]) lda_context = [document_topic[1] for document_topic in document_topics] record['lda_context'] = lda_context context_topics = {} for i in range(num_topics): topic_id = 'topic' + str(i) context_topics[topic_id] = document_topics[i][1] record[Constants.CONTEXT_TOPICS_FIELD] = context_topics
def generateTopic(self,wordsLists, method=TopicMethod.LSI, numTopics=25): """step4: 主题向量转换""" """Note: 采用LDA转换后,经文本相似度比较后发现效果一点都不好, 故而采用LSI转换,效果不错. Created by flx on 2018-4-7 """ bowCorpus = self.generateBow(wordsLists) tfidfCorpus = self.generateTfidf(bowCorpus) if method == TopicMethod.LDA: instance = ldamodel.LdaModel(tfidfCorpus, id2word=self.dictionary, num_topics=numTopics) CacheUtil.dumpTopicModel(instance) elif method == TopicMethod.LSI: instance = lsimodel.LsiModel(tfidfCorpus, id2word=self.dictionary, num_topics=numTopics) CacheUtil.dumpTopicModel(instance) dstCorpus = instance[tfidfCorpus] features=[] # gensim转换后的格式是tuple列表格式,如: # vec = [(0, 0.12345), (2,0.458124),(4,0.485263),(7,0.589542)...] # 只保存向量中的非零值 # 我们转换为普通向量形式 for doc in dstCorpus: vector=[0]*numTopics for pair in doc: vector[pair[0]] = pair[1] features.append(vector) return features
def train(self, filepath, dict_path, docs, num_topics=5, passes=100, chunksize=2000, alpha=0.5, eta=0.5): if (path.exists(filepath)): LOGGER.info('Model already exists...load model') self._inner_model = ldamodel.LdaModel.load(filepath) else: start = time.time() clean_docs = [d for d in docs] if (path.exists(dict_path)): LOGGER.info('Dictionary already exists...loading dictionary') self._dict = corpora.Dictionary.load(dict_path) else: self._dict = corpora.Dictionary(clean_docs) self._dict.save(dict_path) self.dict_time = (time.time() - start) corpus_dict = self._dict corpus = [self._dict.doc2bow(x) for x in clean_docs] self._inner_model = ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=corpus_dict, passes=passes, chunksize=chunksize, alpha=alpha, eta=eta) self._inner_model.save(filepath) self.model_time = (time.time() - start) return self
def compute(self): vec_texts = [text.split() for text in self.texts] write("\n "+"-> Computing the dictionary".ljust(50,'.')) if self.debug else '' dictionary = Dictionary(vec_texts) write("[OK]") if self.debug else '' write("\n "+"-> Creating the bag-of-words space".ljust(50,'.')) if self.debug else '' corpus = [dictionary.doc2bow(vec) for vec in vec_texts] write("[OK]") if self.debug else '' write("\n "+("-> Creating the %s space" % self.method).ljust(50,'.') ) if self.debug else '' tfidf_space = TfidfModel(corpus) tfidf_corpus = tfidf_space[corpus] if self.method == 'TFIDF': self.space = tfidf_space self.index = MatrixSimilarity(tfidf_corpus) elif self.method == 'LSI': self.space = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) elif self.method == 'RP': self.space = RpModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) elif self.method == 'LDA': self.space = ldamodel.LdaModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) self.dictionary = dictionary write("[OK]\n") if self.debug else ''
def gen_V_lda(R_prime,num_topics=10,trained_lda=None,dictionary=None): # delimiters=R_prime['delimiters'] # bag_o_words=R_prime['bag_o_words'] #source=R_prime['source'] topics=[] topic_probability=[] bag_o_words=OrderedDict() for key in R_prime.keys(): bag_o_words[key]=sum(R_prime[key]) words=[[key]*bag_o_words[key] for key in bag_o_words.keys()] # dictionary=corpora.Dictionary(words) # corpus=[dictionary.doc2bow(text) for text in words] if trained_lda!=None: assert isinstance(trained_lda,ldamodel.LdaModel) assert isinstance(dictionary,corpora.Dictionary) corpus=[dictionary.doc2bow(text) for text in words] trained_lda.update(corpus) topics=lda.show_topics(num_topics=num_topics,num_words=len(dictionary),formatted=False) else: dictionary=corpora.Dictionary(words) corpus=[dictionary.doc2bow(text) for text in words] lda=ldamodel.LdaModel(corpus,id2word=dictionary,num_topics=100) topics=lda.show_topics(num_topics=num_topics,num_words=len(dictionary),formatted=False) V_prime=OrderedDict() for word in R_prime.keys(): V_prime[word]=[0]*num_topics for i,topic in enumerate(topics): for entry in topic: V_prime[entry[1]][i]=entry[0] return V_prime
def gentler_lda_entire_corpus(in_folder, ofile, wordlist, num_topics=30): files = sorted([ i for i in os.listdir(in_folder) if os.path.isfile(join(in_folder, i)) ]) #print(files) months = [] with open(ofile, 'w') as f: f.write(pprint.pformat(locals())) for file in files: month = file.split('.')[-2][-8:-3] f.write(f'\n{"="*40}\n\nmonth {month}') mcorpus = textcorpus.conditionalCorpus(join( in_folder, file)) #, lines_are_documents=True) lda = ldamodel.LdaModel(mcorpus, num_topics=num_topics, id2word=mcorpus.dictionary) topics = lda.get_topics() f.write( pprint.pformat(lda.show_topics(num_topics, num_words=30)) + '\n') for w in wordlist: try: topics = lda.get_term_topics(w) f.write(f'\t{w}: {pprint.pformat(topics)}\t') except: f.write(f'\t{w}: outofvocab\t') print(f'month {month} LDA complete')
def lda_model(corpus_tfidf, dictionary, num_topics=10, num_words=10, vis='off'): lda_tfidf = ldamodel.LdaModel(corpus_tfidf, num_topics) lda_tfidf.save('lda_model') topics = lda_tfidf.show_topics(num_topics, num_words, log=False, formatted=False) words = dictionary.token2id topics_decoded = dict() for i in range(len(topics)): topic_no = 'Topic ' + str(i) topics_decoded[topic_no] = {} v = topics[i][1] for j in range(len(v)): word = list(words.keys()) [(list(words.values())).index(int(v[j][0]))] topics_decoded[topic_no][word] = v[j][1] if vis == 'on': pyLDAvis.enable_notebook() pyLDAvis.display( pyLDAvis.gensim.prepare(lda_tfidf, corpus_tfidf, dictionary)) return topics_decoded
def __getitem__(self, doc): """ Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed. """ lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word) lda_model.topics = np.array( np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len)) ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc) time_lhoods = [] for time in range(0, self.num_time_slices): lda_model = self.make_lda_seq_slice(lda_model, time) # create lda_seq slice lhood = LdaPost.fit_lda_post(ldapost, 0, time, self) time_lhoods.append(lhood) doc_topic = ldapost.gamma / ldapost.gamma.sum() # should even the likelihoods be returned? return doc_topic
def create_model(K): pre_collection = preprocess() dictionary = corpora.Dictionary(pre_collection) dictionary.filter_extremes(no_below=2, no_above=0.8, keep_n=500) docs_ids = [dictionary.doc2bow(doc) for doc in pre_collection] lda = glda.LdaModel(docs_ids, num_topics=K, id2word=dictionary) return lda, dictionary, docs_ids, pre_collection
def testPersistence(self): model = ldamodel.LdaModel(self.corpus, num_topics=2) model.save(testfile()) model2 = ldamodel.LdaModel.load(testfile()) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta)) tstvec = [] self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
def createModel(self, doc): dictionary = corpora.Dictionary(doc) corpus = [dictionary.doc2bow(text) for text in doc] model = ldamodel.LdaModel( corpus, num_topics=config["topic_modeling"]["num_topics"], id2word=dictionary) return model, dictionary
def create_models(texts): dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=10) print(lda.show_topics()) vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary) print(vis_data) pyLDAvis.show(vis_data)
def testLargeMmapCompressed(self): fname = testfile() + '.gz' model = ldamodel.LdaModel(self.corpus, num_topics=2) # simulate storing large arrays separately model.save(fname, sep_limit=0) # test loading the large model arrays with mmap self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r')
def testPersistenceCompressed(self): fname = testfile() + '.gz' model = ldamodel.LdaModel(self.corpus, num_topics=2) model.save(fname) model2 = ldamodel.LdaModel.load(fname, mmap=None) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta)) tstvec = [] self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
def get_lda(c, n_topics=4): """ Get LDA model :param n_topics: Number of topics to use in model :param c: Corpus object :return: LDA model """ lda = ldamodel.LdaModel(c.corpus, id2word=c.dic, num_topics=n_topics) return lda
def train_LDA(posts): """ Uses gensim to train an LDA model for topic modeling. """ dct = Dictionary(posts) dct.filter_extremes(no_below=2, no_above=0.5) corpus = [dct.doc2bow(text) for text in posts] model = ldamodel.LdaModel(corpus, num_topics=10, id2word=dct, passes=1) return model, corpus
def suggested_lda_model(self): """ Returns closest corresponding ldamodel object corresponding to current hdp model. The hdp_to_lda method only returns corresponding alpha, beta values, and this method returns a trained ldamodel. The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta. """ alpha, beta = self.hdp_to_lda() ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state) ldam.expElogbeta[:] = beta return ldam
def testPersistenceIgnore(self): fname = testfile('testPersistenceIgnore') model = ldamodel.LdaModel(self.corpus, num_topics=2) model.save(fname, ignore='id2word') model2 = ldamodel.LdaModel.load(fname) self.assertTrue(model2.id2word is None) model.save(fname, ignore=['id2word']) model2 = ldamodel.LdaModel.load(fname) self.assertTrue(model2.id2word is None)
def testPersistenceIgnore(self): fname = get_tmpfile('gensim_models_lda_testPersistenceIgnore.tst') model = ldamodel.LdaModel(self.corpus, num_topics=2) model.save(fname, ignore='id2word') model2 = ldamodel.LdaModel.load(fname) self.assertTrue(model2.id2word is None) model.save(fname, ignore=['id2word']) model2 = ldamodel.LdaModel.load(fname) self.assertTrue(model2.id2word is None)
def testTransform(self): # create the transformation model model = ldamodel.LdaModel(self.corpus, numTopics = 2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.doc2vec(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.0, 1.0] self.assertTrue(numpy.allclose(sorted(vec), sorted(expected))) # must contain the same values, up to re-ordering
def generateTopicModels(dictionary, bow, topicCounts): models = [] alphas = [] betas = [] for amountTopics in topicCounts: ldaModel = ldamodel.LdaModel(bow, amountTopics, dictionary, passes=20, per_word_topics=False) models.append(ldaModel) alphas.append('default') betas.append('default') print('Generated topic model with parameters: ' + "(topics: " + str(amountTopics) + "), (alpha: default" + "), (beta: default)") return models, alphas, betas