def create_topics(self, do_print, epoch): """ for an epoch this function prepares topics with given corpus word list and extracts top words from those. It stores intermediate results of the data in pyldavis file and the model in a hdf5 file . <<<< This is the LDA part >>>> :param do_print: print top words in an epoch :param epoch: index of an epoch :return: """ j=0 # prepare the topic_term_distributions, document_topic_distributions and term_frequencies using softmax data = prepare_topics(weights=cuda.to_cpu(self.model.mixture.weights.W.data).copy(), topic_vectors=cuda.to_cpu(self.model.mixture.factors.W.data).copy(), word_vectors=cuda.to_cpu(self.model.sampler.W.data).copy(), vocab=self.words, doprint=False) #top_words = print_top_words_per_topic(data, do_print=do_print) #if j % 100 == 0 and j > 100 and do_print: # coherence = topic_coherence(top_words) # for j in range(self.n_topics): # print j, coherence[(j, 'cv')] data['doc_lengths'] = self.doc_lengths data['term_frequency'] = self.term_frequency np.savez('topics_' + self.modelid + '.pyldavis', **data) for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened): self.update_per_chunk(d, epoch, f) j+=1 # saves the parameters of model into a file in hdf5 format serializers.save_hdf5("lda2vec_" + self.modelid + ".hdf5", self.model)
model = LDA(n_docs, n_topics, n_units, n_vocab) if os.path.exists('lda.hdf5'): six.print_("Reloading from saved") serializers.load_hdf5("lda.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / bow.shape[0] for epoch in range(50000000): if epoch % 100 == 0: p = cuda.to_cpu(model.proportions.W.data).copy() f = cuda.to_cpu(model.factors.W.data).copy() w = cuda.to_cpu(model.embedding.W.data).copy() d = prepare_topics(p, f, w, words) print_top_words_per_topic(d) for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow): t0 = time.time() optimizer.zero_grads() rec, ld = model.forward(ids, batch) l = rec + ld l.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} " "P:{ld:1.3e} R:{rate:1.3e}") l.to_cpu() rec.to_cpu() ld.to_cpu() t1 = time.time() dt = t1 - t0
serializers.load_hdf5("lda2vec.hdf5", model) if pretrained: model.sampler.W.data[:, :] = vectors[:n_vocab, :] model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] progress = shelve.open('progress.shelve') for epoch in range(200): data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) top_words = print_top_words_per_topic(data) coherence = topic_coherence(top_words) for j in range(n_topics): print j, coherence[(j, 'cv')] kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) for d, f in utils.chunks(batchsize, doc_ids, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(d.copy(), f.copy()) prior = model.prior()
serializers.load_hdf5("lda2vec.hdf5", model) if pretrained: model.sampler.W.data[:, :] = vectors[:n_vocab, :] model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] progress = shelve.open('progress.shelve') for epoch in range(200): data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(n_topics): six.print_(j, coherence[(j, 'cv')]) kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) for d, f in utils.chunks(batchsize, doc_ids, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(d.copy(), f.copy())
def infer(self,docs=None,epochs=200, update_words=False, update_topics=False, topic_vectors=None): """ Infers the featurs of a new document that is passed in. By running the Lda2vec algorithm again. But by updating only the topic distributions""" texts = docs docs = [] for text in texts: docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab))) logging.info("preprocessing") self.preprocess(docs) logging.info('preprocessed!') self.infer_model = LDA2Vec(n_documents=self.n_docs,\ n_document_topics=self.n_topics,\ n_units=300,\ n_vocab=self.n_vocab,\ counts=self.term_frequency,\ n_samples=15,\ power=self.power,\ temperature=self.temp) if self.words_pretrained: self.infer_model.sampler.W.data = self.vectors[:self.n_vocab, :] self.infer_model.mixture.factors.W.data = self.train_model.mixture.factors.W.data if topic_vectors is not None: assert(topic_vectors.shape==self.infer_model.mixture.factors.W.data.shape), ("topic vectors shape doesn't match") self.infer_model.mixture.factors.W.data = topic_vectors optimizer = O.Adam() optimizer.setup(self.infer_model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 msgs = defaultdict(list) for epoch in range(epochs): print "epoch : ",epoch data = prepare_topics(cuda.to_cpu(self.infer_model.mixture.weights.W.data).copy(), cuda.to_cpu(self.infer_model.mixture.factors.W.data).copy(), cuda.to_cpu(self.infer_model.sampler.W.data).copy(), self.words) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(self.n_topics): print j, coherence[(j, 'cv')] kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) #progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = self.doc_lengths data['term_frequency'] = self.term_frequency #np.savez('topics.pyldavis', **data) for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened): t0 = time.time() optimizer.zero_grads() l = self.infer_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics) prior = self.infer_model.prior() loss = prior * self.fraction loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " "P:{prior:1.3e} R:{rate:1.3e}") prior.to_cpu() loss.to_cpu() t1 = time.time() dt = t1 - t0 rate = self.batchsize / dt msgs["E"].append(epoch) msgs["L"].append(float(l)) j += 1 logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate) print msg.format(**logs) print "\n ================================= \n" #serializers.save_hdf5("lda2vec.hdf5", self.model) msgs["loss_per_epoch"].append(float(l)) return data, msgs
def train( self, doc_ids, flattened, vocab, words, max_epochs=np.inf, verbose=False, #added vocab & words to add save(npz) #option during training loss_switch_epochs=0, # num epochs until LDA loss switched on save=False, save_every=1000, outdir="./out", summarize=True, summarize_every=1000, metadata="metadata.tsv", metadata_docs="metadata.docs.tsv"): n_vocab = flattened.max() + 1 # How many tokens are in each document doc_idx, lengths = np.unique(doc_ids, return_counts=True) doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32') doc_lengths[doc_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(flattened, return_counts=True) term_frequency = np.zeros(n_vocab, dtype='int32') term_frequency[tok_idx] = freq if save: try: os.mkdir(outdir) except OSError as e: #for Python 2 if e.errno == errno.EEXIST: pass saver = tf.train.Saver(tf.global_variables()) outdir = os.path.abspath(self.log_dir) if summarize: try: self.logger.flush() except (AttributeError): # not yet logging self.logger = tf.summary.FileWriter(self.log_dir, self.sesh.graph) merged = self._addSummaries(metadata, metadata_docs) j = 0 epoch = 0 fraction = self.batch_size / len(flattened) # == batch / n_corpus self.sesh.run(tf.assign(self.fraction, fraction)) progress = shelve.open('progress.shelve') # turn on LDA loss after n iters of training iters_per_epoch = (int(len(flattened) / self.batch_size) + np.ceil(len(flattened) % self.batch_size)) n = iters_per_epoch * loss_switch_epochs self.sesh.run(tf.assign(self.switch_loss, n)) now = datetime.now().isoformat()[11:] print("------- Training begin: {} -------\n".format(now)) while epoch < max_epochs: try: # doc_ids, word_idxs for d, f in utils.chunks(self.batch_size, doc_ids, flattened): t0 = datetime.now() feed_dict = self.make_feed_dict(d, f) # if len(feed_dict[self.pivot_idxs]) == 0: # print("Empty batch. Skipping...") # continue fetches = [ self.loss_lda, self.loss_word2vec, self.loss, self.train_op ] loss_lda, loss_word2vec, loss, _ = self.sesh.run( fetches, feed_dict=feed_dict) if j > 5: print(loss_lda, loss_word2vec, loss) #py2 j += 1 if verbose and j % 1000 == 0: msg = ( "J:{j:05d} E:{epoch:05d} L_nce:{l_word2vec:1.3e} " "L_dirichlet:{l_lda:1.3e} R:{rate:1.3e}") t1 = datetime.now().timestamp() dt = t1 - t0 rate = self.batch_size / dt logs = dict(l_word2vec=loss_word2vec, epoch=epoch, j=j, l_lda=loss_lda, rate=rate) print(msg.format(**logs)) if save and j % save_every == 0: outfile = os.path.join( outdir, "{}_lda2vec".format(self.datetime)) saver.save(self.sesh, outfile, global_step=self.step) if summarize and j % summarize_every == 0: summary = self.sesh.run(merged, feed_dict=feed_dict) self.logger.add_summary(summary, global_step=self.step) #if j % 100 == 0 and j > 100 and epoch > 1: # coherence = topic_coherence(top_words) # for j in range(n_topics): # print(j, coherence[(j, 'cv')]) # kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) # progress[str(epoch)] = pickle.dumps(kw) epoch += 1 a = self.mixture.W.eval(session=self.sesh) b = self.mixture.factors.eval(session=self.sesh) c = self.sampler.W.eval(session=self.sesh) data = prepare_topics(a, b, c, words) print("------- epoch: {}-------\n".format(epoch)) top_words = print_top_words_per_topic(data) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) except (KeyboardInterrupt): break print("epoch", epoch) print("max", max_epochs) now = datetime.now().isoformat()[11:] print("------- Training end: {} -------\n".format(now)) if save: outfile = os.path.join(outdir, "{}_lda2vec".format(self.datetime)) saver.save(self.sesh, outfile, global_step=self.step) try: self.logger.flush() self.logger.close() except (AttributeError): # not logging pass
model = NSLDA(counts, n_docs, n_topics, n_units, n_vocab) if os.path.exists('nslda.hdf5'): print "Reloading from saved" serializers.load_hdf5("nslda.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / flattened.shape[0] for epoch in range(50000000): p = cuda.to_cpu(model.proportions.W.data).copy() f = cuda.to_cpu(model.factors.W.data).copy() w = cuda.to_cpu(model.loss_func.W.data).copy() d = prepare_topics(p, f, w, words) print_top_words_per_topic(d) for (doc_ids, flat) in utils.chunks(batchsize, doc_id, flattened): t0 = time.time() optimizer.zero_grads() rec, ld = model.forward(doc_ids, flat) l = rec + ld * fraction * strength l.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} " "P:{ld:1.3e} R:{rate:1.3e}") l.to_cpu() rec.to_cpu() ld.to_cpu() t1 = time.time() dt = t1 - t0
if os.path.exists('lda2vec_hn.hdf5'): print "Reloading from saved" serializers.load_hdf5("lda2vec_hn.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] for epoch in range(5000): print "Story topics" w = cuda.to_cpu(model.mixture_stories.weights.W.data).copy() f = cuda.to_cpu(model.mixture_stories.factors.W.data).copy() v = cuda.to_cpu(model.embed.W.data).copy() d = prepare_topics(w, f, v, words) print_top_words_per_topic(d) print "Author topics" w = cuda.to_cpu(model.mixture_authors.weights.W.data).copy() f = cuda.to_cpu(model.mixture_authors.factors.W.data).copy() d = prepare_topics(w, f, v, words) print_top_words_per_topic(d) for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened): t0 = time.time() l = model.fit_partial(s.copy(), a.copy(), f.copy()) prior = model.prior() loss = l + prior * fraction * clambda optimizer.zero_grads() loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
for epoch in range(200): # After the first execution of the statement below, data.keys() => # dict_keys(['vocab', 'doc_lengths', 'doc_topic_dists', 'topic_term_dists', 'term_frequency'] # # Also the data['vocab'] is mostly <OoV> # (Pdb) print(sum(x != '<OoV>' for x in data['vocab']), 'out of', len(data['vocab']), ' is NOT <OoV>') # 27 out of 5835 is NOT <OoV> # # Debug>>> # (Pdb) model.mixture.weights.W.data.shape -> (11314, 20) (weights) # (Pdb) model.mixture.factors.W.data.shape -> (20, 300) (factors -> factor_vector) # (Pdb) model.sampler.W.data.shape -> (5837, 300) (word_vectors) # (Pdb) len(words) -> 5837 (vocab) if gpu_id >= 0: data = prepare_topics(cuda.to_gpu(model.mixture.weights.W.data).copy(), cuda.to_gpu(model.mixture.factors.W.data).copy(), cuda.to_gpu(model.sampler.W.data).copy(), words, normalize = False) else: data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words, normalize = False) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(n_topics): print(j, coherence[(j, 'cv')]) kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw)
serializers.load_hdf5("lda2vec%3d.hdf5" % latest, model) # model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] progress = shelve.open('progress.shelve') steps = flattened.shape[0] // batchsize print('steps per epoch: %d' % steps) for epoch in range(5000): ts = prepare_topics(model.mixture_sty.weights.W.data, model.mixture_sty.factors.W.data, model.sampler.W.data, words) print_top_words_per_topic(ts) ts['doc_lengths'] = sty_len ts['term_frequency'] = term_frequency np.savez('topics.story.pyldavis', **ts) ta = prepare_topics(model.mixture_aut.weights.W.data, model.mixture_aut.factors.W.data, model.sampler.W.data, words) print_top_words_per_topic(ta) ta['doc_lengths'] = aut_len ta['term_frequency'] = term_frequency np.savez('topics.author.pyldavis', **ta) for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened):