optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] progress = shelve.open('progress.shelve') for epoch in range(200): data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(n_topics): six.print_(j, coherence[(j, 'cv')]) kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) for d, f in utils.chunks(batchsize, doc_ids, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(d.copy(), f.copy()) prior = model.prior() loss = prior * fraction loss.backward() optimizer.update()
optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] progress = shelve.open('progress.shelve') for epoch in range(200): data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) top_words = print_top_words_per_topic(data) coherence = topic_coherence(top_words) for j in range(n_topics): print j, coherence[(j, 'cv')] kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) for d, f in utils.chunks(batchsize, doc_ids, flattened): t0 = time.time() optimizer.zero_grads() l = model.fit_partial(d.copy(), f.copy()) prior = model.prior() loss = prior * fraction loss.backward() optimizer.update()
def infer(self,docs=None,epochs=200, update_words=False, update_topics=False, topic_vectors=None): """ Infers the featurs of a new document that is passed in. By running the Lda2vec algorithm again. But by updating only the topic distributions""" texts = docs docs = [] for text in texts: docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab))) logging.info("preprocessing") self.preprocess(docs) logging.info('preprocessed!') self.infer_model = LDA2Vec(n_documents=self.n_docs,\ n_document_topics=self.n_topics,\ n_units=300,\ n_vocab=self.n_vocab,\ counts=self.term_frequency,\ n_samples=15,\ power=self.power,\ temperature=self.temp) if self.words_pretrained: self.infer_model.sampler.W.data = self.vectors[:self.n_vocab, :] self.infer_model.mixture.factors.W.data = self.train_model.mixture.factors.W.data if topic_vectors is not None: assert(topic_vectors.shape==self.infer_model.mixture.factors.W.data.shape), ("topic vectors shape doesn't match") self.infer_model.mixture.factors.W.data = topic_vectors optimizer = O.Adam() optimizer.setup(self.infer_model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 msgs = defaultdict(list) for epoch in range(epochs): print "epoch : ",epoch data = prepare_topics(cuda.to_cpu(self.infer_model.mixture.weights.W.data).copy(), cuda.to_cpu(self.infer_model.mixture.factors.W.data).copy(), cuda.to_cpu(self.infer_model.sampler.W.data).copy(), self.words) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(self.n_topics): print j, coherence[(j, 'cv')] kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) #progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = self.doc_lengths data['term_frequency'] = self.term_frequency #np.savez('topics.pyldavis', **data) for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened): t0 = time.time() optimizer.zero_grads() l = self.infer_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics) prior = self.infer_model.prior() loss = prior * self.fraction loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " "P:{prior:1.3e} R:{rate:1.3e}") prior.to_cpu() loss.to_cpu() t1 = time.time() dt = t1 - t0 rate = self.batchsize / dt msgs["E"].append(epoch) msgs["L"].append(float(l)) j += 1 logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate) print msg.format(**logs) print "\n ================================= \n" #serializers.save_hdf5("lda2vec.hdf5", self.model) msgs["loss_per_epoch"].append(float(l)) return data, msgs