def prepare_topics(): # One document in two topics, unnormalized weights = np.array([[0.5, -0.1]]) # Two topics in 4 dimensions factors = np.array([[0.1, 0.1, 0.1, 5.0], [5.1, 0.1, 0.1, 0.0]]) # Three words in 4 dimensions vectors = np.array([[5.0, 0.1, 0.1, 0.1], [0.0, 0.1, 0.1, 5.0], [2.0, 0.1, 0.1, -.9]]) vocab = ['a', 'b', 'c'] data = topics.prepare_topics(weights, factors, vectors, vocab) return data
if pretrained: model.sampler.W.data[:, :] = vectors[:n_vocab, :] model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] progress = shelve.open('progress.shelve') no_of_epoch = 10 for epoch in range(10): data = topics.prepare_topics( cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.sampler.W.data).copy(), words) top_words = topics.print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topics.topic_coherence(top_words) for j in range(n_topics): print(j, coherence[(j, 'cv')]) kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = doc_lengths data['term_frequency'] = term_frequency np.savez('topics.pyldavis', **data) for d, f in utils.chunks(batchsize, doc_ids, flattened): t0 = time.time() # optimizer.zero_grads() model.cleargrads()
model = LDA(n_docs, n_topics, n_units, n_vocab) if os.path.exists('lda.hdf5'): print("Reloading from saved") serializers.load_hdf5("lda.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / bow.shape[0] for epoch in range(50000000): if epoch % 100 == 0: p = cuda.to_cpu(model.proportions.W.data).copy() f = cuda.to_cpu(model.factors.W.data).copy() w = cuda.to_cpu(model.embedding.W.data).copy() d = topics.prepare_topics(p, f, w, words) topics.print_top_words_per_topic(d) for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow): t0 = time.time() # optimizer.zero_grads() model.cleargrads() rec, ld = model.forward(ids, batch) l = rec + ld l.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} " "P:{ld:1.3e} R:{rate:1.3e}") l.to_cpu() rec.to_cpu() ld.to_cpu() t1 = time.time()