def prepare_topics():
    # One document in two topics, unnormalized
    weights = np.array([[0.5, -0.1]])
    # Two topics in 4 dimensions
    factors = np.array([[0.1, 0.1, 0.1, 5.0], [5.1, 0.1, 0.1, 0.0]])
    # Three words in 4 dimensions
    vectors = np.array([[5.0, 0.1, 0.1, 0.1], [0.0, 0.1, 0.1, 5.0],
                        [2.0, 0.1, 0.1, -.9]])
    vocab = ['a', 'b', 'c']
    data = topics.prepare_topics(weights, factors, vectors, vocab)
    return data
示例#2
0
def prepare_topics():
    # One document in two topics, unnormalized
    weights = np.array([[0.5, -0.1]])
    # Two topics in 4 dimensions
    factors = np.array([[0.1, 0.1, 0.1, 5.0],
                        [5.1, 0.1, 0.1, 0.0]])
    # Three words in 4 dimensions
    vectors = np.array([[5.0, 0.1, 0.1, 0.1],
                        [0.0, 0.1, 0.1, 5.0],
                        [2.0, 0.1, 0.1, -.9]])
    vocab = ['a', 'b', 'c']
    data = topics.prepare_topics(weights, factors, vectors, vocab)
    return data
示例#3
0
if pretrained:
    model.sampler.W.data[:, :] = vectors[:n_vocab, :]
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('progress.shelve')
no_of_epoch = 10
for epoch in range(10):
    data = topics.prepare_topics(
        cuda.to_cpu(model.mixture.weights.W.data).copy(),
        cuda.to_cpu(model.mixture.factors.W.data).copy(),
        cuda.to_cpu(model.sampler.W.data).copy(), words)
    top_words = topics.print_top_words_per_topic(data)
    if j % 100 == 0 and j > 100:
        coherence = topics.topic_coherence(top_words)
        for j in range(n_topics):
            print(j, coherence[(j, 'cv')])
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        # optimizer.zero_grads()
        model.cleargrads()
示例#4
0
model = LDA(n_docs, n_topics, n_units, n_vocab)
if os.path.exists('lda.hdf5'):
    print("Reloading from saved")
    serializers.load_hdf5("lda.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / bow.shape[0]
for epoch in range(50000000):
    if epoch % 100 == 0:
        p = cuda.to_cpu(model.proportions.W.data).copy()
        f = cuda.to_cpu(model.factors.W.data).copy()
        w = cuda.to_cpu(model.embedding.W.data).copy()
        d = topics.prepare_topics(p, f, w, words)
        topics.print_top_words_per_topic(d)
    for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow):
        t0 = time.time()
        # optimizer.zero_grads()
        model.cleargrads()
        rec, ld = model.forward(ids, batch)
        l = rec + ld
        l.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
               "P:{ld:1.3e} R:{rate:1.3e}")
        l.to_cpu()
        rec.to_cpu()
        ld.to_cpu()
        t1 = time.time()