示例#1
0
    def create_topics(self, do_print, epoch):
        """
        for an epoch this function prepares topics with given corpus word list and
        extracts top words from those.
        It stores intermediate results of the data in pyldavis file and the model in a hdf5 file .

        <<<< This is the LDA part >>>>

        :param do_print: print top words in an epoch
        :param epoch: index of an epoch
        :return:
        """
        j=0
        # prepare the topic_term_distributions, document_topic_distributions and term_frequencies using softmax
        data = prepare_topics(weights=cuda.to_cpu(self.model.mixture.weights.W.data).copy(),
                              topic_vectors=cuda.to_cpu(self.model.mixture.factors.W.data).copy(),
                              word_vectors=cuda.to_cpu(self.model.sampler.W.data).copy(),
                              vocab=self.words, doprint=False)

        #top_words = print_top_words_per_topic(data, do_print=do_print)
        #if j % 100 == 0 and j > 100 and do_print:
        #    coherence = topic_coherence(top_words)
        #    for j in range(self.n_topics):
        #        print j, coherence[(j, 'cv')]
        data['doc_lengths'] = self.doc_lengths
        data['term_frequency'] = self.term_frequency
        np.savez('topics_' + self.modelid + '.pyldavis', **data)
        for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened):
            self.update_per_chunk(d, epoch, f)
            j+=1
        # saves the parameters of model into a file in hdf5 format
        serializers.save_hdf5("lda2vec_" + self.modelid + ".hdf5", self.model)
示例#2
0
    def _train(self):
        """
        Train the stacked denoising autoencoders.
        """
        if 'fold' in self.hyperparameters:
            current_fold = self.hyperparameters['fold'] + 1
        else:
            current_fold = 0
        term_freq = self.abstracts_preprocessor.get_term_frequency_sparse_matrix().todense()
        self.get_cnn()
        if self._verbose:
            print("CNN is constructed...")
        error = numpy.inf
        iterations = 0
        batchsize = 2048
        for epoch in range(1, 1 + self.n_iter):
            self.document_distribution = self.predict_sdae(term_freq)
            t0 = time.time()
            self.user_vecs = self.als_step(self.user_vecs, self.item_vecs, self.train_data, self._lambda, type='user')
            self.item_vecs = self.als_step(self.item_vecs, self.user_vecs, self.train_data, self._lambda, type='item')
            t1 = time.time()
            iterations += 1
            if self._verbose:
                error = self.evaluator.get_rmse(self.user_vecs.dot(self.item_vecs.T), self.train_data)
                if current_fold == 0:
                    logs = dict(it=iterations, epoch=epoch, loss=error, time=(t1 - t0))
                    print('Iteration:{it:05d} Epoch:{epoch:02d} Loss:{loss:1.4e} Time:{time:.3f}s'.format(**logs))
                else:
                    logs = dict(fold=current_fold, it=iterations, epoch=epoch, loss=error, time=(t1 - t0))
                    print('Fold:{fold:02d} Iteration:{it:05d} Epoch:{epoch:02d} Loss:{loss:1.4e} '
                          'Time:{time:.3f}s'.format(**logs))

            for inp_batch, item_batch in chunks(batchsize, term_freq, self.item_vecs):
                t0 = time.time()
                loss = self.train_sdae(inp_batch, item_batch)
                t1 = time.time()
                iterations += 1
                if self._verbose:
                    if current_fold == 0:
                        msg = ('Iteration:{it:05d} Epoch:{epoch:02d} Loss:{loss:1.3e} Time:{tim:.3f}s')
                        logs = dict(loss=float(loss), epoch=epoch, it=iterations, tim=(t1 - t0))
                        print(msg.format(**logs))
                    else:
                        msg = ('Fold:{fold:02d} Iteration:{it:05d} Epoch:{epoch:02d} Loss:{loss:1.3e} Time:{tim:.3f}s')
                        logs = dict(fold=current_fold, loss=float(loss), epoch=epoch, it=iterations, tim=(t1 - t0))
                        print(msg.format(**logs))
            error = self.evaluator.get_rmse(self.user_vecs.dot(self.item_vecs.T), self.train_data)

        self.document_distribution = self.predict_sdae(term_freq)
        rms = self.evaluate_sdae(term_freq, self.item_vecs)

        if self._verbose:
            print(rms)
        # Garbage collection for keras
        backend.clear_session()
        if self._verbose:
            print("SDAE trained...")
        return rms
示例#3
0
    def _train(self):
        """
        Train the LDA2Vec model, and store the document_distribution matrix.
        """
        n_units = self.abstracts_preprocessor.get_num_units()
        # 2 lists which correspond to pairs ('doc_id', 'word_id') of all the words
        # in each document, 'word_id' according to the computed dictionary 'vocab'
        doc_ids, flattened = zip(*self.abstracts_preprocessor.get_article_to_words())
        assert len(doc_ids) == len(flattened)
        flattened = numpy.array(flattened, dtype='int32')
        doc_ids = numpy.array(doc_ids, dtype='int32')

        # Word frequencies, for lda2vec_model
        n_vocab = self.abstracts_preprocessor.get_num_vocab()
        term_frequency = self.abstracts_preprocessor.get_term_frequencies()

        # Assuming that doc_ids are in the set {0, 1, ..., n - 1}
        assert doc_ids.max() + 1 == self.n_items
        # Initialize lda2vec model
        lda2v_model = LDA2Vec(n_documents=self.n_items, n_document_topics=self.n_factors,
                              n_units=n_units, n_vocab=n_vocab, counts=term_frequency)
        if self._verbose:
            print("Initialize LDA2Vec model..., Training LDA2Vec...")

        # Initialize optimizers
        optimizer = optimizers.Adam()
        optimizer.setup(lda2v_model)
        clip = chainer.optimizer.GradientClipping(5.0)
        optimizer.add_hook(clip)

        if self._verbose:
            print("Optimizer Initialized...")
        batchsize = 2048
        iterations = 0
        for epoch in range(1, self.n_iter + 1):
            for d, f in chunks(batchsize, doc_ids, flattened):
                t0 = time.time()
                if len(d) <= 10:
                    continue
                optimizer.zero_grads()
                l = lda2v_model.fit_partial(d.copy(), f.copy())
                prior = lda2v_model.prior()
                loss = prior
                loss.backward()
                optimizer.update()
                iterations += 1
                t1 = time.time()
                if self._verbose:
                    msg = "Iteration:{it:05d} Epoch:{epoch:02d} Loss:{loss:1.3e} Prior:{prior:1.3e} Time:{tim:.3f}s"
                    logs = dict(loss=float(l), epoch=epoch, it=iterations, prior=float(prior.data), tim=(t1 - t0))
                    print(msg.format(**logs))

        # Get document distribution matrix.
        self.document_distribution = lda2v_model.mixture.proportions(numpy.unique(doc_ids), True).data
        if self._verbose:
            print("LDA2Vec trained...")
示例#4
0
counts = corpus.keys_counts[:n_vocab]
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]

model = NVDM(n_vocab, n_units)
if os.path.exists('nvdm.hdf5'):
    print "Reloading from saved"
    serializers.load_hdf5("nvdm.hdf5", model)
# model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / bow.shape[0]
for epoch in range(500):
    for (batch, ) in utils.chunks(batchsize, bow):
        t0 = time.time()
        rec, kl = model.observe(batch)
        optimizer.zero_grads()
        l = rec + kl
        l.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
               "P:{kl:1.3e} R:{rate:1.3e}")
        l.to_cpu()
        rec.to_cpu()
        kl.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(rec=float(rec.data),
示例#5
0
    six.print_("Reloading from saved")
    serializers.load_hdf5("lda.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / bow.shape[0]
for epoch in range(50000000):
    if epoch % 100 == 0:
        p = cuda.to_cpu(model.proportions.W.data).copy()
        f = cuda.to_cpu(model.factors.W.data).copy()
        w = cuda.to_cpu(model.embedding.W.data).copy()
        d = prepare_topics(p, f, w, words)
        print_top_words_per_topic(d)
    for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow):
        t0 = time.time()
        optimizer.zero_grads()
        rec, ld = model.forward(ids, batch)
        l = rec + ld
        l.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
               "P:{ld:1.3e} R:{rate:1.3e}")
        l.to_cpu()
        rec.to_cpu()
        ld.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(rec=float(rec.data),
示例#6
0
progress = shelve.open('progress.shelve')
for epoch in range(200):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    coherence = topic_coherence(top_words)
    for j in range(n_topics):
        print j, coherence[(j, 'cv')]
    kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
    progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        l = model.fit_partial(d.copy(), f.copy())
        prior = model.prior()
        loss = prior * fraction
        loss.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
               "P:{prior:1.3e} R:{rate:1.3e}")
        prior.to_cpu()
        loss.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(loss=float(l), epoch=epoch, j=j,
示例#7
0
for epoch in range(200):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    if j % 100 == 0 and j > 100:
        coherence = topic_coherence(top_words)
        for j in range(n_topics):
            six.print_(j, coherence[(j, 'cv')])
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        l = model.fit_partial(d.copy(), f.copy())
        prior = model.prior()
        loss = prior * fraction
        loss.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
               "P:{prior:1.3e} R:{rate:1.3e}")
        prior.to_cpu()
        loss.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(loss=float(l), epoch=epoch, j=j,
    def train(self,
              pivot_words,
              target_words,
              doc_ids,
              data_size,
              num_epochs,
              switch_loss_epoch=0,
              save_every=1,
              report_every=1,
              print_topics_every=5,
              idx_to_word=None):
        """Train the Lda2vec Model. pivot_words, target_words, and doc_ids should be
        the same size.
        
        Args:
            pivot_words (np.array): Array of word idxs corresponding to pivot words
            target_words (np.array): Array of word idxs corresponding to target words
            doc_ids (TYPE): Document IDs linking word idxs to their docs
            data_size (TYPE): Length of pivot_words array
            num_epochs (TYPE): Number of epochs to train model
            switch_loss_epoch (int, optional): Epoch to switch on LDA loss. LDA loss not learned
                                               until this epoch
            save_every (int, optional): Save model every "save_every" epoch
            report_every (int, optional): Report model metrics every "report_every" epoch.
            print_topics_every (int, optional): Print top 10 words in each topic every "print_topics_every" 
            idx_to_word (None, optional): IDX to word mapping - Required if you want to see word-topic membership
        """
        # Calculate fraction used in DL Loss calculation
        temp_fraction = self.batch_size * 1.0 / data_size
        # Assign the fraction placeholder variable with the value we calculated
        self.sesh.run(tf.assign(self.fraction, temp_fraction))

        # Calculate the number of iterations per epoch so we can figure out when to switch the loss
        iters_per_epoch = int(data_size / self.batch_size) + np.ceil(
            data_size % self.batch_size)
        # Calculate what step we would be on @ the switch loss epoch
        switch_loss_step = iters_per_epoch * switch_loss_epoch
        # Assign the switch loss variable with the step we just calculated
        self.sesh.run(tf.assign(self.switch_loss, switch_loss_step))

        if self.save_graph_def:
            # Initialize a tensorflow Saver object
            saver = tf.train.Saver()
            # Initialize a tensorflow summary writer so we can save logs
            writer = tf.summary.FileWriter(self.logdir + '/',
                                           graph=self.sesh.graph)

        # Iterate over the number of epochs we want to train for
        for e in range(num_epochs):
            print('\nEPOCH:', e + 1)
            # Get a batch worth of data
            for p, t, d in utils.chunks(self.batch_size, pivot_words,
                                        target_words, doc_ids):

                # Create the feed dict from the batched data
                feed_dict = {self.x: p, self.y: t, self.docs: d}

                # Values we want to fetch whenever we run the model
                fetches = [
                    self.merged, self.optimizer, self.loss, self.loss_word2vec,
                    self.loss_lda, self.step
                ]

                # Run a step of the model
                summary, _, l, lw2v, llda, step = self.sesh.run(
                    fetches, feed_dict=feed_dict)

            # Prints log every "report_every" epoch
            if (e + 1) % report_every == 0:
                print('LOSS', l, 'w2v', lw2v, 'lda', llda)

            # Saves model every "save_every" epoch
            if (e + 1) % save_every == 0 and self.save_graph_def:
                writer.add_summary(summary, step)
                writer.flush()
                writer.close()
                save_path = saver.save(self.sesh, self.logdir + '/model.ckpt')
                writer = tf.summary.FileWriter(self.logdir + '/',
                                               graph=self.sesh.graph)

            # Prints out membership of words in each topic every "print_topics_every" epoch
            if e > 0 and (e + 1) % print_topics_every == 0:
                idxs = np.arange(self.num_topics)
                words, sims = self.get_k_closest(idxs,
                                                 in_type='topic',
                                                 idx_to_word=idx_to_word,
                                                 k=10,
                                                 verbose=True)

        # Save after all epochs are finished, but only if we didn't just save
        if self.save_graph_def and (e + 1) % save_every != 0:
            writer.add_summary(summary, step)
            writer.flush()
            writer.close()
            save_path = saver.save(self.sesh, self.logdir + '/model.ckpt')
示例#9
0
    def infer(self,docs=None,epochs=200, update_words=False, update_topics=False, topic_vectors=None):
        """ Infers the featurs of a new document that is passed in.
         By running the Lda2vec algorithm again.
        But by updating only the topic distributions"""

        texts = docs
        docs = []
        for text in texts:
            docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab)))

        logging.info("preprocessing")
        
        self.preprocess(docs)
        
        logging.info('preprocessed!')
        
        self.infer_model = LDA2Vec(n_documents=self.n_docs,\
                        n_document_topics=self.n_topics,\
                        n_units=300,\
                        n_vocab=self.n_vocab,\
                        counts=self.term_frequency,\
                        n_samples=15,\
                        power=self.power,\
                        temperature=self.temp)
        
        
        if self.words_pretrained:
            self.infer_model.sampler.W.data = self.vectors[:self.n_vocab, :]

        self.infer_model.mixture.factors.W.data = self.train_model.mixture.factors.W.data
        if topic_vectors is not None:
            assert(topic_vectors.shape==self.infer_model.mixture.factors.W.data.shape), ("topic vectors shape doesn't match")
            self.infer_model.mixture.factors.W.data = topic_vectors


        optimizer = O.Adam()
        optimizer.setup(self.infer_model)
        clip = chainer.optimizer.GradientClipping(5.0)
        optimizer.add_hook(clip)
        
        
        
        j = 0
        msgs = defaultdict(list)
        for epoch in range(epochs):
            print "epoch : ",epoch
            data = prepare_topics(cuda.to_cpu(self.infer_model.mixture.weights.W.data).copy(),
                                  cuda.to_cpu(self.infer_model.mixture.factors.W.data).copy(),
                                  cuda.to_cpu(self.infer_model.sampler.W.data).copy(),
                                  self.words)
            top_words = print_top_words_per_topic(data)
            if j % 100 == 0 and j > 100:
                coherence = topic_coherence(top_words)
                for j in range(self.n_topics):
                    print j, coherence[(j, 'cv')]
                kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
                #progress[str(epoch)] = pickle.dumps(kw)
            data['doc_lengths'] = self.doc_lengths
            data['term_frequency'] = self.term_frequency
            #np.savez('topics.pyldavis', **data)
            for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened):
                t0 = time.time()
                optimizer.zero_grads()
                l = self.infer_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics)
                prior = self.infer_model.prior()
                loss = prior * self.fraction
                loss.backward()
                optimizer.update()
                msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
                       "P:{prior:1.3e} R:{rate:1.3e}")
                prior.to_cpu()
                loss.to_cpu()
                t1 = time.time()
                dt = t1 - t0
                rate = self.batchsize / dt
                
                

                msgs["E"].append(epoch)
                msgs["L"].append(float(l))

                
                j += 1
            logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate)
            print msg.format(**logs)
            print "\n ================================= \n"
            #serializers.save_hdf5("lda2vec.hdf5", self.model)
            msgs["loss_per_epoch"].append(float(l))
        return data, msgs
示例#10
0
                     cuda.to_cpu(model.mixture_sty.factors.W.data).copy(),
                     cuda.to_cpu(model.sampler.W.data).copy(),
                     words)
 print_top_words_per_topic(ts)
 ts['doc_lengths'] = sty_len
 ts['term_frequency'] = term_frequency
 np.savez('topics.story.pyldavis', **ts)
 ta = prepare_topics(cuda.to_cpu(model.mixture_aut.weights.W.data).copy(),
                     cuda.to_cpu(model.mixture_aut.factors.W.data).copy(),
                     cuda.to_cpu(model.sampler.W.data).copy(),
                     words)
 print_top_words_per_topic(ta)
 ta['doc_lengths'] = aut_len
 ta['term_frequency'] = term_frequency
 np.savez('topics.author.pyldavis', **ta)
 for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened):
     t0 = time.time()
     optimizer.zero_grads()
     l = model.fit_partial(s.copy(), a.copy(), f.copy())
     prior = model.prior()
     loss = prior * fraction
     loss.backward()
     optimizer.update()
     msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
            "P:{prior:1.3e} R:{rate:1.3e}")
     prior.to_cpu()
     loss.to_cpu()
     t1 = time.time()
     dt = t1 - t0
     rate = batchsize / dt
     logs = dict(loss=float(l), epoch=epoch, j=j,
示例#11
0
     cuda.to_cpu(model.mixture_sty.weights.W.data).copy(),
     cuda.to_cpu(model.mixture_sty.factors.W.data).copy(),
     cuda.to_cpu(model.sampler.W.data).copy(), words)
 print_top_words_per_topic(ts)
 ts['doc_lengths'] = sty_len
 ts['term_frequency'] = term_frequency
 np.savez('topics.story.pyldavis', **ts)
 ta = prepare_topics(
     cuda.to_cpu(model.mixture_aut.weights.W.data).copy(),
     cuda.to_cpu(model.mixture_aut.factors.W.data).copy(),
     cuda.to_cpu(model.sampler.W.data).copy(), words)
 print_top_words_per_topic(ta)
 ta['doc_lengths'] = aut_len
 ta['term_frequency'] = term_frequency
 np.savez('topics.author.pyldavis', **ta)
 for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened):
     t0 = time.time()
     optimizer.zero_grads()
     l = model.fit_partial(s.copy(), a.copy(), f.copy())
     prior = model.prior()
     loss = prior * fraction
     loss.backward()
     optimizer.update()
     msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
            "P:{prior:1.3e} R:{rate:1.3e}")
     prior.to_cpu()
     loss.to_cpu()
     t1 = time.time()
     dt = t1 - t0
     rate = batchsize / dt
     logs = dict(loss=float(l),
示例#12
0
    def train(
            self,
            doc_ids,
            flattened,
            max_epochs=np.inf,
            verbose=False,
            loss_switch_epochs=0,  # num epochs until LDA loss switched on
            save=False,
            save_every=1000,
            outdir="./out",
            summarize=True,
            summarize_every=1000,
            metadata="metadata.tsv",
            metadata_docs="metadata.docs.tsv"):

        if save:
            try:
                os.mkdir(outdir)
            except (FileExistsError):
                pass
            saver = tf.train.Saver(tf.global_variables())
            outdir = os.path.abspath(self.log_dir)

        if summarize:
            try:
                self.logger.flush()
            except (AttributeError):  # not yet logging
                self.logger = tf.summary.FileWriter(self.log_dir,
                                                    self.sesh.graph)
            merged = self._addSummaries(metadata, metadata_docs)

        j = 0
        epoch = 0

        fraction = self.batch_size / len(flattened)  # == batch / n_corpus
        self.sesh.run(tf.assign(self.fraction, fraction))

        # turn on LDA loss after n iters of training
        iters_per_epoch = (int(len(flattened) / self.batch_size) +
                           np.ceil(len(flattened) % self.batch_size))
        n = iters_per_epoch * loss_switch_epochs
        self.sesh.run(tf.assign(self.switch_loss, n))

        now = datetime.now().isoformat()[11:]
        print("------- Training begin: {} -------\n".format(now))

        while epoch < max_epochs:
            try:

                # doc_ids, word_idxs
                for d, f in utils.chunks(self.batch_size, doc_ids, flattened):
                    t0 = datetime.now().timestamp()

                    feed_dict = self.make_feed_dict(d, f)

                    # if len(feed_dict[self.pivot_idxs]) == 0:
                    # 	print("Empty batch. Skipping...")
                    # 	continue

                    fetches = [
                        self.loss_lda, self.loss_word2vec, self.loss,
                        self.train_op
                    ]
                    loss_lda, loss_word2vec, loss, _ = self.sesh.run(
                        fetches, feed_dict=feed_dict)

                    j += 1

                    if verbose and j % 1000 == 0:
                        msg = (
                            "J:{j:05d} E:{epoch:05d} L_nce:{l_word2vec:1.3e} "
                            "L_dirichlet:{l_lda:1.3e} R:{rate:1.3e}")

                        t1 = datetime.now().timestamp()
                        dt = t1 - t0
                        rate = self.batch_size / dt
                        logs = dict(l_word2vec=loss_word2vec,
                                    epoch=epoch,
                                    j=j,
                                    l_lda=loss_lda,
                                    rate=rate)

                        print(msg.format(**logs))

                    if save and j % save_every == 0:
                        outfile = os.path.join(
                            outdir, "{}_lda2vec".format(self.datetime))
                        saver.save(self.sesh, outfile, global_step=self.step)

                    if summarize and j % summarize_every == 0:
                        summary = self.sesh.run(merged, feed_dict=feed_dict)
                        self.logger.add_summary(summary, global_step=self.step)

                epoch += 1

            except (KeyboardInterrupt):
                break

        print("epoch", epoch)
        print("max", max_epochs)
        now = datetime.now().isoformat()[11:]
        print("------- Training end: {} -------\n".format(now))

        if save:
            outfile = os.path.join(outdir, "{}_lda2vec".format(self.datetime))
            saver.save(self.sesh, outfile, global_step=self.step)

        try:
            self.logger.flush()
            self.logger.close()
        except (AttributeError):  # not logging
            pass

        sys.exit(0)
示例#13
0
    def train(
            self,
            doc_ids,
            flattened,
            vocab,
            words,
            max_epochs=np.inf,
            verbose=False,  #added vocab & words to add save(npz) 
            #option during training
        loss_switch_epochs=0,  # num epochs until LDA loss switched on
            save=False,
            save_every=1000,
            outdir="./out",
            summarize=True,
            summarize_every=1000,
            metadata="metadata.tsv",
            metadata_docs="metadata.docs.tsv"):

        n_vocab = flattened.max() + 1
        # How many tokens are in each document
        doc_idx, lengths = np.unique(doc_ids, return_counts=True)
        doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
        doc_lengths[doc_idx] = lengths
        # Count all token frequencies
        tok_idx, freq = np.unique(flattened, return_counts=True)
        term_frequency = np.zeros(n_vocab, dtype='int32')
        term_frequency[tok_idx] = freq

        if save:
            try:
                os.mkdir(outdir)
            except OSError as e:  #for Python 2
                if e.errno == errno.EEXIST:
                    pass
            saver = tf.train.Saver(tf.global_variables())
            outdir = os.path.abspath(self.log_dir)

        if summarize:
            try:
                self.logger.flush()
            except (AttributeError):  # not yet logging
                self.logger = tf.summary.FileWriter(self.log_dir,
                                                    self.sesh.graph)
            merged = self._addSummaries(metadata, metadata_docs)

        j = 0
        epoch = 0

        fraction = self.batch_size / len(flattened)  # == batch / n_corpus
        self.sesh.run(tf.assign(self.fraction, fraction))
        progress = shelve.open('progress.shelve')

        # turn on LDA loss after n iters of training
        iters_per_epoch = (int(len(flattened) / self.batch_size) +
                           np.ceil(len(flattened) % self.batch_size))
        n = iters_per_epoch * loss_switch_epochs
        self.sesh.run(tf.assign(self.switch_loss, n))

        now = datetime.now().isoformat()[11:]
        print("------- Training begin: {} -------\n".format(now))

        while epoch < max_epochs:
            try:

                # doc_ids, word_idxs
                for d, f in utils.chunks(self.batch_size, doc_ids, flattened):
                    t0 = datetime.now()

                    feed_dict = self.make_feed_dict(d, f)

                    # if len(feed_dict[self.pivot_idxs]) == 0:
                    # 	print("Empty batch. Skipping...")
                    # 	continue

                    fetches = [
                        self.loss_lda, self.loss_word2vec, self.loss,
                        self.train_op
                    ]
                    loss_lda, loss_word2vec, loss, _ = self.sesh.run(
                        fetches, feed_dict=feed_dict)

                    if j > 5:
                        print(loss_lda, loss_word2vec, loss)  #py2

                    j += 1

                    if verbose and j % 1000 == 0:
                        msg = (
                            "J:{j:05d} E:{epoch:05d} L_nce:{l_word2vec:1.3e} "
                            "L_dirichlet:{l_lda:1.3e} R:{rate:1.3e}")

                        t1 = datetime.now().timestamp()
                        dt = t1 - t0
                        rate = self.batch_size / dt
                        logs = dict(l_word2vec=loss_word2vec,
                                    epoch=epoch,
                                    j=j,
                                    l_lda=loss_lda,
                                    rate=rate)

                        print(msg.format(**logs))

                    if save and j % save_every == 0:
                        outfile = os.path.join(
                            outdir, "{}_lda2vec".format(self.datetime))
                        saver.save(self.sesh, outfile, global_step=self.step)

                    if summarize and j % summarize_every == 0:
                        summary = self.sesh.run(merged, feed_dict=feed_dict)
                        self.logger.add_summary(summary, global_step=self.step)

                    #if j % 100 == 0 and j > 100 and epoch > 1:
                    #    coherence = topic_coherence(top_words)
                    #    for j in range(n_topics):
                    #        print(j, coherence[(j, 'cv')])
                    #    kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
                    #    progress[str(epoch)] = pickle.dumps(kw)

                epoch += 1
                a = self.mixture.W.eval(session=self.sesh)
                b = self.mixture.factors.eval(session=self.sesh)
                c = self.sampler.W.eval(session=self.sesh)
                data = prepare_topics(a, b, c, words)
                print("------- epoch: {}-------\n".format(epoch))
                top_words = print_top_words_per_topic(data)
                data['doc_lengths'] = doc_lengths
                data['term_frequency'] = term_frequency
                np.savez('topics.pyldavis', **data)

            except (KeyboardInterrupt):
                break

        print("epoch", epoch)
        print("max", max_epochs)
        now = datetime.now().isoformat()[11:]
        print("------- Training end: {} -------\n".format(now))

        if save:
            outfile = os.path.join(outdir, "{}_lda2vec".format(self.datetime))
            saver.save(self.sesh, outfile, global_step=self.step)

        try:
            self.logger.flush()
            self.logger.close()
        except (AttributeError):  # not logging
            pass
示例#14
0
    print "Reloading from saved"
    serializers.load_hdf5("lda.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / bow.shape[0]
for epoch in range(50000000):
    if epoch % 100 == 0:
        p = cuda.to_cpu(model.proportions.W.data).copy()
        f = cuda.to_cpu(model.factors.W.data).copy()
        w = cuda.to_cpu(model.embedding.W.data).copy()
        d = prepare_topics(p, f, w, words)
        print_top_words_per_topic(d)
    for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow):
        t0 = time.time()
        optimizer.zero_grads()
        rec, ld = model.forward(ids, batch)
        l = rec + ld
        l.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
               "P:{ld:1.3e} R:{rate:1.3e}")
        l.to_cpu()
        rec.to_cpu()
        ld.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(rec=float(rec.data), epoch=epoch, j=j,
示例#15
0
counts = corpus.keys_counts[:n_vocab]
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]

model = NVDM(n_vocab, n_units)
if os.path.exists('nvdm.hdf5'):
    print "Reloading from saved"
    serializers.load_hdf5("nvdm.hdf5", model)
# model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / bow.shape[0]
for epoch in range(500):
    for (batch,) in utils.chunks(batchsize, bow):
        t0 = time.time()
        rec, kl = model.observe(batch)
        optimizer.zero_grads()
        l = rec + kl
        l.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
               "P:{kl:1.3e} R:{rate:1.3e}")
        l.to_cpu()
        rec.to_cpu()
        kl.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(rec=float(rec.data), epoch=epoch, j=j,