Exemplo n.º 1
0
 def evaluate_on_split(self,
                       sess,
                       generated_captions,
                       summary_writer,
                       epoch,
                       tags,
                       split='train'):
     caps = self.data.captions[split]
     ids = self.data.video_ids[split]
     unique_ids = list(set(ids))
     num_iter = int(ceil(len(unique_ids) / float(self.batch_size)))
     while len(unique_ids) < num_iter * self.batch_size:
         unique_ids += unique_ids
     unique_ids = unique_ids[:num_iter * self.batch_size]
     all_gen_cap = np.ndarray((len(unique_ids), self.max_words),
                              dtype=np.int)
     for i in range(num_iter):
         features_batch = [
             self.data.feature(vid)
             for vid in unique_ids[i * self.batch_size:(i + 1) *
                                   self.batch_size]
         ]
         # if len(features_batch) < self.batch_size:
         #     l = len(features_batch)
         #     features_batch += [self.data.feature(vid) for vid in unique_ids[:self.batch_size - l]]
         features_batch = np.asarray(features_batch)
         feed_dict = {self.features: features_batch}
         gen_cap = sess.run(generated_captions, feed_dict=feed_dict)
         all_gen_cap[i * self.batch_size:(i + 1) *
                     self.batch_size] = gen_cap
     all_decoded = decode_captions(all_gen_cap, self.data.vocab.idx2word)
     # create cand dict
     cand = {}
     for vid, sentence in zip(unique_ids, all_decoded):
         cand[vid] = [sentence]
     # create ref dict
     ref = {}
     for vid in unique_ids:
         ref[vid] = decode_captions(caps[ids == vid][:, 1:],
                                    self.data.vocab.idx2word)
     with open('result/cand_%s_%d.txt' % (split, epoch), 'w') as file:
         file.write(str(cand))
     with open('result/ref_%s_%d.txt' % (split, epoch), 'w') as file:
         file.write(str(ref))
     # evaluate
     scores = evaluate(ref=ref, cand=cand, get_scores=True)
     for tag in tags:
         summary = tf.Summary()
         summary.value.add(tag=split + tag, simple_value=scores[tag])
         summary_writer.add_summary(summary, epoch)
     return scores
Exemplo n.º 2
0
    def val(self, epoch):
        self.atten_model.eval()

        # prepare val data: random choice a batch, compute the loss and generate sentence
        features_batch, image_files, cur_captions = sample_minibatch(
            self.val_data, self.args.batch_size)
        features = Variable(torch.from_numpy(features_batch)).cuda()
        caption_in = cur_captions[:, :16]
        caption_out = cur_captions[:, 1:]
        captions_batch_in = Variable(
            torch.from_numpy(caption_in).type(torch.LongTensor)).cuda()
        captions_batch_out = torch.from_numpy(caption_out)
        mask = np.not_equal(captions_batch_out, 0)
        mask = Variable(mask.type(torch.cuda.FloatTensor))
        captions_batch_out = Variable(captions_batch_out.type(
            torch.LongTensor)).cuda()

        loss = self.atten_model(captions_batch_in, captions_batch_out,
                                features, mask)

        alpha_all, betas, sample_caption = self.atten_model.build_sample(
            features)
        decoded = decode_captions(np.squeeze(np.array(sample_caption.data)),
                                  self.idx_to_word)
        alpha_all = np.squeeze(np.array(alpha_all.data))
        betas = np.squeeze(np.array(betas.data))
        cur_decoded = decode_captions(np.stack(cur_captions), self.idx_to_word)
        if epoch % (int(self.args.epochs * 0.1)) == 0:
            file_decoded = {
                image_files[i]:
                (decoded[i], cur_decoded[i], alpha_all[i], betas[i])
                for i in range(self.args.batch_size)
            }
            val_samples_path = os.path.join(
                self.args.val_samples, 'val-' + str(epoch) + '-samples.pkl')
            save_pickle(file_decoded, val_samples_path)
        val_loss = torch.sum(loss) / self.args.batch_size
        # Save the model if the validation loss is the best we've seen so far.
        if not self.best_val_loss or val_loss.data[0] < self.best_val_loss:
            torch.save(self.atten_model.state_dict(), self.args.save)
            self.best_val_loss = val_loss.data[0]
            save_pickle(self.best_val_loss, self.args.loss_log)
            print 'save train model'
        elif epoch != 0 and epoch % 100 == 0:
            self.args.lr /= 2.0

        return self.args.lr, self.best_val_loss, decoded
Exemplo n.º 3
0
    def test(self,
             save_sampled_captions=True,
             evaluate_score=True,
             generate_demo_sample=False):
        self.atten_model.eval()
        self.atten_model.load_state_dict(torch.load(self.args.save))
        self.atten_model.cuda()

        if save_sampled_captions:
            features = self.test_data['features']
            n_examples = features.shape[0]
            all_sam_cap = np.ndarray((n_examples, 20))
            test_times = int(np.ceil(float(n_examples) / self.args.batch_size))
            for t in range(test_times):
                features_batch = Variable(
                    torch.from_numpy(
                        features[t * self.args.batch_size:(t + 1) *
                                 self.args.batch_size])).cuda()
                _, _, sampled_captions = self.atten_model.build_sample(
                    features_batch)
                all_sam_cap[t * self.args.batch_size:(t + 1) *
                            self.args.batch_size] = np.array(
                                sampled_captions.data)
            decoded = decode_captions(all_sam_cap, self.idx_to_word)
            save_pickle(decoded, self.args.test_samples)
            print 'test all sccessful'

        if evaluate_score:
            ref = load_pickle('./data/test/test.references.pkl')
            try:
                evaluate(ref, decoded)
            except KeyboardInterrupt:
                decoded = load_pickle(self.args.test_samples)
                evaluate(ref, decoded)

        if generate_demo_sample:
            features = self.args.demo_feat
            features_batch = Variable(torch.from_numpy(features)).cuda()
            _, _, sampled_captions = self.atten_model.build_sample(
                features_batch)
            decoded = decode_captions(sampled_captions, self.idx_to_word)
            print decoded
def main(argv):
    assert FLAGS.train_dir is not None, "train_dir is required"
    assert FLAGS.resnet_ckpt is not None, "resnet_ckpt is required"

    # data
    print('loading data...')
    (train_stems_list, train_stem_attrs_list, train_images, train_image2stem,
     train_stem2image) = utils.load_coco_data(config.data_root, 'train')
    (val_stems_list, val_stem_attrs_list, val_images, val_image2stem,
     val_stem2image) = utils.load_coco_data(config.data_root, 'val')

    # handling directories
    train_dir = os.path.join(config.model_root, FLAGS.train_dir)
    if not tf.gfile.IsDirectory(train_dir):
        tf.logging.info("Creating training directory: %s", train_dir)
        tf.gfile.MakeDirs(train_dir)

    log_dir = os.path.join(train_dir, 'log')
    if not tf.gfile.IsDirectory(log_dir):
        tf.logging.info("Creating log directory for training: %s", log_dir)
        tf.gfile.MakeDirs(log_dir)

    checkpoint = None
    if FLAGS.checkpoint is not None:
        checkpoint = os.path.join(config.model_root, FLAGS.checkpoint)
        assert os.path.exists(checkpoint), "checkpoint must exists if given."

    # model
    print('building model.')
    model = HierarchicalModel(config, mode=ModeKeys.TRAIN)
    loss = model.build()

    generator = CaptionGenerator(model,
                                 model.level1_word2ix,
                                 None,
                                 beam_size_1level=3,
                                 beam_size_2level=None,
                                 encourage_1level=0.0,
                                 encourage_2level=None,
                                 level2=False)

    # train_op
    with tf.name_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            optim_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='level1')
            if config.train_resnet:
                optim_vars += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope='resnet')
            # deriv
            level1_grads = tf.gradients(loss, optim_vars)
            grads_and_vars = [(i, j) for i, j in zip(level1_grads, optim_vars)
                              if i is not None]
            grads_and_vars = [(tf.clip_by_value(grad, -0.1, 0.1), var)
                              for grad, var in grads_and_vars]

            # todo: here check the batch-norm moving average/var
            # if config.train_resnet:
            #     optim_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='resnet')
            #     resnet_grads = tf.gradients(model.resnet.features, optim_vars)
            #     resnet_pairs = [(i, j) for i, j in zip(resnet_grads, optim_vars) if i is not None]
            #     grads_and_vars.extend(resnet_pairs)

            batchnorm_updates = tf.get_collection('resnet_update_ops')
            batchnorm_updates_op = tf.group(*batchnorm_updates)
            apply_gradient_op = optimizer.apply_gradients(
                grads_and_vars=grads_and_vars)
            train_op = tf.group(apply_gradient_op, batchnorm_updates_op)

    # summary op
    print('************************')
    tf.summary.scalar('batch_loss', loss)
    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name, var)
    # for grad, var in grads_and_vars:
    #     tf.summary.histogram(var.op.name + '/gradient', grad)
    summary_op = tf.summary.merge_all()

    # stats:
    n_examples = len(train_stems_list)
    n_examples_val = len(val_stems_list)
    n_iters_per_epoch = int(np.ceil(float(n_examples) / config.batch_size))
    n_iters_val = int(np.ceil(float(n_examples_val) / config.batch_size))

    print("The number of epoch: %d" % config.n_epochs)
    print("Data size: %d" % n_examples)
    print("Batch size: %d" % config.batch_size)
    print("Iterations per epoch: %d" % n_iters_per_epoch)

    # tf session
    config_ = tf.ConfigProto(allow_soft_placement=True)
    config_.gpu_options.per_process_gpu_memory_fraction = 0.6
    config_.gpu_options.allow_growth = True
    with tf.Session(config=config_) as sess:
        tf.global_variables_initializer().run()
        summary_writer = tf.summary.FileWriter(log_dir,
                                               graph=tf.get_default_graph())
        saver = tf.train.Saver(max_to_keep=40)

        # pretrained
        if checkpoint is not None:
            print("Start training with checkpoint..")
            saver.restore(sess, checkpoint)

        # dynamic stats
        prev_loss_epo = np.inf
        curr_loss_epo = 0
        best_loss_val = np.inf
        curr_loss_val = 0
        i_global = 0

        start_t = time.time()
        for epo in range(config.n_epochs):
            # stochastic batching
            rand_idxs = list(np.random.permutation(n_examples))

            for it in range(n_iters_per_epoch):
                # next batch
                rand_idx = sorted(rand_idxs[it * config.batch_size:(it + 1) *
                                            config.batch_size])
                stems_batch, mask_batch = utils.list2batch(
                    [train_stems_list[i] for i in rand_idx])
                img_idx = train_stem2image[rand_idx]
                img_batch = utils.crop_image(train_images[img_idx], True)
                # print(decode_captions(captions_batch, model.level1_model.idx_to_word))

                feed_dict = {
                    model.level1_model.captions: stems_batch,
                    model.level1_model.mask: mask_batch,
                    model.level1_model.resnet.images: img_batch,
                    model.level1_model.resnet.is_training: config.train_resnet,
                    model.level1_model.keep_prob: 0.5
                }
                _, l = sess.run([train_op, loss], feed_dict)
                curr_loss_epo += l
                # print 'batch norm beta:', sess.run(test1)[:10]
                # print 'batch norm gamma:', sess.run(test2)[:10]

                # global iteration counts
                i_global += 1

                # write summary for tensorboard visualization
                if it % config.log_freq == 0:
                    summary = sess.run(summary_op, feed_dict)
                    summary_writer.add_summary(summary,
                                               epo * n_iters_per_epoch + it)

                # periodical display
                if it % config.print_freq == 0:
                    print(
                        "\nTrain loss at epoch %d & iteration %d (mini-batch): %.5f"
                        % (epo + 1, it + 1, l))
                    ground_truths = stems_batch[0]
                    decoded = utils.decode_captions(
                        ground_truths, model.level1_model.idx_to_word)
                    for j, gt in enumerate(decoded):
                        print("Ground truth %d: %s" % (j + 1, gt))
                        print(ground_truths)

                    predicted = generator.beam_search(sess,
                                                      img_batch[0:1, :, :, :])
                    print("Generated caption: %s\n" % predicted)
                    print('***************')

                # auto save
                if i_global % config.save_freq == 0:
                    saver.save(sess,
                               os.path.join(train_dir,
                                            'model_level1_auto_save'),
                               global_step=i_global)
                    print("model-auto-%s saved." % (i_global))

                # validate
                if i_global % config.valid_freq == 0:
                    cur_loss_val = 0
                    if config.print_bleu:
                        # TODO: some preparation for saving search result.
                        #all_gen_cap = np.ndarray((n_examples_val, 16))
                        pass

                    for it_val in range(n_iters_val):
                        idx_val = np.arange(it_val * config.batch_size,
                                            (it_val + 1) * config.batch_size)
                        stems_batch_val, mask_batch_val = utils.list2batch(
                            [val_stems_list[i] for i in idx_val])
                        img_idx_val = val_stem2image[idx_val]
                        img_batch_val = utils.crop_image(
                            val_images[img_idx_val], False)

                        feed_dict_val = {
                            model.level1_model.captions: stems_batch_val,
                            model.level1_model.mask: mask_batch_val,
                            model.level1_model.resnet.images: img_batch_val,
                            model.level1_model.resnet.is_training: False,
                            model.level1_model.keep_prob: 1.0
                        }
                        curr_loss_val += sess.run(loss, feed_dict_val)

                        if config.print_bleu:
                            # TODO: beam search and evaluate bleu.
                            pass

                    curr_loss_val /= n_iters_val

                    if curr_loss_val < best_loss_val:
                        best_loss_val = cur_loss_val
                        # better model
                        saver.save(sess,
                                   os.path.join(train_dir, 'model_level1_val'),
                                   global_step=i_global)
                        print('model-val-%s saved.' % (i_global))
                    else:
                        # TODO: early stop checking.
                        pass
            # end for(i)
            curr_loss_epo /= n_iters_per_epoch

            # epoch summary:
            print("Previous epoch loss: ", prev_loss_epo)
            print("Current epoch loss: ", curr_loss_epo)
            print("Elapsed time: ", time.time() - start_t)
            prev_loss_epo = curr_loss_epo
            curr_loss_epo = 0

            # save model's parameters
            saver.save(sess,
                       os.path.join(train_dir, 'model_level1_epo'),
                       global_step=epo + 1)
            print("model-epo-%s saved." % (epo + 1))
Exemplo n.º 5
0
    def train(self):
        # train/val dataset
        train_caps, train_lengths, train_ids = self.data.captions['train'], self.data.lengths['train'], \
                                               self.data.video_ids['train']
        n_examples = len(train_caps)
        n_iters_per_epoch = int(np.ceil(float(n_examples) / self.batch_size))

        tags = [
            'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr',
            'ROUGE_L'
        ]
        # build graphs for training model and sampling captions
        with tf.Graph().as_default():
            with tf.device('/cpu:0'):
                with tf.variable_scope(tf.get_variable_scope()) as vscope:
                    tower_loss = []
                    tower_grad = []
                    tower_generated_cap = []
                    # create multi gpu train_op, loss_op and generated_captions_op
                    # create placeholder
                    self.features = tf.placeholder(tf.float32,
                                                   [None, self.L, self.D])
                    self.captions = tf.placeholder(tf.int32,
                                                   [None, self.max_words + 2])
                    # create train_op, loss_op and generated_captions_op
                    for i in range(self.num_gpus):
                        # on each gpu
                        with tf.device('/gpu:%d' % i):
                            with tf.name_scope('tower_%d' % i) as scope:
                                # create batch input for each gpu
                                _feat_batch = self.features[self.batch_size /
                                                            self.num_gpus *
                                                            i:self.batch_size /
                                                            self.num_gpus *
                                                            (i + 1), :, :]
                                _cap_batch = self.captions[self.batch_size /
                                                           self.num_gpus *
                                                           i:self.batch_size /
                                                           self.num_gpus *
                                                           (i + 1), :]
                                # compute loss
                                one_loss = self.model.build_model(
                                    _feat_batch, _cap_batch)
                                tower_loss.append(one_loss)
                                # reuse variables
                                tf.get_variable_scope().reuse_variables()
                                alphas, betas, generated_cap = self.model.build_sampler(
                                    _feat_batch, max_len=self.max_words)
                                tf.get_variable_scope().reuse_variables()
                                tower_generated_cap.append(generated_cap)
                                # compute grad
                                var_list = tf.trainable_variables()
                                grad = tf.gradients(one_loss, var_list)
                                tower_grad.append(grad)
                # multi gpu loss operation: average loss
                loss_op = self.average_loss(tower_loss)
                # caption operation
                generated_caption_op = tf.concat(tower_generated_cap, 0)
                # average grad
                average_grad = self.average_gradients(tower_grad)
                # initialize optimizer
                global_step = tf.Variable(0, trainable=False)
                increase_global_step_op = tf.assign(global_step,
                                                    global_step + 1)
                boundaries = [10]
                values = [self.learning_rate, 0.1 * self.learning_rate]
                piecewise_learning_rate = tf.train.piecewise_constant(
                    global_step, boundaries, values)
                learning_rate = piecewise_learning_rate
                optimizer = self.optimizer(learning_rate=learning_rate,
                                           beta1=0.1,
                                           beta2=0.001)
                # train operation: apply gradients
                train_op = optimizer.apply_gradients(
                    zip(average_grad, tf.trainable_variables()))

                # summary op
                tf.summary.scalar('learning_rate', learning_rate)
                tf.summary.scalar('batch_loss', loss_op)
                for var in tf.trainable_variables():
                    tf.summary.histogram(var.op.name, var)
                for grad, var in zip(average_grad, tf.trainable_variables()):
                    tf.summary.histogram(var.op.name + '/gradient', grad)

                summary_op = tf.summary.merge_all()

                # create session
                sess = tf.Session(config=tf.ConfigProto(
                    allow_soft_placement=True))
                summary_writer = tf.summary.FileWriter(self.log_path,
                                                       sess.graph)
                saver = tf.train.Saver(tf.global_variables())
                # initialized variables
                sess.run([
                    tf.global_variables_initializer(),
                    tf.local_variables_initializer()
                ])
                for epoch in range(self.n_epochs):
                    # shuffle train data
                    rand_idxs = np.random.permutation(n_examples)
                    train_caps = train_caps[rand_idxs]
                    train_ids = train_ids[rand_idxs]
                    train_lengths = train_lengths[rand_idxs]
                    for it in range(n_iters_per_epoch):
                        captions_batch = train_caps[it *
                                                    self.batch_size:(it + 1) *
                                                    self.batch_size]
                        image_idxs_batch = train_ids[it *
                                                     self.batch_size:(it + 1) *
                                                     self.batch_size]
                        if len(captions_batch) < self.batch_size:
                            l = len(captions_batch)
                            captions_batch = np.concatenate(
                                (captions_batch,
                                 train_caps[:self.batch_size - l]),
                                axis=0)
                            image_idxs_batch = np.concatenate(
                                (image_idxs_batch,
                                 train_ids[:self.batch_size - l]),
                                axis=0)
                        features_batch = [
                            self.data.feature(vid) for vid in image_idxs_batch
                        ]
                        feed_dict = {
                            self.features: features_batch,
                            self.captions: captions_batch
                        }
                        _, loss, summary_str = sess.run(
                            (train_op, loss_op, summary_op),
                            feed_dict=feed_dict)
                        # print epoch, it, loss
                        summary_writer.add_summary(
                            summary_str, epoch * n_iters_per_epoch + it)
                        if (it + 1) % self.print_every == 0:
                            print "\nTrain loss at epoch %d & iteration %d (mini-batch): %.5f" % (
                                epoch + 1, it + 1, loss)
                            ground_truths = train_caps[train_ids ==
                                                       image_idxs_batch[0]]
                            decoded = decode_captions(ground_truths[:, 1:],
                                                      self.data.vocab.idx2word)
                            for j, gt in enumerate(decoded):
                                print "Ground truth %d: %s" % (
                                    j + 1, gt.encode('utf-8'))
                            gen_caps = sess.run(generated_caption_op,
                                                feed_dict)
                            decoded = decode_captions(gen_caps,
                                                      self.data.vocab.idx2word)
                            print "Generated caption: %s\n" % decoded[0]
                    self.evaluate_on_split(
                        sess=sess,
                        generated_captions=generated_caption_op,
                        summary_writer=summary_writer,
                        epoch=epoch,
                        tags=tags,
                        split='train')
                    scores = self.evaluate_on_split(
                        sess=sess,
                        generated_captions=generated_caption_op,
                        summary_writer=summary_writer,
                        epoch=epoch,
                        tags=tags,
                        split='val')
                    write_bleu(scores=scores,
                               path=self.model_path,
                               epoch=epoch)
                    self.evaluate_on_split(
                        sess=sess,
                        generated_captions=generated_caption_op,
                        summary_writer=summary_writer,
                        epoch=epoch,
                        tags=tags,
                        split='test')
                    # save model
                    saver.save(sess,
                               os.path.join(self.model_path, 'model'),
                               global_step=epoch + 1)
                    print "model-%s saved." % (epoch + 1)
                    # increase global step, which is used to decay learning rate
                    sess.run(increase_global_step_op)
Exemplo n.º 6
0
    def test(self, split='train', save_sampled_captions=True):
        '''
        Args:
            - data: dictionary with the following keys:
            - features: Feature vectors of shape (5000, 196, 512)
            - file_names: Image file names of shape (5000, )
            - captions: Captions of shape (24210, 17)
            - image_idxs: Indices for mapping caption to image of shape (24210, )
            - features_to_captions: Mapping feature to captions (5000, 4~5)
            - split: 'train', 'val' or 'test'
            - attention_visualization: If True, visualize attention weights with images for each sampled word. (ipthon notebook)
            - save_sampled_captions: If True, save sampled captions to pkl file for computing BLEU scores.
        '''

        caps = self.data.captions[split]
        ids = self.data.video_ids[split]
        unique_ids = list(set(ids))
        n_examples = len(unique_ids)
        n_iters_per_epoch = int(np.ceil(float(n_examples) / self.batch_size))
        # build a graph to sample captions
        alphas, betas, sampled_captions = self.model.build_sampler(
            max_len=self.max_words)  # (N, max_len, L), (N, max_len)

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        all_decoded = []
        with tf.Session(config=config) as sess:
            saver = tf.train.Saver()
            saver.restore(sess, self.test_model)
            for i in range(n_iters_per_epoch):
                ids_batch = unique_ids[i * self.batch_size:(i + 1) *
                                       self.batch_size]
                features_batch = [self.data.feature(vid) for vid in ids_batch]
                features_batch = np.asarray(features_batch)
                feed_dict = {self.model.features: features_batch}
                alps, bts, sam_cap = sess.run(
                    [alphas, betas, sampled_captions],
                    feed_dict)  # (N, max_len, L), (N, max_len)
                decoded = decode_captions(sam_cap, self.data.vocab.idx2word)
                all_decoded.extend(decoded)

        # generate ref and cand
        ref = {}
        cand = {}
        for vid, dec in zip(unique_ids, all_decoded):
            gts = decode_captions(caps[ids == vid][:, 1:],
                                  self.data.vocab.idx2word)
            ref[vid] = gts
            cand[vid] = [dec]
        # print ground truths and generated sentences
        for vid in unique_ids:
            print '---' * 10
            for i, gt in enumerate(ref[vid]):
                print i + 1, ':', gt
            print 'generated :', cand[vid][0]
        scores = evaluate(ref, cand, get_scores=True)
        tags = [
            'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr',
            'ROUGE_L'
        ]
        for tag in tags:
            print tag, ':', scores[tag]
        print split, len(unique_ids), len(all_decoded)
Exemplo n.º 7
0
    def beam_search(self, sess, img):
        """
        Params:
            :sess: tf session
            :img: image of shape (1, width, height, channels)
        Returns:
            top-ranked decoded literal sentence when level-2 is enabled(decode)
            otherwise, top-ranked digital sentence is returned.(no decode)
        """
        resnet = self.model.resnet
        level1 = self.model.level1_model

        # feed image into resnet and get image features
        image_features = sess.run(resnet.features, feed_dict={resnet.images: img})
        
        # level1 (skeleton)
        # initialize for beam search.
        (init_c, init_h, features_encode, features_proj) = sess.run(
                [level1.init_c, level1.init_h, 
                level1.features_encode, level1.features_proj],
                    feed_dict = {level1.image_features: image_features})
        
        initial_beam = Caption(
            sentence=[self.vocab_1level['START']],
            c=init_c, h=init_h,
            logprob=0.0, score=0.0, 
            embeds=[], contexts=[], hiddens=[])
        partial_captions = TopN(self.beam_size_1level)
        partial_captions.push(initial_beam)
        complete_captions = TopN(self.beam_size_1level)

        # Run beam search.
        for t in range(self.max_caption_length_1level):
            partial_captions_list = partial_captions.extract()
            partial_captions.reset()

            input_feed = np.array([c.sentence[-1] for c in partial_captions_list])
            h_feed = np.reshape(np.array([c.h for c in partial_captions_list]), (-1, level1.dim_hid))
            c_feed = np.reshape(np.array([c.c for c in partial_captions_list]), (-1, level1.dim_hid))
            (c, h, log_softmax, alpha, context) = sess.run([level1.c, level1.h, 
                                level1.log_softmax, level1.alpha, level1.context4next],
                                    feed_dict={level1.c_feed: c_feed, 
                                               level1.h_feed: h_feed, 
                                               level1.in_word: input_feed, 
                                               level1.image_features: image_features})

            for i, partial_caption in enumerate(partial_captions_list):
                word_probabilities = log_softmax[i]
                word_probabilities[2:] += self.encourage_1level
                # For this partial caption, get the beam_size most probable next words.
                words_and_probs = list(enumerate(word_probabilities))
                words_and_probs.pop(level1._start)      # exclude START
                words_and_probs.sort(key=lambda x: -x[1])
                words_and_probs = words_and_probs[:self.beam_size_1level]

                # Each next word gives a new partial caption.
                for w, logp in words_and_probs:
                    if self.level2:
                        embed = sess.run(level1.embed4next, feed_dict={level1.word_feed: np.array([w])})
                    else:
                        embed = None
                    sentence = partial_caption.sentence + [w]
                    logprob = partial_caption.logprob + logp
                    score = logprob
                    if w == level1.word_to_idx['EOS']:
                        if self.length_normalization_factor > 0:
                            score /= len(sentence) ** self.length_normalization_factor
                        beam = Caption(sentence, c[i], h[i], logprob, score,
                                       partial_caption.embeds, partial_caption.contexts, partial_caption.hiddens)
                        complete_captions.push(beam)
                    else:
                        beam = Caption(sentence, c[i], h[i], logprob, score,
                                       partial_caption.embeds + [embed],
                                       partial_caption.contexts + [context[i]],
                                       partial_caption.hiddens + [h[i]])
                        partial_captions.push(beam)
                        
            if partial_captions.size() == 0:
                # We have run out of partial candidates; happens when beam_size = 1.
                break
        if not complete_captions.size():
            complete_captions = partial_captions

        level1_top_captions = complete_captions.extract(sort=True)

        full_sentence = []
        # level2 can be excluded for analysis
        if self.level2:
            level2 = self.model.level2_model
            # level2 (attributes)
            for caption in level1_top_captions:
                # for each caption(only one sentence)
                sentence_level1 = caption.sentence
                embeds, contexts, hiddens = caption.embeds, caption.contexts, caption.hiddens

                # only take the best skeleton generated from level1, 
                # and splitted as word sequence (be careful!!!)
                sent_level1 = utils.decode_captions(np.squeeze(np.asarray(sentence_level1)),
                                                    level1.idx_to_word)[0]
                words_level1 = sent_level1.split(' ') 

                attrs_level2 = []
                # iterate over the whole sentence word by word
                for t_level1 in range(len(embeds)):
                    # initialize for beam search.
                    embed = np.reshape(embeds[t_level1], (1, -1))
                    context = np.reshape(contexts[t_level1], (1, -1))
                    hidden = np.reshape(hiddens[t_level1], (1, -1))
                    (init_c, init_h) = sess.run([level2.init_c, level2.init_h],
                                                feed_dict={level2.embedding: embed, 
                                                           level2.context: context, 
                                                           level2.hidden: hidden})

                    initial_beam = Caption(
                                sentence=[self.vocab_2level['START']],
                                c=init_c, h=init_h,
                                logprob=0.0, score=0.0, info=False)
                    partial_captions = TopN(self.beam_size_2level)
                    partial_captions.push(initial_beam)
                    complete_captions = TopN(self.beam_size_2level)

                    # Run beam search.
                    for t in range(self.max_caption_length_2level):
                        partial_captions_list = partial_captions.extract()
                        partial_captions.reset()

                        input_feed = np.array([c.sentence[-1] for c in partial_captions_list])
                        h_feed = np.reshape(np.array([c.h for c in partial_captions_list]), (-1, level2.dim_hid))
                        c_feed = np.reshape(np.array([c.c for c in partial_captions_list]), (-1, level2.dim_hid))
                        (c, h, log_softmax) = sess.run([level2.c, level2.h, level2.log_softmax],
                                                       feed_dict={level2.c_feed: c_feed, 
                                                                  level2.h_feed: h_feed,
                                                                  level2.in_word: input_feed})

                        for i, partial_caption in enumerate(partial_captions_list):
                            word_probabilities = log_softmax[i]
                            word_probabilities[2:] += self.encourage_2level
                            words_and_probs = list(enumerate(word_probabilities))
                            words_and_probs.pop(level2._start)  # exclude START
                            words_and_probs.sort(key=lambda x: -x[1])
                            words_and_probs = words_and_probs[0:self.beam_size_2level]

                            for w, logp in words_and_probs:
                                sentence = partial_caption.sentence + [w]
                                logprob = partial_caption.logprob + logp
                                score = logprob

                                if w == level2.word_to_idx['EOS']:
                                    if self.length_normalization_factor > 0:
                                        score /= len(sentence) ** self.length_normalization_factor
                                    beam = Caption(sentence, c[i], h[i], logprob, score, info=False)
                                    complete_captions.push(beam)
                                else:
                                    beam = Caption(sentence, c[i], h[i], logprob, score, info=False)
                                    partial_captions.push(beam)

                        if partial_captions.size() == 0:
                            break
                    if not complete_captions.size():
                        complete_captions = partial_captions

                    # exclude START, only top-ranked attr is used.
                    # attr ~ list([str <x1>])
                    attr = utils.decode_captions(
                        np.squeeze(np.asarray(
                                complete_captions.extract(sort=True)[0].sentence
                            ))[1:], 
                        level2.idx_to_word)
                    # append str to list
                    attrs_level2.extend(attr) 
                full_sentence.append(' '.join([i + ' ' + j if i != '' else j for (j, i) in zip(words_level1, attrs_level2)]))
        else:
            # exclude START
            full_sentence = [i.sentence[1:] for i in level1_top_captions]
            full_sentence = utils.decode_captions(np.asarray(full_sentence), level1.idx_to_word)
        
        # only return top-ranked stem with attr.
        return full_sentence[0]