class Trainer:
    def __init__(self):
        self.word_to_idx = utils.load_pickle(Const.vocab_path)
        self.model = CaptionGenerator(self.word_to_idx,
                                      dim_feature=[196, 512],
                                      dim_embed=512,
                                      dim_hidden=1024,
                                      n_time_step=33,
                                      prev2out=True,
                                      ctx2out=True,
                                      alpha_c=1.0,
                                      selector=True,
                                      dropout=True)

        self.n_epochs = TrainingArg.n_epochs
        self.batch_size = TrainingArg.batch_size
        self.update_rule = TrainingArg.update_rule
        self.learning_rate = TrainingArg.learning_rate
        self.print_bleu = TrainingArg.print_bleu
        self.print_every = TrainingArg.print_every
        self.save_every = TrainingArg.save_every
        self.log_path = TrainingArg.log_path  # FLAGS.log_dir
        self.model_path = TrainingArg.model_path  # FLAGS.output_dir  # TrainingArg.model_path
        self.data_dir = Const.resize_train_out_path  # FLAGS.data_dir
        self.pretrained_model = TrainingArg.pretrained_model
        self.test_model = TrainingArg.test_model
        self.max_words_len = 35

        self.pre_mgr = PreData(vgg19_path=TrainingArg.vgg19_path)  # 数据管理

        # set an optimizer by update rule
        if self.update_rule == 'adam':
            self.optimizer = tf.train.AdamOptimizer
        elif self.update_rule == 'momentum':
            self.optimizer = tf.train.MomentumOptimizer
        elif self.update_rule == 'rmsprop':
            self.optimizer = tf.train.RMSPropOptimizer

        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)
        if not os.path.exists(self.log_path):
            os.makedirs(self.log_path)

        self.org_decoded = {}
        self.val_data_flag = False

    def train(self):
        """
        training
        :return:
        """
        loss = self.model.build_model()

        with tf.variable_scope(tf.get_variable_scope()) as scope:
            with tf.name_scope('optimizer'):
                tf.get_variable_scope().reuse_variables()
                _, _, generated_captions = self.model.build_sampler(max_len=self.max_words_len)

                self.global_step = tf.Variable(0, name="global_step", trainable=False)
                lr = tf.train.exponential_decay(learning_rate=self.learning_rate, global_step=self.global_step,
                                                decay_steps=TrainingArg.lr_decay_steps,
                                                decay_rate=0.96, staircase=True, name='learn_rate')
                optimizer = self.optimizer(learning_rate=lr)
                grads = tf.gradients(loss, tf.trainable_variables())
                grads_and_vars = list(zip(grads, tf.trainable_variables()))
        train_op = optimizer.apply_gradients(grads_and_vars=grads_and_vars, global_step=self.global_step)

        # summary op
        tf.summary.scalar('batch_loss', loss)
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)
        for grad, var in grads_and_vars:
            if grad is not None:
                tf.summary.histogram(var.op.name + '/gradient', grad)

        summary_op = tf.summary.merge_all()

        config = tf.ConfigProto(allow_soft_placement=True)
        # config.gpu_options.per_process_gpu_memory_fraction=0.9
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            self.pre_mgr.set_tf_sess(sess)

            tf.initialize_all_variables().run()
            summary_writer = tf.summary.FileWriter(self.log_path, graph=tf.get_default_graph())
            saver = tf.train.Saver(max_to_keep=10)

            if self.pretrained_model is not None:
                print("Start training with pretrained Model..")
                saver.restore(sess, self.pretrained_model)

            curr_epoch = 0
            batchs = self.pre_mgr.fetch_batch(Const.caption_train_vector_path, self.data_dir,
                                              self.batch_size, self.n_epochs)

            for batch in batchs:
                caption_batch, image_batch, epoch = batch
                feed_dict = {self.model.features: image_batch, self.model.captions: caption_batch}
                _, l, step = sess.run([train_op, loss, self.global_step], feed_dict)

                if step % self.print_every == 0 or step == 1:
                    summary = sess.run(summary_op, feed_dict)
                    summary_writer.add_summary(summary, step)
                    print("\nTrain loss at epoch %d & step %d (mini-batch): %.5f" % (epoch + 1, step, l))
                    # ground_truths = captions[image_idxs == image_idxs_batch[0]]
                    ground_truths = np.array([caption_batch[0]])
                    decoded = self.pre_mgr.decode_captions(ground_truths, self.model.idx_to_word)
                    for j, gt in enumerate(decoded):
                        print("Ground truth %d: %s" % (j + 1, gt))
                    gen_caps = sess.run(generated_captions, feed_dict)
                    decoded = self.pre_mgr.decode_captions(gen_caps, self.model.idx_to_word)
                    print("Generated caption: %s\n" % decoded[0])

                print('{}, epoch:{} step: {},Current epoch loss: {}'.format(datetime.datetime.now().isoformat(), epoch + 1, step, l))

                # print(out BLEU scores and file write
                if curr_epoch != epoch or step == 1 or step % self.print_every == 0:
                    curr_epoch = epoch
                    val_data_batchs = self.pre_mgr.fetch_val_batch(Const.val_vector_out_path, self.data_dir, self.batch_size)
                    gen_caps = []
                    i = 0
                    for val_batch in val_data_batchs:
                        val_caption, val_image = val_batch
                        # features_batch = val_features[i * self.batch_size:(i + 1) * self.batch_size]
                        feed_dict = {self.model.features: val_image}
                        gen_cap = sess.run(generated_captions, feed_dict=feed_dict)
                        gen_caps.extend(gen_cap)
                        if not self.val_data_flag:
                            print('val batch loop {}'.format(i))
                            for item in val_caption:
                                self.org_decoded[i] = self.pre_mgr.decode_captions(np.array(item), self.model.idx_to_word,
                                                                                   ignore_start=True)
                                i += 1
                                # break
                    self.val_data_flag = True
                    gen_decoded = self.pre_mgr.decode_captions(np.array(gen_caps), self.model.idx_to_word)
                    for j in range(5):
                        print('val org sents: {}'.format(self.org_decoded[j]))
                        print('val gen sents: {}\n'.format(gen_decoded[j]))

                    scores = evaluate(gen_decoded, self.org_decoded, get_scores=True)
                    utils.write_bleu(scores=scores, path=self.model_path, epoch=epoch)

                    # save model's parameters
                    # if (e + 1) % self.save_every == 0:
                    saver.save(sess, os.path.join(self.model_path, 'model'), global_step=step)
                    print("model-%s saved." % (epoch + 1))
예제 #2
0
class Estimate:
    def __init__(self):
        self.word_to_idx = utils.load_pickle(Const.vocab_path)
        self.model = CaptionGenerator(self.word_to_idx,
                                      dim_feature=[196, 512],
                                      dim_embed=512,
                                      dim_hidden=1024,
                                      n_time_step=33,
                                      prev2out=True,
                                      ctx2out=True,
                                      alpha_c=1.0,
                                      selector=True,
                                      dropout=True)

        self.n_epochs = TrainingArg.n_epochs
        self.batch_size = TrainingArg.batch_size
        self.update_rule = TrainingArg.update_rule
        self.learning_rate = TrainingArg.learning_rate
        self.print_bleu = TrainingArg.print_bleu
        self.print_every = TrainingArg.print_every
        self.save_every = TrainingArg.save_every
        self.log_path = TrainingArg.log_path
        self.model_path = TrainingArg.model_path
        self.pretrained_model = TrainingArg.pretrained_model
        self.test_model = TrainingArg.test_model
        self.max_words_len = 35

        self.pre_mgr = PreData(vgg19_path=TrainingArg.vgg19_path)  # 数据管理

    def test(self, image_path):
        """
        
        :return: 
        """
        alphas, betas, sampled_captions = self.model.build_sampler(max_len=self.max_words_len)  # (N, max_len, L), (N, max_len)

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            saver = tf.train.Saver()
            saver.restore(sess, TrainingArg.test_model)
            # saver.restore(sess, r'F:\4_study\show_tell\show-attend-and-tell-master\model\lstm\model-1')
            self.pre_mgr.set_tf_sess(sess)

            feature, resize_path = self.pre_mgr.pre_orig_image_to_tell(image_path)
            feed_dict = {self.model.features: feature}

            # batchs = self.pre_mgr.fetch_batch(Const.caption_train_vector_path, Const.resize_train_out_path,
            #                                   self.batch_size, self.n_epochs)
            #
            # for batch in batchs:
            #     caption_batch, image_batch, epoch = batch
            #     feed_dict = {self.model.features: image_batch}

            alps, bts, sam_cap = sess.run([alphas, betas, sampled_captions], feed_dict)  # (N, max_len, L), (N, max_len)
            #sam_cap = sess.run(sampled_captions, feed_dict)
            decoded = self.pre_mgr.decode_captions(sam_cap, self.model.idx_to_word)
            print(decoded)
            n = 0
            # Plot original image
            # resize_path = ''
            img = ndimage.imread(resize_path)
            plt.subplot(4, 5, 1)
            plt.imshow(img)
            plt.axis('off')

            # Plot images with attention weights
            words = decoded[n].split(" ")
            for t in range(len(words)):
                if t > 18:
                    break
                plt.subplot(4, 5, t + 2)
                plt.text(0, 1, '%s(%.2f)' % (words[t], bts[n, t]), color='black', backgroundcolor='white', fontsize=8)
                plt.imshow(img)
                alp_curr = alps[n, t, :].reshape(14, 14)
                alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=16, sigma=20)
                plt.imshow(alp_img, alpha=0.85)
                plt.axis('off')
            plt.show()

    def test_data(self, path):
        """
        
        :return: 
        """
        alphas, betas, sampled_captions = self.model.build_sampler(max_len=self.max_words_len)  # (N, max_len, L), (N, max_len)

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            saver = tf.train.Saver()
            saver.restore(sess, TrainingArg.test_model)
            self.pre_mgr.set_tf_sess(sess)

            test_result = []
            test_batch = self.pre_mgr.fetch_test_data(path)
            for feature, image_id in test_batch:
                feed_dict = {self.model.features: feature}
                sam_cap = sess.run([sampled_captions], feed_dict)  # (N, max_len, L), (N, max_len)
                decoded = self.pre_mgr.decode_captions(sam_cap, self.model.idx_to_word)
                for i, v in enumerate(decoded):
                    item = {
                        'image_id': image_id[i],
                        'caption': decoded[i].replace(' ', '').rstrip('.')
                    }
                    test_result.append(item)
                print(test_result[-1])

            with open('adw_image_caption.json', 'w') as f:
                json.dump(test_result, f)
                print('save test json to adw_image_caption.json')
예제 #3
0
def main(params):

    sys.path.insert(0, os.path.join(params.bottomup_path, 'lib'))
    from fast_rcnn.config import cfg, cfg_from_file
    from fast_rcnn.test import im_detect, _get_blobs
    from fast_rcnn.nms_wrapper import nms

    ###########################
    # CNN : Faster-RCNN setting
    data_path = os.path.join(params.bottomup_path, 'data/genome/1600-400-20')

    # Load classes
    classes = ['__background__']
    with open(os.path.join(data_path, 'objects_vocab.txt')) as f:
        for object in f.readlines():
            classes.append(object.split(',')[0].lower().strip())

    # Load attributes
    attributes = ['__no_attribute__']
    with open(os.path.join(data_path, 'attributes_vocab.txt')) as f:
        for att in f.readlines():
            attributes.append(att.split(',')[0].lower().strip())

    GPU_ID = params.gpu_id  # if we have multiple GPUs, pick one
    caffe.init_log()
    caffe.set_device(GPU_ID)
    caffe.set_mode_gpu()
    net = None
    cfg_from_file(
        os.path.join(params.bottomup_path,
                     'experiments/cfgs/faster_rcnn_end2end_resnet.yml'))

    weights = os.path.join(
        params.bottomup_path,
        'data/faster_rcnn_models/resnet101_faster_rcnn_final.caffemodel')
    prototxt = os.path.join(
        params.bottomup_path,
        'models/vg/ResNet-101/faster_rcnn_end2end_final/test.prototxt')

    net = caffe.Net(prototxt, caffe.TEST, weights=weights)

    conf_thresh = 0.4
    min_boxes = params.num_objects
    max_boxes = params.num_objects
    ###########################

    ###########################
    # RNN : Caption generation setting
    # load json file
    label_info = json.load(open(params.input_labels))
    word_to_idx = label_info['word_to_idx']

    # load h5 file
    caps_info = h5py.File(params.input_caps, 'r', driver='core')
    seq_length = caps_info['labels'].shape[1]

    # GPU options
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True

    # build a graph to sample captions
    graph_gen_cap = tf.Graph()
    sess_gen_cap = tf.Session(graph=graph_gen_cap, config=config)
    with graph_gen_cap.as_default():
        model = CaptionGenerator(word_to_idx,
                                 num_features=params.num_objects,
                                 dim_feature=params.dim_features,
                                 dim_embed=params.dim_word_emb,
                                 dim_hidden=params.rnn_hid_size,
                                 dim_attention=params.att_hid_size,
                                 n_time_step=seq_length - 1)
        alphas, sampled_captions = model.build_sampler(max_len=params.max_len)
        saver1 = tf.train.Saver()
        saver1.restore(sess_gen_cap, params.test_model)
    tf.reset_default_graph()
    ############################

    ###########################
    # Face : Replacer
    name_replacer = NameReplacer(model.idx_to_word, params.score_thr)
    ############################

    ###########################
    # Run Image Captioning with face detection

    while True:
        full_fname = raw_input("Enter the image path and name:")
        if full_fname == 'Exit':
            break
        if not os.path.exists(full_fname):
            print("Not Exist File : {}".format(full_fname))
            continue

        ###########################
        # Object Detection
        im = cv2.imread(full_fname)
        scores, boxes, attr_scores, rel_scores = im_detect(net, im)

        # Keep the original boxes, don't worry about the regression bbox outputs
        rois = net.blobs['rois'].data.copy()
        # unscale back to raw image space
        blobs, im_scales = _get_blobs(im, None)

        cls_boxes = rois[:, 1:5] / im_scales[0]
        cls_prob = net.blobs['cls_prob'].data
        attr_prob = net.blobs['attr_prob'].data
        pool5 = net.blobs['pool5_flat'].data

        # Keep only the best detections
        max_conf = np.zeros((rois.shape[0]))
        for cls_ind in range(1, cls_prob.shape[1]):
            cls_scores = scores[:, cls_ind]
            dets = np.hstack(
                (cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32)
            keep = np.array(nms(dets, cfg.TEST.NMS))
            max_conf[keep] = np.where(cls_scores[keep] > max_conf[keep],
                                      cls_scores[keep], max_conf[keep])

        keep_boxes = np.where(max_conf >= conf_thresh)[0]
        if len(keep_boxes) < min_boxes:
            keep_boxes = np.argsort(max_conf)[::-1][:min_boxes]
        elif len(keep_boxes) > max_boxes:
            keep_boxes = np.argsort(max_conf)[::-1][:max_boxes]

        feats = pool5[keep_boxes]
        ############################

        ###########################
        # Caption generation using CNN features
        feed_dict = {model.features: [feats]}
        alps, sam_cap = sess_gen_cap.run([alphas, sampled_captions], feed_dict)
        decoded = decode_captions(sam_cap, model.idx_to_word)
        ############################

        ###########################
        # Name replacer
        name_list, conf_list, roi_list = vtt_face_recognize(
            full_fname, params.url, params.post_data)
        replace_decoded, words = name_replacer.name_replace_caps(
            sam_cap, alps, cls_boxes, name_list, conf_list, roi_list)
        print("Original caption : %s" % decoded[0])
        print("Replaced caption : %s" % replace_decoded[0])
        ############################

        ###########################
        # Showing
        img = skimage.io.imread(full_fname)
        img = skimage.img_as_float(img)
        boxes = cls_boxes[keep_boxes]
        boxes = boxes.astype(int)

        # draw attention map
        fig = plt.figure(figsize=(16, 8))
        ax = fig.add_subplot(3, 6, 1)
        ax.imshow(img)
        plt.axis('off')

        # Plot images with attention weights
        words = words[0]
        for t in range(len(words)):
            if t > 16:
                break
            if words[t] == '<BLANK>':
                continue
            alphamap = np.zeros((img.shape[0], img.shape[1]))
            for b in range(boxes.shape[0]):
                alphamap[boxes[b, 1]:boxes[b, 3],
                         boxes[b, 0]:boxes[b, 2]] += alps[0, t, b]
            max_idx = np.argmax(alps[0, t, :])
            att_img = np.dstack((img, alphamap))
            ax = fig.add_subplot(3, 6, t + 2)
            plt.text(0,
                     1,
                     '%s' % (words[t]),
                     color='black',
                     backgroundcolor='white',
                     fontsize=8)
            ax.imshow(att_img)
            ax.add_patch(
                patches.Rectangle((boxes[max_idx, 0], boxes[max_idx, 1]),
                                  boxes[max_idx, 2] - boxes[max_idx, 0],
                                  boxes[max_idx, 3] - boxes[max_idx, 1],
                                  linewidth=1,
                                  edgecolor='r',
                                  facecolor='none'))
            plt.axis('off')

        fig.tight_layout()
        plt.show()
예제 #4
0
class CaptionInference(object):
    def __init__(self, sess, model_path, use_inception):

        path_prefix = os.path.dirname(os.path.realpath(__file__))
        # word to index mapping
        with open(os.path.join(path_prefix, 'data/train/word_to_idx.pkl'),
                  "rb") as f:
            self.word_to_idx = pickle.load(f)

        if use_inception:
            L = 64
            D = 2048
            cnn_model_path = os.path.join(path_prefix,
                                          'data/inception_v3.ckpt')
        else:
            L = 196
            D = 512
            cnn_model_path = os.path.join(
                path_prefix, './data/imagenet-vgg-verydeep-19.mat')

        self.batch_size = 128
        self.sess = sess
        self.use_inception = use_inception
        print("Creating model...")
        self.model = CaptionGenerator(
            self.word_to_idx,
            dim_feature=[L, D],
            dim_embed=512,
            dim_hidden=1800,
            n_time_step=16,
            prev2out=True,
            ctx2out=True,
            alpha_c=5.0,
            selector=True,
            dropout=True,
            use_cnn="inception" if use_inception else "vgg",
            cnn_model_path=cnn_model_path)

        print("Loading CNN weights...")
        self.model.cnn.load_weights(sess)
        print("Building sampler...")
        self.alphas, self.betas, self.generated_captions = self.model.build_sampler(
            max_len=20)

        # initialize model and load weights
        print("Loading LSTM weights...")
        # tf.global_variables_initializer().run()
        saver = tf.train.Saver(self.model.sampler_vars)
        saver.restore(sess, model_path)

    def inference_np(self, images):
        nimgs = images.shape[0]
        print("Running inference on {} images...".format(nimgs))
        nbatches = int(math.ceil(nimgs / self.batch_size))
        all_decoded = []
        all_alphas = None
        all_betas = None
        for i in range(nbatches):
            start = i * self.batch_size
            end = (i + 1) * self.batch_size
            end = nimgs if end >= nimgs else end
            batch_images = images[start:end]
            print("processing {} images ({} to {})".format(
                batch_images.shape[0], start + 1, end))
            batch_alphas, batch_betas, batch_gen_cap = self.sess.run(
                [self.alphas, self.betas, self.generated_captions],
                feed_dict={self.model.images: batch_images})
            # batch_gen_cap = self.sess.run(self.generated_captions, feed_dict = {self.model.images: batch_images})
            batch_decoded = decode_captions(batch_gen_cap,
                                            self.model.idx_to_word)
            all_decoded.extend(batch_decoded)
            all_alphas = np.concatenate([
                all_alphas, batch_alphas
            ]) if all_alphas is not None else batch_alphas
            all_betas = np.concatenate([
                all_betas, batch_betas
            ]) if all_betas is not None else batch_betas
        return all_alphas, all_betas, all_decoded

    @staticmethod
    def resize_image(image, image_size):
        width, height = image.size
        if width > height:
            left = (width - height) / 2
            right = width - left
            top = 0
            bottom = height
        else:
            top = (height - width) / 2
            bottom = height - top
            left = 0
            right = width
        image = image.crop((left, top, right, bottom))
        image = image.resize([image_size, image_size], Image.ANTIALIAS)
        return image

    def preprocess_file(self, file_name):
        print("preprocess", file_name)
        if os.path.splitext(file_name)[1] == ".npy":
            return np.squeeze(np.load(file_name))
        else:
            img_np = np.array(
                self.resize_image(Image.open(file_name),
                                  self.model.cnn.image_size)).astype(
                                      np.float32)
            # convert grey scale image to 3-channel
            if self.use_inception:
                img_np /= 255.0
                img_np -= 0.5
                img_np *= 2.0
            if img_np.ndim == 2:
                img_np = np.stack((img_np, ) * 3, axis=-1)
            return img_np

    def inference_files(self, image_files):
        print("processing {} images...".format(len(image_files)))
        image_batch = np.array([self.preprocess_file(x) for x in image_files])
        return self.inference_np(image_batch)