class Trainer: def __init__(self): self.word_to_idx = utils.load_pickle(Const.vocab_path) self.model = CaptionGenerator(self.word_to_idx, dim_feature=[196, 512], dim_embed=512, dim_hidden=1024, n_time_step=33, prev2out=True, ctx2out=True, alpha_c=1.0, selector=True, dropout=True) self.n_epochs = TrainingArg.n_epochs self.batch_size = TrainingArg.batch_size self.update_rule = TrainingArg.update_rule self.learning_rate = TrainingArg.learning_rate self.print_bleu = TrainingArg.print_bleu self.print_every = TrainingArg.print_every self.save_every = TrainingArg.save_every self.log_path = TrainingArg.log_path # FLAGS.log_dir self.model_path = TrainingArg.model_path # FLAGS.output_dir # TrainingArg.model_path self.data_dir = Const.resize_train_out_path # FLAGS.data_dir self.pretrained_model = TrainingArg.pretrained_model self.test_model = TrainingArg.test_model self.max_words_len = 35 self.pre_mgr = PreData(vgg19_path=TrainingArg.vgg19_path) # 数据管理 # set an optimizer by update rule if self.update_rule == 'adam': self.optimizer = tf.train.AdamOptimizer elif self.update_rule == 'momentum': self.optimizer = tf.train.MomentumOptimizer elif self.update_rule == 'rmsprop': self.optimizer = tf.train.RMSPropOptimizer if not os.path.exists(self.model_path): os.makedirs(self.model_path) if not os.path.exists(self.log_path): os.makedirs(self.log_path) self.org_decoded = {} self.val_data_flag = False def train(self): """ training :return: """ loss = self.model.build_model() with tf.variable_scope(tf.get_variable_scope()) as scope: with tf.name_scope('optimizer'): tf.get_variable_scope().reuse_variables() _, _, generated_captions = self.model.build_sampler(max_len=self.max_words_len) self.global_step = tf.Variable(0, name="global_step", trainable=False) lr = tf.train.exponential_decay(learning_rate=self.learning_rate, global_step=self.global_step, decay_steps=TrainingArg.lr_decay_steps, decay_rate=0.96, staircase=True, name='learn_rate') optimizer = self.optimizer(learning_rate=lr) grads = tf.gradients(loss, tf.trainable_variables()) grads_and_vars = list(zip(grads, tf.trainable_variables())) train_op = optimizer.apply_gradients(grads_and_vars=grads_and_vars, global_step=self.global_step) # summary op tf.summary.scalar('batch_loss', loss) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) for grad, var in grads_and_vars: if grad is not None: tf.summary.histogram(var.op.name + '/gradient', grad) summary_op = tf.summary.merge_all() config = tf.ConfigProto(allow_soft_placement=True) # config.gpu_options.per_process_gpu_memory_fraction=0.9 config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: self.pre_mgr.set_tf_sess(sess) tf.initialize_all_variables().run() summary_writer = tf.summary.FileWriter(self.log_path, graph=tf.get_default_graph()) saver = tf.train.Saver(max_to_keep=10) if self.pretrained_model is not None: print("Start training with pretrained Model..") saver.restore(sess, self.pretrained_model) curr_epoch = 0 batchs = self.pre_mgr.fetch_batch(Const.caption_train_vector_path, self.data_dir, self.batch_size, self.n_epochs) for batch in batchs: caption_batch, image_batch, epoch = batch feed_dict = {self.model.features: image_batch, self.model.captions: caption_batch} _, l, step = sess.run([train_op, loss, self.global_step], feed_dict) if step % self.print_every == 0 or step == 1: summary = sess.run(summary_op, feed_dict) summary_writer.add_summary(summary, step) print("\nTrain loss at epoch %d & step %d (mini-batch): %.5f" % (epoch + 1, step, l)) # ground_truths = captions[image_idxs == image_idxs_batch[0]] ground_truths = np.array([caption_batch[0]]) decoded = self.pre_mgr.decode_captions(ground_truths, self.model.idx_to_word) for j, gt in enumerate(decoded): print("Ground truth %d: %s" % (j + 1, gt)) gen_caps = sess.run(generated_captions, feed_dict) decoded = self.pre_mgr.decode_captions(gen_caps, self.model.idx_to_word) print("Generated caption: %s\n" % decoded[0]) print('{}, epoch:{} step: {},Current epoch loss: {}'.format(datetime.datetime.now().isoformat(), epoch + 1, step, l)) # print(out BLEU scores and file write if curr_epoch != epoch or step == 1 or step % self.print_every == 0: curr_epoch = epoch val_data_batchs = self.pre_mgr.fetch_val_batch(Const.val_vector_out_path, self.data_dir, self.batch_size) gen_caps = [] i = 0 for val_batch in val_data_batchs: val_caption, val_image = val_batch # features_batch = val_features[i * self.batch_size:(i + 1) * self.batch_size] feed_dict = {self.model.features: val_image} gen_cap = sess.run(generated_captions, feed_dict=feed_dict) gen_caps.extend(gen_cap) if not self.val_data_flag: print('val batch loop {}'.format(i)) for item in val_caption: self.org_decoded[i] = self.pre_mgr.decode_captions(np.array(item), self.model.idx_to_word, ignore_start=True) i += 1 # break self.val_data_flag = True gen_decoded = self.pre_mgr.decode_captions(np.array(gen_caps), self.model.idx_to_word) for j in range(5): print('val org sents: {}'.format(self.org_decoded[j])) print('val gen sents: {}\n'.format(gen_decoded[j])) scores = evaluate(gen_decoded, self.org_decoded, get_scores=True) utils.write_bleu(scores=scores, path=self.model_path, epoch=epoch) # save model's parameters # if (e + 1) % self.save_every == 0: saver.save(sess, os.path.join(self.model_path, 'model'), global_step=step) print("model-%s saved." % (epoch + 1))
class Estimate: def __init__(self): self.word_to_idx = utils.load_pickle(Const.vocab_path) self.model = CaptionGenerator(self.word_to_idx, dim_feature=[196, 512], dim_embed=512, dim_hidden=1024, n_time_step=33, prev2out=True, ctx2out=True, alpha_c=1.0, selector=True, dropout=True) self.n_epochs = TrainingArg.n_epochs self.batch_size = TrainingArg.batch_size self.update_rule = TrainingArg.update_rule self.learning_rate = TrainingArg.learning_rate self.print_bleu = TrainingArg.print_bleu self.print_every = TrainingArg.print_every self.save_every = TrainingArg.save_every self.log_path = TrainingArg.log_path self.model_path = TrainingArg.model_path self.pretrained_model = TrainingArg.pretrained_model self.test_model = TrainingArg.test_model self.max_words_len = 35 self.pre_mgr = PreData(vgg19_path=TrainingArg.vgg19_path) # 数据管理 def test(self, image_path): """ :return: """ alphas, betas, sampled_captions = self.model.build_sampler(max_len=self.max_words_len) # (N, max_len, L), (N, max_len) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver = tf.train.Saver() saver.restore(sess, TrainingArg.test_model) # saver.restore(sess, r'F:\4_study\show_tell\show-attend-and-tell-master\model\lstm\model-1') self.pre_mgr.set_tf_sess(sess) feature, resize_path = self.pre_mgr.pre_orig_image_to_tell(image_path) feed_dict = {self.model.features: feature} # batchs = self.pre_mgr.fetch_batch(Const.caption_train_vector_path, Const.resize_train_out_path, # self.batch_size, self.n_epochs) # # for batch in batchs: # caption_batch, image_batch, epoch = batch # feed_dict = {self.model.features: image_batch} alps, bts, sam_cap = sess.run([alphas, betas, sampled_captions], feed_dict) # (N, max_len, L), (N, max_len) #sam_cap = sess.run(sampled_captions, feed_dict) decoded = self.pre_mgr.decode_captions(sam_cap, self.model.idx_to_word) print(decoded) n = 0 # Plot original image # resize_path = '' img = ndimage.imread(resize_path) plt.subplot(4, 5, 1) plt.imshow(img) plt.axis('off') # Plot images with attention weights words = decoded[n].split(" ") for t in range(len(words)): if t > 18: break plt.subplot(4, 5, t + 2) plt.text(0, 1, '%s(%.2f)' % (words[t], bts[n, t]), color='black', backgroundcolor='white', fontsize=8) plt.imshow(img) alp_curr = alps[n, t, :].reshape(14, 14) alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=16, sigma=20) plt.imshow(alp_img, alpha=0.85) plt.axis('off') plt.show() def test_data(self, path): """ :return: """ alphas, betas, sampled_captions = self.model.build_sampler(max_len=self.max_words_len) # (N, max_len, L), (N, max_len) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver = tf.train.Saver() saver.restore(sess, TrainingArg.test_model) self.pre_mgr.set_tf_sess(sess) test_result = [] test_batch = self.pre_mgr.fetch_test_data(path) for feature, image_id in test_batch: feed_dict = {self.model.features: feature} sam_cap = sess.run([sampled_captions], feed_dict) # (N, max_len, L), (N, max_len) decoded = self.pre_mgr.decode_captions(sam_cap, self.model.idx_to_word) for i, v in enumerate(decoded): item = { 'image_id': image_id[i], 'caption': decoded[i].replace(' ', '').rstrip('.') } test_result.append(item) print(test_result[-1]) with open('adw_image_caption.json', 'w') as f: json.dump(test_result, f) print('save test json to adw_image_caption.json')
def main(params): sys.path.insert(0, os.path.join(params.bottomup_path, 'lib')) from fast_rcnn.config import cfg, cfg_from_file from fast_rcnn.test import im_detect, _get_blobs from fast_rcnn.nms_wrapper import nms ########################### # CNN : Faster-RCNN setting data_path = os.path.join(params.bottomup_path, 'data/genome/1600-400-20') # Load classes classes = ['__background__'] with open(os.path.join(data_path, 'objects_vocab.txt')) as f: for object in f.readlines(): classes.append(object.split(',')[0].lower().strip()) # Load attributes attributes = ['__no_attribute__'] with open(os.path.join(data_path, 'attributes_vocab.txt')) as f: for att in f.readlines(): attributes.append(att.split(',')[0].lower().strip()) GPU_ID = params.gpu_id # if we have multiple GPUs, pick one caffe.init_log() caffe.set_device(GPU_ID) caffe.set_mode_gpu() net = None cfg_from_file( os.path.join(params.bottomup_path, 'experiments/cfgs/faster_rcnn_end2end_resnet.yml')) weights = os.path.join( params.bottomup_path, 'data/faster_rcnn_models/resnet101_faster_rcnn_final.caffemodel') prototxt = os.path.join( params.bottomup_path, 'models/vg/ResNet-101/faster_rcnn_end2end_final/test.prototxt') net = caffe.Net(prototxt, caffe.TEST, weights=weights) conf_thresh = 0.4 min_boxes = params.num_objects max_boxes = params.num_objects ########################### ########################### # RNN : Caption generation setting # load json file label_info = json.load(open(params.input_labels)) word_to_idx = label_info['word_to_idx'] # load h5 file caps_info = h5py.File(params.input_caps, 'r', driver='core') seq_length = caps_info['labels'].shape[1] # GPU options config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True # build a graph to sample captions graph_gen_cap = tf.Graph() sess_gen_cap = tf.Session(graph=graph_gen_cap, config=config) with graph_gen_cap.as_default(): model = CaptionGenerator(word_to_idx, num_features=params.num_objects, dim_feature=params.dim_features, dim_embed=params.dim_word_emb, dim_hidden=params.rnn_hid_size, dim_attention=params.att_hid_size, n_time_step=seq_length - 1) alphas, sampled_captions = model.build_sampler(max_len=params.max_len) saver1 = tf.train.Saver() saver1.restore(sess_gen_cap, params.test_model) tf.reset_default_graph() ############################ ########################### # Face : Replacer name_replacer = NameReplacer(model.idx_to_word, params.score_thr) ############################ ########################### # Run Image Captioning with face detection while True: full_fname = raw_input("Enter the image path and name:") if full_fname == 'Exit': break if not os.path.exists(full_fname): print("Not Exist File : {}".format(full_fname)) continue ########################### # Object Detection im = cv2.imread(full_fname) scores, boxes, attr_scores, rel_scores = im_detect(net, im) # Keep the original boxes, don't worry about the regression bbox outputs rois = net.blobs['rois'].data.copy() # unscale back to raw image space blobs, im_scales = _get_blobs(im, None) cls_boxes = rois[:, 1:5] / im_scales[0] cls_prob = net.blobs['cls_prob'].data attr_prob = net.blobs['attr_prob'].data pool5 = net.blobs['pool5_flat'].data # Keep only the best detections max_conf = np.zeros((rois.shape[0])) for cls_ind in range(1, cls_prob.shape[1]): cls_scores = scores[:, cls_ind] dets = np.hstack( (cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32) keep = np.array(nms(dets, cfg.TEST.NMS)) max_conf[keep] = np.where(cls_scores[keep] > max_conf[keep], cls_scores[keep], max_conf[keep]) keep_boxes = np.where(max_conf >= conf_thresh)[0] if len(keep_boxes) < min_boxes: keep_boxes = np.argsort(max_conf)[::-1][:min_boxes] elif len(keep_boxes) > max_boxes: keep_boxes = np.argsort(max_conf)[::-1][:max_boxes] feats = pool5[keep_boxes] ############################ ########################### # Caption generation using CNN features feed_dict = {model.features: [feats]} alps, sam_cap = sess_gen_cap.run([alphas, sampled_captions], feed_dict) decoded = decode_captions(sam_cap, model.idx_to_word) ############################ ########################### # Name replacer name_list, conf_list, roi_list = vtt_face_recognize( full_fname, params.url, params.post_data) replace_decoded, words = name_replacer.name_replace_caps( sam_cap, alps, cls_boxes, name_list, conf_list, roi_list) print("Original caption : %s" % decoded[0]) print("Replaced caption : %s" % replace_decoded[0]) ############################ ########################### # Showing img = skimage.io.imread(full_fname) img = skimage.img_as_float(img) boxes = cls_boxes[keep_boxes] boxes = boxes.astype(int) # draw attention map fig = plt.figure(figsize=(16, 8)) ax = fig.add_subplot(3, 6, 1) ax.imshow(img) plt.axis('off') # Plot images with attention weights words = words[0] for t in range(len(words)): if t > 16: break if words[t] == '<BLANK>': continue alphamap = np.zeros((img.shape[0], img.shape[1])) for b in range(boxes.shape[0]): alphamap[boxes[b, 1]:boxes[b, 3], boxes[b, 0]:boxes[b, 2]] += alps[0, t, b] max_idx = np.argmax(alps[0, t, :]) att_img = np.dstack((img, alphamap)) ax = fig.add_subplot(3, 6, t + 2) plt.text(0, 1, '%s' % (words[t]), color='black', backgroundcolor='white', fontsize=8) ax.imshow(att_img) ax.add_patch( patches.Rectangle((boxes[max_idx, 0], boxes[max_idx, 1]), boxes[max_idx, 2] - boxes[max_idx, 0], boxes[max_idx, 3] - boxes[max_idx, 1], linewidth=1, edgecolor='r', facecolor='none')) plt.axis('off') fig.tight_layout() plt.show()
class CaptionInference(object): def __init__(self, sess, model_path, use_inception): path_prefix = os.path.dirname(os.path.realpath(__file__)) # word to index mapping with open(os.path.join(path_prefix, 'data/train/word_to_idx.pkl'), "rb") as f: self.word_to_idx = pickle.load(f) if use_inception: L = 64 D = 2048 cnn_model_path = os.path.join(path_prefix, 'data/inception_v3.ckpt') else: L = 196 D = 512 cnn_model_path = os.path.join( path_prefix, './data/imagenet-vgg-verydeep-19.mat') self.batch_size = 128 self.sess = sess self.use_inception = use_inception print("Creating model...") self.model = CaptionGenerator( self.word_to_idx, dim_feature=[L, D], dim_embed=512, dim_hidden=1800, n_time_step=16, prev2out=True, ctx2out=True, alpha_c=5.0, selector=True, dropout=True, use_cnn="inception" if use_inception else "vgg", cnn_model_path=cnn_model_path) print("Loading CNN weights...") self.model.cnn.load_weights(sess) print("Building sampler...") self.alphas, self.betas, self.generated_captions = self.model.build_sampler( max_len=20) # initialize model and load weights print("Loading LSTM weights...") # tf.global_variables_initializer().run() saver = tf.train.Saver(self.model.sampler_vars) saver.restore(sess, model_path) def inference_np(self, images): nimgs = images.shape[0] print("Running inference on {} images...".format(nimgs)) nbatches = int(math.ceil(nimgs / self.batch_size)) all_decoded = [] all_alphas = None all_betas = None for i in range(nbatches): start = i * self.batch_size end = (i + 1) * self.batch_size end = nimgs if end >= nimgs else end batch_images = images[start:end] print("processing {} images ({} to {})".format( batch_images.shape[0], start + 1, end)) batch_alphas, batch_betas, batch_gen_cap = self.sess.run( [self.alphas, self.betas, self.generated_captions], feed_dict={self.model.images: batch_images}) # batch_gen_cap = self.sess.run(self.generated_captions, feed_dict = {self.model.images: batch_images}) batch_decoded = decode_captions(batch_gen_cap, self.model.idx_to_word) all_decoded.extend(batch_decoded) all_alphas = np.concatenate([ all_alphas, batch_alphas ]) if all_alphas is not None else batch_alphas all_betas = np.concatenate([ all_betas, batch_betas ]) if all_betas is not None else batch_betas return all_alphas, all_betas, all_decoded @staticmethod def resize_image(image, image_size): width, height = image.size if width > height: left = (width - height) / 2 right = width - left top = 0 bottom = height else: top = (height - width) / 2 bottom = height - top left = 0 right = width image = image.crop((left, top, right, bottom)) image = image.resize([image_size, image_size], Image.ANTIALIAS) return image def preprocess_file(self, file_name): print("preprocess", file_name) if os.path.splitext(file_name)[1] == ".npy": return np.squeeze(np.load(file_name)) else: img_np = np.array( self.resize_image(Image.open(file_name), self.model.cnn.image_size)).astype( np.float32) # convert grey scale image to 3-channel if self.use_inception: img_np /= 255.0 img_np -= 0.5 img_np *= 2.0 if img_np.ndim == 2: img_np = np.stack((img_np, ) * 3, axis=-1) return img_np def inference_files(self, image_files): print("processing {} images...".format(len(image_files))) image_batch = np.array([self.preprocess_file(x) for x in image_files]) return self.inference_np(image_batch)