def _extract_feature(vgg_model_path, batch_size=32): vggnet = Vgg19(vgg_model_path) vggnet.build() with tf.Session() as sess: tf.global_variables_initializer().run() for split in ['train', 'val', 'test']: anno_path = '/home/yifan/PythonProjects/im2txt-att/data/%s/%s.annotations.pkl' % (split, split) save_path = '/home/yifan/PythonProjects/im2txt-att/data/%s/%s.features.hkl' % (split, split) annotations = load_pickle(anno_path) image_path = list(annotations['file_name'].unique()) n_examples = len(image_path) all_feats = np.ndarray([n_examples, 196, 512], dtype=np.float32) for start, end in zip(range(0, n_examples, batch_size), range(batch_size, n_examples + batch_size, batch_size)): image_batch_file = image_path[start:end] image_batch = np.array(map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype(np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[start:end, :] = feats print('Processed %d %s features..' % (end, split)) #use hickle to save huge feature vectors hickle.dump(all_feats, save_path) print('Saved %s..' % (save_path))
def __init__(self, vgg19_path=None): if vgg19_path is not None: self.vggnet = Vgg19(vgg19_path) self.vggnet.build() self.sess = None self.val_data = None
def __init__(self, word_to_idx, dim_feature=[196, 512], dim_embed=512, dim_hidden=1024, n_time_step=16, prev2out=True, ctx2out=True, alpha_c=0.0, selector=True, dropout=True, use_cnn = None, cnn_model_path = None): """ Args: word_to_idx: word-to-index mapping dictionary. dim_feature: (optional) Dimension of vggnet19 conv5_3 feature vectors. dim_embed: (optional) Dimension of word embedding. dim_hidden: (optional) Dimension of all hidden state. n_time_step: (optional) Time step size of LSTM. prev2out: (optional) previously generated word to hidden state. (see Eq (7) for explanation) ctx2out: (optional) context to hidden state (see Eq (7) for explanation) alpha_c: (optional) Doubly stochastic regularization coefficient. (see Section (4.2.1) for explanation) selector: (optional) gating scalar for context vector. (see Section (4.2.1) for explanation) dropout: (optional) If true then dropout layer is added. """ self.word_to_idx = word_to_idx self.idx_to_word = {i: w for w, i in iteritems(word_to_idx)} self.prev2out = prev2out self.ctx2out = ctx2out self.alpha_c = alpha_c self.selector = selector self.dropout = dropout self.use_cnn = use_cnn self.V = len(word_to_idx) self.L = dim_feature[0] self.D = dim_feature[1] self.M = dim_embed self.H = dim_hidden self.T = n_time_step self._start = word_to_idx['<START>'] self._null = word_to_idx['<NULL>'] self.weight_initializer = tf.contrib.layers.xavier_initializer() self.const_initializer = tf.constant_initializer(0.0) self.emb_initializer = tf.random_uniform_initializer(minval=-1.0, maxval=1.0) if use_cnn is None: # Place holder for features self.features = tf.placeholder(tf.float32, [None, self.L, self.D]) else: # build CNN model if use_cnn == "inception": self.cnn = InceptionV3(cnn_model_path) elif use_cnn == "vgg": self.cnn = Vgg19(cnn_model_path) else: raise RuntimeError("Unknown CNN model " + use_cnn) self.cnn.build() # Place holder for image input self.images = self.cnn.images # output features from CNN self.features = self.cnn.features # Place holder for captions self.captions = tf.placeholder(tf.int32, [None, self.T + 1])
def main(): # maximum length of caption(number of word). if caption is longer than max_length, deleted. max_length = 20 # if word occurs less than word_count_threshold in training dataset, the word index is special unknown token. word_count_threshold = 1 # about 500 images and 2500 captions test_dataset = _process_test_data(image_dir='image/XingBi_image_resized/') print('Finished processing caption data') split = 'X_test' save_pickle(test_dataset, 'data/X_test/X_test.annotations.pkl') annotations = load_pickle('./data/%s/%s.annotations.pkl' % (split, split)) file_names = np.asarray(annotations['file_name']) save_pickle(file_names, './data/%s/%s.file.names.pkl' % (split, split)) image_idxs = _build_image_idxs(annotations) save_pickle(image_idxs, './data/%s/%s.image.idxs.pkl' % (split, split)) vgg_model_path = './data/imagenet-vgg-verydeep-19.mat' # extract conv5_3 feature vectors vggnet = Vgg19(vgg_model_path) vggnet.build() # batch size for extracting feature vectors from vggnet. batch_size = 80 with tf.Session() as sess: tf.global_variables_initializer().run() anno_path = './data/%s/%s.annotations.pkl' % (split, split) annotations = load_pickle(anno_path) image_path = list(annotations['file_name'].unique()) save_path = './data/%s/%s.features.hkl' % (split, split) n_examples = len(image_path) all_feats = np.ndarray([n_examples, 196, 512], dtype=np.float32) for start, end in zip(range(0, n_examples, batch_size), range(batch_size, n_examples + batch_size, batch_size)): if (end > len(image_path)): end = len(image_path) image_batch_file = image_path[start:end] image_batch = np.array(list(map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file))).astype( np.float32) # print(start,end) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[start:end, :] = feats print("Processed %d %s features.." % (end, split)) # use hickle to save huge feature vectors hickle.dump(all_feats, save_path) print("Saved %s.." % (save_path))
def get_val(filenames): # batch size for extracting feature vectors from vggnet. batch_size = 2 # vgg model path vgg_model_path = './data/imagenet-vgg-verydeep-19.mat' # extract conv5_3 feature vectors vggnet = Vgg19(vgg_model_path) vggnet.build() with tf.Session() as sess: tf.initialize_all_variables().run() image_batch = np.array( map(lambda x: ndimage.imread(x, mode='RGB'), filenames)).astype(np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) data = {} data['filenames'] = filenames data['features'] = feats return data
def main(): # batch size for extracting feature vectors from vggnet. batch_size = 500 # maximum length of caption(number of word). if caption is longer than max_length, deleted. max_length = 100 # # if word occurs less than word_count_threshold in training dataset, the word index is special unknown token. word_count_threshold = 1 ## Data Path data_path = './data/Sample1/' # # vgg model path vgg_model_path = './data/imagenet-vgg-verydeep-19.mat' # caption_file = 'data/annotations/captions_train2014.json' image_dir = './image/%2014_resized/' # train_dataset = _process_caption_data(caption_data=data_path + 'train/train.json', max_length=max_length) test_dataset = _process_caption_data(caption_data=data_path + 'test/test.json', max_length=max_length) print('Finished processing caption data') # save_pickle(train_dataset, data_path + 'train/train.annotations.pkl') save_pickle(test_dataset, data_path + 'test/test.annotations.pkl') # for split in ['train', 'test']: # for split in ['train','val']: annotations = load_pickle(data_path + '%s/%s.annotations.pkl' % (split, split)) if split == 'train': word_to_idx, process_vocab, max_l = _build_vocab( annotations=annotations, threshold=word_count_threshold) save_pickle(word_to_idx, data_path + '%s/word_to_idx.pkl' % split) save_pickle(process_vocab, data_path + '%s/process-vocab.pkl' % split) # captions = _build_caption_vector(annotations=annotations, word_to_idx=word_to_idx, max_length=max_l) save_pickle(captions, data_path + '%s/%s.captions.pkl' % (split, split)) # file_names, id_to_idx = _build_file_names(annotations) save_pickle(file_names, data_path + '%s/%s.file.names.pkl' % (split, split)) image_idxs = _build_image_idxs(annotations, id_to_idx) save_pickle(image_idxs, data_path + '%s/%s.image.idxs.pkl' % (split, split)) # # prepare reference captions to compute bleu scores later image_ids = {} feature_to_captions = {} i = -1 for caption, image_id in zip(annotations['caption'], annotations['report_id']): if not image_id in image_ids: image_ids[image_id] = 0 i += 1 feature_to_captions[i] = [] feature_to_captions[i].append(caption.lower() + ' .') save_pickle(feature_to_captions, data_path + '%s/%s.references.pkl' % (split, split)) print("Finished building %s caption dataset" % split) # extract conv5_3 feature vectors vggnet = Vgg19(vgg_model_path) vggnet.build() #model=pretrained_model() with tf.Session() as sess: tf.initialize_all_variables().run() for split in ['train', 'test']: anno_path = data_path + '%s/%s.annotations.pkl' % (split, split) save_path = data_path + '%s/%s.features.hkl' % (split, split) annotations = load_pickle(anno_path) image_path = annotations['images'] n_examples = len(image_path) # ndarray to store image features from two images together all_feats = np.ndarray([n_examples, 196, 1024], dtype=np.float32) #all_feats = np.ndarray([n_examples, 196, 512], dtype=np.float32) i = 0 # for start, end in zip(range(0, n_examples, batch_size), for image_record in list(image_path): print(type(image_record)) print(len(image_record)) j = 0 comb_map = np.ndarray([len(image_record), 196, 512], dtype=np.float32) for image in image_record: # range(batch_size, n_examples + batch_size, batch_size)): # image_batch_file = image_path[start:end] image_batch_file = image print(image_batch_file) # # image_batch = np.array(map(lambda x: ndimage.imread(x+'.png', mode='RGB'), image_batch_file)).astype(np.float32) # # image_batch = np.array(list(map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file))).astype(np.float32) image_batch = np.expand_dims(np.array( ndimage.imread(image, mode='RGB').astype(np.float32)), axis=0) # # print("shape:", image_batch.shape) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) #feats = extract_features(model,image_batch_file) #feats=featureVec_image(image_batch_file,model) # feats=Main_hyper(image_batch) # all_feats[start:end, :] = feats comb_map[j, :] = feats j += 1 new_map = merge_feature_maps(comb_map) all_feats[i, :] = new_map #all_feats[i, :] = comb_map[0,:] i += 1 # print ("Processed %d %s features.." % (end, split)) print("Process %d %s features" % (i, split)) # hickle.dump(all_feats, save_path) # use hickle to save huge feature vectors hickle.dump(all_feats, save_path) print("Saved %s.." % (save_path))
def train(self): # train/val dataset n_examples = self.data['captions'].shape[0] n_iters_per_epoch = int(np.ceil(float(n_examples)/self.batch_size)) # features = self.data['features'] captions = self.data['captions'] image_idxs = self.data['image_idxs'] train_file_names=self.data['file_names'] # val_features = self.val_data['features'] val_file_names=self.val_data['file_names'] n_iters_val = int(np.ceil(float(val_file_names.shape[0])/self.batch_size)) # build graphs for training model and sampling captions loss = self.model.build_model() #This is my addtion vgg_model_path = '/mnt/zye/show-attend-and-tell/data/imagenet-vgg-verydeep-19.mat' vggnet = Vgg19(vgg_model_path) vggnet.build() ################### # train op with tf.name_scope('optimizer'): optimizer = self.optimizer(learning_rate=self.learning_rate) grads = tf.gradients(loss, tf.trainable_variables()) grads_and_vars = list(zip(grads, tf.trainable_variables())) train_op = optimizer.apply_gradients(grads_and_vars=grads_and_vars) tf.get_variable_scope().reuse_variables() _, _, generated_captions = self.model.build_sampler(max_len=20) # summary op tf.summary.scalar('batch_loss', loss) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) for grad, var in grads_and_vars: tf.summary.histogram(var.op.name+'/gradient', grad) summary_op = tf.summary.merge_all() print "The number of epoch: %d" %self.n_epochs print "Data size: %d" %n_examples print "Batch size: %d" %self.batch_size print "Iterations per epoch: %d" %n_iters_per_epoch config = tf.ConfigProto(allow_soft_placement = True) #config.gpu_options.per_process_gpu_memory_fraction=0.9 config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.variables_initializer(tf.global_variables()+tf.get_collection('Vgg19'))) summary_writer = tf.summary.FileWriter(self.log_path, graph=tf.get_default_graph()) saver = tf.train.Saver(tf.global_variables(),max_to_keep=5) if self.pretrained_model is not None: print "Start training with pretrained Model.." saver.restore(sess, self.pretrained_model) prev_loss = -1 curr_loss = 0 start_t = time.time() for e in range(self.n_epochs): rand_idxs = np.random.permutation(n_examples) captions = captions[rand_idxs] image_idxs = image_idxs[rand_idxs] for i in range(n_iters_per_epoch): captions_batch = captions[i*self.batch_size:(i+1)*self.batch_size] image_idxs_batch = image_idxs[i*self.batch_size:(i+1)*self.batch_size] #######This is my addtion######## image_batch_file=list(train_file_names[image_idxs_batch]) image_batch = np.array(map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype( np.float32) features_batch= sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) ################################# # features_batch = features[image_idxs_batch] feed_dict = {self.model.features: features_batch, self.model.captions: captions_batch} _, l = sess.run([train_op, loss], feed_dict) curr_loss += l # write summary for tensorboard visualization if i % 10 == 0: summary = sess.run(summary_op, feed_dict) summary_writer.add_summary(summary, e*n_iters_per_epoch + i) if (i+1) % self.print_every == 0: print "\nTrain loss at epoch %d & iteration %d (mini-batch): %.5f" %(e+1, i+1, l) ground_truths = captions[image_idxs == image_idxs_batch[0]] decoded = decode_captions(ground_truths, self.model.idx_to_word) for j, gt in enumerate(decoded): print "Ground truth %d: %s" %(j+1, gt) gen_caps = sess.run(generated_captions, feed_dict) decoded = decode_captions(gen_caps, self.model.idx_to_word) print "Generated caption: %s\n" %decoded[0] print "Previous epoch loss: ", prev_loss print "Current epoch loss: ", curr_loss print "Elapsed time: ", time.time() - start_t prev_loss = curr_loss curr_loss = 0 # print out BLEU scores and file write if self.print_bleu: all_gen_cap = np.ndarray((val_file_names.shape[0], 20)) for i in range(n_iters_val): ########This is my addtion######## image_batch_file = list(val_file_names[i*self.batch_size:(i + 1)*self.batch_size]) image_batch = np.array(map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype( np.float32) features_batch = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) ################################### # features_batch = val_features[i*self.batch_size:(i+1)*self.batch_size] feed_dict = {self.model.features: features_batch} gen_cap = sess.run(generated_captions, feed_dict=feed_dict) all_gen_cap[i*self.batch_size:(i+1)*self.batch_size] = gen_cap all_decoded = decode_captions(all_gen_cap, self.model.idx_to_word) save_pickle(all_decoded, "./data/val/val.candidate.captions.pkl") scores = evaluate(data_path='./data', split='val', get_scores=True) write_bleu(scores=scores, path=self.model_path, epoch=e) # save model's parameters if (e+1) % self.save_every == 0: saver.save(sess, os.path.join(self.model_path, 'model'), global_step=e+1) print "model-%s saved." %(e+1)
def test(self,split='test', attention_visualization=True, save_sampled_captions=True): ''' Args: - data: dictionary with the following keys: # - features: Feature vectors of shape (5000, 196, 512) - file_names: Image file names of shape (5000, ) - captions: Captions of shape (24210, 17) - image_idxs: Indices for mapping caption to image of shape (24210, ) - features_to_captions: Mapping feature to captions (5000, 4~5) - split: 'train', 'val' or 'test' - attention_visualization: If True, visualize attention weights with images for each sampled word. (ipthon notebook) - save_sampled_captions: If True, save sampled captions to pkl file for computing BLEU scores. ''' # features = data['features'] # build a graph to sample captions alphas, betas, sampled_captions = self.model.build_sampler(max_len=20) # (N, max_len, L), (N, max_len) # This is my addtion vgg_model_path = '/mnt/zye/show-attend-and-tell/data/imagenet-vgg-verydeep-19.mat' vggnet = Vgg19(vgg_model_path) vggnet.build() ################### config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # sess.run(tf.global_variables_initializer()) sess.run(tf.variables_initializer(tf.global_variables() + tf.get_collection('Vgg19'))) saver = tf.train.Saver(tf.global_variables()) # ckpt=tf.train.get_checkpoint_state(self.test_model) # print ckpt saver.restore(sess, self.test_model) print 'success' data_size=self.test_data['file_names'].shape[0] mask = np.random.choice(data_size, self.batch_size) image_files = self.test_data['file_names'][mask] image_batch = np.array(map(lambda x: ndimage.imread(x, mode='RGB'),list(image_files))).astype( np.float32) features_batch = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) # features_batch, image_files = sample_coco_minibatch(data, self.batch_size) feed_dict = { self.model.features: features_batch } alps, bts, sam_cap = sess.run([alphas, betas, sampled_captions], feed_dict) # (N, max_len, L), (N, max_len) decoded = decode_captions(sam_cap, self.model.idx_to_word) if attention_visualization: for n in range(10): print "Sampled Caption: %s" %decoded[n] # Plot original image img = ndimage.imread(image_files[n]) plt.subplot(4, 5, 1) plt.imshow(img) plt.axis('off') # Plot images with attention weights words = decoded[n].split(" ") for t in range(len(words)): if t > 18: break plt.subplot(4, 5, t+2) plt.text(0, 1, '%s(%.2f)'%(words[t], bts[n,t]) , color='black', backgroundcolor='white', fontsize=8) plt.imshow(img) alp_curr = alps[n,t,:].reshape(14,14) alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=16, sigma=20) plt.imshow(alp_img, alpha=0.85) plt.axis('off') plt.show() if save_sampled_captions: all_sam_cap = np.ndarray((data_size, 20)) num_iter = int(np.ceil(float(data_size) / self.batch_size)) for i in range(num_iter): image_batch_file = list(self.test_data['file_names'][i * self.batch_size:(i + 1) * self.batch_size]) image_batch = np.array(map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype( np.float32) features_batch = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) # features_batch = features[i*self.batch_size:(i+1)*self.batch_size] feed_dict = { self.model.features: features_batch } all_sam_cap[i*self.batch_size:(i+1)*self.batch_size] = sess.run(sampled_captions, feed_dict) all_decoded = decode_captions(all_sam_cap, self.model.idx_to_word) save_pickle(all_decoded, "./data/%s/%s.candidate.captions.pkl" %(split,split))
def main(): PATH = os.getcwd() vgg_model_path = PATH + '/data/imagenet-vgg-verydeep-19.mat' num_of_image_per_video = 17 type = ['train', 'val', 'test'] # TIME = str(datetime.now()) vggnet = Vgg19(vgg_model_path) vggnet.build() with tf.Session() as sess: tf.initialize_all_variables().run() for each in type: # settle down the paths path = PATH + '/data/data_set/' + each + '/' save_path_feats = path + 'features_' + each + '.hkl' save_path_labels_all = path + 'labels_all_' + each + '.hkl' # load video_filenames and labels video_filename = load_pickle(path + 'video_filenames_' + each + '.pkl') labels = load_pickle(path + 'labels_' + each + '.pkl') # gather the whole data in the current type all_feats = np.ndarray( [len(video_filename), num_of_image_per_video, 196, 512], dtype=np.float32) all_labels = [None] * len(video_filename) # feature extraction for idx, vf in enumerate(video_filename): images_list = sorted(list(os.walk(vf))[0][-1], cmp=comp) print('Processed' + str(idx + 1) + 'videos..') # # generate images_path cur_images_path = [vf + '/' + image for image in images_list] step = int( float(len(images_list)) / float(num_of_image_per_video)) print(step) # Supplement if step == 0: cur_images_path += [cur_images_path[-1]] * ( num_of_image_per_video - len(cur_images_path)) # do not jump if step == 1: # cut from the middle start_num = np.floor( float(len(images_list) - num_of_image_per_video) / 2) start = 1 if start_num == 0 else start_num cur_images_path = cur_images_path[ int(start - 1):int(num_of_image_per_video + start - 1)] # jump if step > 1: # cut by jumping -- start from the bottom of each partition cur_images_path = cur_images_path[step - 1::step] # cut from the middle again in case of the residual effects start_num = np.floor( float(len(cur_images_path) - num_of_image_per_video) / 2) start = 1 if start_num == 0 else start_num cur_images_path = cur_images_path[ int(start - 1):int(num_of_image_per_video + start - 1)] # in case of failure if len(cur_images_path) != num_of_image_per_video: print('step: ' + str(step)) print('length of origianl images: ' + str(len(images_list))) print('length of standard: ' + str(num_of_image_per_video)) print('length: ' + str(len(cur_images_path))) print('errors occur..') exit() cur_labels = labels[idx] # read images and extract features image_batch = np.array( map(lambda x: ndimage.imread(x, mode='RGB'), cur_images_path)).astype(np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[idx, :] = feats all_labels[idx] = [cur_labels] * num_of_image_per_video # use hickle to save huge feature vectors hickle.dump(all_feats, save_path_feats) all_labels = np.array(all_labels) hickle.dump(all_labels, save_path_labels_all) print("Saved %s.." % save_path_feats)
def main(): start = datetime.now() caption_file = 'data/annotations/captions_train2014.json' image_dir = 'image/train2014_resized' max_length = 15 word_count_threshold = 1 vgg_model_path = './data/imagenet-vgg-verydeep-19.mat' batch_size = 50 print '1. Building Top 1K dictionary from Train dataset' if not os.path.exists('./data/top1k.pkl'): train_dataset = _process_caption_data(caption_file=caption_file, image_dir=image_dir, max_length=max_length) word_to_idx = _build_vocab(annotations=train_dataset, threshold=word_count_threshold) save_pickle(word_to_idx, './data/word_to_idx.pkl') top1k = _build_top1k_noun() save_pickle(top1k, './data/top1k.pkl') else: top1k = load_pickle('./data/top1k.pkl') print '2. Download and Process each keywords' cur_dir = os.getcwd() wnid_idx = 0 vggnet = Vgg19(vgg_model_path) vggnet.build() captions = {} if not os.path.exists('./data/imagenet/features/'): os.makedirs('./data/imagenet/features/') for wnid_idx, (wnid, word) in enumerate(top1k): save_path = './data/imagenet/features/%s.list' % wnid if not os.path.exists(save_path): print ' ----- Processing %s, %s, %d / %d' % (wnid, word, wnid_idx, len(top1k)) print '\tdownloading' pre_url = 'http://www.image-net.org/download/synset?wnid=' post_url = '&username=intuinno&accesskey=6be8155ee3d56b5120241b3bda13412d3cc0cd42&release=latest&src=stanford' testfile = urllib.URLopener() try: testfile.retrieve(pre_url + wnid + post_url, wnid + '.tar') except IOError as e: print 'Failed to download' else: original_dir = './data/imagenet/photos/%s/original/' % wnid resized_dir = './data/imagenet/photos/%s/resized/' % wnid if not os.path.exists(original_dir): os.makedirs(original_dir) os.makedirs(resized_dir) os.rename(wnid + '.tar', original_dir + 'data.tar') os.chdir(original_dir) tar = tarfile.open('data.tar') tar.extractall() tar.close() os.remove('data.tar') os.chdir(cur_dir) else: os.remove('%s.tar' % wnid) print '\tresizing' resized_files = [] image_files = os.listdir(original_dir) for i, image_file in enumerate(image_files): # from IPython.core.debugger import Tracer; Tracer()() try: image = Image.open( os.path.join(original_dir, image_file)) except IOError as e: print 'Error: cannot open %s' % (os.path.join( original_dir, image_file)) else: image = resize_image(image) image.save(os.path.join(resized_dir, image_file), image.format) resized_files.append(image_file) image_files = resized_files print '\tget vgg19 image features' with tf.Session() as sess: tf.initialize_all_variables().run() n_examples = len(image_files) all_feats = np.ndarray([n_examples, 196, 512], dtype=np.float32) for start, end in zip( range(0, n_examples, batch_size), range(batch_size, n_examples + batch_size, batch_size)): image_batch_file = image_files[start:end] image_batch = np.array( map( lambda x: ndimage.imread( os.path.join(resized_dir, x), mode='RGB'), image_batch_file)).astype(np.float32) feats = sess.run( vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[start:end, :] = feats save_path = './data/imagenet/features/%s.hkl' % wnid hickle.dump(all_feats, save_path) save_path = './data/imagenet/features/%s.list' % wnid save_pickle(image_files, save_path) print "\tSaved %s.." % save_path
def main(): config = tf.ConfigProto() config.gpu_options.allow_growth = True # batch size for extracting feature vectors from vggnet. batch_size = 100 # maximum length of caption(number of word). if caption is longer than max_length, deleted. max_length = 20 # if word occurs less than word_count_threshold in training dataset, the word index is special unknown token. word_count_threshold = 1 # vgg model path vgg_model_path = '/home/jason6582/sfyc/attention-tensorflow/imagenet-vgg-verydeep-19.mat' # about 80000 images and 400000 captions for train dataset train_dataset = _process_caption_data(caption_file='/home/jason6582/sfyc/attention-tensorflow/nus-wide/lite_train.json', image_dir='/home/jason6582/sfyc/NUS-WIDE/flickrfeature_resized/', max_length=max_length) # about 40000 images and 200000 captions test_dataset = _process_caption_data(caption_file='/home/jason6582/sfyc/attention-tensorflow/nus-wide/lite_test.json', image_dir='/home/jason6582/sfyc/NUS-WIDE/flickrfeature_resized/', max_length=max_length) # about 4000 images and 20000 captions for val / test dataset val_cutoff = int(len(train_dataset)/10) val_dataset = train_dataset[:val_cutoff] train_dataset = train_dataset[val_cutoff:] print 'Finished processing caption data' train_cutoff = [0] for i in range(15): train_cutoff.append(int(len(train_dataset)/16)*(i+1)) for i in range(15): save_pickle(train_dataset[train_cutoff[i]:train_cutoff[i+1]], 'data/train/train.annotations81_%s.pkl' % str(i)) save_pickle(train_dataset[train_cutoff[15]:],'data/train/train.annotations81_15.pkl') save_pickle(val_dataset, 'data/val/val.annotations81.pkl') save_pickle(test_dataset.reset_index(drop=True), 'data/test/test.annotations81.pkl') split = 'train' word_to_idx = {} for part in range(16): annotations = load_pickle('./data/%s/%s.annotations81_%s.pkl' % (split, split, str(part))) word_to_idx_part = _build_vocab(annotations=annotations, threshold=word_count_threshold) for key in word_to_idx_part: word_to_idx[key] = 0 word_list = sorted(word_to_idx.iterkeys()) for i, word in enumerate(word_list): word_to_idx[word] = i save_pickle(word_to_idx, './data/%s/word_to_idx81.pkl' % (split)) for part in range(16): annotations = load_pickle('./data/%s/%s.annotations81_%s.pkl' % (split, split, str(part))) captions = _build_caption_vector(annotations=annotations, word_to_idx=word_to_idx, max_length=max_length) save_pickle(captions, './data/%s/%s.captions81_%s.pkl' % (split, split, str(part))) file_names, id_to_idx = _build_file_names(annotations) save_pickle(file_names, './data/%s/%s.file.names81_%s.pkl' % (split, split, str(part))) image_idxs = _build_image_idxs(annotations, id_to_idx) save_pickle(image_idxs, './data/%s/%s.image.idxs81_%s.pkl' % (split, split, str(part))) # prepare reference captions to compute bleu scores later image_ids = {} feature_to_captions = {} i = -1 for caption, image_id in zip(annotations['caption'], annotations['image_id']): if not image_id in image_ids: image_ids[image_id] = 0 i += 1 feature_to_captions[i] = [] feature_to_captions[i].append(caption.lower() + ' .') save_pickle(feature_to_captions, './data/%s/%s.references81_%s.pkl' % (split, split, str(part))) print "Finished building %s caption dataset" %split for split in ['val', 'test']: annotations = load_pickle('./data/%s/%s.annotations81.pkl' % (split, split)) captions = _build_caption_vector(annotations=annotations, word_to_idx=word_to_idx, max_length=max_length) save_pickle(captions, './data/%s/%s.captions81.pkl' % (split, split)) file_names, id_to_idx = _build_file_names(annotations) save_pickle(file_names, './data/%s/%s.file.names81.pkl' % (split, split)) image_idxs = _build_image_idxs(annotations, id_to_idx) save_pickle(image_idxs, './data/%s/%s.image.idxs81.pkl' % (split, split)) # prepare reference captions to compute bleu scores later image_ids = {} feature_to_captions = {} i = -1 for caption, image_id in zip(annotations['caption'], annotations['image_id']): if not image_id in image_ids: image_ids[image_id] = 0 i += 1 feature_to_captions[i] = [] feature_to_captions[i].append(caption.lower() + ' .') save_pickle(feature_to_captions, './data/%s/%s.references81.pkl' % (split, split)) print "Finished building %s caption dataset" %split # extract conv5_3 feature vectors vggnet = Vgg19(vgg_model_path) vggnet.build() with tf.Session() as sess: tf.initialize_all_variables().run() split = 'train' for part in range(16): print "part", part, "of %s features" % split anno_path = './data/%s/%s.annotations81_%s.pkl' % (split, split, str(part)) save_path = './data/%s/%s.features81_%s.hkl' % (split, split, str(part)) annotations = load_pickle(anno_path) image_path = list(annotations['file_name'].unique()) n_examples = len(image_path) all_feats = np.ndarray([n_examples, 196, 512], dtype=np.float32) # print all_feats.shape for start, end in zip(range(0, n_examples, batch_size), range(batch_size, n_examples + batch_size, batch_size)): image_batch_file = image_path[start:end] image_batch = np.array(map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype(np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[start:end, :] = feats print ("Processed %d %s features.." % (end, split)) # use hickle to save huge feature vectors hickle.dump(all_feats, save_path) print ("Saved %s.." % (save_path)) # for split in ['val', 'test']: for split in ['test']: anno_path = './data/%s/%s.annotations81.pkl' % (split, split) save_path = './data/%s/%s.features81.hkl' % (split, split) annotations = load_pickle(anno_path) image_path = list(annotations['file_name'].unique()) n_examples = len(image_path) all_feats = np.ndarray([n_examples, 196, 512], dtype=np.float32) # print all_feats.shape for start, end in zip(range(0, n_examples, batch_size), range(batch_size, n_examples + batch_size, batch_size)): image_batch_file = image_path[start:end] image_batch = np.array(map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype( np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[start:end, :] = feats print ("Processed %d %s features.." % (end, split)) # use hickle to save huge feature vectors hickle.dump(all_feats, save_path) print ("Saved %s.." % (save_path))
def main(): PATH = os.getcwd() vgg_model_path = PATH + '/data/imagenet-vgg-verydeep-19.mat' data_dir = '../Dataset/data/tobii/' num_of_image_per_video = 17 type = ['test'] # type = ['train', 'val', 'test'] # TIME = str(datetime.now()) vggnet = Vgg19(vgg_model_path) vggnet.build() with tf.Session() as sess: tf.initialize_all_variables().run() for each in type: # settle down the paths if each == 'train': session = '0409-b' elif each == 'val': session = '0409-c' elif each == 'test': session = '0409-e' img_dir = '%s/frames/' % (data_dir + session) label_dir = '%s/label_all.txt' % (data_dir + session) path = PATH + '/data/data_set/' + each + '/' # # generate images_path images_list = natsorted([ img_dir + file for file in os.listdir(img_dir) if file.endswith('.jpg') ]) # cur_images_path = [vf + '/' + image for image in images_list] step = int(float(len(images_list)) / float(num_of_image_per_video)) print(step) all_feats = np.ndarray([step, num_of_image_per_video, 196, 512], dtype=np.float32) # read images and extract features for i in range(step): print('Processing No.' + str(i + 1) + '/%d batch..' % step) cur_images_path = images_list[i * 17:i * 17 + 17] image_batch = [] for img_file in cur_images_path: img = image.load_img(img_file, target_size=[224, 224]) x = image.img_to_array(img) image_batch.append(x) image_batch = np.array(image_batch).astype(np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[i, :] = feats label = [] with open(label_dir, 'r') as f: for line in open(label_dir): line = f.readline().strip().split(',') label.append(line[1]) label_reshape = np.array(label) label_reshape = label_reshape[:step * 17].reshape(step, 17) filenames_new = np.array(list(range(step))) train_data = { 'features': all_feats, 'labels': label_reshape, 'new_filename': filenames_new } # use hickle to save huge feature vectors with open(each + '_data_vgg' + '.pkl', 'wb') as f: pickle.dump(train_data, f)
def main(params): batch_size = params['batch_size'] max_length = params['max_length'] word_count_threshold = params['word_count_threshold'] vgg_model_path = params['vgg_model_path'] splits = ['val', 'test'] for split in splits: annotations = load_pickle('./data/%s/%s.annotations.pkl' % (split, split)) if split == 'train': word_to_idx = build_word_to_idx(annotations['caption'], word_count_threshold) save_pickle(word_to_idx, './data/%s/word_to_idx.pkl' % split) captions = build_caption_vectors(annotations, word_to_idx, max_length) save_pickle(captions, './data/%s/%s.captions.pkl' % (split, split)) file_names, id_to_idx = get_file_names(annotations) save_pickle(file_names, './data/%s/%s.file.names.pkl' % (split, split)) if split == 'train': image_idxs = get_image_idxs(annotations, id_to_idx) save_pickle(image_idxs, './data/%s/%s.image.idxs.pkl' % (split, split)) # Prepare reference captions to compute bleu scores later image_ids = {} feature_to_captions = {} i = -1 for caption, image_id in zip(annotations['caption'], annotations['image_id']): if not image_id in image_ids: image_ids[image_id] = 0 i += 1 feature_to_captions[i] = [] feature_to_captions[i].append(caption.lower() + ' .') save_pickle(feature_to_captions, './data/%s/%s.references.pkl' % (split, split)) # Extract conv5_3 feature vectors vggnet = Vgg19(vgg_model_path) vggnet.build() with tf.Session() as sess: tf.initialize_all_variables().run() for split in splits: anno_path = './data/%s/%s.annotations.pkl' % (split, split) save_path = './data/%s/%s.features.hkl' % (split, split) annotations = load_pickle(anno_path) image_list = list(annotations['file_name'].unique()) if split == 'train': image_path = map( lambda x: os.path.join('./image/train2014_resized', str(x) ), image_list) else: image_path = map( lambda x: os.path.join('./image/val2014_resized', str(x)), image_list) n_examples = len(image_path) all_feats = np.ndarray([n_examples, 196, 512], dtype=np.float32) for start, end in zip( range(0, n_examples, batch_size), range(batch_size, n_examples + batch_size, batch_size)): image_batch_file = image_path[start:end] image_batch = np.array( map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype(np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[start:end, :] = feats print("Processed %d %s features.." % (end, split)) # Normalize feature vectors all_feats = np.reshape(all_feats, [-1, 512]) mean = np.mean(all_feats, 0) var = np.var(all_feats, 0) all_feats = (all_feats - mean) / np.sqrt(var) all_feats = np.reshape(all_feats, [-1, 196, 512]) # Use hickle to save huge numpy array hickle.dump(all_feats, save_path) print("Saved %s.." % (save_path))
def main(): # batch size for extracting feature vectors from vggnet. batch_size = 100 # maximum length of caption(number of word). if caption is longer than max_length, deleted. max_length = 15 #15 # if word occurs less than word_count_threshold in training dataset, the word index is special unknown token. word_count_threshold = 1 # vgg model path vgg_model_path = './data/imagenet-vgg-verydeep-19.mat' #path to resized images i_fp = './image/2014_resized/' #n_images = 67691 #building dataset print 'Start processing caption data' train_dataset = get_caption_data(i_fp, max_length) print 'Finished processing caption data' #train, val, and test --> 70, 15, and 15 train_cutoff = int(0.70 * len(train_dataset)) val_cutoff = int(0.85 * len(train_dataset)) #path to data directory d_fp = './data' if not os.path.exists(d_fp + '/train'): os.makedirs(d_fp + '/train') if not os.path.exists(d_fp + '/val'): os.makedirs(d_fp + '/val') if not os.path.exists(d_fp + '/test'): os.makedirs(d_fp + '/test') save_pickle(train_dataset[:train_cutoff], d_fp + '/train/train.annotations.pkl') save_pickle(train_dataset[train_cutoff:val_cutoff].reset_index(drop=True), d_fp + '/val/val.annotations.pkl') save_pickle(train_dataset[val_cutoff + 1:].reset_index(drop=True), d_fp + '/test/test.annotations.pkl') ################# train, val, and test data saved ##################### for split in ['train', 'val', 'test']: annotations = load_pickle(d_fp + '/%s/%s.annotations.pkl' % (split, split)) if split == 'train': word_to_idx = _build_vocab(annotations=annotations, threshold=word_count_threshold) save_pickle(word_to_idx, d_fp + '/%s/word_to_idx.pkl' % split) captions = _build_caption_vector(annotations=annotations, word_to_idx=word_to_idx, max_length=max_length) save_pickle(captions, d_fp + '/%s/%s.captions.pkl' % (split, split)) file_names, id_to_idx = _build_file_names(annotations) save_pickle(file_names, d_fp + '/%s/%s.file.names.pkl' % (split, split)) image_idxs = _build_image_idxs(annotations, id_to_idx) save_pickle(image_idxs, d_fp + '/%s/%s.image.idxs.pkl' % (split, split)) # prepare reference captions to compute bleu scores later image_ids = {} feature_to_captions = {} i = -1 for caption, image_id in zip(annotations['caption'], annotations['image_id']): if not image_id in image_ids: image_ids[image_id] = 0 i += 1 feature_to_captions[i] = [] feature_to_captions[i].append(caption.lower() + ' .') save_pickle(feature_to_captions, d_fp + '/%s/%s.references.pkl' % (split, split)) print "Finished building %s caption dataset" % split #extract conv5_3 feature vectors vggnet = Vgg19(vgg_model_path) vggnet.build() with tf.Session() as sess: tf.initialize_all_variables().run() for split in ['train', 'val', 'test']: anno_path = d_fp + '/%s/%s.annotations.pkl' % (split, split) save_path = d_fp + '/%s/%s.features.hkl' % (split, split) annotations = load_pickle(anno_path) image_path = list(annotations['file_name'].unique()) n_examples = len(image_path) all_feats = np.ndarray([n_examples, 196, 512], dtype=np.float32) for start, end in zip( range(0, n_examples, batch_size), range(batch_size, n_examples + batch_size, batch_size)): print start, '-', end image_batch_file = image_path[start:end] image_batch = np.array( map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype(np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[start:end, :] = feats print("Processed %d %s features.." % (end, split)) # use hickle to save huge feature vectors hickle.dump(all_feats, save_path) print("Saved %s.." % (save_path))
def main(): # batch size for extracting feature vectors from vggnet. batch_size = 100 # maximum length of caption(number of word). if caption is longer than max_length, deleted. max_length = 15 # if word occurs less than word_count_threshold in training dataset, the word index is special unknown token. word_count_threshold = 1 train_caption_file = TRAIN_DATA_PATH + '/caption_train_annotations_20170902.json' image_dir = TRAIN_DATA_PATH + '/caption_train_images_20170902/' val_caption_file = VAL_DATA_PATH + '/caption_validation_annotations_20170910.json' val_image_dir = VAL_DATA_PATH + '/caption_validation_images_20170910/' train_dataset = _process_caption_data(train_caption_file, image_dir, max_length) val_dataset = _process_caption_data(val_caption_file, val_image_dir, max_length) # init make dirs sub_train_split = ['train' + str(i) for i in range(21)] split_parts = ['train', 'val', 'test'] + sub_train_split for split in split_parts: path = 'data/' + split if not os.path.exists(path): os.makedirs(path) save_pickle(train_dataset, 'data/train/train.annotations.pkl') save_pickle(val_dataset[:-5 * 4000].reset_index(drop=True), 'data/val/val.annotations.pkl') save_pickle(val_dataset[-5 * 4000:].reset_index(drop=True), 'data/test/test.annotations.pkl') block_size = len(train_dataset) / 21 for i in range(21): save_pickle( train_dataset[i * block_size:(i + 1) * block_size].reset_index(drop=True), 'data/train%d/train%d.annotations.pkl' % (i, i)) for split in split_parts: annotations = load_pickle('./data/%s/%s.annotations.pkl' % (split, split)) if split == 'train': word_to_idx = _build_vocab(annotations=annotations, threshold=word_count_threshold) save_pickle(word_to_idx, './data/%s/word_to_idx.pkl' % split) captions = _build_caption_vector(annotations=annotations, word_to_idx=word_to_idx, max_length=max_length) save_pickle(captions, './data/%s/%s.captions.pkl' % (split, split)) file_names, id_to_idx = _build_file_names(annotations) save_pickle(file_names, './data/%s/%s.file.names.pkl' % (split, split)) image_idxs = _build_image_idxs(annotations, id_to_idx) save_pickle(image_idxs, './data/%s/%s.image.idxs.pkl' % (split, split)) # prepare reference captions to compute bleu scores later image_ids = set() feature_to_captions = {} i = -1 for caption, image_id in zip(annotations['caption'], annotations['image_id']): if not image_id in image_ids: image_ids.add(image_id) i += 1 feature_to_captions[i] = [] feature_to_captions[i].append(caption + ' .') save_pickle(feature_to_captions, './data/%s/%s.references.pkl' % (split, split)) print "Finished building %s caption dataset" % split # extract conv5_3 feature vectors init_op = tf.initialize_all_variables() sess = tf.Session() sess.run(init_op) tf.reset_default_graph() vggnet = Vgg19(VGG_MODEL_PATH) vggnet.build() with tf.Session() as sess: tf.initialize_all_variables().run() for split in split_parts[1:]: anno_path = './data/%s/%s.annotations.pkl' % (split, split) save_path = './data/%s/%s.features.hkl' % (split, split) annotations = load_pickle(anno_path) image_path = list(annotations['image_file_name'].unique()) n_examples = len(image_path) all_feats = np.ndarray([n_examples, 196, 512], dtype=np.float32) for start, end in zip( range(0, n_examples, batch_size), range(batch_size, n_examples + batch_size, batch_size)): image_batch_file = image_path[start:end] image_batch = np.array( map(lambda x: np.array(resize_image(Image.open(x))), image_batch_file)).astype(np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[start:end, :] = feats print("Processed %d %s features.." % (end, split)) # use hickle to save huge feature vectors hickle.dump(all_feats, save_path) print("Saved %s.." % (save_path))
from core.vggnet import Vgg19 import tensorflow as tf import numpy as np import json import moxel from moxel.space import Image, String, Array vgg_model_path = './data/imagenet-vgg-verydeep-19.mat' vggnet = Vgg19(vgg_model_path) vggnet.build() def predict(image): image.resize((224, 224)) image_batch = np.array([image.to_numpy()]).astype(np.float32) with tf.Session() as sess: tf.initialize_all_variables().run() feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) return { # 'feature': String.from_str(str(feats)) 'feature': Array.from_numpy(feats) }
def test_custom(self, image_dir, vgg_model_path, attention_visualization=True): ''' Args: - data: dictionary with the following keys: - image_file: Image file name - attention_visualization: If True, visualize attention weights with images for each sampled word. (ipthon notebook) ''' # build a graph to sample captions alphas, betas, sampled_captions = self.model.build_sampler(max_len=20) # (N, max_len, L), (N, max_len) # image files: import glob, os image_files = [] for file in glob.glob(image_dir+"*.jpg"): image_files.append(file) # read in image feature image_features = [] imgs = [] graph = tf.Graph() with graph.as_default(): with tf.Session() as sess: vggnet = Vgg19(vgg_model_path) vggnet.build() tf.initialize_all_variables().run() for image_file in image_files: with open(image_file, 'r+b') as f: with Image.open(f) as image: img = np.asarray(resize_image(image)) imgs.append(img) image_batch = np.array(imgs) image_features = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver = tf.train.Saver() saver.restore(sess, self.test_model) feed_dict = { self.model.features: image_features } alps, bts, sam_cap = sess.run([alphas, betas, sampled_captions], feed_dict) # (N, max_len, L), (N, max_len) decoded = decode_captions(sam_cap, self.model.idx_to_word) for i in range(len(imgs)): img = imgs[i] print( "Sampled Caption: %s" %decoded[i] ) # Plot original image #plt.figure(figsize=(18,9)) plt.subplot(4, 5, 1) plt.imshow(img) plt.axis('off') if attention_visualization: # Plot images with attention weights words = decoded[i].split(" ") for t in range(len(words)): if t > 18: break plt.subplot(4, 5, t+2) plt.text(0, 1, '%s(%.2f)'%(words[t], bts[i,t]) , color='black', backgroundcolor='white', fontsize=8) plt.imshow(img) alp_curr = alps[i,t,:].reshape(14,14) alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=16, sigma=20) plt.imshow(alp_img, alpha=0.85) plt.axis('off') plt.show()
from collections import Counter from core.vggnet import Vgg19 from core.utils import * import tensorflow as tf import numpy as np import pandas as pd import hickle import os import json import jieba batch_size = 50 vggnet = Vgg19('./ai.challenger/data/imagenet-vgg-verydeep-19.mat') vggnet.build() os.environ["CUDA_VISIBLE_DEVICES"] = '0' config = tf.ConfigProto() config.gpu_options.allow_growth = True #with tf.Session(config = config) as sess: # init = tf.global_variables_initializer() # sess.run(init) # for split in ['train']: # anno_path = './data/%s/%s.annotations.pkl' % (split, split) # # annotations = load_pickle(anno_path) # image_path = list(annotations['file_name'].unique()) # n_examples = len(image_path) #
def __init__(self, sess, word_to_idx, dim_embed=512, dim_hidden=1024, n_time_step=16, prev2out=True, ctx2out=True, emo2out=True, alpha_c=0.0, selector=True, dropout=True, update_rule='adam', learning_rate=None, vgg_model_path='./data/imagenet-vgg-verydeep-19.mat', features_extractor='vgg', pretrained_model=None): """ Args: word_to_idx: word-to-index mapping dictionary. dim_feature: (optional) Dimension of vggnet19 conv5_3 feature vectors. dim_embed: (optional) Dimension of word embedding. dim_hidden: (optional) Dimension of all hidden state. n_time_step: (optional) Time step size of LSTM. prev2out: (optional) previously generated word to hidden state. (see Eq (7) for explanation) ctx2out: (optional) context to hidden state (see Eq (7) for explanation) alpha_c: (optional) Doubly stochastic regularization coefficient. (see Section (4.2.1) for explanation) selector: (optional) gating scalar for context vector. (see Section (4.2.1) for explanation) dropout: (optional) If true then dropout layer is added. """ self.word_to_idx = word_to_idx self.idx_to_word = {i: w for w, i in word_to_idx.items()} self.prev2out = prev2out self.ctx2out = ctx2out self.emo2out = emo2out self.alpha_c = alpha_c self.selector = selector self.dropout = dropout self.V = len(word_to_idx) self.M = dim_embed self.H = dim_hidden self.T = n_time_step + 1 self._start = word_to_idx['<START>'] self._null = word_to_idx['<NULL>'] self.weight_initializer = tf.contrib.layers.xavier_initializer() self.const_initializer = tf.constant_initializer(0.0) self.emb_initializer = tf.random_uniform_initializer(minval=-1.0, maxval=1.0) self.features_extractor = features_extractor if features_extractor == 'vgg': self.vggnet = Vgg19(vgg_model_path) dim_feature = [196, 512] elif features_extractor == 'resnet': self.resnet152 = ResNetFeatureExtractor( M.resnet152(pretrained=True).to(device), feat_layer="res5c") dim_feature = [49, 2048] self.L = dim_feature[0] self.D = dim_feature[1] # Placeholders self.features = tf.placeholder(tf.float32, [None, self.L, self.D]) self.captions = tf.placeholder(tf.int32, [None, self.T + 1]) self.emotions = tf.placeholder(tf.float32, [None, 3]) self.rewards = tf.placeholder( tf.float32, shape=[None, self.T]) # get from rollout policy and discriminator self.mode_learning = tf.placeholder(tf.int32) # Build graphs for training model and sampling captions with tf.variable_scope(tf.get_variable_scope()): self.loss = self.build_model() tf.get_variable_scope().reuse_variables() _, _, self.generated_captions = self.build_sampler() # ---set an optimizer by update rule if update_rule == 'adam': self.optimizer = tf.train.AdamOptimizer elif update_rule == 'momentum': self.optimizer = tf.train.MomentumOptimizer elif update_rule == 'rmsprop': self.optimizer = tf.train.RMSPropOptimizer # ---train op if learning_rate: with tf.variable_scope(tf.get_variable_scope(), reuse=False): optimizer = self.optimizer(learning_rate=learning_rate) grads = tf.gradients(self.loss, tf.trainable_variables()) self.grads_and_vars = list(zip(grads, tf.trainable_variables())) self.train_op = optimizer.apply_gradients( grads_and_vars=self.grads_and_vars) # ---init self.prev_loss = -1 self.sess = sess # ---load pretrained model self.saver = tf.train.Saver(max_to_keep=40) if pretrained_model is not None: print("Pretrained generator loaded") self.saver.restore(sess=self.sess, save_path=os.path.join(pretrained_model, 'model.ckpt')) initialize_uninitialized(self.sess)
def main(): # batch size for extracting feature vectors from vggnet. batch_size = 100 # maximum length of caption(number of word). if caption is longer than max_length, deleted. max_length = 15 # if word occurs less than word_count_threshold in training dataset, the word index is special unknown token. word_count_threshold = 1 # vgg model path vgg_model_path = './data/imagenet-vgg-verydeep-19.mat' caption_file = 'data/annotations/captions_train2014.json' image_dir = 'image/%2014_resized/' # about 80000 images and 400000 captions for train dataset train_dataset = _process_caption_data( caption_file='data/annotations/captions_train2014.json', image_dir='image/train2014_resized/', max_length=max_length) # about 40000 images and 200000 captions val_dataset = _process_caption_data( caption_file='data/annotations/captions_val2014.json', image_dir='image/val2014_resized/', max_length=max_length) # about 4000 images and 20000 captions for val / test dataset val_cutoff = int(0.1 * len(val_dataset)) test_cutoff = int(0.2 * len(val_dataset)) print('Finished processing caption data') save_pickle(train_dataset, 'data/train/train.annotations.pkl') save_pickle(val_dataset[:val_cutoff], 'data/val/val.annotations.pkl') save_pickle(val_dataset[val_cutoff:test_cutoff].reset_index(drop=True), 'data/test/test.annotations.pkl') for split in ['train', 'val', 'test']: annotations = load_pickle('./data/%s/%s.annotations.pkl' % (split, split)) if split == 'train': word_to_idx = _build_vocab(annotations=annotations, threshold=word_count_threshold) save_pickle(word_to_idx, './data/%s/word_to_idx.pkl' % split) captions = _build_caption_vector(annotations=annotations, word_to_idx=word_to_idx, max_length=max_length) save_pickle(captions, './data/%s/%s.captions.pkl' % (split, split)) file_names, id_to_idx = _build_file_names(annotations) save_pickle(file_names, './data/%s/%s.file.names.pkl' % (split, split)) image_idxs = _build_image_idxs(annotations, id_to_idx) save_pickle(image_idxs, './data/%s/%s.image.idxs.pkl' % (split, split)) # prepare reference captions to compute bleu scores later image_ids = {} feature_to_captions = {} i = -1 for caption, image_id in zip(annotations['caption'], annotations['image_id']): if not image_id in image_ids: image_ids[image_id] = 0 i += 1 feature_to_captions[i] = [] feature_to_captions[i].append(caption.lower() + ' .') save_pickle(feature_to_captions, './data/%s/%s.references.pkl' % (split, split)) print("Finished building %s caption dataset" % split) # extract conv5_3 feature vectors vggnet = Vgg19(vgg_model_path) vggnet.build() with tf.Session() as sess: tf.initialize_all_variables().run() for split in ['train', 'val', 'test']: anno_path = './data/%s/%s.annotations.pkl' % (split, split) save_path = './data/%s/%s.features.hkl' % (split, split) annotations = load_pickle(anno_path) image_path = list(annotations['file_name'].unique()) n_examples = len(image_path) all_feats = np.ndarray([n_examples, 196, 512], dtype=np.float32) for start, end in zip( range(0, n_examples, batch_size), range(batch_size, n_examples + batch_size, batch_size)): image_batch_file = image_path[start:end] image_batch = np.array( map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype(np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch}) all_feats[start:end, :] = feats print("Processed %d %s features.." % (end, split)) # use hickle to save huge feature vectors hickle.dump(all_feats, save_path) print("Saved %s.." % (save_path))
def get_featrues(): vgg_model_path = './data/imagenet-vgg-verydeep-19.mat' vggnet = Vgg19(vgg_model_path) vggnet.build() img_path = r"C:\Users\song\Desktop\511project\show-attend-and-tell-tensorflow\image_data_to_be_labeled\Object_feature\JPEGImages" resized_img_path = r'C:\Users\song\Desktop\511project\show-attend-and-tell-tensorflow\image_data_to_be_labeled\Object_feature\resized' for image in os.listdir(img_path): pil_im = Image.open(os.path.join(img_path, image)) size = 224, 224 pil_im = pil_im.resize(size) pil_im.save(os.path.join(resized_img_path, image)) with tf.Session() as sess: tf.initialize_all_variables().run() image_batch_file = [] for image in os.listdir( r"C:\Users\song\Desktop\511project\show-attend-and-tell-tensorflow\image_data_to_be_labeled\Object_feature\resized" ): image_batch_file.append( os.path.join( r".\image_data_to_be_labeled\Object_feature\resized", image.rstrip('\n'))) print(len(image_batch_file)) # for image in image_batch_file: # img = Image.open(image) # img = img.resize((224,224)) # img.save(os.path.join(r".\image_data_to_be_labeled\resized_image",os.path.basename(image))) f = open( r'C:\Users\song\Desktop\511project\show-attend-and-tell-tensorflow\image_data_to_be_labeled\Object_feature\train_list.txt', 'w') f.close() # f = open(r'C:\Users\song\Desktop\511project\show-attend-and-tell-tensorflow\image_data_to_be_labeled\test_list.txt','w') # f.close() train_batch_file = image_batch_file.copy() with open( r'C:\Users\song\Desktop\511project\show-attend-and-tell-tensorflow\image_data_to_be_labeled\Object_feature\train_list.txt', 'a') as f: for train in train_batch_file: f.write(train + '\n') # # test_batch_file = image_batch_file[220:] # with open(r'C:\Users\song\Desktop\511project\show-attend-and-tell-tensorflow\image_data_to_be_labeled\test_list.txt','a') as f: # for test in test_batch_file: # f.write(test+'\n') train_feats = np.ndarray([len(train_batch_file), 196, 512], dtype=np.float32) # test_feats = np.ndarray([len(test_batch_file), 196, 512], dtype=np.float32) train_batch = [] # test_batch = [] for image in train_batch_file: image_read = ndimage.imread(image, mode='RGB').astype(np.float32) train_batch.append(image_read) # for image in test_batch_file: # image_read = ndimage.imread(image, mode='RGB').astype(np.float32) # test_batch.append(image_read) train_batch = np.array(train_batch) # test_batch = np.array(test_batch) print(train_batch.shape) # print(test_batch.shape) # train_feats = np.ndarray([220, 196, 512], dtype=np.float32) # test_feats = np.ndarray([70, 196, 512], dtype=np.float32) for i in range(22): train_feats[i * 10:(i + 1) * 10] = sess.run( vggnet.features, feed_dict={vggnet.images: train_batch[i * 10:(i + 1) * 10]}) # for j in range(7): # test_feats[j*10:(j+1)*10] = sess.run(vggnet.features, feed_dict={vggnet.images: test_batch[j*10:(j+1)*10]}) print(train_feats.shape) # print(test_feats.shape) # use hickle to save huge feature vectors hickle.dump( train_feats, r"C:\Users\song\Desktop\511project\show-attend-and-tell-tensorflow\image_data_to_be_labeled\Object_feature\our_data\train.features.hkl" )