def main(): # The dataset (987M) can be downloaded from # https://drive.google.com/file/d/1Wgeq3NZ4R1letnZEKLo-DTSSgcTsgkmq/view?usp=sharing # The dataset contains the feature of images in MSCOCO dataset # The data should be in the same folder as the code # Load COCO data from disk; this returns a dictionary small_data = coco_utils.load_coco_data(max_train=50) # Experiment with vanilla RNN small_rnn_model = CaptioningRNN( cell_type='rnn', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, ) small_rnn_solver = CaptioningSolver(small_rnn_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.95, verbose=True, print_every=10, ) small_rnn_solver.train() # Plot the training losses plt.plot(small_rnn_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() plt.savefig('loss_rnn.png') plt.close() for split in ['train', 'val']: # some images might be deprecated. You may rerun the code several times # to successfully get the sample images from url. minibatch = coco_utils.sample_coco_minibatch( small_data, split=split, batch_size=2, seed=0) gt_captions, features, urls = minibatch gt_captions = coco_utils.decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_rnn_model.sample(features) sample_captions = coco_utils.decode_captions(sample_captions, small_data['idx_to_word']) for i, (gt_caption, sample_caption, url) in enumerate(zip(gt_captions, sample_captions, urls)): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() plt.savefig('%s_rnn_%d.png' % (split, i)) plt.close()
def show_image_by_image_idxs(self, coco_data, img_idxs): """ data indices to find image """ urls = coco_data.get_urls_by_image_index(img_idxs) for url in urls: plt.imshow(image_from_url(url)) plt.axis('off') plt.show()
def getAnnotatedImage(self, data, split): ''' samples image and returns it with GT and generated capture''' minibatch = sample_coco_minibatch(data, batch_size=1, split=split) captions, features, urls = minibatch # sample some captions given image features gt_captions = decode_captions(captions, data['idx_to_word']) _, captions_out = self.beam_decode(features) #captions_out = self.sample(features) sample_captions = [] sample_captions.append( decode_captions(captions_out, data['idx_to_word'])) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): img = image_from_url(url) img = np.asarray(img) try: img = np.swapaxes(img, 0, 2).transpose(0, 2, 1) except ValueError: img = np.random.rand(3, 256, 256) caption = ('%s \n %s \n GT:%s' % (split, sample_caption, gt_caption)) return img, caption
plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() #LSTM test-time sampling for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, data['idx_to_word']) sample_captions = small_lstm_model.sample(features) sample_captions = decode_captions(sample_captions, data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() #train a good model sdata = load_coco_data(max_train=10000) lstm_model = CaptioningRNN( cell_type='lstm', word_to_idx=sdata['word_to_idx'], input_dim=sdata['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, dtype=np.float32,
def main(): # The dataset can be downloaded in https://drive.google.com/drive/folders/1zCq7kS9OXc2mgaOzDimAwiBblECWeBtO?usp=sharing # The dataset contains the feature of images in MSCOCO dataset # Load COCO data from disk; this returns a dictionary small_data = load_coco_data(max_train=50) # Experiment with vanilla RNN small_rnn_model = CaptioningRNN( cell_type='rnn', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, ) small_rnn_solver = CaptioningSolver( small_rnn_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.95, verbose=True, print_every=10, ) small_rnn_solver.train() # Plot the training losses plt.plot(small_rnn_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_rnn_model.sample(features) sample_captions = decode_captions(sample_captions, small_data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() ################################################################################################## # Experiment with LSTM small_lstm_model = CaptioningRNN( cell_type='lstm', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, dtype=np.float32, ) small_lstm_solver = CaptioningSolver( small_lstm_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.995, verbose=True, print_every=10, ) small_lstm_solver.train() # Plot the training losses plt.plot(small_lstm_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_lstm_model.sample(features) sample_captions = decode_captions(sample_captions, small_data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show()
def main(_): # load data data = load_coco_data(FLAGS.data_dir) # force padded_length equal to padded_length - 1 # model_config.padded_length = len(data['train_captions'][0]) - 1 # Build the TensorFlow graph and train it g = tf.Graph() with g.as_default(): # Build the model. If FLAGS.glove_vocab is null, we do not initialize the model with word vectors; if not, we initialize with glove vectors if FLAGS.glove_vocab is '': model = build_model(model_config, mode=mode) else: glove_vocab = np.load(FLAGS.glove_vocab) model = build_model(model_config, mode=mode, glove_vocab=glove_vocab) # Set up the learning rate. learning_rate_decay_fn = None learning_rate = tf.constant(training_config.initial_learning_rate) if training_config.learning_rate_decay_factor > 0: num_batches_per_epoch = (training_config.num_examples_per_epoch / model_config.batch_size) decay_steps = int(num_batches_per_epoch * training_config.num_epochs_per_decay) def _learning_rate_decay_fn(learning_rate, global_step): return tf.train.exponential_decay( learning_rate, global_step, decay_steps=decay_steps, decay_rate=training_config.learning_rate_decay_factor, staircase=True) learning_rate_decay_fn = _learning_rate_decay_fn # Set up the training ops. train_op = tf.contrib.layers.optimize_loss( loss=model['total_loss'], global_step=model['global_step'], learning_rate=learning_rate, optimizer=training_config.optimizer, clip_gradients=training_config.clip_gradients, learning_rate_decay_fn=learning_rate_decay_fn) # initialize all variables init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) num_epochs = training_config.total_num_epochs num_train = data['train_captions'].shape[0] iterations_per_epoch = max(num_train / model_config.batch_size, 1) num_iterations = int(num_epochs * iterations_per_epoch) # Set up some variables for book-keeping epoch = 0 best_val_acc = 0 best_params = {} loss_history = [] train_acc_history = [] val_acc_history = [] print("\n\nTotal training iter: ", num_iterations, "\n\n") time_now = datetime.now() for t in range(num_iterations): total_loss_value = _step(sess, data, train_op, model, model_config.lstm_dropout_keep_prob ) # run each training step loss_history.append(total_loss_value) # Print out training loss if FLAGS.print_every > 0 and t % FLAGS.print_every == 0: print( '(Iteration %d / %d) loss: %f, and time eclipsed: %.2f minutes' % (t + 1, num_iterations, float(loss_history[-1]), (datetime.now() - time_now).seconds / 60.0)) # Print out some image sample results if FLAGS.sample_every > 0 and (t + 1) % FLAGS.sample_every == 0: temp_dir = os.path.join(FLAGS.sample_dir, 'temp_dir_{}//'.format(t + 1)) if not os.path.exists(temp_dir): os.makedirs(temp_dir) captions_pred, urls = _run_validation( sess, data, model_config.batch_size, model, 1.0) # the output is size (32, 16) captions_pred = [ unpack.reshape(-1, 1) for unpack in captions_pred ] captions_pred = np.concatenate(captions_pred, 1) captions_deco = decode_captions(captions_pred, data['idx_to_word']) for j in range(len(captions_deco)): img_name = os.path.join(temp_dir, 'image_{}.jpg'.format(j)) img = image_from_url(urls[j]) write_text_on_image(img, img_name, captions_deco[j]) # save the model continuously to avoid interruption if FLAGS.saveModel_every > 0 and ( t + 1) % FLAGS.saveModel_every == 0: if not os.path.exists(FLAGS.savedSession_dir): os.makedirs(FLAGS.savedSession_dir) checkpoint_name = savedModelName[: -5] + '_checkpoint{}.ckpt'.format( t + 1) save_path = model['saver'].save( sess, os.path.join(FLAGS.savedSession_dir, checkpoint_name)) if not os.path.exists(FLAGS.savedSession_dir): os.makedirs(FLAGS.savedSession_dir) save_path = model['saver'].save( sess, os.path.join(FLAGS.savedSession_dir, savedModelName)) print("done. Model saved at: ", os.path.join(FLAGS.savedSession_dir, savedModelName))
with g.as_default(): # Build the model. model = build_model(model_config, mode, inference_batch=BATCH_SIZE_INFERENCE) # run training init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) model['saver'].restore(sess, directory + "savedSession/model0.ckpt") print("Model restured! Last step run: ", sess.run(model['global_step'])) for i in range(TOTAL_INFERENCE_STEP): captions_pred, urls = _step_test( sess, data, BATCH_SIZE_INFERENCE, model, 1.0) # the output is size (32, 16) captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred] captions_pred = np.concatenate(captions_pred, 1) captions_deco = decode_captions(captions_pred, data['idx_to_word']) for j in range(len(captions_deco)): img_name = directory + 'image_' + str(j) + '.jpg' img = image_from_url(urls[j]) write_text_on_image(img, img_name, captions_deco[j])
def train_model(model, config, data): #g = tf.Graph() #with g.as_default(): ################define optimizer######## num_batches = config.total_instances / config.batch_size decay_steps = int(num_batches * config.num_epochs_per_decay) learning_rate = tf.constant(config.initial_learning_rate) learning_rate_decay_fn = None def _decay_fn(learning_rate, global_step): return tf.train.exponential_decay(learning_rate, global_step, decay_steps=decay_steps, decay_rate=0.5, staircase=True) learning_rate_decay_fn = _decay_fn train_op = tf.contrib.layers.optimize_loss( loss=model.total_loss, global_step=model.global_step, learning_rate=learning_rate, optimizer='SGD', clip_gradients=config.clip_gradients, learning_rate_decay_fn=learning_rate_decay_fn) ################## saver = tf.train.Saver() init = tf.global_variables_initializer() # for BLAS Memmory DUMP failure config_ = tf.ConfigProto() config_.gpu_options.allow_growth = True with tf.Session(config=config_) as sess: sess.run(init) # if checkpoint exist, restore ckpt = tf.train.get_checkpoint_state( os.path.dirname('checkpoints/checkpoint')) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("cucessfully restored the checkpoint") rand_int = np.random.randint(1, 100) caption_in, caption_out, mask, image_features, urls = minibatch( data, rand_int, config.batch_size, config.total_instances) if not os.path.exists('test_caption'): os.makedirs('test_caption') captions_pred = _run_validation( sess, caption_in, image_features, config.batch_size, model, config.input_len) # the output is size (32, 16) captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred] captions_pred = np.concatenate(captions_pred, 1) captions_deco = decode_captions(captions_pred, data['idx_to_word']) for j in range(len(captions_deco)): img_name = os.path.join('test_caption', 'image_{}.jpg'.format(j)) img = image_from_url(urls[j]) write_text_on_image(img, img_name, captions_deco[j]) print("saved predicted images into ./test_caption folder") # 100 epoch # total_runs = int((config.total_instances/config.batch_size)*config.num_epochs) # initial_step = model.global_step.eval() ### initialize summary writer # tf.summary.scalar("learing_rate", learning_rate) # a = tf.summary.merge_all() # writer = tf.summary.FileWriter('./graphs/singlelayer_lstm', sess.graph) # time_now = datetime.now() # for t in range(total_runs): # caption_in, caption_out, mask, image_features, urls = minibatch(data,t,config.batch_size, config.total_instances) # # feed data # feed_dict = {model.image_feature: image_features, model.caption_in: caption_in, # model.caption_out: caption_out, model.caption_mask: mask} # merge_op, _, total_loss, b = sess.run([model.summary_op, train_op, model.total_loss, a], # feed_dict = feed_dict) # writer.add_summary(merge_op, global_step=t) # writer.add_summary(b, global_step=t) # # print loss infor # if(t+1) % 20 == 0: # print('(Iteration %d / %d) loss: %f, and time eclipsed: %.2f minutes' % ( # t + 1, total_runs, float(total_loss), (datetime.now() - time_now).seconds/60.0)) # #print image # if(t+1)%100 == 0: # if not os.path.exists('test_caption'): # os.makedirs('test_caption') # captions_pred = _run_validation(sess, caption_in, image_features, 1, model, config.input_len) # the output is size (32, 16) # captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred] # captions_pred = np.concatenate(captions_pred, 1) # captions_deco = decode_captions(captions_pred, data['idx_to_word']) # for j in range(len(captions_deco)): # img_name = os.path.join('test_caption', 'image_{}.jpg'.format(j)) # img = image_from_url(urls[j]) # write_text_on_image(img, img_name, captions_deco[j]) # #save model # if(t+1)%50 == 0 or t == (total_runs-1): # if not os.path.exists('checkpoints/singlelayer_lstm'): # os.makedirs('checkpoints/singlelayer_lstm') # saver.save(sess, 'checkpoints/singlelayer_lstm', t) # visualize embed matrix #code to visualize the embeddings. uncomment the below to visualize embeddings final_embed_matrix = sess.run(model.embed_map) # it has to variable. constants don't work here. you can't reuse model.embed_matrix embedding_var = tf.Variable(final_embed_matrix[:1000], name='embedding') sess.run(embedding_var.initializer) config = projector.ProjectorConfig() summary_writer = tf.summary.FileWriter('processed') # add embedding to the config file embedding = config.embeddings.add() embedding.tensor_name = embedding_var.name # link this tensor to its metadata file, in this case the first 500 words of vocab # metadata_path = './processed/matadata.tsv' # if not os.path.exists(metadata_path): # f = open(metadata_path, "w") # f.close() embedding.metadata_path = os.path.join('processed', 'metadata.tsv') # saves a configuration file that TensorBoard will read during startup. projector.visualize_embeddings(summary_writer, config) saver_embed = tf.train.Saver([embedding_var]) saver_embed.save(sess, 'processed/model3.ckpt', 1)