def load_vocabulary_embedding(dimension=50): """ Params: dimension: the dimensionality of the word vector Returns: A map of word to index Inverse map of index to map numpy array of word vectors with index mapping as above """ coco_data = load_coco_data() embeddings = np.random.rand(len(coco_data['word_to_idx']), dimension) assert(dimension == 50 or dimension == 100 or dimension == 200 or dimension == 300) vocab_path = BASE_VOCAB_PATH + str(dimension) + 'd.txt' ctr,ctr1 = 0,1 with open(vocab_path, 'r') as vocabulary_file: for idx, line in enumerate(vocabulary_file): tokens = line.split(' ') try: idx = coco_data['word_to_idx'][tokens[0]] embeddings[idx] = map(float, tokens[1:]) ctr1 += 1 #x = input() except KeyError: ctr += 1 continue if(idx % 10000 == 0): print('%d tokens processed'%(idx)) print('%d words not present while %d words processed'%(ctr,ctr1)) return np.array(embeddings)
def generate_image_index_to_reference_captions(): data = load_coco_data() gts_train = {} for cap_idx, img_idx in enumerate(data['train_image_idxs']): img_idx = str(img_idx) if img_idx not in gts_train: gts_train[img_idx] = [] gts_train[img_idx].append({ 'caption': decode_captions(data['train_captions'][cap_idx][1:], data['idx_to_word']) }) with open('train_img_idx_to_captions.json', 'wb') as f: f.write(json.dumps(gts_train).encode('ascii')) gts_val = {} for cap_idx, img_idx in enumerate(data['val_image_idxs']): img_idx = str(img_idx) if img_idx not in gts_val: gts_val[img_idx] = [] gts_val[img_idx].append({ 'caption': decode_captions(data['val_captions'][cap_idx][1:], data['idx_to_word']) }) with open('val_img_idx_to_captions.json', 'wb') as f: f.write(json.dumps(gts_val).encode('ascii'))
def __init__(self, mode='PG', batch_size=50, START_TOKEN='<START>', END_TOKEN='<END>', NULL_TOKEN='<NULL>'): self.mode = mode self.batch_size = batch_size self.data = load_coco_data(pca_features=False) self.vocab_dim = len(self.data['word_to_idx']) self.image_feature_dim = self.data['val_features'].shape[1] self.word_embedding_dim = self.data['word_embedding'].shape[1] self.NULL_ID = self.data['word_to_idx'][NULL_TOKEN] self.START_ID = self.data['word_to_idx'][START_TOKEN] self.END_ID = self.data['word_to_idx'][END_TOKEN] self.valid_splits = ['val', 'train'] self.index_orders = {} self.prep_index_orders() with open('train_img_idx_to_captions.json', 'rb') as f: self.data['train_image_idx_to_captions'] = json.load(f) with open('val_img_idx_to_captions.json', 'rb') as f: self.data['val_image_idx_to_captions'] = json.load(f)
def generate_image_index_to_reference_captions(base_dir="datasets/self_process"): data = load_coco_data(base_dir=base_dir, pca_features=False, is_caption_separated=True) gts_train = {} for cap_idx, img_idx in enumerate(data['train_image_idxs']): img_idx = str(img_idx) if img_idx not in gts_train: gts_train[img_idx] = [] gts_train[img_idx].append({'caption': decode_captions(data['train_captions'][cap_idx][1:], data['idx_to_word'])}) with open('train_img_idx_to_captions.json', 'w') as f: f.write(json.dumps(gts_train)) gts_val = {} for cap_idx, img_idx in enumerate(data['val_image_idxs']): img_idx = str(img_idx) if img_idx not in gts_val: gts_val[img_idx] = [] gts_val[img_idx].append({'caption': decode_captions(data['val_captions'][cap_idx][1:], data['idx_to_word'])}) with open('val_img_idx_to_captions.json', 'w') as f: f.write(json.dumps(gts_val))
def __init__(self, mode='PG', batch_size=50, START_TOKEN='<START>', END_TOKEN='<END>', NULL_TOKEN='<NULL>', UNK_TOKEN='<UNK>'): self.mode = mode self.batch_size = batch_size self.data = load_coco_data(base_dir="datasets/self_process", pca_features=False, is_caption_separated=True) self.vocab_dim = len(self.data['word_to_idx']) self.image_feature_dim = self.data['val_features'].shape[1] self.word_embedding_dim = self.data['word_embedding'].shape[1] self.NULL_ID = self.data['word_to_idx'][NULL_TOKEN] self.START_ID = self.data['word_to_idx'][START_TOKEN] self.END_ID = self.data['word_to_idx'][END_TOKEN] self.UNK_ID = self.data['word_to_idx'][UNK_TOKEN] self.valid_splits = ['val', 'train'] self.index_orders = {} self.prep_index_orders() self.build_image_idx_to_caption_idxs('train') self.build_image_idx_to_caption_idxs('val') self.caption_length = self.data['train_captions'].shape[1]
def main(): # The dataset (987M) can be downloaded from # https://drive.google.com/file/d/1Wgeq3NZ4R1letnZEKLo-DTSSgcTsgkmq/view?usp=sharing # The dataset contains the feature of images in MSCOCO dataset # The data should be in the same folder as the code # Load COCO data from disk; this returns a dictionary small_data = coco_utils.load_coco_data(max_train=50) # Experiment with vanilla RNN small_rnn_model = CaptioningRNN( cell_type='rnn', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, ) small_rnn_solver = CaptioningSolver(small_rnn_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.95, verbose=True, print_every=10, ) small_rnn_solver.train() # Plot the training losses plt.plot(small_rnn_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() plt.savefig('loss_rnn.png') plt.close() for split in ['train', 'val']: # some images might be deprecated. You may rerun the code several times # to successfully get the sample images from url. minibatch = coco_utils.sample_coco_minibatch( small_data, split=split, batch_size=2, seed=0) gt_captions, features, urls = minibatch gt_captions = coco_utils.decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_rnn_model.sample(features) sample_captions = coco_utils.decode_captions(sample_captions, small_data['idx_to_word']) for i, (gt_caption, sample_caption, url) in enumerate(zip(gt_captions, sample_captions, urls)): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() plt.savefig('%s_rnn_%d.png' % (split, i)) plt.close()
def main(argv): print('Loading Data') start = timeit.default_timer() data = load_coco_data(pca_features=False) print('Data Loaded in %ds'%(timeit.default_timer()-start)) ## Collect training data vgg16_features_train = [] caption_ids_train = [] print('Preparing training data') for i,idx in tqdm(enumerate(data['train_image_idxs'])): vgg16_features_train.append(data['train_features'][idx]) caption_ids_train.append(data['train_captions'][i]) ## Collect Validation data vgg16_features_val = [] caption_ids_val = [] print('Preparing validation data') for i,idx in tqdm(enumerate(data['val_image_idxs'])): vgg16_features_val.append(data['val_features'][idx]) caption_ids_val.append(data['val_captions'][i]) ## Redistribute the dataset train_cutoff = int(0.85 * len(vgg16_features_val)) val_cutoff = int(0.90 * len(vgg16_features_val)) vgg16_features_train = vgg16_features_train + vgg16_features_val[:train_cutoff] caption_ids_train = caption_ids_train + caption_ids_val[:train_cutoff] vgg16_features_test = vgg16_features_val[val_cutoff:] caption_ids_test = caption_ids_val[val_cutoff:] vgg16_features_val = vgg16_features_val[train_cutoff:val_cutoff] caption_ids_val = caption_ids_val[train_cutoff:val_cutoff] print("Length of training data: %d"%(len(vgg16_features_train))) print("Length of validation data: %d"%(len(vgg16_features_val))) print("Length of test data: %d"%(len(vgg16_features_test))) print("Vocabulary Size: %d"%(len(data['idx_to_word']))) start = timeit.default_timer() print('Preparing tf records for training data: ') _process_dataset("train", vgg16_features_train,caption_ids_train,256,output_dir_train) print('Completed in %ds'%(timeit.default_timer()-start)) start = timeit.default_timer() print('Preparing tf records for validation data: ') _process_dataset("val", vgg16_features_val,caption_ids_val,4,output_dir_eval) print('Completed in %ds'%(timeit.default_timer()-start)) start = timeit.default_timer() print('Preparing tf records for testingdata: ') _process_dataset("test", vgg16_features_test,caption_ids_test,8,output_dir_test) print('Completed in %ds'%(timeit.default_timer()-start))
def __init__(self, START_TOKEN='<START>', END_TOKEN='<END>', NULL_TOKEN='<NULL>'): self.data = load_coco_data(pca_features=False) self.vocab_dim = len(self.data['word_to_idx']) self.image_feature_dim = self.data['val_features'].shape[1] self.word_embedding_dim = self.data['word_embedding'].shape[1] self.NULL_ID = self.data['word_to_idx'][NULL_TOKEN] self.START_ID = self.data['word_to_idx'][START_TOKEN] self.END_ID = self.data['word_to_idx'][END_TOKEN] self.index_orders = {}
def main(): # Load COCO data from disk data = load_coco_data() # Create Caption Model model = CaptioningRNN( cell_type='lstm', word_to_idx=data['word_to_idx'], input_dim=data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, dtype=np.float32, ) solver = CaptioningSolver(model, data, update_rule='adam', num_epochs=1, batch_size=100, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.995, verbose=True, print_every=10, eval_every=100) solver.train() # Plot the training losses plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots plt.rcParams['image.interpolation'] = 'nearest' plt.rcParams['image.cmap'] = 'gray' plt.plot(solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show()
# Set all model parameters to fixed values for k, v in model.params.iteritems(): model.params[k] = np.linspace(-1.4, 1.3, num=v.size).reshape(*v.shape) features = np.linspace(-0.5, 1.7, num=N*D).reshape(N, D) captions = (np.arange(N * T) % V).reshape(N, T) loss, grads = model.loss(features, captions) expected_loss = 9.82445935443 print 'loss: ', loss print 'expected loss: ', expected_loss print 'difference: ', abs(loss - expected_loss) data = load_coco_data(pca_features=True) #Overfit LSTM captioning model small_data = load_coco_data(max_train=50) small_lstm_model = CaptioningRNN( cell_type='lstm', word_to_idx=data['word_to_idx'], input_dim=data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, dtype=np.float32, ) small_lstm_solver = CaptioningSolver(small_lstm_model, small_data, update_rule='adam',
import matplotlib.pyplot as plt from gradient_check import eval_numerical_gradient, eval_numerical_gradient_array # from rnn_layers import * from captioning_solver import CaptioningSolver from classifiers.rnn import CaptioningRNN from coco_utils import load_coco_data, sample_coco_minibatch, decode_captions from image_utils import image_from_url def rel_error(x, y): """ returns relative error """ return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y)))) data = load_coco_data(pca_features=True) # Print out all the keys and values from the data dictionary for k, v in data.items(): if type(v) == np.ndarray: print(k, type(v), v.shape, v.dtype) else: print(k, type(v), len(v)) # Sample a minibatch and show the images and captions batch_size = 3 captions, features, urls = sample_coco_minibatch(data, batch_size=batch_size) for i, (caption, url) in enumerate(zip(captions, urls)): plt.imshow(image_from_url(url)) plt.axis('off')
def main(): # The dataset can be downloaded in https://drive.google.com/drive/folders/1zCq7kS9OXc2mgaOzDimAwiBblECWeBtO?usp=sharing # The dataset contains the feature of images in MSCOCO dataset # Load COCO data from disk; this returns a dictionary small_data = load_coco_data(max_train=50) # Experiment with vanilla RNN small_rnn_model = CaptioningRNN( cell_type='rnn', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, ) small_rnn_solver = CaptioningSolver( small_rnn_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.95, verbose=True, print_every=10, ) small_rnn_solver.train() # Plot the training losses plt.plot(small_rnn_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_rnn_model.sample(features) sample_captions = decode_captions(sample_captions, small_data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() ################################################################################################## # Experiment with LSTM small_lstm_model = CaptioningRNN( cell_type='lstm', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, dtype=np.float32, ) small_lstm_solver = CaptioningSolver( small_lstm_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.995, verbose=True, print_every=10, ) small_lstm_solver.train() # Plot the training losses plt.plot(small_lstm_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_lstm_model.sample(features) sample_captions = decode_captions(sample_captions, small_data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show()
def main(_): # load data data = load_coco_data(FLAGS.data_dir) # force padded_length equal to padded_length - 1 # model_config.padded_length = len(data['train_captions'][0]) - 1 # Build the TensorFlow graph and train it g = tf.Graph() with g.as_default(): # Build the model. If FLAGS.glove_vocab is null, we do not initialize the model with word vectors; if not, we initialize with glove vectors if FLAGS.glove_vocab is '': model = build_model(model_config, mode=mode) else: glove_vocab = np.load(FLAGS.glove_vocab) model = build_model(model_config, mode=mode, glove_vocab=glove_vocab) # Set up the learning rate. learning_rate_decay_fn = None learning_rate = tf.constant(training_config.initial_learning_rate) if training_config.learning_rate_decay_factor > 0: num_batches_per_epoch = (training_config.num_examples_per_epoch / model_config.batch_size) decay_steps = int(num_batches_per_epoch * training_config.num_epochs_per_decay) def _learning_rate_decay_fn(learning_rate, global_step): return tf.train.exponential_decay( learning_rate, global_step, decay_steps=decay_steps, decay_rate=training_config.learning_rate_decay_factor, staircase=True) learning_rate_decay_fn = _learning_rate_decay_fn # Set up the training ops. train_op = tf.contrib.layers.optimize_loss( loss=model['total_loss'], global_step=model['global_step'], learning_rate=learning_rate, optimizer=training_config.optimizer, clip_gradients=training_config.clip_gradients, learning_rate_decay_fn=learning_rate_decay_fn) # initialize all variables init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) num_epochs = training_config.total_num_epochs num_train = data['train_captions'].shape[0] iterations_per_epoch = max(num_train / model_config.batch_size, 1) num_iterations = int(num_epochs * iterations_per_epoch) # Set up some variables for book-keeping epoch = 0 best_val_acc = 0 best_params = {} loss_history = [] train_acc_history = [] val_acc_history = [] print("\n\nTotal training iter: ", num_iterations, "\n\n") time_now = datetime.now() for t in range(num_iterations): total_loss_value = _step(sess, data, train_op, model, model_config.lstm_dropout_keep_prob ) # run each training step loss_history.append(total_loss_value) # Print out training loss if FLAGS.print_every > 0 and t % FLAGS.print_every == 0: print( '(Iteration %d / %d) loss: %f, and time eclipsed: %.2f minutes' % (t + 1, num_iterations, float(loss_history[-1]), (datetime.now() - time_now).seconds / 60.0)) # Print out some image sample results if FLAGS.sample_every > 0 and (t + 1) % FLAGS.sample_every == 0: temp_dir = os.path.join(FLAGS.sample_dir, 'temp_dir_{}//'.format(t + 1)) if not os.path.exists(temp_dir): os.makedirs(temp_dir) captions_pred, urls = _run_validation( sess, data, model_config.batch_size, model, 1.0) # the output is size (32, 16) captions_pred = [ unpack.reshape(-1, 1) for unpack in captions_pred ] captions_pred = np.concatenate(captions_pred, 1) captions_deco = decode_captions(captions_pred, data['idx_to_word']) for j in range(len(captions_deco)): img_name = os.path.join(temp_dir, 'image_{}.jpg'.format(j)) img = image_from_url(urls[j]) write_text_on_image(img, img_name, captions_deco[j]) # save the model continuously to avoid interruption if FLAGS.saveModel_every > 0 and ( t + 1) % FLAGS.saveModel_every == 0: if not os.path.exists(FLAGS.savedSession_dir): os.makedirs(FLAGS.savedSession_dir) checkpoint_name = savedModelName[: -5] + '_checkpoint{}.ckpt'.format( t + 1) save_path = model['saver'].save( sess, os.path.join(FLAGS.savedSession_dir, checkpoint_name)) if not os.path.exists(FLAGS.savedSession_dir): os.makedirs(FLAGS.savedSession_dir) save_path = model['saver'].save( sess, os.path.join(FLAGS.savedSession_dir, savedModelName)) print("done. Model saved at: ", os.path.join(FLAGS.savedSession_dir, savedModelName))
model['input_mask']: mask, model['keep_prob']: keep_prob } current_pred, state = sess.run([model['preds'], model['final_state']], feed_dict=feed_dict) current_pred = current_pred.reshape(-1, 1) final_preds.append(current_pred) return final_preds, urls # load data data = load_coco_data(base_dir='/home/ubuntu/COCO/dataset/COCO_captioning/') TOTAL_INFERENCE_STEP = 1 BATCH_SIZE_INFERENCE = 32 # Build the TensorFlow graph and train it g = tf.Graph() with g.as_default(): # Build the model. model = build_model(model_config, mode, inference_batch=BATCH_SIZE_INFERENCE) # run training init = tf.global_variables_initializer() with tf.Session() as sess:
def main(): config = Config() data = load_coco_data() model = LSTM_Model('train', config) model.build_graph() train_model(model, config, data)