def load_vocabulary_embedding(dimension=50):
    """
    Params:
    dimension: the dimensionality of the word vector
    Returns:
    A map of word to index
    Inverse map of index to map
    numpy array of word vectors with index mapping as above
    """
    coco_data = load_coco_data()
    embeddings = np.random.rand(len(coco_data['word_to_idx']), dimension)
    assert(dimension == 50 or dimension ==
           100 or dimension == 200 or dimension == 300)
    vocab_path = BASE_VOCAB_PATH + str(dimension) + 'd.txt'
    ctr,ctr1 = 0,1
    with open(vocab_path, 'r') as vocabulary_file:
        for idx, line in enumerate(vocabulary_file):
            tokens = line.split(' ')
            try:
                idx = coco_data['word_to_idx'][tokens[0]]
                embeddings[idx] = map(float, tokens[1:])
                ctr1 += 1
                #x = input()
            except KeyError:
                ctr += 1
                continue
            if(idx % 10000 == 0):
                print('%d tokens processed'%(idx))

    print('%d words not present while %d words processed'%(ctr,ctr1))

    return np.array(embeddings)
示例#2
0
def generate_image_index_to_reference_captions():
    data = load_coco_data()

    gts_train = {}
    for cap_idx, img_idx in enumerate(data['train_image_idxs']):
        img_idx = str(img_idx)
        if img_idx not in gts_train:
            gts_train[img_idx] = []

        gts_train[img_idx].append({
            'caption':
            decode_captions(data['train_captions'][cap_idx][1:],
                            data['idx_to_word'])
        })

    with open('train_img_idx_to_captions.json', 'wb') as f:
        f.write(json.dumps(gts_train).encode('ascii'))

    gts_val = {}
    for cap_idx, img_idx in enumerate(data['val_image_idxs']):
        img_idx = str(img_idx)
        if img_idx not in gts_val:
            gts_val[img_idx] = []

        gts_val[img_idx].append({
            'caption':
            decode_captions(data['val_captions'][cap_idx][1:],
                            data['idx_to_word'])
        })

    with open('val_img_idx_to_captions.json', 'wb') as f:
        f.write(json.dumps(gts_val).encode('ascii'))
示例#3
0
  def __init__(self,
               mode='PG',
               batch_size=50,
               START_TOKEN='<START>',
               END_TOKEN='<END>',
               NULL_TOKEN='<NULL>'):
    self.mode = mode
    self.batch_size = batch_size

    self.data = load_coco_data(pca_features=False)

    self.vocab_dim          = len(self.data['word_to_idx'])
    self.image_feature_dim  = self.data['val_features'].shape[1]
    self.word_embedding_dim = self.data['word_embedding'].shape[1]

    self.NULL_ID  = self.data['word_to_idx'][NULL_TOKEN]
    self.START_ID = self.data['word_to_idx'][START_TOKEN]
    self.END_ID   = self.data['word_to_idx'][END_TOKEN]

    self.valid_splits = ['val', 'train']
    self.index_orders = {}
    self.prep_index_orders()

    with open('train_img_idx_to_captions.json', 'rb') as f:
      self.data['train_image_idx_to_captions'] = json.load(f)

    with open('val_img_idx_to_captions.json', 'rb') as f:
      self.data['val_image_idx_to_captions'] = json.load(f)
def generate_image_index_to_reference_captions(base_dir="datasets/self_process"):
  data = load_coco_data(base_dir=base_dir, pca_features=False, is_caption_separated=True)

  gts_train = {}
  for cap_idx, img_idx in enumerate(data['train_image_idxs']):
    img_idx = str(img_idx)
    if img_idx not in gts_train:
      gts_train[img_idx] = []

    gts_train[img_idx].append({'caption': decode_captions(data['train_captions'][cap_idx][1:], data['idx_to_word'])})

  with open('train_img_idx_to_captions.json', 'w') as f:
    f.write(json.dumps(gts_train))


  gts_val = {}
  for cap_idx, img_idx in enumerate(data['val_image_idxs']):
    img_idx = str(img_idx)
    if img_idx not in gts_val:
      gts_val[img_idx] = []

    gts_val[img_idx].append({'caption': decode_captions(data['val_captions'][cap_idx][1:], data['idx_to_word'])})

  with open('val_img_idx_to_captions.json', 'w') as f:
    f.write(json.dumps(gts_val))
  def __init__(self,
               mode='PG',
               batch_size=50,
               START_TOKEN='<START>',
               END_TOKEN='<END>',
               NULL_TOKEN='<NULL>',
               UNK_TOKEN='<UNK>'):
    self.mode = mode
    self.batch_size = batch_size

    self.data = load_coco_data(base_dir="datasets/self_process", pca_features=False, is_caption_separated=True)

    self.vocab_dim          = len(self.data['word_to_idx'])
    self.image_feature_dim  = self.data['val_features'].shape[1]
    self.word_embedding_dim = self.data['word_embedding'].shape[1]

    self.NULL_ID  = self.data['word_to_idx'][NULL_TOKEN]
    self.START_ID = self.data['word_to_idx'][START_TOKEN]
    self.END_ID   = self.data['word_to_idx'][END_TOKEN]
    self.UNK_ID   = self.data['word_to_idx'][UNK_TOKEN]

    self.valid_splits = ['val', 'train']
    self.index_orders = {}
    self.prep_index_orders()

    self.build_image_idx_to_caption_idxs('train')
    self.build_image_idx_to_caption_idxs('val')

    self.caption_length = self.data['train_captions'].shape[1]
示例#6
0
def main():
    # The dataset (987M) can be downloaded from
    # https://drive.google.com/file/d/1Wgeq3NZ4R1letnZEKLo-DTSSgcTsgkmq/view?usp=sharing
    # The dataset contains the feature of images in MSCOCO dataset
    # The data should be in the same folder as the code
    # Load COCO data from disk; this returns a dictionary
    small_data = coco_utils.load_coco_data(max_train=50)

    # Experiment with vanilla RNN
    small_rnn_model = CaptioningRNN(
          cell_type='rnn',
          word_to_idx=small_data['word_to_idx'],
          input_dim=small_data['train_features'].shape[1],
          hidden_dim=512,
          wordvec_dim=256,
    )

    small_rnn_solver = CaptioningSolver(small_rnn_model, small_data,
           update_rule='adam',
           num_epochs=50,
           batch_size=25,
           optim_config={
             'learning_rate': 5e-3,
           },
           lr_decay=0.95,
           verbose=True, print_every=10,
         )

    small_rnn_solver.train()

    # Plot the training losses
    plt.plot(small_rnn_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()
    plt.savefig('loss_rnn.png')
    plt.close()

    for split in ['train', 'val']:
        # some images might be deprecated. You may rerun the code several times
        # to successfully get the sample images from url.
        minibatch = coco_utils.sample_coco_minibatch(
            small_data, split=split, batch_size=2, seed=0)
        gt_captions, features, urls = minibatch
        gt_captions = coco_utils.decode_captions(gt_captions,
                                                 small_data['idx_to_word'])

        sample_captions = small_rnn_model.sample(features)
        sample_captions = coco_utils.decode_captions(sample_captions,
                                                     small_data['idx_to_word'])

        for i, (gt_caption, sample_caption, url) in enumerate(zip(gt_captions, sample_captions, urls)):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()
            plt.savefig('%s_rnn_%d.png' % (split, i))
            plt.close()
def main(argv):
    print('Loading Data')
    start = timeit.default_timer()
    data = load_coco_data(pca_features=False)

    print('Data Loaded in %ds'%(timeit.default_timer()-start))

    ## Collect training data
    vgg16_features_train = []
    caption_ids_train = []
    print('Preparing training data')

    for i,idx in tqdm(enumerate(data['train_image_idxs'])):
        vgg16_features_train.append(data['train_features'][idx])
        caption_ids_train.append(data['train_captions'][i])


    ## Collect Validation data
    vgg16_features_val = []
    caption_ids_val = []
    print('Preparing validation data')

    for i,idx in tqdm(enumerate(data['val_image_idxs'])):
        vgg16_features_val.append(data['val_features'][idx])
        caption_ids_val.append(data['val_captions'][i])



    ## Redistribute the dataset
    train_cutoff = int(0.85 * len(vgg16_features_val))
    val_cutoff = int(0.90 * len(vgg16_features_val))

    vgg16_features_train = vgg16_features_train + vgg16_features_val[:train_cutoff]
    caption_ids_train = caption_ids_train + caption_ids_val[:train_cutoff]
    vgg16_features_test = vgg16_features_val[val_cutoff:]
    caption_ids_test = caption_ids_val[val_cutoff:]
    vgg16_features_val = vgg16_features_val[train_cutoff:val_cutoff]
    caption_ids_val = caption_ids_val[train_cutoff:val_cutoff]

    print("Length of training data: %d"%(len(vgg16_features_train)))
    print("Length of validation data: %d"%(len(vgg16_features_val)))
    print("Length of test data: %d"%(len(vgg16_features_test)))
    print("Vocabulary Size: %d"%(len(data['idx_to_word'])))

    start = timeit.default_timer()
    print('Preparing tf records for training data: ')
    _process_dataset("train", vgg16_features_train,caption_ids_train,256,output_dir_train)
    print('Completed in %ds'%(timeit.default_timer()-start))
    start = timeit.default_timer()
    print('Preparing tf records for validation data: ')
    _process_dataset("val", vgg16_features_val,caption_ids_val,4,output_dir_eval)
    print('Completed in %ds'%(timeit.default_timer()-start))
    start = timeit.default_timer()
    print('Preparing tf records for testingdata: ')
    _process_dataset("test", vgg16_features_test,caption_ids_test,8,output_dir_test)
    print('Completed in %ds'%(timeit.default_timer()-start))
示例#8
0
    def __init__(self,
                 START_TOKEN='<START>',
                 END_TOKEN='<END>',
                 NULL_TOKEN='<NULL>'):

        self.data = load_coco_data(pca_features=False)

        self.vocab_dim = len(self.data['word_to_idx'])
        self.image_feature_dim = self.data['val_features'].shape[1]
        self.word_embedding_dim = self.data['word_embedding'].shape[1]

        self.NULL_ID = self.data['word_to_idx'][NULL_TOKEN]
        self.START_ID = self.data['word_to_idx'][START_TOKEN]
        self.END_ID = self.data['word_to_idx'][END_TOKEN]

        self.index_orders = {}
示例#9
0
def main():
    # Load COCO data from disk
    data = load_coco_data()
    # Create Caption Model
    model = CaptioningRNN(
        cell_type='lstm',
        word_to_idx=data['word_to_idx'],
        input_dim=data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
        dtype=np.float32,
    )
    solver = CaptioningSolver(model,
                              data,
                              update_rule='adam',
                              num_epochs=1,
                              batch_size=100,
                              optim_config={
                                  'learning_rate': 5e-3,
                              },
                              lr_decay=0.995,
                              verbose=True,
                              print_every=10,
                              eval_every=100)
    solver.train()

    # Plot the training losses
    plt.rcParams['figure.figsize'] = (10.0, 8.0)  # set default size of plots
    plt.rcParams['image.interpolation'] = 'nearest'
    plt.rcParams['image.cmap'] = 'gray'

    plt.plot(solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()
示例#10
0
# Set all model parameters to fixed values
for k, v in model.params.iteritems():
  model.params[k] = np.linspace(-1.4, 1.3, num=v.size).reshape(*v.shape)

features = np.linspace(-0.5, 1.7, num=N*D).reshape(N, D)
captions = (np.arange(N * T) % V).reshape(N, T)

loss, grads = model.loss(features, captions)
expected_loss = 9.82445935443

print 'loss: ', loss
print 'expected loss: ', expected_loss
print 'difference: ', abs(loss - expected_loss)

data = load_coco_data(pca_features=True)

#Overfit LSTM captioning model
small_data = load_coco_data(max_train=50)

small_lstm_model = CaptioningRNN(
          cell_type='lstm',
          word_to_idx=data['word_to_idx'],
          input_dim=data['train_features'].shape[1],
          hidden_dim=512,
          wordvec_dim=256,
          dtype=np.float32,
        )

small_lstm_solver = CaptioningSolver(small_lstm_model, small_data,
           update_rule='adam',
示例#11
0
import matplotlib.pyplot as plt

from gradient_check import eval_numerical_gradient, eval_numerical_gradient_array
# from rnn_layers import *
from captioning_solver import CaptioningSolver
from classifiers.rnn import CaptioningRNN
from coco_utils import load_coco_data, sample_coco_minibatch, decode_captions
from image_utils import image_from_url


def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


data = load_coco_data(pca_features=True)

# Print out all the keys and values from the data dictionary
for k, v in data.items():
    if type(v) == np.ndarray:
        print(k, type(v), v.shape, v.dtype)
    else:
        print(k, type(v), len(v))

# Sample a minibatch and show the images and captions
batch_size = 3

captions, features, urls = sample_coco_minibatch(data, batch_size=batch_size)
for i, (caption, url) in enumerate(zip(captions, urls)):
    plt.imshow(image_from_url(url))
    plt.axis('off')
示例#12
0
def main():
    # The dataset can be downloaded in https://drive.google.com/drive/folders/1zCq7kS9OXc2mgaOzDimAwiBblECWeBtO?usp=sharing
    # The dataset contains the feature of images in MSCOCO dataset
    # Load COCO data from disk; this returns a dictionary
    small_data = load_coco_data(max_train=50)

    # Experiment with vanilla RNN
    small_rnn_model = CaptioningRNN(
        cell_type='rnn',
        word_to_idx=small_data['word_to_idx'],
        input_dim=small_data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
    )

    small_rnn_solver = CaptioningSolver(
        small_rnn_model,
        small_data,
        update_rule='adam',
        num_epochs=50,
        batch_size=25,
        optim_config={
            'learning_rate': 5e-3,
        },
        lr_decay=0.95,
        verbose=True,
        print_every=10,
    )

    small_rnn_solver.train()

    # Plot the training losses
    plt.plot(small_rnn_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()

    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(small_data,
                                          split=split,
                                          batch_size=2)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, small_data['idx_to_word'])

        sample_captions = small_rnn_model.sample(features)
        sample_captions = decode_captions(sample_captions,
                                          small_data['idx_to_word'])

        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()

    ##################################################################################################

    # Experiment with LSTM
    small_lstm_model = CaptioningRNN(
        cell_type='lstm',
        word_to_idx=small_data['word_to_idx'],
        input_dim=small_data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
        dtype=np.float32,
    )
    small_lstm_solver = CaptioningSolver(
        small_lstm_model,
        small_data,
        update_rule='adam',
        num_epochs=50,
        batch_size=25,
        optim_config={
            'learning_rate': 5e-3,
        },
        lr_decay=0.995,
        verbose=True,
        print_every=10,
    )

    small_lstm_solver.train()
    # Plot the training losses
    plt.plot(small_lstm_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()

    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(small_data,
                                          split=split,
                                          batch_size=2)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, small_data['idx_to_word'])

        sample_captions = small_lstm_model.sample(features)
        sample_captions = decode_captions(sample_captions,
                                          small_data['idx_to_word'])

        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()
示例#13
0
def main(_):
    # load data
    data = load_coco_data(FLAGS.data_dir)

    # force padded_length equal to padded_length - 1
    # model_config.padded_length = len(data['train_captions'][0]) - 1

    # Build the TensorFlow graph and train it
    g = tf.Graph()
    with g.as_default():

        # Build the model. If FLAGS.glove_vocab is null, we do not initialize the model with word vectors; if not, we initialize with glove vectors
        if FLAGS.glove_vocab is '':
            model = build_model(model_config, mode=mode)
        else:
            glove_vocab = np.load(FLAGS.glove_vocab)
            model = build_model(model_config,
                                mode=mode,
                                glove_vocab=glove_vocab)

        # Set up the learning rate.
        learning_rate_decay_fn = None
        learning_rate = tf.constant(training_config.initial_learning_rate)
        if training_config.learning_rate_decay_factor > 0:
            num_batches_per_epoch = (training_config.num_examples_per_epoch /
                                     model_config.batch_size)
            decay_steps = int(num_batches_per_epoch *
                              training_config.num_epochs_per_decay)

            def _learning_rate_decay_fn(learning_rate, global_step):
                return tf.train.exponential_decay(
                    learning_rate,
                    global_step,
                    decay_steps=decay_steps,
                    decay_rate=training_config.learning_rate_decay_factor,
                    staircase=True)

            learning_rate_decay_fn = _learning_rate_decay_fn

        # Set up the training ops.
        train_op = tf.contrib.layers.optimize_loss(
            loss=model['total_loss'],
            global_step=model['global_step'],
            learning_rate=learning_rate,
            optimizer=training_config.optimizer,
            clip_gradients=training_config.clip_gradients,
            learning_rate_decay_fn=learning_rate_decay_fn)

        # initialize all variables
        init = tf.global_variables_initializer()

        with tf.Session() as sess:
            sess.run(init)

            num_epochs = training_config.total_num_epochs

            num_train = data['train_captions'].shape[0]
            iterations_per_epoch = max(num_train / model_config.batch_size, 1)
            num_iterations = int(num_epochs * iterations_per_epoch)

            # Set up some variables for book-keeping
            epoch = 0
            best_val_acc = 0
            best_params = {}
            loss_history = []
            train_acc_history = []
            val_acc_history = []

            print("\n\nTotal training iter: ", num_iterations, "\n\n")
            time_now = datetime.now()
            for t in range(num_iterations):

                total_loss_value = _step(sess, data, train_op, model,
                                         model_config.lstm_dropout_keep_prob
                                         )  # run each training step

                loss_history.append(total_loss_value)

                # Print out training loss
                if FLAGS.print_every > 0 and t % FLAGS.print_every == 0:
                    print(
                        '(Iteration %d / %d) loss: %f, and time eclipsed: %.2f minutes'
                        % (t + 1, num_iterations, float(loss_history[-1]),
                           (datetime.now() - time_now).seconds / 60.0))

                # Print out some image sample results
                if FLAGS.sample_every > 0 and (t +
                                               1) % FLAGS.sample_every == 0:
                    temp_dir = os.path.join(FLAGS.sample_dir,
                                            'temp_dir_{}//'.format(t + 1))
                    if not os.path.exists(temp_dir):
                        os.makedirs(temp_dir)
                    captions_pred, urls = _run_validation(
                        sess, data, model_config.batch_size, model,
                        1.0)  # the output is size (32, 16)
                    captions_pred = [
                        unpack.reshape(-1, 1) for unpack in captions_pred
                    ]
                    captions_pred = np.concatenate(captions_pred, 1)

                    captions_deco = decode_captions(captions_pred,
                                                    data['idx_to_word'])

                    for j in range(len(captions_deco)):
                        img_name = os.path.join(temp_dir,
                                                'image_{}.jpg'.format(j))
                        img = image_from_url(urls[j])
                        write_text_on_image(img, img_name, captions_deco[j])

                # save the model continuously to avoid interruption
                if FLAGS.saveModel_every > 0 and (
                        t + 1) % FLAGS.saveModel_every == 0:
                    if not os.path.exists(FLAGS.savedSession_dir):
                        os.makedirs(FLAGS.savedSession_dir)
                    checkpoint_name = savedModelName[:
                                                     -5] + '_checkpoint{}.ckpt'.format(
                                                         t + 1)
                    save_path = model['saver'].save(
                        sess,
                        os.path.join(FLAGS.savedSession_dir, checkpoint_name))

            if not os.path.exists(FLAGS.savedSession_dir):
                os.makedirs(FLAGS.savedSession_dir)
            save_path = model['saver'].save(
                sess, os.path.join(FLAGS.savedSession_dir, savedModelName))
            print("done. Model saved at: ",
                  os.path.join(FLAGS.savedSession_dir, savedModelName))
示例#14
0
            model['input_mask']: mask,
            model['keep_prob']: keep_prob
        }

        current_pred, state = sess.run([model['preds'], model['final_state']],
                                       feed_dict=feed_dict)

        current_pred = current_pred.reshape(-1, 1)

        final_preds.append(current_pred)

    return final_preds, urls


# load data
data = load_coco_data(base_dir='/home/ubuntu/COCO/dataset/COCO_captioning/')

TOTAL_INFERENCE_STEP = 1
BATCH_SIZE_INFERENCE = 32

# Build the TensorFlow graph and train it
g = tf.Graph()
with g.as_default():
    # Build the model.
    model = build_model(model_config,
                        mode,
                        inference_batch=BATCH_SIZE_INFERENCE)

    # run training
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
示例#15
0
def main():
    config = Config()
    data = load_coco_data()
    model = LSTM_Model('train', config)
    model.build_graph()
    train_model(model, config, data)