예제 #1
0
    def __init__(self, data, max_train_data=2000, pca=True, hidden_dim=512, wordvec_dim=256, num_epochs=50, batch_size=50, lr=5e-3, lr_decay=0.997):
        
        np.random.seed(231)

        self.small_data = load_coco_data(max_train=max_train_data, pca_features=pca)

        self.small_lstm_model = CaptioningRNN(
                  cell_type='lstm',
                  word_to_idx=data['word_to_idx'],
                  input_dim=data['train_features'].shape[1],
                  hidden_dim=hidden_dim,
                  wordvec_dim=wordvec_dim,
                  dtype=np.float32,
                )
        
        self.small_lstm_solver = CaptioningSolver(self.small_lstm_model, self.small_data,
                   update_rule='adam',
                   num_epochs=50,
                   batch_size=50,
                   optim_config={
                     'learning_rate': lr,
                   },
                   lr_decay=lr_decay,
                   verbose=True, print_every=10,
                 )
예제 #2
0
def overfit_small_data():
    """
    Similar to the Solver class that we used to train image classification models on the
    previous assignment, on this assignment we use a CaptioningSolver class to train
    image captioning models. Open the file cs231n/captioning_solver.py and read through
    the CaptioningSolver class; it should look very familiar.

    Once you have familiarized yourself with the API, run the following to make sure your
    model overfits a small sample of 100 training examples. You should see a final loss
    of less than 0.1.
    """
    np.random.seed(231)

    small_data = load_coco_data(max_train=50)

    small_rnn_model = CaptioningRNN(
        cell_type='rnn',
        word_to_idx=data['word_to_idx'],
        input_dim=data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
    )
    small_rnn_solver = CaptioningSolver(
        small_rnn_model,
        small_data,
        update_rule='adam',
        num_epochs=50,
        batch_size=25,
        optim_config={
            'learning_rate': 5e-3,
        },
        lr_decay=0.95,
        verbose=True,
        print_every=10,
    )
    small_rnn_solver.train()

    # Plot the training losses
    plt.plot(small_rnn_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()

    for split in ['train', 'val']:
        gt_captions, features, urls = sample_coco_minibatch(small_data,
                                                            split=split,
                                                            batch_size=2)
        gt_captions = decode_captions(gt_captions, data['idx_to_word'])

        sample_captions = small_rnn_model.sample(features)
        sample_captions = decode_captions(sample_captions, data['idx_to_word'])

        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()
예제 #3
0
def overfit_lstm_captioning_model():
    """You should see a final loss less than 0.5."""
    np.random.seed(231)

    small_data = load_coco_data(max_train=50)

    small_lstm_model = CaptioningRNN(
        cell_type='lstm',
        word_to_idx=data['word_to_idx'],
        input_dim=data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
        dtype=np.float32,
    )

    small_lstm_solver = CaptioningSolver(
        small_lstm_model,
        small_data,
        update_rule='adam',
        num_epochs=50,
        batch_size=25,
        optim_config={
            'learning_rate': 5e-3,
        },
        lr_decay=0.995,
        verbose=True,
        print_every=10,
    )

    small_lstm_solver.train()

    # Plot the training losses
    plt.plot(small_lstm_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()

    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(small_data,
                                          split=split,
                                          batch_size=2)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, data['idx_to_word'])

        sample_captions = small_lstm_model.sample(features)
        sample_captions = decode_captions(sample_captions, data['idx_to_word'])

        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()
예제 #4
0
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Load COCO data from disk; this returns a dictionary
# We'll work with dimensionality-reduced features for this notebook, but feel
# free to experiment with the original features by changing the flag below.
data = load_coco_data(pca_features=True)

# Print out all the keys and values from the data dictionary
for k, v in data.items():
    if type(v) == np.ndarray:
        print(k, type(v), v.shape, v.dtype)
    else:
        print(k, type(v), len(v))

np.random.seed(231)

N, D, H = 4, 5, 6
x = np.random.randn(N, D)
prev_h = np.random.randn(N, H)
prev_c = np.random.randn(N, H)
Wx = np.random.randn(D, 4 * H)
예제 #5
0
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
#%load_ext autoreload
#%autoreload 2


def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


# Load COCO data from disk; this returns a dictionary
# We'll work with dimensionality-reduced features for this notebook, but feel
# free to experiment with the original features by changing the flag below.
data = load_coco_data(pca_features=True)
#
# Print out all the keys and values from the data dictionary
#for k, v in data.items():
#  if type(v) == np.ndarray:
#    print(k, type(v), v.shape, v.dtype)
#  else:
#    print(k, type(v), len(v))
#
#N, D, H = 3, 4, 5
#x = np.linspace(-0.4, 1.2, num=N*D).reshape(N, D)
#prev_h = np.linspace(-0.3, 0.7, num=N*H).reshape(N, H)
#prev_c = np.linspace(-0.4, 0.9, num=N*H).reshape(N, H)
#Wx = np.linspace(-2.1, 1.3, num=4*D*H).reshape(D, 4 * H)
#Wh = np.linspace(-0.7, 2.2, num=4*H*H).reshape(H, 4 * H)
#b = np.linspace(0.3, 0.7, num=4*H)
예제 #6
0
# We have preprocessed the data and extracted features for you already. For all images we have extracted features from the fc7 layer of the VGG-16 network pretrained on ImageNet; these features are stored in the files `train2014_vgg16_fc7.h5` and `val2014_vgg16_fc7.h5` respectively. To cut down on processing time and memory requirements, we have reduced the dimensionality of the features from 4096 to 512; these features can be found in the files `train2014_vgg16_fc7_pca.h5` and `val2014_vgg16_fc7_pca.h5`.
# 
# The raw images take up a lot of space (nearly 20GB) so we have not included them in the download. However all images are taken from Flickr, and URLs of the training and validation images are stored in the files `train2014_urls.txt` and `val2014_urls.txt` respectively. This allows you to download images on the fly for visualization. Since images are downloaded on-the-fly, **you must be connected to the internet to view images**.
# 
# Dealing with strings is inefficient, so we will work with an encoded version of the captions. Each word is assigned an integer ID, allowing us to represent a caption by a sequence of integers. The mapping between integer IDs and words is in the file `coco2014_vocab.json`, and you can use the function `decode_captions` from the file `cs231n/coco_utils.py` to convert numpy arrays of integer IDs back into strings.
# 
# There are a couple special tokens that we add to the vocabulary. We prepend a special `<START>` token and append an `<END>` token to the beginning and end of each caption respectively. Rare words are replaced with a special `<UNK>` token (for "unknown"). In addition, since we want to train with minibatches containing captions of different lengths, we pad short captions with a special `<NULL>` token after the `<END>` token and don't compute loss or gradient for `<NULL>` tokens. Since they are a bit of a pain, we have taken care of all implementation details around special tokens for you.
# 
# You can load all of the MS-COCO data (captions, features, URLs, and vocabulary) using the `load_coco_data` function from the file `cs231n/coco_utils.py`. Run the following cell to do so:

# In[ ]:

# Load COCO data from disk; this returns a dictionary
# We'll work with dimensionality-reduced features for this notebook, but feel
# free to experiment with the original features by changing the flag below.
data = load_coco_data(pca_features=True)

# Print out all the keys and values from the data dictionary
for k, v in data.iteritems():
  if type(v) == np.ndarray:
    print k, type(v), v.shape, v.dtype
  else:
    print k, type(v), len(v)


# ## Look at the data
# It is always a good idea to look at examples from the dataset before working with it.
# 
# You can use the `sample_coco_minibatch` function from the file `cs231n/coco_utils.py` to sample minibatches of data from the data structure returned from `load_coco_data`. Run the following to sample a small minibatch of training data and show the images and their captions. Running it multiple times and looking at the results helps you to get a sense of the dataset.
# 
# Note that we decode the captions using the `decode_captions` function and that we download the images on-the-fly using their Flickr URL, so **you must be connected to the internet to viw images**.
예제 #7
0
#   sample_captions = small_lstm_model.sample(features)
#   sample_captions = decode_captions(sample_captions, data['idx_to_word'])
#
#   for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls):
#     plt.imshow(image_from_url(url))
#     plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
#     plt.axis('off')
#     plt.show()

######################################################################################
# Train a good captioning model!
# To relieve overfitting, did:
# (1) incrase max_train from 50 to 500
# (2) decrease num_epoches from 50 to 25

small_data2 = load_coco_data(max_train=500)  # test

# Load COCO data from disk; this returns a dictionary
# We'll work with dimensionality-reduced features for this notebook, but feel
# free to experiment with the original features by changing the flag below.
data = load_coco_data(pca_features=True)
# Print out all the keys and values from the data dictionary
for k, v in data.iteritems():
    if type(v) == np.ndarray:
        print k, type(v), v.shape, v.dtype
    else:
        print k, type(v), len(v)
good_model = CaptioningRNN(
    cell_type='lstm',
    word_to_idx=data['word_to_idx'],
    input_dim=data['train_features'].shape[1],
from cs231n.rnn_layers import *
from cs231n.captioning_solver import CaptioningSolver
from cs231n.classifiers.rnn import CaptioningRNN
from cs231n.coco_utils import load_coco_data, sample_coco_minibatch, decode_captions
from cs231n.image_utils import image_from_url


def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


# Load COCO data from disk; this returns a dictionary
# We'll work with dimensionality-reduced features for this notebook, but feel
# free to experiment with the original features by changing the flag below.
data = load_coco_data(pca_features=True)

# Print out all the keys and values from the data dictionary
for k, v in data.iteritems():
    if type(v) == np.ndarray:
        print k, type(v), v.shape, v.dtype
    else:
        print k, type(v), len(v)
'''

N, D, H = 3, 4, 5
x = np.linspace(-0.4, 1.2, num=N*D).reshape(N, D)
prev_h = np.linspace(-0.3, 0.7, num=N*H).reshape(N, H)
prev_c = np.linspace(-0.4, 0.9, num=N*H).reshape(N, H)
Wx = np.linspace(-2.1, 1.3, num=4*D*H).reshape(D, 4 * H)
Wh = np.linspace(-0.7, 2.2, num=4*H*H).reshape(H, 4 * H)
예제 #9
0
파일: main.py 프로젝트: StranotDong/RNN_tf
import matplotlib.pyplot as plt

from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array
from cs231n.rnn_layers import *
from cs231n.captioning_solver import CaptioningSolver
from cs231n.classifiers.rnn import CaptioningRNN
from cs231n.coco_utils import load_coco_data, sample_coco_minibatch, decode_captions
from cs231n.image_utils import image_from_url

from configuration import Config
from RNN_image_cap import RNN_image_cap

# Load COCO data from disk; this returns a dictionary
# We'll work with dimensionality-reduced features for this notebook, but feel
# free to experiment with the original features by changing the flag below.
data = load_coco_data(max_train=50, pca_features=True)
captions = data['train_captions']
image_idxs = data['train_image_idxs']
image_features = data['train_features'][image_idxs]
word_to_idx = data['word_to_idx']

# Print out all the keys and values from the data dictionary
# for k, v in data.items():
#     if type(v) == np.ndarray:
#         print(k, type(v), v.shape, v.dtype)
#     else:
#         print(k, type(v), len(v))

config = Config()
# print(vars(config).items())
path = 'trained_model/test.ckpt'
예제 #10
0
    # return params, forward, sample


def step(loss, optimizer):
    # loss /= batch_size
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


'''
training
'''

data = load_coco_data(max_train=10)
print(data['train_features'].shape)
print(data['val_features'].shape)

num_epochs = 100
batch_size = 25
num_train, _ = data['train_captions'].shape
_, feature_dim = data['train_features'].shape
hidden_dim = 512
wordvec_dim = 256
vocab_size = len(data['word_to_idx'])
# print(vocab_size)
batch_size = 25

iterations_per_epoch = max(num_train // batch_size, 1)
num_iterations = num_epochs * iterations_per_epoch
예제 #11
0
#get_ipython().magic(u'autoreload 2')

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


## # Load MS-COCO data
## As in the previous notebook, we will use the Microsoft COCO dataset for captioning.
#
## In[ ]:
#
## Load COCO data from disk; this returns a dictionary
## We'll work with dimensionality-reduced features for this notebook, but feel
## free to experiment with the original features by changing the flag below.
data = load_coco_data(pca_features=True)

# Print out all the keys and values from the data dictionary
for k, v in data.iteritems():
  if type(v) == np.ndarray:
    print k, type(v), v.shape, v.dtype
  else:
    print k, type(v), len(v)


## # LSTM
## If you read recent papers, you'll see that many people use a variant on the 
#vanialla RNN called Long-Short Term Memory (LSTM) RNNs. Vanilla RNNs can be tough 
#to train on long sequences due to vanishing and exploding gradiants caused by repeated 
#matrix multiplication. LSTMs solve this problem by replacing the simple update rule of 
#the vanilla RNN with a gating mechanism as follows.
예제 #12
0
#=======Change These===============================
max_train = 400000
epoch = 10
lr = 1e-5
reg = 1e-5
word_vector_size = 50
result_file = 'coco_train_result' + '_lr' + str(lr) + '_reg' + str(reg) + '-2'
save_model_file = 'coco_bestModel' + '_lr' + str(lr) + '_reg' + str(reg) + '-2'
save_model_file2 = 'coco_lastestModel' + '_lr' + str(lr) + '_reg' + str(
    reg) + '-2'

loadModel = True  #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
loadModel_file = 'coco_lastestModel_lr0.001_reg1e-05-1'
#==================================================

data = load_coco_data(pca_features=False, max_train=max_train)
input_size = data['train_features'].shape[1]
maxLen = data['train_captions'].shape[1]
wordLs = []

for caption in data['train_captions']:
    for word in caption:
        wordLs.append(word)
voc_size = len(list(set(wordLs)))

# Print out all the keys and values from the data dictionary
for k, v in data.items():
    if type(v) == np.ndarray:
        print(k, type(v), v.shape, v.dtype)
    else:
        print(k, type(v), len(v))
예제 #13
0
from cs231n.rnn_layers import *
from cs231n.captioning_solver import CaptioningSolver
from cs231n.classifiers.rnn import CaptioningRNN
from cs231n.coco_utils import load_coco_data, sample_coco_minibatch, decode_captions
from cs231n.image_utils import image_from_url

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
get_ipython().magic(u'load_ext autoreload')
get_ipython().magic(u'autoreload 2')

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

data = load_coco_data(pca_features=True)

big_coco_data = load_coco_data(max_train=500)

big_lstm_model = CaptioningRNN(
    cell_type='lstm',
    word_to_idx=data['word_to_idx'],
    input_dim=data['train_features'].shape[1],
    hidden_dim=1024,
    wordvec_dim=512,
    dtype=np.float32,
)

big_lstm_solver = CaptioningSolver(big_lstm_model, big_coco_data,
                                   update_rule='adam',
                                   num_epochs=30,