Exemplo n.º 1
0
from cs231n.rnn_layers import lstm_forward, lstm_backward
np.random.seed(231)

N, D, T, H = 2, 3, 10, 6

x = np.random.randn(N, T, D)
h0 = np.random.randn(N, H)
Wx = np.random.randn(D, 4 * H)
Wh = np.random.randn(H, 4 * H)
b = np.random.randn(4 * H)

out, cache = lstm_forward(x, h0, Wx, Wh, b)

dout = np.random.randn(*out.shape)

dx, dh0, dWx, dWh, db = lstm_backward(dout, cache)

fx = lambda x: lstm_forward(x, h0, Wx, Wh, b)[0]
fh0 = lambda h0: lstm_forward(x, h0, Wx, Wh, b)[0]
fWx = lambda Wx: lstm_forward(x, h0, Wx, Wh, b)[0]
fWh = lambda Wh: lstm_forward(x, h0, Wx, Wh, b)[0]
fb = lambda b: lstm_forward(x, h0, Wx, Wh, b)[0]

dx_num = eval_numerical_gradient_array(fx, x, dout)
dh0_num = eval_numerical_gradient_array(fh0, h0, dout)
dWx_num = eval_numerical_gradient_array(fWx, Wx, dout)
dWh_num = eval_numerical_gradient_array(fWh, Wh, dout)
db_num = eval_numerical_gradient_array(fb, b, dout)

print('dx error: ', rel_error(dx_num, dx))
print('dh0 error: ', rel_error(dh0_num, dh0))
Exemplo n.º 2
0
    def loss(self, features, captions):
        """
        Compute training-time loss for the RNN. We input image features and
        ground-truth captions for those images, and use an RNN (or LSTM) to compute
        loss and gradients on all parameters.

        Inputs:
        - features: Input image features, of shape (N, D)
        - captions: Ground-truth captions; an integer array of shape (N, T) where
          each element is in the range 0 <= y[i, t] < V

        Returns a tuple of:
        - loss: Scalar loss
        - grads: Dictionary of gradients parallel to self.params
        """
        # Cut captions into two pieces: captions_in has everything but the last word
        # and will be input to the RNN; captions_out has everything but the first
        # word and this is what we will expect the RNN to generate. These are offset
        # by one relative to each other because the RNN should produce word (t+1)
        # after receiving word t. The first element of captions_in will be the START
        # token, and the first element of captions_out will be the first word.
        captions_in = captions[:, :-1]
        captions_out = captions[:, 1:]

        # You'll need this
        mask = (captions_out != self._null)
        # Weight and bias for the affine transform from image features to initial
        # hidden state
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']

        # Word embedding matrix
        W_embed = self.params['W_embed']

        # Input-to-hidden, hidden-to-hidden, adn biases for the RNN
        Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']

        # Weight and bias for the hidden-to-vocab transformation
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

        loss, grads = 0.0, {}
        ############################################################################
        # TODO: Implement the forward and backward passes for the CaptioningRNN.   #
        # In the forward pass you will need to do the following:                   #
        # (1) Use an affine transformation to compute the initial hidden state     #
        #     from the image features. This should produce an array of shape (N, H)#
        # (2) Use a word embedding layer to transform the words in captions_in     #
        #     from indices to vectors, giving an array of shape (N, T, W).         #
        # (3) Use either a vanilla RNN or LSTM (depending on self.cell_type) to    #
        #     process the sequence of input word vectors and produce hidden state  #
        #     vectors for all timesteps, producing an array of shape (N, T, H).    #
        # (4) Use a (temporal) affine transformation to compute scores over the    #
        #     vocabulary at every timestep using the hidden states, giving an      #
        #     array of shape (N, T, V).                                            #
        # (5) Use (temporal) softmax to compute loss using captions_out, ignoring  #
        #     the points where the output word is <NULL> using the mask above.     #
        #                                                                          #
        # In the backward pass you will need to compute the gradient of the loss   #
        # with respect to all model parameters. Use the loss and grads variables   #
        # defined above to store loss and gradients; grads[k] should give the      #
        # gradients for self.params[k].                                            #
        ############################################################################
        # pass
        N, _ = features.shape
        h0 = np.dot(features,W_proj) + b_proj # (1)
        word_vec, word_vec_cache = word_embedding_forward(captions_in, W_embed) # (2)
        if self.cell_type == 'rnn':
            h, h_cache = rnn_forward(x= word_vec, h0= h0, Wx= Wx, Wh= Wh, b= b)  # (3)
            out, out_cache = temporal_affine_forward(x= h, w= W_vocab, b= b_vocab) # (4)
            mask = (captions_out!=self._null)
            loss, dx = temporal_softmax_loss(x= out, y= captions_out, mask= mask, verbose=False)
            dh, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dx, out_cache)
            dcaption_in, dh0, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward(dh, h_cache)
            grads['W_embed'] = word_embedding_backward(dcaption_in, word_vec_cache)
            grads['W_proj'] = np.dot(features.T, dh0)
            grads['b_proj'] = np.sum(dh0, axis=0)
        else:
            if self.cell_type == 'lstm':
                h, h_cache = lstm_forward(x= word_vec, h0= h0, Wx= Wx, Wh= Wh, b= b) # (2)
                out, out_cache = temporal_affine_forward(x= h, w= W_vocab, b= b_vocab)
                mask = (captions_out!=self._null)
                loss, dx = temporal_softmax_loss(x= out, y= captions_out, mask= mask, verbose= False)
                dh, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dx, out_cache)
                dcaption_in, dh0, grads['Wx'], grads['Wh'], grads['b'] = lstm_backward(dh, h_cache)
                grads['W_embed'] = word_embedding_backward(dcaption_in, word_vec_cache)
                grads['W_proj'] = np.dot(features.T, dh0)
                grads['b_proj'] = np.sum(dh0, axis=0)
            else:
                print('Unknow type')
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads