from cs231n.rnn_layers import lstm_forward, lstm_backward np.random.seed(231) N, D, T, H = 2, 3, 10, 6 x = np.random.randn(N, T, D) h0 = np.random.randn(N, H) Wx = np.random.randn(D, 4 * H) Wh = np.random.randn(H, 4 * H) b = np.random.randn(4 * H) out, cache = lstm_forward(x, h0, Wx, Wh, b) dout = np.random.randn(*out.shape) dx, dh0, dWx, dWh, db = lstm_backward(dout, cache) fx = lambda x: lstm_forward(x, h0, Wx, Wh, b)[0] fh0 = lambda h0: lstm_forward(x, h0, Wx, Wh, b)[0] fWx = lambda Wx: lstm_forward(x, h0, Wx, Wh, b)[0] fWh = lambda Wh: lstm_forward(x, h0, Wx, Wh, b)[0] fb = lambda b: lstm_forward(x, h0, Wx, Wh, b)[0] dx_num = eval_numerical_gradient_array(fx, x, dout) dh0_num = eval_numerical_gradient_array(fh0, h0, dout) dWx_num = eval_numerical_gradient_array(fWx, Wx, dout) dWh_num = eval_numerical_gradient_array(fWh, Wh, dout) db_num = eval_numerical_gradient_array(fb, b, dout) print('dx error: ', rel_error(dx_num, dx)) print('dh0 error: ', rel_error(dh0_num, dh0))
def loss(self, features, captions): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN (or LSTM) to compute loss and gradients on all parameters. Inputs: - features: Input image features, of shape (N, D) - captions: Ground-truth captions; an integer array of shape (N, T) where each element is in the range 0 <= y[i, t] < V Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ # Cut captions into two pieces: captions_in has everything but the last word # and will be input to the RNN; captions_out has everything but the first # word and this is what we will expect the RNN to generate. These are offset # by one relative to each other because the RNN should produce word (t+1) # after receiving word t. The first element of captions_in will be the START # token, and the first element of captions_out will be the first word. captions_in = captions[:, :-1] captions_out = captions[:, 1:] # You'll need this mask = (captions_out != self._null) # Weight and bias for the affine transform from image features to initial # hidden state W_proj, b_proj = self.params['W_proj'], self.params['b_proj'] # Word embedding matrix W_embed = self.params['W_embed'] # Input-to-hidden, hidden-to-hidden, adn biases for the RNN Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b'] # Weight and bias for the hidden-to-vocab transformation W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab'] loss, grads = 0.0, {} ############################################################################ # TODO: Implement the forward and backward passes for the CaptioningRNN. # # In the forward pass you will need to do the following: # # (1) Use an affine transformation to compute the initial hidden state # # from the image features. This should produce an array of shape (N, H)# # (2) Use a word embedding layer to transform the words in captions_in # # from indices to vectors, giving an array of shape (N, T, W). # # (3) Use either a vanilla RNN or LSTM (depending on self.cell_type) to # # process the sequence of input word vectors and produce hidden state # # vectors for all timesteps, producing an array of shape (N, T, H). # # (4) Use a (temporal) affine transformation to compute scores over the # # vocabulary at every timestep using the hidden states, giving an # # array of shape (N, T, V). # # (5) Use (temporal) softmax to compute loss using captions_out, ignoring # # the points where the output word is <NULL> using the mask above. # # # # In the backward pass you will need to compute the gradient of the loss # # with respect to all model parameters. Use the loss and grads variables # # defined above to store loss and gradients; grads[k] should give the # # gradients for self.params[k]. # ############################################################################ # pass N, _ = features.shape h0 = np.dot(features,W_proj) + b_proj # (1) word_vec, word_vec_cache = word_embedding_forward(captions_in, W_embed) # (2) if self.cell_type == 'rnn': h, h_cache = rnn_forward(x= word_vec, h0= h0, Wx= Wx, Wh= Wh, b= b) # (3) out, out_cache = temporal_affine_forward(x= h, w= W_vocab, b= b_vocab) # (4) mask = (captions_out!=self._null) loss, dx = temporal_softmax_loss(x= out, y= captions_out, mask= mask, verbose=False) dh, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dx, out_cache) dcaption_in, dh0, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward(dh, h_cache) grads['W_embed'] = word_embedding_backward(dcaption_in, word_vec_cache) grads['W_proj'] = np.dot(features.T, dh0) grads['b_proj'] = np.sum(dh0, axis=0) else: if self.cell_type == 'lstm': h, h_cache = lstm_forward(x= word_vec, h0= h0, Wx= Wx, Wh= Wh, b= b) # (2) out, out_cache = temporal_affine_forward(x= h, w= W_vocab, b= b_vocab) mask = (captions_out!=self._null) loss, dx = temporal_softmax_loss(x= out, y= captions_out, mask= mask, verbose= False) dh, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dx, out_cache) dcaption_in, dh0, grads['Wx'], grads['Wh'], grads['b'] = lstm_backward(dh, h_cache) grads['W_embed'] = word_embedding_backward(dcaption_in, word_vec_cache) grads['W_proj'] = np.dot(features.T, dh0) grads['b_proj'] = np.sum(dh0, axis=0) else: print('Unknow type') ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads