print('dx error: ', rel_error(dx_num, dx)) print('dh error: ', rel_error(dh_num, dh)) print('dc error: ', rel_error(dc_num, dc)) print('dWx error: ', rel_error(dWx_num, dWx)) print('dWh error: ', rel_error(dWh_num, dWh)) print('db error: ', rel_error(db_num, db)) N, D, H, T = 2, 5, 4, 3 x = np.linspace(-0.4, 0.6, num=N*T*D).reshape(N, T, D) h0 = np.linspace(-0.4, 0.8, num=N*H).reshape(N, H) Wx = np.linspace(-0.2, 0.9, num=4*D*H).reshape(D, 4 * H) Wh = np.linspace(-0.3, 0.6, num=4*H*H).reshape(H, 4 * H) b = np.linspace(0.2, 0.7, num=4*H) h, cache = lstm_forward(x, h0, Wx, Wh, b) expected_h = np.asarray([ [[ 0.01764008, 0.01823233, 0.01882671, 0.0194232 ], [ 0.11287491, 0.12146228, 0.13018446, 0.13902939], [ 0.31358768, 0.33338627, 0.35304453, 0.37250975]], [[ 0.45767879, 0.4761092, 0.4936887, 0.51041945], [ 0.6704845, 0.69350089, 0.71486014, 0.7346449 ], [ 0.81733511, 0.83677871, 0.85403753, 0.86935314]]]) print('h error: ', rel_error(expected_h, h)) from cs231n.rnn_layers import lstm_forward, lstm_backward np.random.seed(231)
def loss(self, features, captions): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN (or LSTM) to compute loss and gradients on all parameters. Inputs: - features: Input image features, of shape (N, D) - captions: Ground-truth captions; an integer array of shape (N, T) where each element is in the range 0 <= y[i, t] < V Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ # Cut captions into two pieces: captions_in has everything but the last word # and will be input to the RNN; captions_out has everything but the first # word and this is what we will expect the RNN to generate. These are offset # by one relative to each other because the RNN should produce word (t+1) # after receiving word t. The first element of captions_in will be the START # token, and the first element of captions_out will be the first word. captions_in = captions[:, :-1] captions_out = captions[:, 1:] # You'll need this mask = (captions_out != self._null) # Weight and bias for the affine transform from image features to initial # hidden state W_proj, b_proj = self.params['W_proj'], self.params['b_proj'] # Word embedding matrix W_embed = self.params['W_embed'] # Input-to-hidden, hidden-to-hidden, adn biases for the RNN Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b'] # Weight and bias for the hidden-to-vocab transformation W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab'] loss, grads = 0.0, {} ############################################################################ # TODO: Implement the forward and backward passes for the CaptioningRNN. # # In the forward pass you will need to do the following: # # (1) Use an affine transformation to compute the initial hidden state # # from the image features. This should produce an array of shape (N, H)# # (2) Use a word embedding layer to transform the words in captions_in # # from indices to vectors, giving an array of shape (N, T, W). # # (3) Use either a vanilla RNN or LSTM (depending on self.cell_type) to # # process the sequence of input word vectors and produce hidden state # # vectors for all timesteps, producing an array of shape (N, T, H). # # (4) Use a (temporal) affine transformation to compute scores over the # # vocabulary at every timestep using the hidden states, giving an # # array of shape (N, T, V). # # (5) Use (temporal) softmax to compute loss using captions_out, ignoring # # the points where the output word is <NULL> using the mask above. # # # # In the backward pass you will need to compute the gradient of the loss # # with respect to all model parameters. Use the loss and grads variables # # defined above to store loss and gradients; grads[k] should give the # # gradients for self.params[k]. # ############################################################################ # pass N, _ = features.shape h0 = np.dot(features,W_proj) + b_proj # (1) word_vec, word_vec_cache = word_embedding_forward(captions_in, W_embed) # (2) if self.cell_type == 'rnn': h, h_cache = rnn_forward(x= word_vec, h0= h0, Wx= Wx, Wh= Wh, b= b) # (3) out, out_cache = temporal_affine_forward(x= h, w= W_vocab, b= b_vocab) # (4) mask = (captions_out!=self._null) loss, dx = temporal_softmax_loss(x= out, y= captions_out, mask= mask, verbose=False) dh, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dx, out_cache) dcaption_in, dh0, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward(dh, h_cache) grads['W_embed'] = word_embedding_backward(dcaption_in, word_vec_cache) grads['W_proj'] = np.dot(features.T, dh0) grads['b_proj'] = np.sum(dh0, axis=0) else: if self.cell_type == 'lstm': h, h_cache = lstm_forward(x= word_vec, h0= h0, Wx= Wx, Wh= Wh, b= b) # (2) out, out_cache = temporal_affine_forward(x= h, w= W_vocab, b= b_vocab) mask = (captions_out!=self._null) loss, dx = temporal_softmax_loss(x= out, y= captions_out, mask= mask, verbose= False) dh, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dx, out_cache) dcaption_in, dh0, grads['Wx'], grads['Wh'], grads['b'] = lstm_backward(dh, h_cache) grads['W_embed'] = word_embedding_backward(dcaption_in, word_vec_cache) grads['W_proj'] = np.dot(features.T, dh0) grads['b_proj'] = np.sum(dh0, axis=0) else: print('Unknow type') ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads