def test_output_dropout_forward(self): print_test("Testing dropout forward function:") seed = 395 np.random.seed(seed) p = 0.7 x = np.linspace(-0.7, 0.5, num=10).reshape(2, 5) out_train, _ = layers.dropout_forward(x, p=p, seed=seed, train=True) out_test, _ = layers.dropout_forward(x, p=p, seed=seed, train=False) correct_out = np.asarray([[-0., -0., -0., -1., -0.], [-0., 0., 0., 0., 0.]]) e_train = rel_error(out_train, correct_out) e_test = rel_error(out_test, x) print("Relative difference train:", e_train) print("Relative difference test:", e_test) self.assertTrue(e_train < 1e-12) self.assertTrue(e_test < 1e-12)
def test_output_relu_backward(self): print_test("Testing dropout backward function:") seed = 395 np.random.seed(seed) x = np.random.randn(16, 16) + 8 dout = np.random.randn(*x.shape) p = 0.7 # Test for train dout, mask = layers.dropout_forward(x, train=True, p=p, seed=seed) dx = layers.dropout_backward(dout, mask, p=p, train=True) dx_num = eval_numerical_gradient_array( lambda xx: layers.dropout_forward( xx, p=0.7, train=True, seed=seed)[0], x, dout) e_train = rel_error(dx, dx_num) print('dx train relative error: ', e_train) self.assertTrue(e_train <= 5e-11) # Test for test dx_test = layers.dropout_backward(dout, mask, train=False, p=p) e_test = rel_error(dout, dx_test) print('dx test relative error: ', e_test) self.assertTrue(e_test <= 1e-12)
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Args: print() - X: Input data, numpy array of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.param s, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None X = X.astype(self.dtype) linear_cache = dict() relu_cache = dict() dropout_cache = dict() """ TODO: Implement the forward pass for the fully-connected neural network, compute the scores and store them in the scores variable. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### hidden_num = self.num_layers - 1 scores = X cache_history = [] L2reg = 0 cache_history.append(scores) # for i in range(hidden_num): scores = linear_forward(scores, self.params['W%d' % (i + 1)], self.params['b%d' % (i + 1)]) cache_history.append(scores) scores = relu_forward(scores) cache_history.append(scores) if self.use_dropout: scores, cache = dropout_forward(scores, self.dropout_params["p"], self.dropout_params["train"], self.dropout_params["seed"]) cache_history.append(cache) L2reg += np.sum(self.params['W%d' % (i + 1)]**2) i += 1 scores = linear_forward(scores, self.params['W%d' % (i + 1)],\ self.params['b%d' % (i + 1)]) L2reg += np.sum(self.params['W%d' % (i + 1)]**2) L2reg *= 0.5 * self.reg ####################################################################### # END OF YOUR CODE # ####################################################################### # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, dict() """ TODO: Implement the backward pass for the fully-connected net. Store the loss in the loss variable and all gradients in the grads dictionary. Compute the loss with softmax. grads[k] has the gradients for self.params[k]. Add L2 regularisation to the loss function. NOTE: To ensure that your implementation matches ours and you pass the automated tests, make sure that your L2 regularization includes a factor of 0.5 to simplify the expression for the gradient. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### loss, dout = softmax(scores, y) loss += L2reg dout, grads['W%d' % (i + 1)], grads['b%d' % (i + 1)] = linear_backward( dout, cache_history.pop(), self.params['W%d' % (i + 1)], self.params['b%d' % (i + 1)]) grads['W%d' % (i + 1)] += self.reg * self.params['W%d' % (i + 1)] i -= 1 while i >= 0: if self.use_dropout: dout = dropout_backward(dout, cache_history.pop()) dout = relu_backward(dout, cache_history.pop()) dout, grads['W%d' % (i + 1)], grads['b%d' % (i + 1)] = linear_backward( dout, cache_history.pop(), self.params['W%d' % (i + 1)], self.params['b%d' % (i + 1)]) grads['W%d' % (i + 1)] += self.reg * self.params['W%d' % (i + 1)] i -= 1 ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Args: - X: Input data, numpy array of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None X = X.astype(self.dtype) linear_cache = dict() relu_cache = dict() dropout_cache = dict() """ TODO: Implement the forward pass for the fully-connected neural network, compute the scores and store them in the scores variable. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### inp = X for i in range(1, self.num_layers): W, b = self.params["W%d" % i], self.params["b%d" % i] linOut = linear_forward(inp, W, b) linear_cache["O%d" % i] = inp reluOut = relu_forward(linOut) relu_cache["O%d" % i] = linOut inp = reluOut if self.use_dropout: dropOut, dropMask = dropout_forward(reluOut, **self.dropout_params) dropout_cache["O%d" % i] = dropMask inp = dropOut W, b = self.params["W%d" % (i + 1)], self.params["b%d" % (i + 1)] scores = linear_forward(inp, W, b) ####################################################################### # END OF YOUR CODE # ####################################################################### # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, dict() """ TODO: Implement the backward pass for the fully-connected net. Store the loss in the loss variable and all gradients in the grads dictionary. Compute the loss with softmax. grads[k] has the gradients for self.params[k]. Add L2 regularisation to the loss function. NOTE: To ensure that your implementation matches ours and you pass the automated tests, make sure that your L2 regularization includes a factor of 0.5 to simplify the expression for the gradient. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### loss, dout = softmax(scores, y) for k in range(1, self.num_layers + 1): loss += 0.5 * self.reg * (self.params["W%d" % k]**2).sum() dX, dW, db = linear_backward(dout, inp, W, b) grads["W%d" % (i + 1)], grads["b%d" % (i + 1)] = dW + self.reg * W, db for i in range(self.num_layers - 1, 0, -1): if self.use_dropout: dX = dropout_backward(dX, dropout_cache["O%d" % i], **self.dropout_params) reluBack = relu_backward(dX, relu_cache["O%d" % i]) W, b = self.params["W%d" % i], self.params["b%d" % i] dX, dW, db = linear_backward(reluBack, linear_cache["O%d" % i], W, b) grads["W%d" % i], grads["b%d" % i] = dW + self.reg * W, db ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Args: - X: Input data, numpy array of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None X = X.astype(self.dtype) linear_cache = dict() relu_cache = dict() dropout_cache = dict() """ TODO: Implement the forward pass for the fully-connected neural network, compute the scores and store them in the scores variable. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### x_in = X # If y is None then we are in test mode so just return scores if y is None: for i in range(1, self.num_layers): linear_out = linear_forward(x_in, self.params['W' + str(i)], self.params['b' + str(i)]) relu_out = relu_forward(linear_out) x_in = relu_out logits = linear_forward(x_in, self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]) scores = softmax(logits, y) # If y is not None then we are in train mode so just return scores return scores else: self.last_grads_ = copy.deepcopy(self.grads_) loss, grads = 0, dict() self.grads_ = grads for i in range(1, self.num_layers): linear_out = linear_forward(x_in, self.params['W' + str(i)], self.params['b' + str(i)]) linear_cache["out" + str(i)] = linear_out relu_out = relu_forward(linear_out) relu_cache["out" + str(i)] = relu_out dropout_out = relu_out if self.use_dropout: seed = 42 if self.dropout_params["seed"] is None else self.dropout_params["seed"] dropout_out, mask = dropout_forward(relu_out, self.dropout_params["p"], True, seed) dropout_cache["out" + str(i)] = dropout_out dropout_cache["mask" + str(i)] = mask x_in = dropout_out logits = linear_forward(x_in, self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]) loss, dlogits = softmax(logits, y) ####################################################################### # END OF YOUR CODE # ####################################################################### # """ # TODO: Implement the backward pass for the fully-connected net. Store # the loss in the loss variable and all gradients in the grads # dictionary. Compute the loss with softmax. grads[k] has the gradients # for self.params[k]. Add L2 regularisation to the loss function. # NOTE: To ensure that your implementation matches ours and you pass the # automated tests, make sure that your L2 regularization includes a # factor of 0.5 to simplify the expression for the gradient. # """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### dout = dlogits if self.use_dropout: input_x = dropout_cache["out" + str(self.num_layers - 1)] else: input_x = relu_cache["out" + str(self.num_layers - 1)] W = self.params['W' + str(self.num_layers)] b = self.params['b' + str(self.num_layers)] dout, dW, db = linear_backward(dout, input_x, W, b) grads['W' + str(self.num_layers)] = dW grads['b' + str(self.num_layers)] = db for i in range(self.num_layers - 1, 1, -1): if self.use_dropout: mask = dropout_cache['mask' + str(i)] dout = dropout_backward(dout, mask, seed, self.dropout_params["p"]) input_x = linear_cache["out" + str(i)] dout = relu_backward(dout, input_x) if self.use_dropout: input_x = dropout_cache["out" + str(i - 1)] else: input_x = relu_cache["out" + str(i - 1)] W = self.params['W' + str(i)] b = self.params['b' + str(i)] dout, dW, db = linear_backward(dout, input_x, W, b) grads['W' + str(i)] = dW grads['b' + str(i)] = db if self.use_dropout: mask = dropout_cache['mask' + str(1)] dout = dropout_backward(dout, mask, seed, self.dropout_params["p"]) input_x = linear_cache["out" + str(1)] dout = relu_backward(dout, input_x) W = self.params['W' + str(1)] b = self.params['b' + str(1)] dout, dW, db = linear_backward(dout, X, W, b) grads['W' + str(1)] = dW grads['b' + str(1)] = db # add L2 regularisation regularisation = 0.0 for i in range(1, self.num_layers + 1): tempW = 0.5 * self.reg * np.square(self.params['W' + str(i)]) regularisation += np.sum(tempW) grads['W' + str(i)] += self.reg * self.params['W' + str(i)] loss += regularisation ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Args: - X: Input data, numpy array of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i] Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None X = X.astype(self.dtype) linear_cache = dict() relu_cache = dict() dropout_cache = dict() """ TODO: Implement the forward pass for the fully-connected neural network, compute the scores and store them in the scores variable. """ if self.use_dropout: self.dropout_params["train"] = False if y is None else True Xi = linear_cache['0'] = X if self.use_dropout: p, t = self.dropout_params["p"], \ self.dropout_params["train"] for i in range(self.num_layers): W, b = self.params['W' + str(i + 1)], self.params['b' + str(i + 1)] Xi = relu_cache[str(i)] = linear_forward(Xi, W, b) if i != self.num_layers - 1: Xi = linear_cache[str(i + 1)] = relu_forward(Xi) if self.use_dropout: # receive (out, mask) Xi, dropout_cache[str(i)] = dropout_forward(Xi, \ p, t, None) scores = Xi # If y is None then we are in test mode, so just return scores if y is None: return scores loss, grads = 0, dict() """ TODO: Implement the backward pass for the fully-connected net. Store the loss in the loss variable and all gradients in the grads dictionary. Compute the loss with softmax. grads[k] has the gradients for self.params[k]. Add L2 regularisation to the loss function. NOTE: To ensure that your implementation matches ours and you pass the automated tests, make sure that your L2 regularization includes a factor of 0.5 to simplify the expression for the gradient. """ loss, dX = softmax(scores, y) for i in reversed(range(self.num_layers)): if i != self.num_layers - 1: if self.use_dropout: dX = dropout_backward(dX, \ dropout_cache[str(i)], p=p, train=t) dX = relu_backward(dX, relu_cache[str(i)]) W, b = self.params['W' + str(i + 1)], self.params['b' + str(i + 1)] dX, dW, db = linear_backward(dX, linear_cache[str(i)], W, b) grads['W' + str(i + 1)], grads['b' + str(i + 1)] = \ dW + self.reg * self.params['W' + str(i + 1)], db loss += 0.5 * self.reg * np.sum(W**2) return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Args: - X: Input data, numpy array of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None X = X.astype(self.dtype) linear_cache = dict() relu_cache = dict() dropout_cache = dict() """ TODO: Implement the forward pass for the fully-connected neural network, compute the scores and store them in the scores variable. """ # get num of hidden layers (do not count the output layer) num_hidden_layers = self.num_layers - 1 # store the first layer's input in the linear cache linear_cache[1] = X # iterate through all layers, including the class layer for i in range(1, self.num_layers + 1): # compute this layer's W and b keys Wkey, bkey = "W" + str(i), "b" + str(i) # perform linear pass. output has dimensions M x N. Store it in relu_cache relu_cache[i] = linear_forward(linear_cache[i], self.params[Wkey], self.params[bkey]) # perform relu -> dropout if i < self.num_layers: # perform ReLU relu_out = relu_forward(relu_cache[i]) # perform dropout out, mask = dropout_forward(relu_out, \ self.dropout_params["p"], \ self.dropout_params["train"], \ self.dropout_params["seed"]) # add the mask to the dropout cache dropout_cache[i] = mask # cache this layer's output as input to the next layer in the linear cache linear_cache[i + 1] = out # final layer output is stored in the relu_cache since it did not go through dropout or ReLU scores = relu_cache[self.num_layers] # if y is None then we are in test mode so just return scores if y is None: return scores """ TODO: Implement the backward pass for the fully-connected net. Store the loss in the loss variable and all gradients in the grads dictionary. Compute the loss with softmax. grads[k] has the gradients for self.params[k]. Add L2 regularisation to the loss function. NOTE: To ensure that your implementation matches ours and you pass the automated tests, make sure that your L2 regularization includes a factor of 0.5 to simplify the expression for the gradient. """ # used to store gradients grads = dict() # perform softmax. should I log scores? loss, dlogits = softmax(scores, y) # used to store the upstream derivate of the next layer dout = dlogits for i in range(self.num_layers, 0, -1): # compute this layer's W and b names Wkey, bkey = "W" + str(i), "b" + str(i) # add L2 regularisation. square each weight of this layer and add it to loss loss += 0.5 * self.reg * np.sum(self.params[Wkey]**2) if i < self.num_layers: # perform dropout dout = dropout_backward(dout, dropout_cache[i], self.dropout_params["p"], self.dropout_params["train"]) # perform ReLU dout = relu_backward(dout, relu_cache[i]) # perform linear backward and store the gradients dX, dW, db = linear_backward(dout, linear_cache[i], self.params[Wkey], self.params[bkey]) # d(E_0 + 0.5 * reg * W_all^2)/dW_i = d(E_o)/dW_i + reg * W_i # dW holds d(E_0/dW_i), so we must add the reg * W_i term ourselves grads.update({Wkey: dW + self.reg * self.params[Wkey], bkey: db}) # set dout equal to dX dout = dX return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Args: - X: Input data, numpy array of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None X = X.astype(self.dtype) linear_cache = dict() relu_cache = dict() dropout_cache = dict() out = X.copy() """ TODO: Implement the forward pass for the fully-connected neural network, compute the scores and store them in the scores variable. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### if y is None: #mmmmmmmmodified train = False else: train = True scores = out for i in range(1, self.num_layers): linear_cache[str(i)] = scores scores = linear_forward(X=scores, W=self.params['W' + str(i)], b=self.params['b' + str(i)]) relu_cache[str(i)] = scores scores = relu_forward(scores) if self.use_dropout: if "seed" in self.dropout_params: #mmmmmmmmodified scores, mask = dropout_forward( scores, p=self.dropout_params["p"], train=train, seed=self.dropout_params["seed"]) else: scores, mask = dropout_forward(scores, p=self.dropout_params["p"], train=train) dropout_cache[str(i)] = mask linear_cache[str(self.num_layers)] = scores scores = linear_forward(X=scores, W=self.params['W' + str(self.num_layers)], b=self.params['b' + str(self.num_layers)]) ####################################################################### # END OF YOUR CODE # ####################################################################### # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, dict() """ TODO: Implement the backward pass for the fully-connected net. Store the loss in the loss variable and all gradients in the grads dictionary. Compute the loss with softmax. grads[k] has the gradients for self.params[k]. Add L2 regularisation to the loss function. NOTE: To ensure that your implementation matches ours and you pass the automated tests, make sure that your L2 regularization includes a factor of 0.5 to simplify the expression for the gradient. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### loss, dscore = softmax(scores, y) regularization = 0 #add regularization to loss for i in range(1, self.num_layers + 1): regularization += np.sum(self.params['W' + str(i)]**2) loss += 0.5 * self.reg * regularization #backward through last w and b dhidden_layer,grads['W' + str(self.num_layers)],grads['b' + str(self.num_layers)] = \ linear_backward(dscore, linear_cache[str(self.num_layers)], self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]) #add regularization to W grads['W' + str(self.num_layers)] += self.reg * self.params[ 'W' + str(self.num_layers)] #backward from all other layers for i in range(self.num_layers - 1, 0, -1): if self.use_dropout: dhidden_layer = dropout_backward(dhidden_layer, dropout_cache[str(i)], self.dropout_params["p"], self.dropout_params["train"]) dhidden_layer = relu_backward(dhidden_layer, relu_cache[str(i)]) dhidden_layer, grads['W' + str(i)],grads['b' + str(i)] = \ linear_backward(dhidden_layer, linear_cache[str(i)], self.params['W' + str(i)], self.params['b' + str(i)]) grads['W' + str(i)] += self.reg * self.params['W' + str(i)] ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Args: - X: Input data, numpy array of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None X = X.astype(self.dtype) linear_cache = dict() relu_cache = dict() dropout_cache = dict() """ TODO: Implement the forward pass for the fully-connected neural network, compute the scores and store them in the scores variable. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### linear_cache[0] = relu_cache[0] = dropout_cache[0] = X for i in range(1, self.num_layers + 1): curr_id = str(i) W_id = 'W' + curr_id b_id = 'b' + curr_id # if current layer is not the output layer if i < self.num_layers: # linear regression if not self.use_dropout: prev_layer_output = relu_cache[i - 1] else: prev_layer_output = dropout_cache[i - 1] linear_cache[i] = linear_forward(prev_layer_output, self.params[W_id], self.params[b_id]) # relu activation relu_cache[i] = relu_forward(linear_cache[i]) # dropout regularization if self.use_dropout: dropout_cache[i] = dropout_forward( relu_cache[i], self.dropout_params['p'], self.params[W_id], self.params[b_id]) # if current layer is the output layer else: # output layer outputs the estimate result of nn, scores if not self.use_dropout: prev_layer_output = relu_cache[i - 1] else: prev_layer_output = dropout_cache[i - 1] scores = linear_forward(prev_layer_output, self.params[W_id], self.params[b_id]) ####################################################################### # END OF YOUR CODE # ####################################################################### # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, dict() """ TODO: Implement the backward pass for the fully-connected net. Store the loss in the loss variable and all gradients in the grads dictionary. Compute the loss with softmax. grads[k] has the gradients for self.params[k]. Add L2 regularisation to the loss function. NOTE: To ensure that your implementation matches ours and you pass the automated tests, make sure that your L2 regularization includes a factor of 0.5 to simplify the expression for the gradient. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### # use softmax to produce intermediate results, loss and dlogits loss, dlogits = softmax(scores, y) # iterate backward, output layer to input layer via hidden layers for i in range(self.num_layers, 0, -1): # set variable names curr_id = str(i) W_id = 'W' + curr_id b_id = 'b' + curr_id # L2 regularization loss += 0.5 * self.reg * np.sum(self.params[W_id]**2) # retrieve the result of upper layer from cache, prev_layer_output prev_layer_output = relu_cache[ i - 1] if not self.use_dropout else dropout_cache[i - 1][0] # if current layer is the output layer if i == self.num_layers: # perform linear bacward regression to update grads for W and b dX, grads[W_id], grads[b_id] = linear_backward( dlogits, prev_layer_output, self.params[W_id], self.params[b_id]) # if current layer is not the output layer else: # dropout if self.use_dropout: mask = dropout_cache[i][1] p, train = self.dropout_params['p'], self.dropout_params[ 'train'] dX = dropout_backward(relu_cache[i], mask, p, train) # relu dX = relu_backward(dX, linear_cache[i]) # linear dX, grads[W_id], grads[b_id] = linear_backward( dX, prev_layer_output, self.params[W_id], self.params[b_id]) # grads[W_id] += self.reg * self.params[W_id] ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Args: - X: Input data, numpy array of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None X = X.astype(self.dtype) linear_cache = dict() relu_cache = dict() dropout_cache = dict() """ TODO: Implement the forward pass for the fully-connected neural network, compute the scores and store them in the scores variable. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### train = True if y is not None else False if self.use_dropout: p = self.dropout_params["p"] seed = self.dropout_params["seed"] for layer in range(1, self.num_layers + 1): W = self.params["W" + str(layer)] b = self.params["b" + str(layer)] input = X if layer == 1 else ( dropout_cache[layer - 1][0] if self.use_dropout else relu_cache[layer - 1]) if layer != self.num_layers: linear_cache[layer] = linear_forward(input, W, b) relu_cache[layer] = relu_forward(linear_cache[layer]) if self.use_dropout: dropout_cache[layer] = dropout_forward( relu_cache[layer], p, train, seed) else: scores = linear_forward(input, W, b) ####################################################################### # END OF YOUR CODE # ####################################################################### # If y is None then we are in test mode so just return scores if not train: return scores loss, grads = 0, dict() """ TODO: Implement the backward pass for the fully-connected net. Store the loss in the loss variable and all gradients in the grads dictionary. Compute the loss with softmax. grads[k] has the gradients for self.params[k]. Add L2 regularisation to the loss function. NOTE: To ensure that your implementation matches ours and you pass the automated tests, make sure that your L2 regularization includes a factor of 0.5 to simplify the expression for the gradient. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### loss, dscores = softmax(scores, y) loss += self.l2regular() for layer in reversed(range(1, self.num_layers + 1)): w_key = "W" + str(layer) b_key = "b" + str(layer) W = self.params[w_key] b = self.params[b_key] if layer == self.num_layers: input_linear = dropout_cache[ layer - 1][0] if self.use_dropout else relu_cache[layer - 1] dX, dW, db = linear_backward(dscores, input_linear, W, b) dW += self.reg * W else: if self.use_dropout: input_drop = relu_cache[layer] mask = dropout_cache[layer][1] dX = dropout_backward(dX, mask, p, train) input_relu = linear_cache[layer] dX = relu_backward(dX, input_relu) if layer == 1: input_linear = X else: input_linear = dropout_cache[ layer - 1][0] if self.use_dropout else relu_cache[layer - 1] dout = dX dX, dW, db = linear_backward(dout, input_linear, W, b) dW += self.reg * W grads[w_key] = dW grads[b_key] = db ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Args: - X: Input data, numpy array of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None X = X.astype(self.dtype) linear_cache = dict() relu_cache = dict() dropout_cache = dict() """ TODO: Implement the forward pass for the fully-connected neural network, compute the scores and store them in the scores variable. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### # Forward pass: loop through all layers # store the linear activations, relu and masks for i in range(1, self.num_layers): if i == 1: linear_cache["L{}".format(i)] = linear_forward( X, self.params["W{}".format(i)], self.params["b{}".format(i)]) else: if self.use_dropout: linear_cache["L{}".format(i)] = linear_forward( dropout_cache["D{}".format(i - 1)], self.params["W{}".format(i)], self.params["b{}".format(i)]) else: linear_cache["L{}".format(i)] = linear_forward( relu_cache["R{}".format(i - 1)], self.params["W{}".format(i)], self.params["b{}".format(i)]) relu_cache["R{}".format(i)] = relu_forward( linear_cache["L{}".format(i)]) if self.use_dropout: s = None if 'seed' in self.dropout_params.keys(): s = self.dropout_params['seed'] p = self.dropout_params["p"] t = self.dropout_params["train"] dropout_cache["D{}".format(i)], dropout_cache["M{}".format( i)] = dropout_forward(relu_cache["R{}".format(i)], p=p, train=t, seed=s) #Linear final layer scores = None if self.use_dropout: scores = linear_forward( dropout_cache["D{}".format(self.num_layers - 1)], self.params["W{}".format(self.num_layers)], self.params["b{}".format(self.num_layers)]) else: scores = linear_forward( relu_cache["R{}".format(self.num_layers - 1)], self.params["W{}".format(self.num_layers)], self.params["b{}".format(self.num_layers)]) ####################################################################### # END OF YOUR CODE # ####################################################################### # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, dict() """ TODO: Implement the backward pass for the fully-connected net. Store the loss in the loss variable and all gradients in the grads dictionary. Compute the loss with softmax. grads[k] has the gradients for self.params[k]. Add L2 regularisation to the loss function. NOTE: To ensure that your implementation matches ours and you pass the automated tests, make sure that your L2 regularization includes a factor of 0.5 to simplify the expression for the gradient. """ ####################################################################### # BEGIN OF YOUR CODE # ####################################################################### # Compute the loss and gradients using softmax loss, dx = softmax(scores, y) # Apply L2 Regularisation for i in range(1, self.num_layers + 1): loss += (0.5 * self.reg) * np.sum( np.square(self.params["W{}".format(i)])) # Backwards pass: Final linear layer if self.use_dropout: dx, dW, db = linear_backward( dx, dropout_cache["D{}".format(self.num_layers - 1)], self.params["W{}".format(self.num_layers)], self.params["b{}".format(self.num_layers)]) else: dx, dW, db = linear_backward( dx, relu_cache["R{}".format(self.num_layers - 1)], self.params["W{}".format(self.num_layers)], self.params["b{}".format(self.num_layers)]) grads["W{}".format(self.num_layers)] = dW # L2 Regularisation grads["W{}".format( self.num_layers)] += self.reg * self.params["W{}".format( self.num_layers)] grads["b{}".format(self.num_layers)] = db # Loop backwards through the layers to the first layer for j in reversed(range(1, self.num_layers)): # Reverse dropout if self.use_dropout: dx = dropout_backward(dx, mask=dropout_cache["M{}".format(j)], p=self.dropout_params["p"], train=self.dropout_params["train"]) # Relu backward pass with incoming Z value dx = relu_backward(dx, linear_cache["L{}".format(j)]) # Linear pass with activation value of incoming layer if j == 1: dx, dW, db = linear_backward(dx, X, self.params["W{}".format(j)], self.params["b{}".format(j)]) else: foo = None if self.use_dropout: dx, dW, db = linear_backward( dx, dropout_cache["D{}".format(j - 1)], self.params["W{}".format(j)], self.params["b{}".format(j)]) else: dx, dW, db = linear_backward( dx, relu_cache["R{}".format(j - 1)], self.params["W{}".format(j)], self.params["b{}".format(j)]) grads["W{}".format(j)] = dW # Regularisation grads["W{}".format(j)] += self.reg * self.params["W{}".format(j)] grads["b{}".format(j)] = db ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads