def conv_relu_backward(dout, cache): """ Backward pass for the conv-relu convenience layer. """ conv_cache, relu_cache = cache da = relu_backward(dout, relu_cache) dx, dw, db = conv_backward_fast(da, conv_cache) return dx, dw, db
def affine_relu_backward(dout, cache): """ Backward pass for the affine-relu convenience layer """ fc_cache, relu_cache = cache da = relu_backward(dout, relu_cache) dx, dw, db = affine_backward(da, fc_cache) return dx, dw, db
def affine_relu_backward( dout, cache ): """ Backward pass for the affine-relu convenience layer """ fc_cache, relu_cache = cache da = layers.relu_backward( dout, relu_cache ) dx, dw, db = layers.affine_backward( da, fc_cache ) return dx, dw, db
def affine_batchnorm_relu_backward(dout, cache): """ Backward pass for the Affine->BatchNorm->ReLU convenience layer """ fc_cache, bn_cache, relu_cache = cache da = relu_backward(dout, relu_cache) dan, dgamma, dbeta = batchnorm_backward(da, bn_cache) dx, dw, db = affine_backward(dan, fc_cache) return dx, dw, db, dgamma, dbeta
def conv_relu_pool_backward(dout, cache): """ Backward pass for the conv-relu-pool convenience layer """ conv_cache, relu_cache, pool_cache = cache ds = max_pool_backward_fast(dout, pool_cache) da = relu_backward(ds, relu_cache) dx, dw, db = conv_backward_fast(da, conv_cache) return dx, dw, db
def affine_bn_relu_backward(dout, cache): """ Backward pass for the affine-bn-relu convenience layer """ fc_cache, bn_cache, relu_cache = cache dx = layers.relu_backward(dout, relu_cache) dx, dgamma, dbeta = layers.batchnorm_backward_alt(dx, bn_cache) dx, dw, db = layers.affine_backward(dx, fc_cache) return dx, dw, db, dgamma, dbeta
def combo_backward(dout, cache): """ Backward pass for the affine-relu convenience layer """ dgamma, dbeta = 0, 0 fc_cache, bn_cache, relu_cache = cache da = relu_backward(dout, relu_cache) if bn_cache is not None: da, dgamma, dbeta = batchnorm_backward(da, bn_cache) dx, dw, db = affine_backward(da, fc_cache) return dx, dw, db, dgamma, dbeta
0.40909091, 0.5, ]]) # Compare your output with ours. The error should be around 5e-8 print('Testing relu_forward function:') print('difference: ', rel_error(out, correct_out)) np.random.seed(231) x = np.random.randn(10, 10) dout = np.random.randn(*x.shape) dx_num = eval_numerical_gradient_array(lambda x: relu_forward(x)[0], x, dout) _, cache = relu_forward(x) dx = relu_backward(dout, cache) # The error should be around 3e-12 print('Testing relu_backward function:') print('dx error: ', rel_error(dx_num, dx)) np.random.seed(231) x = np.random.randn(2, 3, 4) w = np.random.randn(12, 10) b = np.random.randn(10) dout = np.random.randn(2, 10) out, cache = affine_relu_forward(x, w, b) dx, dw, db = affine_relu_backward(dout, cache) dx_num = eval_numerical_gradient_array(
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ caches = collections.defaultdict(list) out_layer = X for i in range(self.num_layers - 1): n = str(i + 1) # (zy) The learned parameters are for BN affine transformation used # in training, while the running average is used for prediction. if self.use_batchnorm: out_layer, cache = affine_bn_relu_forward( out_layer, self.params["W" + n], self.params["b" + n], self.params["gamma" + n], self.params["beta" + n], self.bn_params[i]) caches["affine_bn_relu"].append(cache) else: out_layer, cache = layers.affine_forward( out_layer, self.params["W" + n], self.params["b" + n]) caches["affine"].append(cache) out_layer, cache = layers.relu_forward(out_layer) caches["relu"].append(cache) if self.use_dropout: out_layer, cache = layers.dropout_forward( out_layer, self.dropout_param) caches["drop"].append(cache) nn = str(self.num_layers) scores, cache = layers.affine_forward(out_layer, self.params["W" + nn], self.params["b" + nn]) ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dloss = layers.softmax_loss(scores, y) # for regularization if self.reg != 0: for k, v in self.params.items(): # only include the w parameters, excluding gamma, beta and b if k.startswith("W"): loss += 0.5 * self.reg * np.sum(v**2) # get the gradient out = layers.affine_backward(dloss, cache) dout, grads["W" + nn], grads["b" + nn] = out grads["W" + nn] += self.reg * cache[1] for i in range(self.num_layers - 2, -1, -1): n = str(i + 1) if self.use_dropout: dout = layers.dropout_backward(dout, caches["drop"][i]) if self.use_batchnorm: out = affine_bn_relu_backward(dout, caches["affine_bn_relu"][i]) dout, grads["W"+n], grads["b"+n], \ grads["gamma"+n], grads["beta"+n] = out grads["W" + n] += self.reg * self.params["W" + n] if self.reg else 0 else: dout = layers.relu_backward(dout, caches["relu"][i]) out = layers.affine_backward(dout, caches["affine"][i]) dout, grads["W" + n], grads["b" + n] = out # need to include regularization grads["W" + n] += self.reg * caches["affine"][i][1] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. """ if self.use_dropout: self.dropout_param['mode']=mode """ if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ X_temp = X affine_Input = list() relu_input = list() batchnorm_input = list() dropout_input = list() score_tmp = None for i in range(self.num_layers - 1): tmp, affine_input_tmp = affine_forward( X_temp, self.params['W' + str(i + 1)], self.params['b' + str(i + 1)]) if self.use_batchnorm: tmp, batchnorm_cache = batchnorm_forward( tmp, self.params['gamma' + str(i + 1)], self.params['beta' + str(i + 1)], self.bn_params[i]) batchnorm_input.append(batchnorm_cache) score_tmp, relu_input_tmp = relu_forward(tmp) if self.use_dropout: score_tmp, dropout_cache = dropout_forward( score_tmp, self.dropout_param) dropout_input.append(dropout_cache) affine_Input.append(affine_input_tmp) relu_input.append(relu_input_tmp) X_temp = score_tmp scores, last_input_tmp = affine_forward( score_tmp, self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]) affine_Input.append(last_input_tmp) ############################################################################ # END OF YOUR CODE # ############################################################################ if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ num_trains = X.shape[0] loss, dscores = softmax_loss(scores, y) weight_decay_sum = 0 for i in range(self.num_layers): tmp = np.sum(self.params['W' + str(i + 1)] * self.params['W' + str(i + 1)]) weight_decay_sum = weight_decay_sum + tmp loss = loss + 0.5 * self.reg * weight_decay_sum #softmax_output=np.exp(scores)/np.sum(np.exp(scores),axis=1).reshape(-1,1) #softmax_output[range(num_trains),list(y)]=softmax_output[range(num_trains),list(y)]-1 dout = dscores for i in range(self.num_layers): dx, dw, db = affine_backward(dout, affine_Input[-(i + 1)]) grads['W' + str(self.num_layers - i)] = dw + self.reg * self.params[ 'W' + str(self.num_layers - i)] grads['b' + str(self.num_layers - i)] = db if self.use_dropout and i != self.num_layers - 1: dx = dropout_backward(dx, dropout_input[-(i + 1)]) if i != self.num_layers - 1: dout = relu_backward(dx, relu_input[-(i + 1)]) if i != self.num_layers - 1 and self.use_batchnorm: dout, dgamma, dbeta = batchnorm_backward( dout, batchnorm_input[-(i + 1)]) grads['gamma' + str(self.num_layers - i - 1)] = dgamma grads['beta' + str(self.num_layers - i - 1)] = dbeta return loss, grads
def two_layer_net(X, model, y=None, reg=0.0): """ Compute the loss and gradients for a two layer fully connected neural network. The net has an input dimension of D, a hidden layer dimension of H, and performs classification over C classes. We use a softmax loss function and L2 regularization the the weight matrices. The two layer net should use a ReLU nonlinearity after the first affine layer. The two layer net has the following architecture: input - fully connected layer - ReLU - fully connected layer - softmax The outputs of the second fully-connected layer are the scores for each class. Inputs: - X: Input data of shape (N, D). Each X[i] is a training sample. - model: Dictionary mapping parameter names to arrays of parameter values. It should contain the following: - W1: First layer weights; has shape (D, H) - b1: First layer biases; has shape (H,) - W2: Second layer weights; has shape (H, C) - b2: Second layer biases; has shape (C,) - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is an integer in the range 0 <= y[i] < C. This parameter is optional; if it is not passed then we only return scores, and if it is passed then we instead return the loss and gradients. - reg: Regularization strength. Returns: If y not is passed, return a matrix scores of shape (N, C) where scores[i, c] is the score for class c on input X[i]. If y is not passed, instead return a tuple of: - loss: Loss (data loss and regularization loss) for this batch of training samples. - grads: Dictionary mapping parameter names to gradients of those parameters with respect to the loss function. This should have the same keys as model. """ # unpack variables from the model dictionary W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2'] N, D = X.shape # compute the forward pass scores = None ############################################################################# # TODO: Perform the forward pass, computing the class scores for the input. # # Store the result in the scores variable, which should be an array of # # shape (N, C). # ############################################################################# # relu = lambda x: np.maximum(x,0) # H, C = W2.shape # scores = np.zeros((N,C)) # layer1 = np.maximum(np.dot(X,W1) + b1,0) # scores = np.dot(layer1,W2) + b2 ## above is the test implementation ## NOW, using cs231n/layers.py ## NOTICE define layer0 = X # then behaviour is 'functional' layer(n+1) = f(layer(n) | parameters) from cs231n.layers import affine_forward, relu_forward, softmax_loss from cs231n.layers import affine_backward, relu_backward layer1, cache1 = affine_forward(X, W1, b1) layer2, cache2 = relu_forward(layer1) layer3, cache3 = affine_forward(layer2, W2, b2) scores = layer3 ############################################################################# # END OF YOUR CODE # ############################################################################# # If the targets are not given then jump out, we're done if y is None: return scores # compute the loss loss = None ############################################################################# # TODO: Finish the forward pass, and compute the loss. This should include # # both the data loss and L2 regularization for W1 and W2. Store the result # # in the variable loss, which should be a scalar. Use the Softmax # # classifier loss. So that your results match ours, multiply the # # regularization loss by 0.5 # ############################################################################# # rows = np.sum(np.exp(scores), axis=1) # layer4 = np.mean(-layer3[range(N), y] + np.log(rows)) # loss = layer4 + 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) # loss, dx = softmax_loss(scores, y) loss += 0.5 * reg * np.sum(W1*W1) + 0.5 * reg * np.sum(W2 * W2) ############################################################################# # END OF YOUR CODE # ############################################################################# # compute the gradients grads = {} ############################################################################# # TODO: Compute the backward pass, computing the derivatives of the weights # # and biases. Store the results in the grads dictionary. For example, # # grads['W1'] should store the gradient on W1, and be a matrix of same size # ############################################################################# dlayer2, grads['W2'], grads['b2'] = affine_backward(dx, cache3) dlayer1 = relu_backward(dlayer2, cache2) dLayer0, grads['W1'], grads['b1'] = affine_backward(dlayer1, cache1) #gradients need to have regularization term grads['W2'] += reg * W2 grads['W1'] += reg * W1 ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ out_affine1, cache_affine1 = layers.affine_forward( X, self.params["W1"], self.params["b1"]) out_relu1, cache_relu1 = layers.relu_forward(out_affine1) out_affine2, cache_affine2 = layers.affine_forward( out_relu1, self.params["W2"], self.params["b2"]) # no need to compute SVM/softmax loss, just give the argmax result When # we are in prediction. scores = out_affine2 ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # in training, compute the loss and do backprop. loss, dloss = layers.softmax_loss(scores, y) # need to add regularization here... loss += 0.5 * self.reg * (np.sum(self.params["W1"]**2) + np.sum(self.params["W2"]**2)) dout_affine2 = layers.affine_backward(dloss, cache_affine2) grads["W2"] = dout_affine2[1] + self.reg * self.params["W2"] grads["b2"] = dout_affine2[2] dout_relu1 = layers.relu_backward(dout_affine2[0], cache_relu1) dout_affine1 = layers.affine_backward(dout_relu1, cache_affine1) grads["W1"] = dout_affine1[1] + self.reg * self.params["W1"] grads["b1"] = dout_affine1[2] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ hidden1_out, h1_cache = affine_forward(X, self.params['W1'], self.params['b1']) relu_out, relu_cache = relu_forward(hidden1_out) scores, h2_cache = affine_forward(relu_out, self.params['W2'], self.params['b2']) ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ """ X_reshape=np.reshape(X,(X.shape[0],-1)) num_trains=X.shape[0] loss,_=softmax_loss(scores,y) loss=loss+self.reg*0.5*(np.sum(self.params['W2']*self.params['W2'])+np.sum(self.params['W1']*self.params['W1'])) softmax_output=np.exp(scores)/np.sum(np.exp(scores),axis=1).reshape(-1,1) softmax_output[range(num_trains),list(y)]=softmax_output[range(num_trains),list(y)]-1 grads['b2']=np.zeros_like(self.params['b2']) grads['W2']=np.zeros_like(self.params['W2']) grads['b1']=np.zeros_like(self.params['b1']) grads['W1']=np.zeros_like(self.params['W1']) grads['b2']=np.sum(softmax_output,axis=0) grads['W2']=np.dot(relu_out.T,softmax_output) grads_b1_tmp=np.dot(softmax_output,self.params['W2'].T) tmp=(relu_out>0)*grads_b1_tmp grads['b1']=np.sum(tmp,axis=0) grads['W1']=np.dot(X_reshape.T,grads_b1_tmp) grads['W1']=grads['W1']/num_trains+self.reg*self.params['W1'] grads['b1']=grads['b1']/num_trains grads['W2']=grads['W2']/num_trains+self.reg*self.params['W2'] grads['b2']=grads['b2']/num_trains """ num_trains = X.shape[0] loss, dscore = softmax_loss(scores, y) loss = loss + self.reg * 0.5 * ( np.sum(self.params['W2'] * self.params['W2']) + np.sum(self.params['W1'] * self.params['W1'])) grads_h2, grads_w2, grads_b2 = affine_backward(dout=dscore, cache=h2_cache) grads_relu = relu_backward(grads_h2, relu_cache) grads_h1, grads_w1, grads_b1 = affine_backward(grads_relu, h1_cache) grads['W1'] = grads_w1 + self.reg * self.params['W1'] grads['W2'] = grads_w2 + self.reg * self.params['W2'] grads['b1'] = grads_b1 grads['b2'] = grads_b2 ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] N = X.shape[0] D = np.prod(X.shape[1:]) X_ = X.reshape(N, D) A, fc1_cache = affine_forward(X_, W1, b1) R, relu_cache = relu_forward(A) scores, fc2_cache = affine_forward(R, W2, b2) ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dscores = softmax_loss(scores, y) dR, dW2, db2 = affine_backward(dscores, fc2_cache) dA = relu_backward(dR, relu_cache) dX, dW1, db1 = affine_backward(dA, fc1_cache) loss += 0.5 * self.reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) dW2 += self.reg * W2 dW1 += self.reg * W1 grads = {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2} ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.normalization == 'batchnorm': for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** arg, caches = X, [] for i in range(1, self.num_layers + 1): cache = {} W = self.params[f"W{i}"] b = self.params[f"b{i}"] arg, cache['fc_cache'] = affine_forward(arg, W, b) if i != self.num_layers and self.normalization: gamma = self.params[f"gamma{i}"] beta = self.params[f"beta{i}"] normalize_forward = batchnorm_forward if self.normalization is 'batchnorm' else layernorm_forward arg, cache['bn_cache'] = normalize_forward(arg, gamma, beta, self.bn_params[i-1]) arg, cache['relu_cache'] = relu_forward(arg) if self.use_dropout: arg, cache['dropout_cache'] = dropout_forward(arg, self.dropout_param) caches.append(cache) scores = arg # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch/layer normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** loss, dout = softmax_loss(scores, y) for i in range(self.num_layers, 0, -1): W = self.params[f"W{i}"] cache = caches[i-1] if self.use_dropout: dout = dropout_backward(dout, cache['dropout_cache']) da = relu_backward(dout, cache['relu_cache']) if i != self.num_layers and self.normalization: normalize_backward = batchnorm_backward if self.normalization is 'batchnorm' else layernorm_backward da, dgamma, dbeta = batchnorm_backward(da, cache['bn_cache']) grads[f"gamma{i}"] = dgamma grads[f"beta{i}"] = dbeta dout, dw, db = affine_backward(da, cache['fc_cache']) grads[f"W{i}"] = dw + self.reg * W grads[f"b{i}"] = db loss += 0.5 * self.reg * np.sum(W * W) # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads