def test_softmax(num_classes, samples=random.randrange(1,10)): num_classes, num_inputs = num_classes, 50 x = 0.001 * np.random.randn(num_inputs, num_classes) y = np.random.randint(num_classes, size=num_inputs) dx_num = eval_numerical_gradient(lambda x: softmax_loss(x, y)[0], x, verbose=False) loss, dx = softmax_loss(x, y) assert dx_num.shape == dx.shape assert loss < - np.log( 0.8 / num_classes) and loss > - np.log( 1.2 / num_classes) assert rel_error(dx_num, dx) < 5e-7
def make_fooling_image( X, target_y, model): # a method based on softmax loss and regularization """ Generate a fooling image that is close to X, but that the model classifies as target_y. Inputs: - X: Input image, of shape (1, 3, 64, 64) - target_y: An integer in the range [0, 100) - model: A PretrainedCNN Returns: - X_fooling: An image that is close to X, but that is classifed as target_y by the model. """ X_fooling = X.copy() ############################################################################## # TODO: Generate a fooling image X_fooling that the model will classify as # # the class target_y. Use gradient ascent on the target class score, using # # the model.forward method to compute scores and the model.backward method # # to compute image gradients. # # # # HINT: For most examples, you should be able to generate a fooling image # # in fewer than 100 iterations of gradient ascent. # ############################################################################## N = X.shape[0] reg = 5e-5 from cs231n.layers import softmax_loss for i in range(100): R = X_fooling - X scores, cache = model.forward(X_fooling, mode='test') loss, dscores = softmax_loss(scores, target_y) loss += 0.5 * reg * np.sum(R * R) print('softmax loss:', loss) y_pred = np.argmax(scores) print('target class index', target_y, 'current class index:', y_pred) if target_y == y_pred: print('iter num:', i) break else: df, _ = model.backward(dscores, cache) #print dX dX = reg * R + df X_fooling -= 6000 * dX #passhttp://10.10.7.221:8890/notebooks/assignment3/ImageGradients.ipynb# ############################################################################## # END OF YOUR CODE # ############################################################################## return X_fooling
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = "test" if y is None else "train" # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param["mode"] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param["mode"] = mode """ loss 과정에서 활용할 리스트들 [i] : i번째 layer의 변수들 """ fc = [] relu = [] bn = [] dropout = [] cache_bn = [] cache_fc = [] cache_relu = [] cache_dropout = [] fc.append(0) bn.append(0) relu.append(X) dropout.append(0) cache_bn.append(0) cache_dropout.append(0) cache_fc.append(0) cache_relu.append(0) # 맨 처음 trian data X를 집어넣어준다 # 0으로 모든 리스트를 초기화해준다 # 이러한 작업을 해주는 이유 : 인덱스를 1부터 L-1까지 활용하기 위함 """ fc_i : i번째 layer의 output cache_fc_i : i번째 layer의 input """ for i in range(1, self.num_layers): # 1부터 L-1까지 # affine fc_i, cache_fc_i = affine_forward(relu[i - 1], self.params["W" + str(i)], self.params["b" + str(i)]) fc.append(fc_i) cache_fc.append(cache_fc_i) if self.use_batchnorm: # batchnorm bn_i, cache_bn_i = batchnorm_forward( fc_i, gamma=self.params["gamma" + str(i)], beta=self.params["beta" + str(i)], bn_param=self.bn_params[i - 1], ) bn.append(bn_i) cache_bn.append(cache_bn_i) # relu relu_i, cache_relu_i = relu_forward(bn_i) relu.append(relu_i) cache_relu.append(cache_relu_i) else: # relu relu_i, cache_relu_i = relu_forward(fc[i]) relu.append(relu_i) cache_relu.append(cache_relu_i) # dropout layer if self.use_dropout: dropout_i, cache_dropout_i = dropout_forward( relu_i, dropout_param=self.dropout_param) dropout.append(dropout_i) cache_dropout.append(cache_dropout_i) # 마지막 L번째 layer : affine & softmax fc_L, cache_fc_L = affine_forward( dropout[-1] if self.use_dropout else relu[-1], self.params["W" + str(self.num_layers)], self.params["b" + str(self.num_layers)]) fc.append(fc_L) cache_fc.append(cache_fc_L) # (N,C) scores = fc[self.num_layers] ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ pass ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == "test": return scores loss, grads = 0.0, {} loss, d_scores = softmax_loss(scores, y) dx_ = [] dfc = [] drelu = [] dbatch = [] ddropout = [] # 맨 마지막 Layer drelu_L, dWL, dbL = affine_backward(d_scores, cache_fc[self.num_layers]) dfc.append(d_scores) dx_.append(drelu_L) grads["W" + str(self.num_layers)] = dWL grads["b" + str(self.num_layers)] = dbL for i in range(self.num_layers - 1, 0, -1): # N-1, 1 : all hidden layer # dropout backward if self.use_dropout: ddropout_i = dropout_backward(dx_[-1], cache_dropout[i]) ddropout.append(ddropout_i) # relu backward d_fc = relu_backward(ddropout[-1] if self.use_dropout else dx_[-1], cache_relu[i]) # batch normalization if self.use_batchnorm: # vriable name = d_fc이지만 사실은 d_batch dbatch.append(d_fc) # print('i = ', i) # print('length of cache_bn = ', len(cache_bn)) d_fc, dgamma, dbeta = batchnorm_backward(dbatch[-1], cache=cache_bn[i]) grads["gamma" + str(i)] = dgamma grads["beta" + str(i)] = dbeta dfc.append(d_fc) else: dfc.append(d_fc) # affine backward dx, dw, db = affine_backward(dfc[-1], cache_fc[i]) dx_.append(dx) grads["W" + str(i)] = dw grads["b" + str(i)] = db # if (i == 1): # print(i) ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ pass ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ hidden1_out, h1_cache = affine_forward(X, self.params['W1'], self.params['b1']) relu_out, relu_cache = relu_forward(hidden1_out) scores, h2_cache = affine_forward(relu_out, self.params['W2'], self.params['b2']) ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ """ X_reshape=np.reshape(X,(X.shape[0],-1)) num_trains=X.shape[0] loss,_=softmax_loss(scores,y) loss=loss+self.reg*0.5*(np.sum(self.params['W2']*self.params['W2'])+np.sum(self.params['W1']*self.params['W1'])) softmax_output=np.exp(scores)/np.sum(np.exp(scores),axis=1).reshape(-1,1) softmax_output[range(num_trains),list(y)]=softmax_output[range(num_trains),list(y)]-1 grads['b2']=np.zeros_like(self.params['b2']) grads['W2']=np.zeros_like(self.params['W2']) grads['b1']=np.zeros_like(self.params['b1']) grads['W1']=np.zeros_like(self.params['W1']) grads['b2']=np.sum(softmax_output,axis=0) grads['W2']=np.dot(relu_out.T,softmax_output) grads_b1_tmp=np.dot(softmax_output,self.params['W2'].T) tmp=(relu_out>0)*grads_b1_tmp grads['b1']=np.sum(tmp,axis=0) grads['W1']=np.dot(X_reshape.T,grads_b1_tmp) grads['W1']=grads['W1']/num_trains+self.reg*self.params['W1'] grads['b1']=grads['b1']/num_trains grads['W2']=grads['W2']/num_trains+self.reg*self.params['W2'] grads['b2']=grads['b2']/num_trains """ num_trains = X.shape[0] loss, dscore = softmax_loss(scores, y) loss = loss + self.reg * 0.5 * ( np.sum(self.params['W2'] * self.params['W2']) + np.sum(self.params['W1'] * self.params['W1'])) grads_h2, grads_w2, grads_b2 = affine_backward(dout=dscore, cache=h2_cache) grads_relu = relu_backward(grads_h2, relu_cache) grads_h1, grads_w1, grads_b1 = affine_backward(grads_relu, h1_cache) grads['W1'] = grads_w1 + self.reg * self.params['W1'] grads['W2'] = grads_w2 + self.reg * self.params['W2'] grads['b1'] = grads_b1 grads['b2'] = grads_b2 ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. """ if self.use_dropout: self.dropout_param['mode']=mode """ if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ X_temp = X affine_Input = list() relu_input = list() batchnorm_input = list() dropout_input = list() score_tmp = None for i in range(self.num_layers - 1): tmp, affine_input_tmp = affine_forward( X_temp, self.params['W' + str(i + 1)], self.params['b' + str(i + 1)]) if self.use_batchnorm: tmp, batchnorm_cache = batchnorm_forward( tmp, self.params['gamma' + str(i + 1)], self.params['beta' + str(i + 1)], self.bn_params[i]) batchnorm_input.append(batchnorm_cache) score_tmp, relu_input_tmp = relu_forward(tmp) if self.use_dropout: score_tmp, dropout_cache = dropout_forward( score_tmp, self.dropout_param) dropout_input.append(dropout_cache) affine_Input.append(affine_input_tmp) relu_input.append(relu_input_tmp) X_temp = score_tmp scores, last_input_tmp = affine_forward( score_tmp, self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]) affine_Input.append(last_input_tmp) ############################################################################ # END OF YOUR CODE # ############################################################################ if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ num_trains = X.shape[0] loss, dscores = softmax_loss(scores, y) weight_decay_sum = 0 for i in range(self.num_layers): tmp = np.sum(self.params['W' + str(i + 1)] * self.params['W' + str(i + 1)]) weight_decay_sum = weight_decay_sum + tmp loss = loss + 0.5 * self.reg * weight_decay_sum #softmax_output=np.exp(scores)/np.sum(np.exp(scores),axis=1).reshape(-1,1) #softmax_output[range(num_trains),list(y)]=softmax_output[range(num_trains),list(y)]-1 dout = dscores for i in range(self.num_layers): dx, dw, db = affine_backward(dout, affine_Input[-(i + 1)]) grads['W' + str(self.num_layers - i)] = dw + self.reg * self.params[ 'W' + str(self.num_layers - i)] grads['b' + str(self.num_layers - i)] = db if self.use_dropout and i != self.num_layers - 1: dx = dropout_backward(dx, dropout_input[-(i + 1)]) if i != self.num_layers - 1: dout = relu_backward(dx, relu_input[-(i + 1)]) if i != self.num_layers - 1 and self.use_batchnorm: dout, dgamma, dbeta = batchnorm_backward( dout, batchnorm_input[-(i + 1)]) grads['gamma' + str(self.num_layers - i - 1)] = dgamma grads['beta' + str(self.num_layers - i - 1)] = dbeta return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ####################################################################### # TODO: Implement the forward pass for the two-layer net, computing the # class scores for X and storing them in the scores variable. ####################################################################### W1 = self.params["W1"] b1 = self.params["b1"] W2 = self.params["W2"] b2 = self.params["b2"] N = X.shape[0] C = W2.shape[1] scores = np.zeros((N, C)) X_hidden, cache1 = affine_relu_forward(X, W1, b1) scores, cache2 = affine_forward(X_hidden, W2, b2) ####################################################################### # END OF YOUR CODE # ####################################################################### # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ####################################################################### # TODO: Implement the backward pass for the two-layer net. Store the # loss in the loss variable and gradients in the grads dictionary. # Compute data loss using softmax, and make sure that grads[k] # holds the gradients for self.params[k]. Don't forget to add L2 # regularization! # # NOTE: To ensure that your implementation matches ours and you pass # the automated tests, make sure that your L2 regularization includes a # factor of 0.5 to simplify the expression for the gradient. ####################################################################### loss, dscores = softmax_loss(scores, y) loss += 0.5 * self.reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) dx_hidden, dw2, db2 = affine_backward(dscores, cache2) grads["W2"] = dw2 + self.reg * W2 grads["b2"] = db2 dx, dw1, db1 = affine_relu_backward(dx_hidden, cache1) grads["W1"] = dw1 + self.reg * W1 grads["b1"] = db1 ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.dropout_param is not None: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param[mode] = mode scores = None ####################################################################### # TODO: Implement the forward pass for the fully-connected net, # computing the class scores for X and storing them in the scores # variable. # # When using dropout, you'll need to pass self.dropout_param to each # dropout forward pass. # # When using batch normalization, you'll need to pass self.bn_params[0] # to the forward pass for the first batch normalization layer, # pass self.bn_params[1] to the forward pass for the second batch # normalization layer, etc. ####################################################################### IN = X caches = {} if self.use_dropout: dropout_caches = {} for l in range(self.num_layers - 1): W = self.params["W{}".format(l + 1)] b = self.params["b{}".format(l + 1)] if self.use_batchnorm: gamma = self.params["gamma{}".format(l + 1)] beta = self.params["beta{}".format(l + 1)] IN, cache = affine_batchnorm_relu_forward( IN, W, b, gamma, beta, self.bn_params[l]) else: IN, cache = affine_relu_forward(IN, W, b) caches[l] = cache if self.use_dropout: IN, d_cache = dropout_forward(IN, self.dropout_param) dropout_caches[l] = d_cache # forward pass: last affine layer num_last = self.num_layers name_W_last = "W{}".format(num_last) name_b_last = "b{}".format(num_last) W_last = self.params[name_W_last] b_last = self.params[name_b_last] scores, cache_last = affine_forward(IN, W_last, b_last) ####################################################################### # END OF YOUR CODE # ####################################################################### # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ####################################################################### # TODO: Implement the backward pass for the fully-connected net. # Store the loss in the loss variable and gradients in the grads # dictionary. Compute data loss using softmax, and make sure that # grads[k] holds the gradients for self.params[k]. Don't forget to add # L2 regularization! # # When using batch normalization, you don't need to regularize the # scale and shift parameters. # # NOTE: To ensure that your implementation matches ours and you pass # the automated tests, make sure that your L2 regularization includes a # factor of 0.5 to simplify the expression for the gradient. ####################################################################### # loss loss, dscores = softmax_loss(scores, y) # regularization loss for l in range(self.num_layers): W = self.params["W{}".format(l + 1)] loss += 0.5 * self.reg * np.sum(W * W) # backprop through last affine layer dx, dw, db = affine_backward(dscores, cache_last) grads[name_W_last] = dw + self.reg * W_last grads[name_b_last] = db # backprop through affine-batchnorm-relu layers for l in reversed(range(self.num_layers - 1)): name_W = "W{}".format(l + 1) name_b = "b{}".format(l + 1) if self.use_dropout: dx = dropout_backward(dx, dropout_caches[l]) if self.use_batchnorm: dx, dw, db, dgamma, dbeta = affine_batchnorm_relu_backward( dx, caches[l]) grads["gamma{}".format(l + 1)] = dgamma grads["beta{}".format(l + 1)] = dbeta else: dx, dw, db = affine_relu_backward(dx, caches[l]) grads[name_W] = dw + self.reg * self.params[name_W] grads[name_b] = db ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.normalization == 'batchnorm': for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ combo_caches = [] fc_cache = None N = X.shape[0] D = np.prod(X.shape[1:]) x_ = X.reshape(N, D) # middle combo layers for layer in range(1, self.num_layers): #[1, 2, ..., L-1] w = self.params['W' + str(layer)] b = self.params['b' + str(layer)] # prepare for batch normalization gamma, beta, bn_parma = 1., 0, None if self.normalization == 'batchnorm': gamma = self.params['gamma' + str(layer)] beta = self.params['beta' + str(layer)] bn_parma = self.bn_params[layer - 1] # zero based x_, cache = combo_forward(x_, w, b, gamma, beta, bn_parma) combo_caches.append(cache) # final fully connected layer w = self.params['W' + str(self.num_layers)] b = self.params['b' + str(self.num_layers)] scores, fc_cache = affine_forward(x_, w, b) ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch/layer normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dout = softmax_loss(scores, y) # finall fully connected layer dout, dw, db = affine_backward(dout, fc_cache) grads['W' + str(self.num_layers )] = dw + self.reg * self.params['W' + str(self.num_layers)] grads['b' + str(self.num_layers)] = db # adjust loss with regularization term of dWL loss += 0.5 * self.reg * np.sum(self.params['W' + str(self.num_layers)] **2) # middle combo layers for layer in range(self.num_layers - 1, 0, -1): # [L-1, L-2, ... ,1] dout, dw, db, dgamma, dbeta = combo_backward( dout, combo_caches[layer - 1]) grads['W' + str(layer)] = dw + self.reg * self.params['W' + str(layer)] grads['b' + str(layer)] = db if self.normalization == 'batchnorm': grads['gamma' + str(layer)] = dgamma grads['beta' + str(layer)] = dbeta # adjust loss with regularization term of dWl loss += 0.5 * self.reg * np.sum(self.params['W' + str(layer)]**2) ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def two_layer_net(X, model, y=None, reg=0.0): """ Compute the loss and gradients for a two layer fully connected neural network. The net has an input dimension of D, a hidden layer dimension of H, and performs classification over C classes. We use a softmax loss function and L2 regularization the the weight matrices. The two layer net should use a ReLU nonlinearity after the first affine layer. The two layer net has the following architecture: input - fully connected layer - ReLU - fully connected layer - softmax The outputs of the second fully-connected layer are the scores for each class. Inputs: - X: Input data of shape (N, D). Each X[i] is a training sample. - model: Dictionary mapping parameter names to arrays of parameter values. It should contain the following: - W1: First layer weights; has shape (D, H) - b1: First layer biases; has shape (H,) - W2: Second layer weights; has shape (H, C) - b2: Second layer biases; has shape (C,) - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is an integer in the range 0 <= y[i] < C. This parameter is optional; if it is not passed then we only return scores, and if it is passed then we instead return the loss and gradients. - reg: Regularization strength. Returns: If y not is passed, return a matrix scores of shape (N, C) where scores[i, c] is the score for class c on input X[i]. If y is not passed, instead return a tuple of: - loss: Loss (data loss and regularization loss) for this batch of training samples. - grads: Dictionary mapping parameter names to gradients of those parameters with respect to the loss function. This should have the same keys as model. """ # unpack variables from the model dictionary W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2'] N, D = X.shape # compute the forward pass scores = None ############################################################################# # TODO: Perform the forward pass, computing the class scores for the input. # # Store the result in the scores variable, which should be an array of # # shape (N, C). # ############################################################################# # relu = lambda x: np.maximum(x,0) # H, C = W2.shape # scores = np.zeros((N,C)) # layer1 = np.maximum(np.dot(X,W1) + b1,0) # scores = np.dot(layer1,W2) + b2 ## above is the test implementation ## NOW, using cs231n/layers.py ## NOTICE define layer0 = X # then behaviour is 'functional' layer(n+1) = f(layer(n) | parameters) from cs231n.layers import affine_forward, relu_forward, softmax_loss from cs231n.layers import affine_backward, relu_backward layer1, cache1 = affine_forward(X, W1, b1) layer2, cache2 = relu_forward(layer1) layer3, cache3 = affine_forward(layer2, W2, b2) scores = layer3 ############################################################################# # END OF YOUR CODE # ############################################################################# # If the targets are not given then jump out, we're done if y is None: return scores # compute the loss loss = None ############################################################################# # TODO: Finish the forward pass, and compute the loss. This should include # # both the data loss and L2 regularization for W1 and W2. Store the result # # in the variable loss, which should be a scalar. Use the Softmax # # classifier loss. So that your results match ours, multiply the # # regularization loss by 0.5 # ############################################################################# # rows = np.sum(np.exp(scores), axis=1) # layer4 = np.mean(-layer3[range(N), y] + np.log(rows)) # loss = layer4 + 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) # loss, dx = softmax_loss(scores, y) loss += 0.5 * reg * np.sum(W1*W1) + 0.5 * reg * np.sum(W2 * W2) ############################################################################# # END OF YOUR CODE # ############################################################################# # compute the gradients grads = {} ############################################################################# # TODO: Compute the backward pass, computing the derivatives of the weights # # and biases. Store the results in the grads dictionary. For example, # # grads['W1'] should store the gradient on W1, and be a matrix of same size # ############################################################################# dlayer2, grads['W2'], grads['b2'] = affine_backward(dx, cache3) dlayer1 = relu_backward(dlayer2, cache2) dLayer0, grads['W1'], grads['b1'] = affine_backward(dlayer1, cache1) #gradients need to have regularization term grads['W2'] += reg * W2 grads['W1'] += reg * W1 ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ out_affine1, cache_affine1 = layers.affine_forward( X, self.params["W1"], self.params["b1"]) out_relu1, cache_relu1 = layers.relu_forward(out_affine1) out_affine2, cache_affine2 = layers.affine_forward( out_relu1, self.params["W2"], self.params["b2"]) # no need to compute SVM/softmax loss, just give the argmax result When # we are in prediction. scores = out_affine2 ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # in training, compute the loss and do backprop. loss, dloss = layers.softmax_loss(scores, y) # need to add regularization here... loss += 0.5 * self.reg * (np.sum(self.params["W1"]**2) + np.sum(self.params["W2"]**2)) dout_affine2 = layers.affine_backward(dloss, cache_affine2) grads["W2"] = dout_affine2[1] + self.reg * self.params["W2"] grads["b2"] = dout_affine2[2] dout_relu1 = layers.relu_backward(dout_affine2[0], cache_relu1) dout_affine1 = layers.affine_backward(dout_relu1, cache_affine1) grads["W1"] = dout_affine1[1] + self.reg * self.params["W1"] grads["b1"] = dout_affine1[2] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ caches = collections.defaultdict(list) out_layer = X for i in range(self.num_layers - 1): n = str(i + 1) # (zy) The learned parameters are for BN affine transformation used # in training, while the running average is used for prediction. if self.use_batchnorm: out_layer, cache = affine_bn_relu_forward( out_layer, self.params["W" + n], self.params["b" + n], self.params["gamma" + n], self.params["beta" + n], self.bn_params[i]) caches["affine_bn_relu"].append(cache) else: out_layer, cache = layers.affine_forward( out_layer, self.params["W" + n], self.params["b" + n]) caches["affine"].append(cache) out_layer, cache = layers.relu_forward(out_layer) caches["relu"].append(cache) if self.use_dropout: out_layer, cache = layers.dropout_forward( out_layer, self.dropout_param) caches["drop"].append(cache) nn = str(self.num_layers) scores, cache = layers.affine_forward(out_layer, self.params["W" + nn], self.params["b" + nn]) ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dloss = layers.softmax_loss(scores, y) # for regularization if self.reg != 0: for k, v in self.params.items(): # only include the w parameters, excluding gamma, beta and b if k.startswith("W"): loss += 0.5 * self.reg * np.sum(v**2) # get the gradient out = layers.affine_backward(dloss, cache) dout, grads["W" + nn], grads["b" + nn] = out grads["W" + nn] += self.reg * cache[1] for i in range(self.num_layers - 2, -1, -1): n = str(i + 1) if self.use_dropout: dout = layers.dropout_backward(dout, caches["drop"][i]) if self.use_batchnorm: out = affine_bn_relu_backward(dout, caches["affine_bn_relu"][i]) dout, grads["W"+n], grads["b"+n], \ grads["gamma"+n], grads["beta"+n] = out grads["W" + n] += self.reg * self.params["W" + n] if self.reg else 0 else: dout = layers.relu_backward(dout, caches["relu"][i]) out = layers.affine_backward(dout, caches["affine"][i]) dout, grads["W" + n], grads["b" + n] = out # need to include regularization grads["W" + n] += self.reg * caches["affine"][i][1] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** W1, W2 = self.params['W1'], self.params['W2'] b1, b2 = self.params['b1'], self.params['b2'] A1, c1 = affine_relu_forward(X, W1, b1) Z2, c2 = affine_relu_forward(A1, W2, b2) scores = Z2 # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** loss, dout = softmax_loss(scores, y) dA1, dW2, db2 = affine_relu_backward(dout, c2) dX, dW1, db1 = affine_relu_backward(dA1, c1) grads['W2'] = dW2 + self.reg * W2 grads['W1'] = dW1 + self.reg * W1 grads['b2'] = db2 grads['b1'] = db1 loss += 0.5 * self.reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.normalization == 'batchnorm': for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** arg, caches = X, [] for i in range(1, self.num_layers + 1): cache = {} W = self.params[f"W{i}"] b = self.params[f"b{i}"] arg, cache['fc_cache'] = affine_forward(arg, W, b) if i != self.num_layers and self.normalization: gamma = self.params[f"gamma{i}"] beta = self.params[f"beta{i}"] normalize_forward = batchnorm_forward if self.normalization is 'batchnorm' else layernorm_forward arg, cache['bn_cache'] = normalize_forward(arg, gamma, beta, self.bn_params[i-1]) arg, cache['relu_cache'] = relu_forward(arg) if self.use_dropout: arg, cache['dropout_cache'] = dropout_forward(arg, self.dropout_param) caches.append(cache) scores = arg # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch/layer normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** loss, dout = softmax_loss(scores, y) for i in range(self.num_layers, 0, -1): W = self.params[f"W{i}"] cache = caches[i-1] if self.use_dropout: dout = dropout_backward(dout, cache['dropout_cache']) da = relu_backward(dout, cache['relu_cache']) if i != self.num_layers and self.normalization: normalize_backward = batchnorm_backward if self.normalization is 'batchnorm' else layernorm_backward da, dgamma, dbeta = batchnorm_backward(da, cache['bn_cache']) grads[f"gamma{i}"] = dgamma grads[f"beta{i}"] = dbeta dout, dw, db = affine_backward(da, cache['fc_cache']) grads[f"W{i}"] = dw + self.reg * W grads[f"b{i}"] = db loss += 0.5 * self.reg * np.sum(W * W) # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ W1 = self.params["W1"] W2 = self.params["W2"] b1 = self.params["b1"] b2 = self.params["b2"] fc_1, cache_fc_1 = affine_forward(X, W1, b1) # (N, H) relu_1, cache_relu_1 = relu_forward(fc_1) # (N, H) fc_2, cache_fc_2 = affine_forward(relu_1, W2, b2) # (N, C) import copy scores = copy.deepcopy(fc_2) ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ pass ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} loss, d_scores = softmax_loss(scores, y) d_relu_1, d_W2, d_b2 = affine_backward(d_scores, cache_fc_2) d_fc_1 = relu_backward(d_relu_1, cache_relu_1) dx, d_W1, d_b1 = affine_backward(d_fc_1, cache_fc_1) grads["W1"] = d_W1 grads["W2"] = d_W2 grads["b1"] = d_b1 grads["b2"] = d_b2 loss += 0.5 * self.reg * \ (np.sum(np.square(self.params["W1"])) + np.sum(np.square(self.params["W2"]))) grads["W2"] += self.reg * self.params["W2"] grads["W1"] += self.reg * self.params["W1"] ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ pass ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
print('db error: ', rel_error(db_num, db)) np.random.seed(231) num_classes, num_inputs = 10, 50 x = 0.001 * np.random.randn(num_inputs, num_classes) y = np.random.randint(num_classes, size=num_inputs) dx_num = eval_numerical_gradient(lambda x: svm_loss(x, y)[0], x, verbose=False) loss, dx = svm_loss(x, y) # Test svm_loss function. Loss should be around 9 and dx error should be 1e-9 print('Testing svm_loss:') print('loss: ', loss) print('dx error: ', rel_error(dx_num, dx)) dx_num = eval_numerical_gradient(lambda x: softmax_loss(x, y)[0], x, verbose=False) loss, dx = softmax_loss(x, y) # Test softmax_loss function. Loss should be 2.3 and dx error should be 1e-8 print('\nTesting softmax_loss:') print('loss: ', loss) print('dx error: ', rel_error(dx_num, dx)) """ np.random.seed(231) N, D, H, C = 3, 5, 50, 7 X = np.random.randn(N, D) y = np.random.randint(C, size=N) std = 1e-3
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ out = X caches = [] for i in range(self.num_layers): w_name = 'W{}'.format(i) b_name = 'b{}'.format(i) w = self.params[w_name] b = self.params[b_name] if i == self.num_layers - 1: out, cache = layer_utils.affine_forward(out, w, b) else: out, cache = layer_utils.affine_relu_forward(out, w, b) caches.append(cache) scores = out ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ softmax_loss, dsoftmax = layers.softmax_loss(scores, y) reg_loss = 0 for key in self.params.keys(): if key.startswith('W'): w = self.params[key] reg_loss += self.reg * np.sum(w * w) * 0.5 loss = softmax_loss + reg_loss dx = dsoftmax for i in reversed(range(self.num_layers)): w_name = 'W{}'.format(i) b_name = 'b{}'.format(i) if i == self.num_layers - 1: dx, dw, db = layer_utils.affine_backward(dx, caches[i]) else: dx, dw, db = layer_utils.affine_relu_backward(dx, caches[i]) grads[w_name] = dw grads[b_name] = db for key in self.params.keys(): if key.startswith('W'): w = self.params[key] grads[key] += self.reg * w ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] N = X.shape[0] D = np.prod(X.shape[1:]) X_ = X.reshape(N, D) A, fc1_cache = affine_forward(X_, W1, b1) R, relu_cache = relu_forward(A) scores, fc2_cache = affine_forward(R, W2, b2) ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dscores = softmax_loss(scores, y) dR, dW2, db2 = affine_backward(dscores, fc2_cache) dA = relu_backward(dR, relu_cache) dX, dW1, db1 = affine_backward(dA, fc1_cache) loss += 0.5 * self.reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) dW2 += self.reg * W2 dW1 += self.reg * W1 grads = {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2} ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ out_1, cache_1 = layer_utils.affine_relu_forward( X, self.params['W1'], self.params['b1']) out_2, cache_2 = layer_utils.affine_relu_forward( out_1, self.params['W2'], self.params['b2']) scores = out_2 ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dsoftmax = layers.softmax_loss(scores, y) dl2, dw2, db2 = layer_utils.affine_relu_backward(dsoftmax, cache_2) _, dw1, db1 = layer_utils.affine_relu_backward(dl2, cache_1) # add regularization loss for w in [self.params['W1'], self.params['W2']]: loss += self.reg * np.sum(w * w) * 0.5 grads['W1'] = dw1 + self.reg * self.params['W1'] grads['W2'] = dw2 + self.reg * self.params['W2'] grads['b1'] = db1 grads['b2'] = db2 ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads