def test_sandwich_layers(samples=random.randrange(1, 10)): for x in range(0, samples): x = np.random.randn(2, 3, 4) w = np.random.randn(12, 10) b = np.random.randn(10) dout = np.random.randn(2, 10) #need an input for gradient being backpropagated into this layer out, cache = affine_relu_forward(x, w, b) dx, dw, db = affine_relu_backward(dout, cache) dx_num = eval_numerical_gradient_array( lambda x: affine_relu_forward(x, w, b)[0], x, dout) dw_num = eval_numerical_gradient_array( lambda w: affine_relu_forward(x, w, b)[0], w, dout) db_num = eval_numerical_gradient_array( lambda b: affine_relu_forward(x, w, b)[0], b, dout) assert rel_error(dx_num, dx) < 5e-7 assert rel_error(dw_num, dw) < 5e-7 assert rel_error(db_num, db) < 5e-7 assert dx.shape == dx_num.shape assert dw.shape == dw_num.shape assert db.shape == db_num.shape assert out.shape == dout.shape
def Test_connect_layer(): x = np.random.randn(2, 12) w = np.random.randn(12, 10) b = np.random.randn(10) dout = np.random.randn(2, 10) out, cache = affine_relu_forward(x, w, b) dx, dw, db = affine_relu_backward(dout, cache) dx_num = eval_numerical_gradient_array( lambda x: affine_relu_forward(x, w, b)[0].asnumpy(), x, dout) dw_num = eval_numerical_gradient_array( lambda w: affine_relu_forward(x, w, b)[0].asnumpy(), w, dout) db_num = eval_numerical_gradient_array( lambda b: affine_relu_forward(x, w, b)[0].asnumpy(), b, dout) print 'Testing affine_relu_forward:' print 'dx error: ', rel_error(dx_num, dx.asnumpy()) print 'dw error: ', rel_error(dw_num, dw.asnumpy()) print 'db error: ', rel_error(db_num, db.asnumpy())
def test_sandwich_layers(samples = random.randrange(1,10)): for x in range(0,samples): x = np.random.randn(2, 3, 4) w = np.random.randn(12, 10) b = np.random.randn(10) dout = np.random.randn(2, 10) #need an input for gradient being backpropagated into this layer out, cache = affine_relu_forward(x, w, b) dx, dw, db = affine_relu_backward(dout, cache) dx_num = eval_numerical_gradient_array(lambda x: affine_relu_forward(x, w, b)[0], x, dout) dw_num = eval_numerical_gradient_array(lambda w: affine_relu_forward(x, w, b)[0], w, dout) db_num = eval_numerical_gradient_array(lambda b: affine_relu_forward(x, w, b)[0], b, dout) assert rel_error(dx_num, dx) < 5e-7 assert rel_error(dw_num, dw) < 5e-7 assert rel_error(db_num, db) < 5e-7 assert dx.shape == dx_num.shape assert dw.shape == dw_num.shape assert db.shape == db_num.shape assert out.shape == dout.shape
_, cache = relu_forward(x) dx = relu_backward(dout, cache) # The error should be around 1e-12 print 'Testing relu_backward function:' print 'dx error: ', rel_error(dx_num, dx) from cs231n.layer_utils import affine_relu_forward, affine_relu_backward x = np.random.randn(2, 3, 4) w = np.random.randn(12, 10) b = np.random.randn(10) dout = np.random.randn(2, 10) out, cache = affine_relu_forward(x, w, b) dx, dw, db = affine_relu_backward(dout, cache) dx_num = eval_numerical_gradient_array( lambda x: affine_relu_forward(x, w, b)[0], x, dout) dw_num = eval_numerical_gradient_array( lambda w: affine_relu_forward(x, w, b)[0], w, dout) db_num = eval_numerical_gradient_array( lambda b: affine_relu_forward(x, w, b)[0], b, dout) print 'Testing affine_relu_forward:' print 'dx error: ', rel_error(dx_num, dx) print 'dw error: ', rel_error(dw_num, dw) print 'db error: ', rel_error(db_num, db) num_classes, num_inputs = 10, 50 x = 0.001 * np.random.randn(num_inputs, num_classes)
# # "Sandwich" layers # There are some common patterns of layers that are frequently used in neural nets. For example, affine layers are frequently followed by a ReLU nonlinearity. To make these common patterns easy, we define several convenience layers in the file `cs231n/layer_utils.py`. # # For now take a look at the `affine_relu_forward` and `affine_relu_backward` functions, and run the following to numerically gradient check the backward pass: # In[ ]: from cs231n.layer_utils import affine_relu_forward, affine_relu_backward x = np.random.randn(2, 3, 4) w = np.random.randn(12, 10) b = np.random.randn(10) dout = np.random.randn(2, 10) out, cache = affine_relu_forward(x, w, b) dx, dw, db = affine_relu_backward(dout, cache) dx_num = eval_numerical_gradient_array(lambda x: affine_relu_forward(x, w, b)[0], x, dout) dw_num = eval_numerical_gradient_array(lambda w: affine_relu_forward(x, w, b)[0], w, dout) db_num = eval_numerical_gradient_array(lambda b: affine_relu_forward(x, w, b)[0], b, dout) print 'Testing affine_relu_forward:' print 'dx error: ', rel_error(dx_num, dx) print 'dw error: ', rel_error(dw_num, dw) print 'db error: ', rel_error(db_num, db) # # Loss layers: Softmax and SVM # You implemented these loss functions in the last assignment, so we'll give them to you for free here. You should still make sure you understand how they work by looking at the implementations in `cs231n/layers.py`. # # You can make sure that the implementations are correct by running the following:
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ####################################################################### # TODO: Implement the forward pass for the two-layer net, computing the # class scores for X and storing them in the scores variable. ####################################################################### W1 = self.params["W1"] b1 = self.params["b1"] W2 = self.params["W2"] b2 = self.params["b2"] N = X.shape[0] C = W2.shape[1] scores = np.zeros((N, C)) X_hidden, cache1 = affine_relu_forward(X, W1, b1) scores, cache2 = affine_forward(X_hidden, W2, b2) ####################################################################### # END OF YOUR CODE # ####################################################################### # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ####################################################################### # TODO: Implement the backward pass for the two-layer net. Store the # loss in the loss variable and gradients in the grads dictionary. # Compute data loss using softmax, and make sure that grads[k] # holds the gradients for self.params[k]. Don't forget to add L2 # regularization! # # NOTE: To ensure that your implementation matches ours and you pass # the automated tests, make sure that your L2 regularization includes a # factor of 0.5 to simplify the expression for the gradient. ####################################################################### loss, dscores = softmax_loss(scores, y) loss += 0.5 * self.reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) dx_hidden, dw2, db2 = affine_backward(dscores, cache2) grads["W2"] = dw2 + self.reg * W2 grads["b2"] = db2 dx, dw1, db1 = affine_relu_backward(dx_hidden, cache1) grads["W1"] = dw1 + self.reg * W1 grads["b1"] = db1 ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.dropout_param is not None: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param[mode] = mode scores = None ####################################################################### # TODO: Implement the forward pass for the fully-connected net, # computing the class scores for X and storing them in the scores # variable. # # When using dropout, you'll need to pass self.dropout_param to each # dropout forward pass. # # When using batch normalization, you'll need to pass self.bn_params[0] # to the forward pass for the first batch normalization layer, # pass self.bn_params[1] to the forward pass for the second batch # normalization layer, etc. ####################################################################### IN = X caches = {} if self.use_dropout: dropout_caches = {} for l in range(self.num_layers - 1): W = self.params["W{}".format(l + 1)] b = self.params["b{}".format(l + 1)] if self.use_batchnorm: gamma = self.params["gamma{}".format(l + 1)] beta = self.params["beta{}".format(l + 1)] IN, cache = affine_batchnorm_relu_forward( IN, W, b, gamma, beta, self.bn_params[l]) else: IN, cache = affine_relu_forward(IN, W, b) caches[l] = cache if self.use_dropout: IN, d_cache = dropout_forward(IN, self.dropout_param) dropout_caches[l] = d_cache # forward pass: last affine layer num_last = self.num_layers name_W_last = "W{}".format(num_last) name_b_last = "b{}".format(num_last) W_last = self.params[name_W_last] b_last = self.params[name_b_last] scores, cache_last = affine_forward(IN, W_last, b_last) ####################################################################### # END OF YOUR CODE # ####################################################################### # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ####################################################################### # TODO: Implement the backward pass for the fully-connected net. # Store the loss in the loss variable and gradients in the grads # dictionary. Compute data loss using softmax, and make sure that # grads[k] holds the gradients for self.params[k]. Don't forget to add # L2 regularization! # # When using batch normalization, you don't need to regularize the # scale and shift parameters. # # NOTE: To ensure that your implementation matches ours and you pass # the automated tests, make sure that your L2 regularization includes a # factor of 0.5 to simplify the expression for the gradient. ####################################################################### # loss loss, dscores = softmax_loss(scores, y) # regularization loss for l in range(self.num_layers): W = self.params["W{}".format(l + 1)] loss += 0.5 * self.reg * np.sum(W * W) # backprop through last affine layer dx, dw, db = affine_backward(dscores, cache_last) grads[name_W_last] = dw + self.reg * W_last grads[name_b_last] = db # backprop through affine-batchnorm-relu layers for l in reversed(range(self.num_layers - 1)): name_W = "W{}".format(l + 1) name_b = "b{}".format(l + 1) if self.use_dropout: dx = dropout_backward(dx, dropout_caches[l]) if self.use_batchnorm: dx, dw, db, dgamma, dbeta = affine_batchnorm_relu_backward( dx, caches[l]) grads["gamma{}".format(l + 1)] = dgamma grads["beta{}".format(l + 1)] = dbeta else: dx, dw, db = affine_relu_backward(dx, caches[l]) grads[name_W] = dw + self.reg * self.params[name_W] grads[name_b] = db ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ out_1, cache_1 = layer_utils.affine_relu_forward( X, self.params['W1'], self.params['b1']) out_2, cache_2 = layer_utils.affine_relu_forward( out_1, self.params['W2'], self.params['b2']) scores = out_2 ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dsoftmax = layers.softmax_loss(scores, y) dl2, dw2, db2 = layer_utils.affine_relu_backward(dsoftmax, cache_2) _, dw1, db1 = layer_utils.affine_relu_backward(dl2, cache_1) # add regularization loss for w in [self.params['W1'], self.params['W2']]: loss += self.reg * np.sum(w * w) * 0.5 grads['W1'] = dw1 + self.reg * self.params['W1'] grads['W2'] = dw2 + self.reg * self.params['W2'] grads['b1'] = db1 grads['b2'] = db2 ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ out = X caches = [] for i in range(self.num_layers): w_name = 'W{}'.format(i) b_name = 'b{}'.format(i) w = self.params[w_name] b = self.params[b_name] if i == self.num_layers - 1: out, cache = layer_utils.affine_forward(out, w, b) else: out, cache = layer_utils.affine_relu_forward(out, w, b) caches.append(cache) scores = out ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ softmax_loss, dsoftmax = layers.softmax_loss(scores, y) reg_loss = 0 for key in self.params.keys(): if key.startswith('W'): w = self.params[key] reg_loss += self.reg * np.sum(w * w) * 0.5 loss = softmax_loss + reg_loss dx = dsoftmax for i in reversed(range(self.num_layers)): w_name = 'W{}'.format(i) b_name = 'b{}'.format(i) if i == self.num_layers - 1: dx, dw, db = layer_utils.affine_backward(dx, caches[i]) else: dx, dw, db = layer_utils.affine_relu_backward(dx, caches[i]) grads[w_name] = dw grads[b_name] = db for key in self.params.keys(): if key.startswith('W'): w = self.params[key] grads[key] += self.reg * w ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ##################################################################### # TODO: Implement the forward pass for the two-layer net, computing # # the class scores for X and storing them in the scores variable. # ##################################################################### a, cache_relu = affine_relu_forward(X, self.params['W1'], self.params['b1']) scores, cache_scores = affine_forward(a, self.params['W2'], self.params['b2']) ##################################################################### # END OF YOUR CODE # ##################################################################### # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ###################################################################### # TODO: Implement the backward pass for the two-layer net. Store the # # loss in the loss variable and gradients in the grads dictionary. # # Compute data loss using softmax, and make sure that grads[k] hold # # the gradients for self.params[k]. Don't forget to add L2 # # regularization! # # NOTE: To ensure that your # # implementation matches ours and you pass the automated tests, # # make sure that your L2 regularization includes a factor of 0.5 to # # simplify the expression for the gradient. # ###################################################################### loss, dscores = softmax_loss(scores, y) dx, grads['W2'], grads['b2'] = affine_backward(dscores, cache_scores) _, grads['W1'], grads['b1'] = affine_relu_backward(dx, cache_relu) # add regularization ss = np.sum(self.params['W1'] ** 2) + np.sum(self.params['W2'] ** 2) loss += 0.5 * self.reg * ss grads['W1'] += self.reg * self.params['W1'] grads['W2'] += self.reg * self.params['W2'] ###################################################################### # END OF YOUR CODE # ###################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.normalization=='batchnorm': for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ################################################################### # TODO: Implement the forward pass for the fully-connected net, # # computing the class scores for X and storing them in the scores # # variable. When using dropout, you'll need to pass # # self.dropout_param to each dropout forward pass. # # When using batch normalization, you'll need to pass # # self.bn_params[0] to the forward pass for the first batch # # normalization layer, pass self.bn_params[1] to the # # forward pass for the second batch normalization layer, etc. # ################################################################### cache = [None] * self.num_layers # for all layers but dropout dropout_cache = [None] * self.num_layers # for dropout layers activations = [None] * self.num_layers activations[0] = X if self.normalization == 'batchnorm': forward = affine_bn_relu_forward else: forward = affine_relu_forward for i in range(1, self.num_layers): args = [activations[i-1], self.params['W' + str(i)], self.params['b' + str(i)]] if self.normalization == 'batchnorm': args += [self.params['gamma' + str(i)], self.params['beta' + str(i)], self.bn_params[i-1]] if self.use_dropout: a, cache[i] = forward(*args) activations[i], dropout_cache[i] = \ dropout_forward(a, self.dropout_param) else: activations[i], cache[i] = forward(*args) last_layer_ind = str(self.num_layers) scores, cache_scores = affine_forward( activations[-1], self.params['W' + last_layer_ind], self.params['b' + last_layer_ind]) ################################################################### # END OF YOUR CODE # ################################################################### # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ####################################################################### # TODO: Implement the backward pass for the fully-connected net. # # Store the loss in the loss variable and gradients in the grads # # dictionary. Compute data loss using softmax, and make sure that # # grads[k] holds the gradients for self.params[k]. Don't forget to # # add L2 regularization! # # When using batch/layer normalization, you don't need to regularize # # the scale and shift parameters. # # NOTE: To ensure that your implementation matches ours and you pass # # the automated tests, make sure that your L2 regularization # # includes a factor of 0.5 to simplify the expression for the # # gradient. # ####################################################################### loss, dscores = softmax_loss(scores, y) last_layer_ind = str(self.num_layers) dx, grads['W' + last_layer_ind], grads['b' + last_layer_ind] = \ affine_backward(dscores, cache_scores) for i in range(self.num_layers-1, 0, -1): Wi = 'W' + str(i) bi = 'b' + str(i) if self.use_dropout: dx = dropout_backward(dx, dropout_cache[i]) if self.normalization == 'batchnorm': gammai = 'gamma' + str(i) betai = 'beta' + str(i) dx, grads[Wi], grads[bi], grads[gammai], grads[betai] = \ affine_bn_relu_backward(dx, cache[i]) else: dx, grads[Wi], grads[bi] = affine_relu_backward(dx, cache[i]) # add regularization sums_of_sqares = [np.sum(self.params[k] ** 2) for k in self.params.keys() if k.startswith('W')] ss = np.sum(sums_of_sqares) loss += 0.5 * self.reg * ss for i in range(self.num_layers, 0, -1): Wi = 'W' + str(i) grads[Wi] += self.reg * self.params[Wi] ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ from cs231n.layer_utils import affine_relu_forward, softmax_loss activations = {} cache = {} dropout_cache = {} activations[0] = X if self.use_batchnorm: for i in range(1, self.num_layers): if self.use_dropout: tmp_activations, cache[i] = affine_batchnorm_relu_forward( activations[i - 1], self.params['W{0}'.format(i)], self.params['b{0}'.format(i)], self.params['gamma{0}'.format(i)], self.params['beta{0}'.format(i)], self.bn_params[i - 1]) activations[i], dropout_cache[i] = dropout_forward( tmp_activations, self.dropout_param) else: activations[i], cache[i] = affine_batchnorm_relu_forward( activations[i - 1], self.params['W{0}'.format(i)], self.params['b{0}'.format(i)], self.params['gamma{0}'.format(i)], self.params['beta{0}'.format(i)], self.bn_params[i - 1]) scores, cache[self.num_layers] = affine_forward( activations[self.num_layers - 1], self.params['W{0}'.format(self.num_layers)], self.params['b{0}'.format(self.num_layers)]) else: for i in range(1, self.num_layers): if self.use_dropout: tmp_activations, cache[i] = affine_relu_forward( activations[i - 1], self.params['W{0}'.format(i)], self.params['b{0}'.format(i)]) activations[i], dropout_cache[i] = dropout_forward( tmp_activations, self.dropout_param) else: activations[i], cache[i] = affine_relu_forward( activations[i - 1], self.params['W{0}'.format(i)], self.params['b{0}'.format(i)]) scores, cache[self.num_layers] = affine_forward( activations[self.num_layers - 1], self.params['W{0}'.format(self.num_layers)], self.params['b{0}'.format(self.num_layers)]) ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ from cs231n.layer_utils import affine_relu_backward loss, dx = softmax_loss(scores, y) for i in range(self.num_layers): loss += 0.5 * self.reg * np.sum(self.params['W{0}'.format(i + 1)] * self.params['W{0}'.format(i + 1)]) if self.use_batchnorm: #get grads for top layer, add in regularization term activations['da{0}'.format(self.num_layers - 1)], grads[ 'W{0}'.format(self.num_layers)], grads['b{0}'.format( self.num_layers)] = affine_backward( dx, cache[self.num_layers]) grads['W{0}'.format( self.num_layers)] += self.reg * self.params['W{0}'.format( self.num_layers)] if self.use_dropout: #get grads for other layers (dropout) for i in range(self.num_layers - 1, 0, -1): activations['da{0}post_dropout'.format( i - 1)] = dropout_backward( activations['da{0}'.format(i)], dropout_cache[i]) activations['da{0}'.format(i - 1)], grads['W{0}'.format( i)], grads['b{0}'.format(i)], grads['gamma{0}'.format( i)], grads['beta{0}'.format( i)] = affine_batchnorm_relu_backward( activations['da{0}post_dropout'.format(i - 1)], cache[i]) grads['W{0}'.format( i)] += self.reg * self.params['W{0}'.format(i)] else: #get grads for other layers (no dropout) for i in range(self.num_layers - 1, 0, -1): activations['da{0}'.format(i - 1)], grads['W{0}'.format( i)], grads['b{0}'.format(i)], grads['gamma{0}'.format( i)], grads['beta{0}'.format( i)] = affine_batchnorm_relu_backward( activations['da{0}'.format(i)], cache[i]) grads['W{0}'.format( i)] += self.reg * self.params['W{0}'.format(i)] else: #get grads for top layer, add in regularization term activations['da{0}'.format(self.num_layers - 1)], grads[ 'W{0}'.format(self.num_layers)], grads['b{0}'.format( self.num_layers)] = affine_backward( dx, cache[self.num_layers]) grads['W{0}'.format( self.num_layers)] += self.reg * self.params['W{0}'.format( self.num_layers)] if self.use_dropout: #get grads for other layers (dropout) for i in range(self.num_layers - 1, 0, -1): activations['da{0}post_dropout'.format( i - 1)] = dropout_backward( activations['da{0}'.format(i)], dropout_cache[i]) activations['da{0}'.format(i - 1)], grads['W{0}'.format( i)], grads['b{0}'.format(i)] = affine_relu_backward( activations['da{0}post_dropout'.format(i - 1)], cache[i]) grads['W{0}'.format( i)] += self.reg * self.params['W{0}'.format(i)] else: #get grads for other layers (no dropout) for i in range(self.num_layers - 1, 0, -1): activations['da{0}'.format(i - 1)], grads['W{0}'.format( i)], grads['b{0}'.format(i)] = affine_relu_backward( activations['da{0}'.format(i)], cache[i]) grads['W{0}'.format( i)] += self.reg * self.params['W{0}'.format(i)] # #get grads for top layer # grads['W{0}'.format(self.num_layers)] = activations['a{0}'.format(self.num_layers - 1)].T.dot(dx) + self.reg * self.params['W{0}'.format(self.num_layers)] # grads['b{0}'.format(self.num_layers)] = np.sum(dx, axis = 0) # activations['da{0}'.format(self.num_layers - 1)] = dx.dot(self.params['W{0}'.format(self.num_layers)].T) # activations['da{0}'.format(self.num_layers - 1)][activations['a{0}'.format(self.num_layers - 1)] <= 0] = 0 # #get grads for intermediate layers # for i in reversed(range(1, self.num_layers - 1)): # activations['da{0}'.format(i)] = activations['da{0}'.format(i + 1)].dot(self.params['W{0}'.format(i + 1)].T) # activations['da{0}'.format(i)][activations['a{0}'.format(i)] <= 0] = 0 # for i in reversed(range(2, self.num_layers)): # grads['W{0}'.format(i)] = activations['a{0}'.format(i - 1)].T.dot(activations['da{0}'.format(i)]) + self.reg * self.params['W{0}'.format(i)] # grads['b{0}'.format(i)] = np.sum(activations['da{0}'.format(i)], axis = 0) # #get grads for the initial layer # grads['W1'] = X.reshape(X.shape[0], -1).T.dot(activations['da1']) + self.reg * self.params['W1'] # grads['b1'] = np.sum(activations['da1'], axis = 0) ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads