def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ W1, W2, b1, b2 = self.params['W1'], self.params['W2'], self.params[ 'b1'], self.params['b2'] hidden_out, cache1 = affine_relu_forward(X, W1, b1) scores, cache2 = affine_forward(hidden_out, W2, b2) if y is None: return scores grads = {} loss, dScore = softmax_loss(scores, y) loss += .5 * self.reg * (np.sum(W1**2) + np.sum(W2**2)) dX2, grads['W2'], grads['b2'] = affine_backward(dScore, cache2) dX, grads['W1'], grads['b1'] = affine_relu_backward(dX2, cache1) grads['W2'] += self.reg * W2 grads['W1'] += self.reg * W1 return loss, grads
def loss(self, x, y=None): """ Loss function used is MSE loss """ scores = None scores, cache1 = affine_relu_forward(x, self.params['W1'], self.params['b1']) scores, cache2 = affine_relu_forward(scores, self.params['W2'], self.params['b2']) scores, cache3 = affine_relu_forward(scores, self.params['W3'], self.params['b3']) scores, cache4 = affine_forward(scores, self.params['W4'], self.params['b4']) if y is None: return scores loss = mse_loss_forward(scores, y) grads = {} dup = mse_loss_backward(scores, y) dup, grads['W4'], grads['b4'] = affine_backward(dup, cache4) dup, grads['W3'], grads['b3'] = affine_relu_backward(dup, cache3) dup, grads['W2'], grads['b2'] = affine_relu_backward(dup, cache2) dup, grads['W1'], grads['b1'] = affine_relu_backward(dup, cache1) return loss, grads
def train_loss(X, y, W1, W2, b1, b2): l1 = affine_relu_forward(X, W1, b1) l2 = affine_forward(l1, W2, b2) scores = l2 if y is None: return scores #[TODO]: softmax is not supported yet # loss, d_scores = softmax_loss(scores, y) loss = svm_loss(scores, y) loss_with_reg = loss + np.sum(W1 ** 2) * 0.5 * self.reg + np.sum(W2 ** 2) * 0.5 * self.reg return loss_with_reg
def train_loss(X, y, W1, W2, b1, b2): l1 = affine_relu_forward(X, W1, b1) l2 = affine_forward(l1, W2, b2) scores = l2 if y is None: return scores #[TODO]: softmax is not supported yet # loss, d_scores = softmax_loss(scores, y) loss = svm_loss(scores, y) loss_with_reg = loss + np.sum(W1**2) * 0.5 * self.reg + np.sum( W2**2) * 0.5 * self.reg return loss_with_reg
# There are some common patterns of layers that are frequently used in # # neural nets. For example, affine layers are frequently followed by a # # ReLU nonlinearity. To make these common patterns easy, we define # # several convenience layers in the file layer_utils.py. For now # # take a look at the affine_relu_forward and affine_relu_backward # # functions, and run the following to numerically gradient check the # # backward pass. # ################################################################################### x = np.random.randn(2, 3, 4) theta = np.random.randn(12, 10) theta_0 = np.random.randn(10) dout = np.random.randn(2, 10) if layers.affine_forward(x,theta,theta_0)[0] is not None: out, cache = layer_utils.affine_relu_forward(x, theta, theta_0) dx, dtheta, dtheta_0 = layer_utils.affine_relu_backward(dout, cache) dx_num = eval_numerical_gradient_array(lambda x: layer_utils.affine_relu_forward(x, theta, theta_0)[0], x, dout) dtheta_num = eval_numerical_gradient_array(lambda w: layer_utils.affine_relu_forward(x, theta, theta_0)[0], theta, dout) dtheta_0_num = eval_numerical_gradient_array(lambda b: layer_utils.affine_relu_forward(x, theta, theta_0)[0], theta_0, dout) print 'Testing affine_relu_forward:' print 'dx error: ', rel_error(dx_num, dx) print 'dtheta error: ', rel_error(dtheta_num, dtheta) print 'dtheta_0 error: ', rel_error(dtheta_0_num, dtheta_0) ################################################################################### # Loss layers: Softmax and SVM # ###################################################################################
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ # forward pass 1st hidden layer out, h1_cache = affine_relu_forward( X, self.params['W1'], self.params['b1']) # forward pass 2nd affine scores, h2_cache = affine_forward( out, self.params['W2'], self.params['b2']) ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dsoft = softmax_loss(scores, y) regularized_term = 0.5 * self.reg * \ (np.sum(self.params['W1']*self.params['W1']) + np.sum(self.params['W2'] * self.params['W2'])) # regularized loss loss += regularized_term # backward pass to 2nd affine layer dx2, dW2, db2 = affine_backward(dsoft, h2_cache) # backward pass to 1st affine_relu_layer _, dW1, db1 = affine_relu_backward(dx2, h1_cache) # number of examples N = X.shape[0] grads['W2'] = dW2 / N + self.reg * self.params['W2'] grads['b2'] = db2 / N grads['W1'] = dW1 / N + self.reg * self.params['W1'] grads['b1'] = db1 / N ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.normalization == 'batchnorm': for bn_param in self.bn_params: bn_param['mode'] = mode # a vector of scores scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ forward_cache = [] input = X # each rouand is an affine_relu_forward for layer in range(self.num_layers-1): weight_key = 'W'+str(layer+1) bias_key = 'b'+str(layer+1) out, cache = affine_relu_forward( input, self.params[weight_key], self.params[bias_key]) forward_cache.append(cache) input = out # output layer affine_forward weight_key = 'W' + str(self.num_layers) bias_key = 'b' + str(self.num_layers) scores, cache = affine_forward( input, self.params[weight_key], self.params[bias_key]) forward_cache.append(cache) ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch/layer normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dsoft = softmax_loss(scores, y) regularization_term = 0 for L in range(self.num_layers): key = 'W' + str(L+1) regularization_term += np.sum(self.params[key] * self.params[key]) loss += 0.5 * self.reg * regularization_term # N number of examples N = X.shape[0] dupstream = dsoft key_i = self.num_layers forward_cache.reverse() for index, cache in enumerate(forward_cache): # the last layer just perform affine backward if index == 0: dupstream, dw, db = affine_backward(dupstream, cache) grad_key = 'W' + str(key_i) grads[grad_key] = dw / N + self.reg * self.params[grad_key] grad_key = 'b' + str(key_i) grads[grad_key] = db / N key_i -= 1 # hidden layer backpropagation, affine_relu_backward else: dupstream, dw, db = affine_relu_backward(dupstream, cache) grad_key = 'W' + str(key_i) grads[grad_key] = dw / N + self.reg * self.params[grad_key] grad_key = 'b' + str(key_i) grads[grad_key] = db / N key_i -= 1 ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
####################################################################################### # Test the sandwitch layers # -If you implemented affine and relu functions properly, # you would have no problem with this code ####################################################################################### from layer_utils import affine_relu_forward from layer_utils import affine_relu_backward np.random.seed(231) x = np.random.randn(2, 3, 4) w = np.random.randn(12, 10) b = np.random.randn(10) dout = np.random.randn(2, 10) out, cache = affine_relu_forward(x, w, b) dx, dw, db = affine_relu_backward(dout, cache) dx_num = eval_numerical_gradient_array( lambda x: affine_relu_forward(x, w, b)[0], x, dout) dw_num = eval_numerical_gradient_array( lambda w: affine_relu_forward(x, w, b)[0], w, dout) db_num = eval_numerical_gradient_array( lambda b: affine_relu_forward(x, w, b)[0], b, dout) print('Testing affine_relu_forward:') print('dx error: ', rel_error(dx_num, dx)) print('dw error: ', rel_error(dw_num, dw)) print('db error: ', rel_error(db_num, db)) #######################################################################################
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None cache = self.num_layers * [None] dropout_cache = (self.num_layers - 1) * [None] for i in np.arange(self.num_layers - 1): if not self.use_batchnorm: scores, cache[i] = affine_relu_forward( X if i == 0 else scores, self.params['W%d' % (i + 1)], self.params['b%d' % (i + 1)]) else: scores, cache[i] = affine_bn_relu_forward( X if i == 0 else scores, self.params['W%d' % (i + 1)], self.params['b%d' % (i + 1)], self.params['gamma%d' % (i + 1)], self.params['beta%d' % (i + 1)], self.bn_params[i]) if self.use_dropout: scores, dropout_cache[i] = dropout_forward( scores, self.dropout_param) scores, cache[self.num_layers - 1] = affine_forward( scores, self.params['W%d' % self.num_layers], self.params['b%d' % self.num_layers]) ############################################################################ # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} loss, dscore = softmax_loss(scores, y) dx, grads['W%d' % self.num_layers], grads['b%d' % self.num_layers] = affine_backward( dscore, cache[self.num_layers - 1]) for i in reversed(np.arange(self.num_layers - 1)): if self.use_dropout: dx = dropout_backward(dx, dropout_cache[i]) if not self.use_batchnorm: dx, grads['W%d' % (i + 1)], grads['b%d' % (i + 1)] = affine_relu_backward( dx, cache[i]) else: dx, grads['W%d' % (i+1)], grads['b%d' % (i+1)], grads['gamma%d' % (i+1)], grads['beta%d' % (i+1)] \ = affine_bn_relu_backward(dx, cache[i]) for i in np.arange(self.num_layers): loss += .5 * self.reg * np.sum( np.square(self.params['W%d' % (i + 1)])) grads['W%d' % (i + 1)] += self.reg * self.params['W%d' % (i + 1)] ############################################################################ # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # ############################################################################ return loss, grads
# There are some common patterns of layers that are frequently used in # # neural nets. For example, affine layers are frequently followed by a # # ReLU nonlinearity. To make these common patterns easy, we define # # several convenience layers in the file layer_utils.py. For now # # take a look at the affine_relu_forward and affine_relu_backward # # functions, and run the following to numerically gradient check the # # backward pass. # ########################################################################## x = np.random.randn(2, 3, 4) theta = np.random.randn(12, 10) theta_0 = np.random.randn(10) dout = np.random.randn(2, 10) if layers.affine_forward(x, theta, theta_0)[0] is not None: out, cache = layer_utils.affine_relu_forward(x, theta, theta_0) dx, dtheta, dtheta_0 = layer_utils.affine_relu_backward(dout, cache) dx_num = eval_numerical_gradient_array( lambda x: layer_utils.affine_relu_forward(x, theta, theta_0)[0], x, dout) dtheta_num = eval_numerical_gradient_array( lambda w: layer_utils.affine_relu_forward(x, theta, theta_0)[0], theta, dout) dtheta_0_num = eval_numerical_gradient_array( lambda b: layer_utils.affine_relu_forward(x, theta, theta_0)[0], theta_0, dout) print 'Testing affine_relu_forward:' print 'dx error: ', rel_error(dx_num, dx) print 'dtheta error: ', rel_error(dtheta_num, dtheta) print 'dtheta_0 error: ', rel_error(dtheta_0_num, dtheta_0)