def test_softmax_loss_vectorized_gradient(self): loss_naive, grad_naive = softmax_loss_naive(self.weights, self.x, self.y, self.reg) loss_vect, grad_vect = softmax_loss_vectorized(self.weights, self.x, self.y, self.reg) np.testing.assert_allclose(loss_naive, loss_vect, 1e-04) np.testing.assert_allclose(grad_naive, grad_vect, 1e-04)
def loss(self, X, y=None, reg=0.0): """ Compute the loss and gradients for a two layer fully connected neural network. Inputs: - X: Input data of shape (N, D). Each X[i] is a training sample. - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is an integer in the range 0 <= y[i] < C. This parameter is optional; if it is not passed then we only return scores, and if it is passed then we instead return the loss and gradients. - reg: Regularization strength. Returns: If y is None, return a matrix scores of shape (N, C) where scores[i, c] is the score for class c on input X[i]. If y is not None, instead return a tuple of: - loss: Loss (data loss and regularization loss) for this batch of training samples. - grads: Dictionary mapping parameter names to gradients of those parameters with respect to the loss function; has the same keys as self.params. """ # Unpack variables from the params dictionary W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] N, D = X.shape # Compute the forward pass h1, h2, scores = self.compute_score(X, W1, b1, W2, b2) if y is None: return scores # Compute the loss loss = softmax_loss_vectorized( W=None, X=None, y=y, scores=scores)[0] + reg * ( np.sum(np.square(W1)) + np.sum(np.square(W2))) # Backward pass: compute gradients grads = {} exp_scores = np.exp(scores) sum_scores = np.sum(exp_scores, axis=1) prob_scores = (exp_scores.T / sum_scores).T prob_scores[np.arange(N), y] -= 1 dLoss_Scores = prob_scores / N dLoss_b2 = np.sum(dLoss_Scores, axis=0) grads['b2'] = dLoss_b2 dLoss_W2 = h2.T.dot(dLoss_Scores) + 2 * reg * W2 grads['W2'] = dLoss_W2 dLoss_h2 = dLoss_Scores.dot(W2.T) dh2_h1 = np.zeros_like(h1) dh2_h1[h1 <= 0] = 0 dh2_h1[h1 > 0] = 1 dLoss_h1 = dh2_h1 * dLoss_h2 dLoss_b1 = np.sum(dLoss_h1, axis=0) grads['b1'] = dLoss_b1 dLoss_W1 = X.T.dot(dLoss_h1) + 2 * reg * W1 grads['W1'] = dLoss_W1 return loss, grads
def loss(self, X_batch, y_batch, reg): return softmax_loss_vectorized(self.theta, X_batch, y_batch, reg)
# from gradient_check import grad_check_sparse # f = lambda th: softmax_loss_naive(th, X_train, y_train, 0.0)[0] # grad_numerical = grad_check_sparse(f, theta, grad, 10) # Now that we have a naive implementation of the softmax loss function and its gradient, # implement a vectorized version in softmax_loss_vectorized. # The two versions should compute the same results, but the vectorized version should be # much faster. # tic = time.time() # loss_naive, grad_naive = softmax_loss_naive(theta, X_train, y_train, 0.00001) # toc = time.time() # print 'naive loss: %e computed in %fs' % (loss_naive, toc - tic) tic = time.time() loss_vectorized, grad_vectorized = softmax_loss_vectorized(theta, X_train, y_train, 0.00001) toc = time.time() print 'vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic) # We use the Frobenius norm to compare the two versions # of the gradient. # grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro') # print 'Loss difference: %f' % np.abs(loss_naive - loss_vectorized) # print 'Gradient difference: %f' % grad_difference learning_rates = [1e-7] regularization_strengths = [1e8] # best_softmax, results, best_val = pick_hyperparams(X_train,y_train, X_val, y_val, learning_rates, regularization_strengths)
from util import plt, np, load_data, grad_check_sparse, time_elapse from softmax import softmax_loss_vectorized from linear_classifier import Softmax cifar_dir = '../cifar-10-batches-py' X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = load_data( cifar_dir, num_test=500) # ininialize W W = np.random.randn(3073, 10) * 0.0001 # test loss loss, grad = softmax_loss_vectorized(W, X_dev, y_dev, 0.0) #print('loss: %f' % loss) #print('sanity check: %f' % (-np.log(0.1))) # test gradient without regularization #def f(w): return softmax_loss_vectorized(W, X_dev, y_dev, 0.0)[0] #grad_numerical = grad_check_sparse(f, W, grad, 10) # test gradient with regularization #def f(w): return softmax_loss_vectorized(W, X_dev, y_dev, 1e2)[0] #grad_numerical = grad_check_sparse(f, W, grad, 10) softmax = Softmax() loss_history = softmax.train(X_train, y_train, learning_rate=1e-7, reg=5e4, num_iters=1500, verbose=True)
def test_softmax_loss_vectorized_loss(self): loss, _ = softmax_loss_vectorized(self.weights, self.x, self.y, self.reg) np.testing.assert_allclose(loss, self.expected, 1e-04)
X_val = np.vstack((np.ones(X_val.shape[0]), X_val.T)).T theta = np.random.randn(3073, 10) * 0.0001 loss, grad = softmax_loss_naive(theta, X_train, y_train, 0.0) # Loss should be something close to - log(0.1) print 'loss:', loss, ' should be close to ', -np.log(0.1) tic = time.time() loss_naive, grad_naive = softmax_loss_naive(theta, X_train, y_train, 0.00001) toc = time.time() print 'naive loss: %e computed in %fs' % (loss_naive, toc - tic) tic = time.time() loss_vectorized, grad_vectorized = softmax_loss_vectorized( theta, X_train, y_train, 0.00001) toc = time.time() print 'vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic) # We use the Frobenius norm to compare the two versions # of the gradient. grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro') print 'Loss difference: %f' % np.abs(loss_naive - loss_vectorized) print 'Gradient difference: %f' % grad_difference results = {} best_val = -1 best_softmax = None #learning_rates = [1e-7, 5e-7, 1e-6, 5e-6] #regularization_strengths = [ 5e4, 1e5, 5e5, 1e8]
loss, grad = softmax.softmax_loss_naive(W, X_dev, Y_dev, 5e1) f = lambda w: softmax.softmax_loss_naive(w, X_dev, Y_dev, 5e1)[0] Tools.grad_check_sparse(f, W, grad, 10) # Now that we have a naive implementation of the softmax loss function and its gradient, # implement a vectorized version in softmax_loss_vectorized. # The two versions should compute the same results, but the vectorized version should be # much faster. tic = time.time() loss_naive, grad_naive = softmax.softmax_loss_naive(W, X_dev, Y_dev, 0.000005) toc = time.time() print('naive loss: %e computed in %fs' % (loss_naive, toc - tic)) f tic = time.time() loss_vectorized, grad_vectorized = softmax.softmax_loss_vectorized( W, X_dev, Y_dev, 0.000005) toc = time.time() print('vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic)) # As we did for the SVM, we use the Frobenius norm to compare the two versions # of the gradient. grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro') print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized)) print('Gradient difference: %f' % grad_difference) # Use the validation set to tune hyperparameters (regularization strength and # learning rate). You should experiment with different ranges for the learning # rates and regularization strengths; if you are careful you should be able to # get a classification accuracy of over 0.35 on the validation set. results = {} best_val = -1
def loss(self, X, y, reg): return softmax_loss_vectorized(X, y, reg)
def loss(self, X, y=None, reg=0.0): """ Compute the loss and gradients for a two layer fully connected neural network. Inputs: - X: Input data of shape (N, D). Each X[i] is a training sample. - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is an integer in the range 0 <= y[i] < C. This parameter is optional; if it is not passed then we only return scores, and if it is passed then we instead return the loss and gradients. - reg: Regularization strength. Returns: If y is None, return a matrix scores of shape (N, C) where scores[i, c] is the score for class c on input X[i]. If y is not None, instead return a tuple of: - loss: Loss (data loss and regularization loss) for this batch of training samples. - grads: Dictionary mapping parameter names to gradients of those parameters with respect to the loss function; has the same keys as self.params. """ # Unpack variables from the params dictionary W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] N, D = X.shape #print "REG:",reg X1 = np.insert(X, 0, 1, axis=1) W11 = np.insert(W1, 0, b1, axis=0) W21 = np.insert(W2, 0, b2, axis=0) scores = None z1 = X1.dot(W11) Layer1ub = np.maximum(0, z1) Layer1 = np.insert(np.maximum(0, z1), 0, 1, axis=1) # Compute the forward pass scores = Layer1.dot(W21) ############################################################################# # TODO: Perform the forward pass, computing the class scores for the input. # # Store the result in the scores variable, which should be an array of # # shape (N, C). # ############################################################################# pass ############################################################################# # END OF YOUR CODE # ############################################################################# # If the targets are not given then jump out, we're done if y is None: return scores loss = 0.0 # Compute the loss L = np.exp(scores) #print sc for i in range(X.shape[0]): loss -= scores[i][y[i]] loss += math.log(sum(L[i])) loss /= X.shape[0] loss += +0.5 * reg * np.sum(W1 * W1) + 0.5 * reg * np.sum(W2 * W2) ############################################################################# # TODO: Finish the forward pass, and compute the loss. This should include # # both the data loss and L2 regularization for W1 and W2. Store the result # # in the variable loss, which should be a scalar. Use the Softmax # # classifier loss. So that your results match ours, multiply the # # regularization loss by 0.5 # ############################################################################# pass ############################################################################# # END OF YOUR CODE # ############################################################################# # Backward pass: compute gradients l1, w2gradient = softmax_loss_vectorized(W21, Layer1, y, reg) derror = error_class(Layer1ub, y, W2, b2) #obtain error in outermost layer dhidden = np.dot(derror, W2.T) #backprop error to hidden layer dhidden[Layer1ub <= 0] = 0 #apply ReLu w1grads = np.dot(X.T, dhidden) #compute grad #print "grad:",w1grads.shape,W1.shape grads = {} grads['W2'] = w2gradient[1:] + reg * sum( W1 * W1) #np.dot(Layer1,gradient) grads['W1'] = w1grads + reg * W1 #w1Lgradient[1:]+reg*W1 grads['b2'] = w2gradient[0] grads['b1'] = np.sum(dhidden, axis=0) #w1Lgradient[0] #print grads['W1'] ############################################################################# # TODO: Compute the backward pass, computing the derivatives of the weights # # and biases. Store the results in the grads dictionary. For example, # # grads['W1'] should store the gradient on W1, and be a matrix of same size # ############################################################################# pass ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, grads
x_val = np.reshape(x_val, (x_val.shape[0], -1)) x_dev = np.reshape(x_dev, (x_dev.shape[0], -1)) #下面进行归一化处理,对每个特征减去平均值来中心化 mean_image = np.mean(x_train, axis=0) x_train -= mean_image x_val -= mean_image x_test -= mean_image x_dev -= mean_image #权重矩阵W其实是W和b,因此我们需要x增加一个维度 x_train = np.hstack([x_train, np.ones((x_train.shape[0], 1))]) x_val = np.hstack([x_val, np.ones((x_val.shape[0], 1))]) x_test = np.hstack([x_test, np.ones((x_test.shape[0], 1))]) x_dev = np.hstack([x_dev, np.ones((x_dev.shape[0], 1))]) #损失函数和梯度计算 w = np.random.randn(3073, 10) * 0.0001 #返回(3073,10)尺寸的符合正态分布的随机数组 loss, grad = softmax.softmax_loss_vectorized( w, x_dev, y_dev, 0.00001) #也可使用公式计算梯度 svm_loss_vectorized print('loss is : %f' % loss) #梯度检验,用公式计算梯度速度很快,但是实现过程中容易出错,为了解决这个问题,需要进行梯度检验 #把分析梯度法的结果和数值梯度法的结果做比较 def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5): for i in range(num_checks): ix = tuple([random.randrange(m) for m in x.shape]) oldval = x[ix] x[ix] = oldval + h fxph = f(x) x[ix] = oldval - h fxmh = f(x) x[ix] = oldval grad_numerical = (fxph - fxmh) / (2 * h)
def loss(self,X_batch,Y_batch,regularization): return softmax.softmax_loss_vectorized(self.W,X_batch,Y_batch,regularization)
return X_train, y_train, X_val, y_val, X_test, y_test X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data() ##print 'Train data shape: ', X_train.shape ##print 'Train labels shape: ', y_train.shape ##print 'Validation data shape: ', X_val.shape ##print 'Validation labels shape: ', y_val.shape ##print 'Test data shape: ', X_test.shape ##print 'Test labels shape: ', y_test.shape W = np.random.randn(10,3073)*0.0001 ##loss, grad = softmax_loss_naive(W, X_train, y_train, 0.0) ##print 'loss: %f' % loss loss, grad = softmax_loss_vectorized(W, X_train, y_train, 0.0) print 'loss: %f' % loss from gradient_check import grad_check_sparse f = lambda w: softmax_loss_naive(w, X_train, y_train, 0.0)[0] grad_numerical = grad_check_sparse(f, W, grad, 10) ##import numpy as np ##a = np.arange(15).reshape(3, 5)*0.1 ##probs = a / np.sum(a, axis=0) ##y = np.random.choice(3, 5) # Use the validation set to tune hyperparameters (regularization strength and # learning rate). You should experiment with different ranges for the learning # rates and regularization strengths; if you are careful you should be able to # get a classification accuracy of over 0.35 on the validation set.
X=np.concatenate((X,np.ones([X.shape[0],1])),axis=1) X_test=np.concatenate((X_test,np.ones([X_test.shape[0],1])),axis=1) X_train, X_val, y_train, y_val = cross_validation.train_test_split(X,y, test_size=0.4, random_state=0) # First implement the naive softmax loss function with nested loops. # Open the file softmax.py and implement the # softmax_loss_naive function. # Generate a random softmax theta matrix and use it to compute the loss. theta = np.random.randn(3073,10) * 0.0001 loss, grad = softmax_loss_vectorized(theta, X_train, y_train, 0.0) # Loss should be something close to - log(0.1) print 'loss:', loss, ' should be close to ', - np.log(0.1) # Use numeric gradient checking as a debugging tool. # The numeric gradient should be close to the analytic gradient. (within 1e-7) # Now that we have a naive implementation of the softmax loss function and its gradient, # implement a vectorized version in softmax_loss_vectorized. # The two versions should compute the same results, but the vectorized version should be # much faster.
def loss1(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] N = X.shape[0] feat = 1 for num_wei in X.shape[1:]: feat *= num_wei #out = None Xfeat = X.reshape([X.shape[0], feat]) #print "X1: shape",X1.shape X1 = np.insert(Xfeat, 0, 1, axis=1) #print "X1: shape",X1.shape W11 = np.insert(W1, 0, b1, axis=0) W21 = np.insert(W2, 0, b2, axis=0) scores = None z1 = X1.dot(W11) Layer1ub = np.maximum(0, z1) scoresub = np.exp(Layer1ub.dot(W2) + b2) Layer1 = np.insert(np.maximum(0, z1), 0, 1, axis=1) #print "ALyer1",Layer1.shape scores = Layer1.dot(W21) ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ pass ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} loss = 0.0 # Compute the loss L = np.exp(scores) #print sc for i in range(X.shape[0]): loss -= scores[i][y[i]] loss += math.log(sum(L[i])) loss /= X.shape[0] loss += +0.5 * self.reg * np.sum(W1 * W1) + 0.5 * self.reg * np.sum( W2 * W2) ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ pass ############################################################################ # END OF YOUR CODE # ############################################################################ l1, w2gradient = softmax_loss_vectorized(W21, Layer1, y, self.reg) derror = error_class(Layer1ub, y, W2, b2) #obtain error in outermost layer dhidden = np.dot(derror, W2.T) #backprop error to hidden layer dhidden[Layer1ub <= 0] = 0 #apply ReLu w1grads = np.dot(Xfeat.T, dhidden) #compute grad #print "grad:",w1grads.shape,W1.shape grads = {} #print "W1:",w1grads.shape,W1.shape grads['W2'] = w2gradient[1:] #.reshape(#np.dot(Layer1,gradient) grads['W1'] = w1grads #+self.reg*W1#w1Lgradient[1:]+reg*W1 grads['b2'] = w2gradient[0] #np.sum(scoresub, axis=0)# grads['b1'] = np.sum(dhidden, axis=0) #w1Lgradient[0] #print "grads over" return loss, grads