def mlp_loss(weights, X, y, reg): """ Compute loss and gradients of the neutral network. """ L = len(weights) # The index of the output layer z = [] a = [] err_tol = 1e-10 # Error of tolerance # Number of samples m = X.shape[0] # Forward pass z.append(0) # Dummy element a.append(X) # Input activation for i in range(0, L): W = weights[i]['W'] b = weights[i]['b'] z.append(np.dot(a[-1], W) + b) a.append(ac_func( z[-1])) # Note the final element in a[:] will not be used zL_max = np.max(z[-1], axis=1, keepdims=True) z[-1] -= zL_max # Avoid numerical problem due to large values of exp(z[-1]) proba = np.exp(z[-1]) / np.sum( np.exp(z[-1]), axis=1, keepdims=True ) + err_tol # Add err_tol to avoid this value too close to zero # Target matrix of labels Y = to_binary_class_matrix(y) # loss function sum_squared_weights = 0.0 # Sum of squared weights for i in range(L): W = weights[i]['W'] sum_squared_weights += np.sum(W * W) loss = -1.0 / m * np.sum( Y * np.log(proba)) + 0.5 * reg * sum_squared_weights # Backpropagation delta = [-1.0 * (Y - proba)] for i in reversed(range(L)): # Note that delta[0] will not be used W = weights[i]['W'] d = np.dot(delta[0], W.T) * ac_func_deriv(z[i]) delta.insert(0, d) # Insert element at beginning # Gradients grad = [{} for i in range(L)] for i in range(L): W = weights[i]['W'] grad[i]['W'] = np.dot(a[i].T, delta[i + 1]) / m + reg * W grad[i]['b'] = np.mean(delta[i + 1], axis=0) return loss, grad
def mlp_loss(weights, X, y, reg): """ Compute loss and gradients of the neutral network. """ L = len(weights) # The index of the output layer z = [] a = [] err_tol = 1e-10 # Error of tolerance # Number of samples m = X.shape[0] # Forward pass z.append(0) # Dummy element a.append(X) # Input activation for i in range(0, L): W = weights[i]['W'] b = weights[i]['b'] z.append(np.dot(a[-1], W) + b) a.append(ac_func(z[-1])) # Note the final element in a[:] will not be used zL_max = np.max(z[-1], axis=1, keepdims=True) z[-1] -= zL_max # Avoid numerical problem due to large values of exp(z[-1]) proba = np.exp(z[-1]) / np.sum(np.exp(z[-1]), axis=1, keepdims=True) + err_tol # Add err_tol to avoid this value too close to zero # Target matrix of labels Y = to_binary_class_matrix(y) # loss function sum_squared_weights = 0.0 # Sum of squared weights for i in range(L): W = weights[i]['W'] sum_squared_weights += np.sum(W*W) loss = -1.0/m * np.sum(Y * np.log(proba)) + 0.5*reg*sum_squared_weights # Backpropagation delta = [-1.0 * (Y - proba)] for i in reversed(range(L)): # Note that delta[0] will not be used W = weights[i]['W'] d = np.dot(delta[0], W.T) * ac_func_deriv(z[i]) delta.insert(0, d) # Insert element at beginning # Gradients grad = [{} for i in range(L)] for i in range(L): W = weights[i]['W'] grad[i]['W'] = np.dot(a[i].T, delta[i+1]) / m + reg*W grad[i]['b'] = np.mean(delta[i+1], axis=0) return loss, grad
def neural_net_loss(weights, X, y, reg): """ Compute loss and gradients of the neutral network. """ Y = to_binary_class_matrix(y) L = len(weights) # The index of the output layer z = [] a = [] # Number of samples m = X.shape[0] # Forward pass z.append(0) # Dummy element a.append(X) # Input activation for i in range(0, L): W = weights[i]['W'] b = weights[i]['b'] z.append(np.dot(a[-1], W) + b) a.append(ac_func(z[-1])) # loss function sum_weight_square = 0.0 # Sum of weight square for i in range(L): W = weights[i]['W'] sum_weight_square += np.sum(W * W) loss = 1.0 / (2.0 * m) * np.sum( (a[-1] - Y)**2) + 0.5 * reg * sum_weight_square # Backpropagation delta = [(a[-1] - Y) * ac_func_deriv(z[-1])] for i in reversed(range(L)): # Note that delta[0] will not be used W = weights[i]['W'] d = np.dot(delta[0], W.T) * ac_func_deriv(z[i]) delta.insert(0, d) # Insert element at beginning # Gradients grad = [{} for i in range(L)] for i in range(L): W = weights[i]['W'] grad[i]['W'] = np.dot(a[i].T, delta[i + 1]) / m + reg * W grad[i]['b'] = np.mean(delta[i + 1], axis=0) return loss, grad
def neural_net_loss(weights, X, y, reg): """ Compute loss and gradients of the neutral network. """ Y = to_binary_class_matrix(y) L = len(weights) # The index of the output layer z = [] a = [] # Number of samples m = X.shape[0] # Forward pass z.append(0) # Dummy element a.append(X) # Input activation for i in range(0, L): W = weights[i]['W'] b = weights[i]['b'] z.append(np.dot(a[-1], W) + b) a.append(ac_func(z[-1])) # loss function sum_weight_square = 0.0 # Sum of weight square for i in range(L): W = weights[i]['W'] sum_weight_square += np.sum(W*W) loss = 1.0/(2.0*m) * np.sum((a[-1] - Y)**2) + 0.5*reg*sum_weight_square # Backpropagation delta = [(a[-1] - Y) * ac_func_deriv(z[-1])] for i in reversed(range(L)): # Note that delta[0] will not be used W = weights[i]['W'] d = np.dot(delta[0], W.T) * ac_func_deriv(z[i]) delta.insert(0, d) # Insert element at beginning # Gradients grad = [{} for i in range(L)] for i in range(L): W = weights[i]['W'] grad[i]['W'] = np.dot(a[i].T, delta[i+1]) / m + reg*W grad[i]['b'] = np.mean(delta[i+1], axis=0) return loss, grad
def softmax_loss(weights, X, y, reg): """ Compute the loss and derivative. theta: weight matrix X: the N x M input matrix, where each column data[:, i] corresponds to a single test set y: labels corresponding to the input data """ # Small constant used to avoid numerical problem eps = 1e-10 # Weighting parameters W0 = weights[0]['W'] b0 = weights[0]['b'] # Number of samples m = X.shape[0] # Forward pass a0 = X # Input activation z1 = np.dot(a0, W0) + b0 z1_max = np.max(z1, axis=1, keepdims=True) z1 -= z1_max # Avoid numerical problem due to large values of exp(z1) proba = np.exp(z1) / np.sum(np.exp(z1), axis=1, keepdims=True) + eps # Add eps to avoid this value too close to zero # Target matrix of labels target = to_binary_class_matrix(y) # loss function loss = -1.0/m * np.sum(target * np.log(proba)) + 0.5*reg*np.sum(W0*W0) # Gradients delta1 = -1.0 * (target - proba) grad = [{}] grad[0]['W'] = np.dot(a0.T, delta1)/m + reg*W0 grad[0]['b'] = np.mean(delta1, axis=0) return loss, grad