def cost(theta, input_size, hidden_size, num_classes, netconfig, lamb, data, labels): # We first extract the part which compute the softmax gradient softmax_theta = theta[:hidden_size * num_classes].reshape([num_classes, hidden_size], order='F') # Extract out the "stack" stack = params2stack(theta[hidden_size * num_classes:], netconfig) depth = len(stack) num_cases = data.shape[1] ground_truth = np.zeros([num_classes, num_cases]) ground_truth[labels.ravel(), np.arange(num_cases)] = 1 # Compute the cost function and gradient vector for the stacked # autoencoder. # # `stack` is a cell-array of the weights and biases for every # layer. In particular, the weights of layer d are `stack[d].w` and # the biases are `stack[d].b`. # # The last layer of the network is connected to the softmax # classification layer, `softmax_theta`. # # Compute the gradients for the softmax_theta, storing that in # `softmax_theta_grad`. Similarly, compute the gradients for each # layer in the stack, storing the gradients in `stack_grad[d].w` and # `stack_grad[d].b`. Note that the size of the matrices in stackgrad # should match exactly that of the size of the matrices in stack. z = [0] a = [data] for layer in xrange(depth): z.append(stack[layer].w.dot(a[layer]) + stack[layer].b) a.append(autoencoder.sigmoid(z[layer+1])) M = softmax_theta.dot(a[depth]) M = M - M.max(0) p = np.exp(M) / np.exp(M).sum(0) gt_vec = ground_truth.reshape([1, -1], order='F') p_vec = p.reshape([-1, 1], order='F') cost = (-1.0/num_cases * gt_vec.dot(np.log(p_vec)) + lamb/2 * (softmax_theta**2).sum()) softmax_theta_grad = -1.0/num_cases * (ground_truth - p).dot(a[depth].T) + lamb * softmax_theta d = [0 for _ in xrange(depth+1)] d[depth] = -(softmax_theta.T.dot(ground_truth - p)) * a[depth] * (1-a[depth]) for layer in range(depth-1, 0, -1): d[layer] = stack[layer].w.T.dot(d[layer+1]) * a[layer] * (1-a[layer]) stack_grad = [util.Empty() for _ in xrange(depth)] for layer in range(depth-1, -1, -1): stack_grad[layer].w = (1.0/num_cases) * d[layer+1].dot(a[layer].T) stack_grad[layer].b = (1.0/num_cases) * np.sum(d[layer+1], 1)[:, np.newaxis] grad = np.append(softmax_theta_grad.ravel('F'), stack2params(stack_grad)[0]) assert (grad.shape==theta.shape) assert grad.flags['F_CONTIGUOUS'] return cost, grad
def predict(theta, input_size, hidden_size, num_classes, netconfig, data): # We first extract the part which compute the softmax gradient softmax_theta = theta[:hidden_size * num_classes].reshape([num_classes, hidden_size], order='F') # Extract out the "stack" stack = params2stack(theta[hidden_size * num_classes:], netconfig) depth = len(stack) z = [0] a = [data] for layer in xrange(depth): z.append(stack[layer].w.dot(a[layer]) + stack[layer].b) a.append(autoencoder.sigmoid(z[layer+1])) return softmax_theta.dot(a[depth]).argmax(0)