def feedforward_autoencoder(theta, hidden_size, visible_size, data): """Accepts theta variable, hidden and visible size integers, and the data. Theta has shape (hidden_size*visible_size*2 + hidden_size+visible_size,)) data has shape (visible_size, num_examples). Returns activations on the hidden state. activations has shape (hidden_size, num_examples) """ hv = hidden_size*visible_size assert theta.shape == (2*hv + hidden_size + visible_size,) W1 = theta[:hv].reshape(hidden_size, visible_size) b1 = theta[2*hv:2*hv+hidden_size] return sigmoid(np.dot(W1, data) + T(b1))
def cost(theta, visible_size, hidden_size, weight_decay, sparsity_param, beta, data): """ % visible_size: the number of input units (probably 64) % hidden_size: the number of hidden units (probably 25) % lambda: weight decay parameter % sparsityParam: The desired average activation for the hidden units (denoted in the lecture % notes by the greek alphabet rho, which looks like a lower-case "p"). % beta: weight of sparsity penalty term % data: Our 64x10000 matrix containing the training data. So, data(:,i) is the i-th training example. % The input theta is a vector (because minFunc expects the parameters to be a vector). % We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this % follows the notation convention of the lecture notes. """ sparsity_param = float(sparsity_param) W1, W2, b1, b2 = unflatten_params(theta, hidden_size, visible_size) num_data = data.shape[1] # do a feed forward pass a2 = sigmoid(np.dot(W1, data) + T(b1)) a3 = sigmoid(np.dot(W2, a2) + T(b2)) assert a2.shape == (hidden_size, num_data) assert a3.shape == (visible_size, num_data) cost = 1.0 / num_data * (0.5 * np.sum((a3 - data)**2)) # add in weight decay cost += (0.5 * weight_decay) * (np.sum(W1**2) + np.sum(W2**2)) # add in sparsity parameter sparsity = np.sum(a2, axis=1) / float(num_data) assert sparsity.shape == (hidden_size, ) s = np.sum(binary_KL_divergence(sparsity_param, sparsity)) cost += beta * s # delta3: Compute the backprop (product rule) delta3 = -(data - a3) * a3 * (1 - a3) assert delta3.shape == (visible_size, num_data) # delta2: Compute the backprop (product rule) # 1. calculate inner derivative delta2 = np.dot(W2.T, delta3) # 2. add in sparsity parameter delta2 += T(beta * ((-sparsity_param / sparsity) + ((1 - sparsity_param) / (1 - sparsity)))) # 3. multiply by outer derivative delta2 *= a2 * (1 - a2) assert delta2.shape == (hidden_size, num_data) # compute final gradient W1grad = np.dot(delta2, data.T) / float(num_data) W2grad = np.dot(delta3, a2.T) / float(num_data) # add weight decay W1grad += weight_decay * W1 W2grad += weight_decay * W2 b1grad = np.sum(delta2, axis=1) / float(num_data) b2grad = np.sum(delta3, axis=1) / float(num_data) assert W1grad.shape == W1.shape assert W2grad.shape == W2.shape assert b1grad.shape == b1.shape assert b2grad.shape == b2.shape grad = neurolib.flatten_params(W1grad, W2grad, b1grad, b2grad) return cost, grad
def cost(theta, visible_size, hidden_size, weight_decay, sparsity_param, beta, data): """ % visible_size: the number of input units (probably 64) % hidden_size: the number of hidden units (probably 25) % lambda: weight decay parameter % sparsityParam: The desired average activation for the hidden units (denoted in the lecture % notes by the greek alphabet rho, which looks like a lower-case "p"). % beta: weight of sparsity penalty term % data: Our 64x10000 matrix containing the training data. So, data(:,i) is the i-th training example. % The input theta is a vector (because minFunc expects the parameters to be a vector). % We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this % follows the notation convention of the lecture notes. """ sparsity_param = float(sparsity_param) W1, W2, b1, b2 = unflatten_params(theta, hidden_size, visible_size) num_data = data.shape[1] # do a feed forward pass a2 = sigmoid(np.dot(W1, data) + T(b1)) a3 = sigmoid(np.dot(W2, a2) + T(b2)) assert a2.shape == (hidden_size, num_data) assert a3.shape == (visible_size, num_data) cost = 1.0 / num_data * (0.5 * np.sum((a3 - data)**2)) # add in weight decay cost += (0.5 * weight_decay) * (np.sum(W1**2) + np.sum(W2**2)) # add in sparsity parameter sparsity = np.sum(a2, axis=1) / float(num_data) assert sparsity.shape == (hidden_size,) s = np.sum(binary_KL_divergence(sparsity_param, sparsity)) cost += beta * s # delta3: Compute the backprop (product rule) delta3 = -(data - a3) * a3 * (1 - a3) assert delta3.shape == (visible_size, num_data) # delta2: Compute the backprop (product rule) # 1. calculate inner derivative delta2 = np.dot(W2.T, delta3) # 2. add in sparsity parameter delta2 += T(beta * ((-sparsity_param / sparsity) + ((1 - sparsity_param) / (1 - sparsity)))) # 3. multiply by outer derivative delta2 *= a2 * (1 - a2) assert delta2.shape == (hidden_size, num_data) # compute final gradient W1grad = np.dot(delta2, data.T) / float(num_data) W2grad = np.dot(delta3, a2.T) / float(num_data) # add weight decay W1grad += weight_decay * W1 W2grad += weight_decay * W2 b1grad = np.sum(delta2, axis=1) / float(num_data) b2grad = np.sum(delta3, axis=1) / float(num_data) assert W1grad.shape == W1.shape assert W2grad.shape == W2.shape assert b1grad.shape == b1.shape assert b2grad.shape == b2.shape grad = neurolib.flatten_params(W1grad, W2grad, b1grad, b2grad) return cost, grad