def cost(params, data, weight_penalty=0, dropout=None, rng=np.random): inputs, targets = data.inputs, data.targets W1, b1, W2, b2, W3, b3 = params if dropout is None: dropout = DEFAULT_DROPOUT num_cases = inputs.shape[0] a1 = inputs if dropout[0] > 0: mask = rng.random_sample(a1.shape) > dropout[0] a1 = a1 * mask z2 = a1.dot(W1.T) + b1 a2 = logistic(z2) # Note that at present every single activation is computed even # though we throw half of that work away when using dropout. if dropout[1] > 0: mask = rng.random_sample(a2.shape) > dropout[1] a2 = a2 * mask z3 = a2.dot(W2.T) + b2 a3 = logistic(z3) if dropout[2] > 0: mask = rng.random_sample(a3.shape) > dropout[2] a3 = a3 * mask # Un-nomalized log-prob. U = a3.dot(W3.T) + b3 # Normalize. log_prob = U - np.log(np.sum(np.exp(U), 1))[:,np.newaxis] # Compute probabilities over classes. prob = np.exp(log_prob) weight_cost = (0.5 * weight_penalty * ( np.sum(W1 ** 2) + np.sum(b1 ** 2) + np.sum(W2 ** 2) + np.sum(b2 ** 2) + np.sum(W3 ** 2) + np.sum(b3 ** 2))) cost = weight_cost - (np.sum(log_prob * targets) / num_cases) delta4 = error = prob - targets delta3 = delta4.dot(W3) * a3 * (1 - a3) delta2 = delta3.dot(W2) * a2 * (1 - a2) W1_grad = (delta2.T.dot(a1) / num_cases) + (weight_penalty * W1) W2_grad = (delta3.T.dot(a2) / num_cases) + (weight_penalty * W2) W3_grad = (delta4.T.dot(a3) / num_cases) + (weight_penalty * W3) b1_grad = (np.sum(delta2, 0) / num_cases) + (weight_penalty * b1) b2_grad = (np.sum(delta3, 0) / num_cases) + (weight_penalty * b2) b3_grad = (np.sum(delta4, 0) / num_cases) + (weight_penalty * b3) return cost, (W1_grad, b1_grad, W2_grad, b2_grad, W3_grad, b3_grad)
def cost(params, data, weight_penalty=0, dropout=None, rng=np.random): inputs, targets = data.inputs, data.targets W1, b1, W2, b2, W3, b3 = params if dropout is None: dropout = DEFAULT_DROPOUT num_cases = inputs.shape[0] a1 = inputs if dropout[0] > 0: mask = rng.random_sample(a1.shape) > dropout[0] a1 = a1 * mask z2 = a1.dot(W1.T) + b1 a2 = logistic(z2) # Note that at present every single activation is computed even # though we throw half of that work away when using dropout. if dropout[1] > 0: mask = rng.random_sample(a2.shape) > dropout[1] a2 = a2 * mask z3 = a2.dot(W2.T) + b2 a3 = logistic(z3) if dropout[2] > 0: mask = rng.random_sample(a3.shape) > dropout[2] a3 = a3 * mask # Un-nomalized log-prob. U = a3.dot(W3.T) + b3 # Normalize. log_prob = U - np.log(np.sum(np.exp(U), 1))[:, np.newaxis] # Compute probabilities over classes. prob = np.exp(log_prob) weight_cost = (0.5 * weight_penalty * (np.sum(W1**2) + np.sum(b1**2) + np.sum(W2**2) + np.sum(b2**2) + np.sum(W3**2) + np.sum(b3**2))) cost = weight_cost - (np.sum(log_prob * targets) / num_cases) delta4 = error = prob - targets delta3 = delta4.dot(W3) * a3 * (1 - a3) delta2 = delta3.dot(W2) * a2 * (1 - a2) W1_grad = (delta2.T.dot(a1) / num_cases) + (weight_penalty * W1) W2_grad = (delta3.T.dot(a2) / num_cases) + (weight_penalty * W2) W3_grad = (delta4.T.dot(a3) / num_cases) + (weight_penalty * W3) b1_grad = (np.sum(delta2, 0) / num_cases) + (weight_penalty * b1) b2_grad = (np.sum(delta3, 0) / num_cases) + (weight_penalty * b2) b3_grad = (np.sum(delta4, 0) / num_cases) + (weight_penalty * b3) return cost, (W1_grad, b1_grad, W2_grad, b2_grad, W3_grad, b3_grad)
def log_prob(params, data, dropout=None): """ Compute the log probability over classes for each input. """ W1, b1, W2, b2, W3, b3 = params if dropout is None: dropout = DEFAULT_DROPOUT a1 = data.inputs z2 = a1.dot(W1.T * (1 - dropout[0])) + b1 a2 = logistic(z2) z3 = a2.dot(W2.T * (1 - dropout[1])) + b2 a3 = logistic(z3) U = a3.dot(W3.T * (1 - dropout[2])) + b3 log_prob = U - np.log(np.sum(np.exp(U), 1))[:,np.newaxis] return log_prob
def log_prob(params, data, dropout=None): """ Compute the log probability over classes for each input. """ W1, b1, W2, b2, W3, b3 = params if dropout is None: dropout = DEFAULT_DROPOUT a1 = data.inputs z2 = a1.dot(W1.T * (1 - dropout[0])) + b1 a2 = logistic(z2) z3 = a2.dot(W2.T * (1 - dropout[1])) + b2 a3 = logistic(z3) U = a3.dot(W3.T * (1 - dropout[2])) + b3 log_prob = U - np.log(np.sum(np.exp(U), 1))[:, np.newaxis] return log_prob
def sample_h(rbm, v, end_of_chain): h_mean = logistic(v.dot(rbm.W.T) + rbm.h_bias) if not end_of_chain: h = sample_bernoulli(h_mean) else: # Don't sample the states of the hidden units, because: # a) We're at the end of the gibbs chain so we don't need h # for future computations of p(v|h). # b) h isn't required to compute neg_free_energy_grad. h = None return h, h_mean
def sample_v_softmax(rbm, h, k, labels=None): """ Sample the visible units of an RBM treating the k left-most units as a softmax group. If labels is given, the softmax group is clamped to those values. """ # Top down activity for all units. a = h.dot(rbm.W) + rbm.v_bias # Softmax units. if labels is None: u = a[:,0:k] # Activities are un-normalized log probabilities. log_prob = u - np.log(np.sum(np.exp(u), 1))[:,np.newaxis] prob = np.exp(log_prob) labels = sample_softmax(prob) else: # Use labels as probs if clamped. prob = labels # Logistic units. v_mean = logistic(a[:,k:]) v = sample_bernoulli(v_mean) return np.hstack((labels, v)), np.hstack((prob, v_mean))
def sample_v(rbm, h): v_mean = logistic(h.dot(rbm.W) + rbm.v_bias) v = sample_bernoulli(v_mean) return v, v_mean