def backpropagation(self, train_set_y): # (train_set_x, train_set_y) = train_xy # assuming linear output and square error cost function observation_error = self.final_layer_output - train_set_y self.W_grads = [] self.b_grads = [] current_error = observation_error current_activation = self.activations[-1] current_W_grad = gnp.dot(current_activation.T, observation_error) current_b_grad = gnp.dot(gnp.ones((1, observation_error.shape[0])), observation_error) self.W_grads.append(current_W_grad) self.b_grads.append(current_b_grad) propagate_error = gnp.dot(observation_error, self.W_params[self.n_layers].T) # final layer is linear output, gradient is one for i in reversed(list(range(self.n_layers))): current_activation = self.activations[i] current_gradient = 1.0 - current_activation ** 2 current_W_grad = gnp.dot(current_activation.T, propagate_error) current_b_grad = gnp.dot(gnp.ones((1, propagate_error.shape[0])), propagate_error) propagate_error = gnp.dot(propagate_error, self.W_params[i].T) * current_gradient self.W_grads.insert(0, current_W_grad) self.b_grads.insert(0, current_b_grad)
def forward(self, X, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] R = self.R C = self.C bw = self.bw # Obtain word features tmp = R.as_numpy_array()[:, X.flatten()].flatten(order='F') tmp = tmp.reshape((batchsize, self.K * self.context)) words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i, :, :] = tmp[i, :].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:, :, i], C[i, :, :]) acts = gpu.concatenate((acts, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(acts, gpu.concatenate((R, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, preds.as_numpy_array())
def init_samples(self, num): """Generate exact samples from the model assuming the weights are all zero, i.e. all units are independent.""" assert np.allclose(self.weights.as_numpy_array(), 0.) vis = rbm_utils.sample_units(gnp.outer(gnp.ones(num), self.vbias)) hid = rbm_utils.sample_units(gnp.outer(gnp.ones(num), self.hbias)) return RBMState(vis, hid)
def feedforward(theta, data): nData = shape(data)[1] x = gpu.concatenate((gpu.ones((1,nData)), data), axis = 0) hidden_sum = gpu.dot(theta, x) relu_mask_hidden = gpu.ones(shape(hidden_sum)) * (hidden_sum>0) hidden_activation = hidden_sum*relu_mask_hidden return hidden_activation
def costfunc_gpu_ReLU(x, *args): num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args num_weights1 = (num_input+1)*num_hidden x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden_sum = gpu.dot(weights1, data) hidden_activation = gpu.log(1+hidden_sum.exp()) p_avg = gpu.sum(hidden_activation,axis=1)/nData hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:,1:shape(weights1)[1]] regularized_penalty2 = weights2[:,1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - inputs)*(output - inputs) KL = gpu.sum(sparsityParam*gpu.log(sparsityParam/p_avg) + (1-sparsityParam)*gpu.log((1-sparsityParam)/(1-p_avg))) cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta*KL print 'ReLU Linear Decoder Cost: ', cost return cost
def mlpSingleOutput1Layer_costfunc(x, *args): inputSize, l1Size, lambda_hidden, inputs, targets = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) inputs = gpu.garray(inputs) targets = gpu.garray(targets) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_output = gpu.garray(reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size+1))) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis = 0) #hidden_activation_L1 = hidden_activation_L1 * dropout_prob hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1) outputs = hidden_sum_output.logistic() output_target_diff = (outputs - targets)**2 regularized_penalty_output = theta_output[:,1:shape(theta_output)[1]] regularized_penalty_output = regularized_penalty_output * regularized_penalty_output regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 cost = gpu.sum(output_target_diff)/(2*numCases) + 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L1)+gpu.sum(regularized_penalty_output)) print 'Multilayer Preceptron Cost:', cost del inputs del theta_L1 del hidden_sum_L1 del hidden_activation_L1 del regularized_penalty_output del regularized_penalty_L1 gpu.free_reuse_cache() return cost
def __init__(self, layer_sizes, scale=0.05, verbose=1, l2=0.0001, momentum=0.9, epochs=20, batch_size=256,dropouts=0.0, learning_rate=0.01, learning_rate_decays=0.9): self.layer_sizes = layer_sizes self.scale = scale self.verbose = 1 self.l2 = l2 self.momentum = momentum self.epochs = epochs self.batch_size = batch_size self.dropouts = [dropouts for l in range(len(layer_sizes)-1)] self.learning_rate = learning_rate self.learning_rate_decays = learning_rate_decays shapes = [(layer_sizes[i-1], layer_sizes[i]) for i in range(1, len(layer_sizes))] self.biases = init_biases_matrix(layer_sizes) self.weights = init_weights_matrix(shapes, scale) self.rms_limits = [None for i in range(len(self.weights))] self.hidden_functions = [self.hidden_function for i in range(len(self.weights) - 1)] self.weight_grads_l2_norm = [gnp.ones(weight.shape) for weight in self.weights] self.bias_gradis_l2_norm = [gnp.ones(bias.shape) for bias in self.biases] self.weight_grads = [gnp.zeros(weight.shape) for weight in self.weights] self.bias_grads = [gnp.zeros(bias.shape) for bias in self.biases]
def backpropagation(self, train_set_y): # (train_set_x, train_set_y) = train_xy # assuming linear output and square error cost function observation_error = self.final_layer_output - train_set_y self.W_grads = [] self.b_grads = [] current_error = observation_error current_activation = self.activations[-1] current_W_grad = gnp.dot(current_activation.T, observation_error) current_b_grad = gnp.dot(gnp.ones((1, observation_error.shape[0])), observation_error) self.W_grads.append(current_W_grad) self.b_grads.append(current_b_grad) propagate_error = gnp.dot(observation_error, self.W_params[ self.n_layers].T) # final layer is linear output, gradient is one for i in reversed(range(self.n_layers)): current_activation = self.activations[i] current_gradient = 1.0 - current_activation**2 current_W_grad = gnp.dot(current_activation.T, propagate_error) current_b_grad = gnp.dot(gnp.ones((1, propagate_error.shape[0])), propagate_error) propagate_error = gnp.dot(propagate_error, self.W_params[i].T) * current_gradient self.W_grads.insert(0, current_W_grad) self.b_grads.insert(0, current_b_grad)
def forward(self, X, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] R = self.R C = self.C bw = self.bw # Obtain word features tmp = R.as_numpy_array()[:,X.flatten()].flatten(order='F') # flatten(), default in row-major order, order='F' means Fortran(column-major) order tmp = tmp.reshape((batchsize, self.K * self.context)) # reshape(), in row-major order words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i,:,:] = tmp[i,:].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:,:,i], C[i,:,:]) # the dot() of 2-D matrix is equiverlent to multiply acts = gpu.concatenate((acts, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(acts, gpu.concatenate((R, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, preds.as_numpy_array())
def grad_costfunc_gpu_ReLU(x, *args): num_input, num_hidden, num_output, inputs, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden num_weights2 = (num_hidden + 1) * num_output x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) #hidden_activation = gpu.log(1+hidden_sum.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum)) * (hidden_sum > 0) hidden_activation = hidden_sum * relu_mask_hidden1 #hidden_derivative = hidden_sum.logistic() hidden_derivative = relu_mask_hidden1 hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) hidden_derivative = gpu.concatenate((gpu.ones( (1, nData)), hidden_derivative), axis=0) outputs = gpu.dot(weights2, hidden_activation) weights1_grad = gpu.zeros(shape(weights1)) weights2_grad = gpu.zeros(shape(weights2)) p = outputs - inputs weights2_grad += gpu.dot( p, gpu.garray(transpose(hidden_activation.as_numpy_array()))) q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())), p) #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation)) q = q_temp * hidden_derivative delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array()))) weights1_grad += delta2[1:shape(delta2)[0], :] weights1_grad = weights1_grad / nData weights2_grad = weights2_grad / nData weights1_grad[:, 1:shape(weights1_grad)[1]] = weights1_grad[:, 1:shape( weights1_grad)[1]] + weights1[:, 1:shape(weights1)[1]] * lambda_val weights2_grad[:, 1:shape(weights2_grad)[1]] = weights2_grad[:, 1:shape( weights2_grad)[1]] + weights2[:, 1:shape(weights2)[1]] * lambda_val #weights1_grad = reshape(weights1_grad, num_weights1) weights1_grad = weights1_grad.reshape(num_weights1) #weights2_grad = reshape(weights2_grad, num_weights2) weights2_grad = weights2_grad.reshape(num_weights2) del x del inputs del data del p del q_temp del q del delta2 del hidden_sum del hidden_activation del weights1 del weights2 gpu.free_reuse_cache() return hstack( (weights1_grad.as_numpy_array(), weights2_grad.as_numpy_array()))
def smooth(self, eps=0.001): moments = (1. - eps)**2 * self moments += eps * (1. - eps) * self.__class__.from_independent( 0.5 * gnp.ones(moments.expect_vis.size), self.expect_hid) moments += eps * (1. - eps) * self.__class__.from_independent( self.expect_vis, 0.5 * gnp.ones(moments.expect_hid.size)) moments += eps**2 * self.__class__.uniform(moments.expect_vis.size, moments.expect_hid.size) return moments
def mlpSoftmax1Layer_grad(x, *args): numClasses, inputSize, l1Size, lambda_softmax, lambda_hidden, inputs, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_softmax = numClasses * l1Size inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_softmax = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (numClasses, l1Size))) theta_L1_grad = gpu.zeros(shape(theta_L1)) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) #hidden_derivative_L1 = hidden_sum_L1.logistic() relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 #hidden_activation_L1 = hidden_sum_L1.logistic() hidden_derivative_L1 = relu_mask_hidden1 hidden_sum_softmax_imd = gpu.dot(theta_softmax, hidden_activation_L1) hidden_sum_softmax = hidden_sum_softmax_imd - hidden_sum_softmax_imd.max( axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) softmax_imd = groundTruth - predictions theta_softmax_grad = -1 * gpu.dot( softmax_imd, gpu.garray(transpose(hidden_activation_L1.as_numpy_array())) ) / numCases + lambda_softmax * theta_softmax deltaOut = -softmax_imd delta_L1_imd = gpu.dot( gpu.garray(transpose(theta_softmax.as_numpy_array())), deltaOut) delta_L1_imd2 = delta_L1_imd * hidden_derivative_L1 #delta_L1_imd2 = (delta_L1_imd*hidden_activation_L1)*(1-hidden_activation_L1) delta_L1 = gpu.dot(delta_L1_imd2, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta_L1 theta_L1_grad = theta_L1_grad / numCases theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape( theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) theta_softmax_grad = reshape(theta_softmax_grad.as_numpy_array(), num_weights_softmax) del inputs del theta_L1 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_softmax del predictions del softmax_imd del deltaOut del delta_L1_imd del delta_L1_imd2 del delta_L1 gpu.free_reuse_cache() return hstack((theta_L1_grad, theta_softmax_grad))
def forwardProp(X, theta1, theta2): a1 = gpu.concatenate((X, gpu.ones((np.size(X[:, 0]), 1))), axis=1) a2 = sigmoid(theta1.dot(a1.T)) a2 = gpu.concatenate((a2, gpu.ones((1, np.size(a2[0, :])))), axis=0) a3 = sigmoid(theta2.dot(a2)) return a1, a2, a3
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray( reshape(x[num_weights_L1:num_weights_L2 + num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_softmax = gpu.garray( reshape(x[num_weights_L2 + num_weights_L1:shape(x)[0]], (numClasses, l2Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_activation_L1), axis=0) hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = hidden_sum_L2.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) temp = groundTruth * gpu.log(predictions) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:, 1:shape(theta_L2)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 cost = -1 * gpu.sum(temp) / numCases + 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2) ) + 0.5 * lambda_softmax * gpu.sum(theta_softmax * theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_L2 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 del regularized_penalty_L2 gpu.free_reuse_cache() return cost
def costfunc_gpu(x, *args): num_input, num_hidden, num_output, inputs, noNoiseData, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden x = gpu.garray(x) # randomNoise = random.random_sample(shape(inputs)) # criteriaTable = randomNoise > 0.32 # inputs = inputs * criteriaTable inputs = gpu.garray(inputs) noNoiseData = gpu.garray(noNoiseData) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) hidden_activation = hidden_sum.logistic() p_avg = gpu.sum(hidden_activation, axis=1) / nData hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:, 1:shape(weights1)[1]] regularized_penalty2 = weights2[:, 1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - noNoiseData) * (output - noNoiseData) KL = gpu.sum(sparsityParam * gpu.log(sparsityParam / p_avg) + (1 - sparsityParam) * gpu.log((1 - sparsityParam) / (1 - p_avg))) cost = gpu.sum(output_target_diff) / (2 * nData) + 0.5 * lambda_val * ( gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta * KL print 'GPU Linear Denoising Decoder Cost: ', cost del x del inputs del noNoiseData del data del hidden_sum del hidden_activation del p_avg del output del regularized_penalty1 del regularized_penalty2 del weights1 del weights2 del output_target_diff gpu.free_reuse_cache() return cost
def grad_costfunc_gpu(x, *args): num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args num_weights1 = (num_input+1)*num_hidden num_weights2 = (num_hidden+1)*num_output x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1)) weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden_sum = gpu.dot(weights1, data) hidden_activation = hidden_sum.logistic() p_avg = gpu.sum(hidden_activation,axis=1)/nData grad_sparse = -1*sparsityParam/p_avg.as_numpy_array() + (1-sparsityParam)/(1-p_avg.as_numpy_array()) grad_sparse = append(0,grad_sparse) grad_sparse = tile(grad_sparse, (nData, 1)) grad_sparse = gpu.garray(transpose(grad_sparse)) hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0) outputs = gpu.dot(weights2, hidden_activation) weights1_grad = gpu.zeros(shape(weights1)) weights2_grad = gpu.zeros(shape(weights2)) p = outputs-inputs weights2_grad += gpu.dot(p, gpu.garray(transpose(hidden_activation.as_numpy_array()))) q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())),p) + beta*grad_sparse #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation)) q = (q_temp*hidden_activation)*(1-hidden_activation) delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array()))) weights1_grad += delta2[1:shape(delta2)[0], :] weights1_grad = weights1_grad/nData weights2_grad = weights2_grad/nData weights1_grad[:,1:shape(weights1_grad)[1]] = weights1_grad[:,1:shape(weights1_grad)[1]] + weights1[:,1:shape(weights1)[1]] * lambda_val weights2_grad[:,1:shape(weights2_grad)[1]] = weights2_grad[:,1:shape(weights2_grad)[1]] + weights2[:,1:shape(weights2)[1]] * lambda_val #weights1_grad = reshape(weights1_grad, num_weights1) weights1_grad = weights1_grad.reshape(num_weights1) #weights2_grad = reshape(weights2_grad, num_weights2) weights2_grad = weights2_grad.reshape(num_weights2) del x del inputs del data del grad_sparse del p del q_temp del q del delta2 del hidden_sum del hidden_activation del weights1 del weights2 gpu.free_reuse_cache() return hstack((weights1_grad.as_numpy_array(),weights2_grad.as_numpy_array()))
def clip(a, a_min, a_max): """Clip (limit) the values in an array. Given an interval, values outside the interval are clipped to the interval edges. For example, if an interval of [0, 1] is specified, values smaller than 0 become 0, and values larger than 1 become 1.""" if isinstance(a, gp.garray): max_mask = (a > a_max) max_tar = gp.ones(a.shape) * a_max min_mask = (a < a_min) min_tar = gp.ones(a.shape) * a_min a_clipped = a*(1-max_mask-min_mask) + max_tar*max_mask + min_tar*min_mask return a_clipped else: return np.clip(a, a_min, a_max)
def KL(rho, rho_target, KL_flat): y = rho.copy() if KL_flat: y[gp.where(y < rho_target)] = rho_target * gp.ones( y[gp.where(y < rho_target)].shape) return rho_target * gp.log(rho_target / y) + (1 - rho_target) * gp.log( (1 - rho_target) / (1 - y))
def build_layer(self, in_dim, out_dim, nonlin, dropout=0, sparsity=0, sparsity_weight=0, init_scale=1, loss=None, params=None, loss_after_nonlin=False, init_bias=0, use_batch_normalization=False): self.nonlin = nonlin self.set_params(params if params is not None else \ LayerParams(in_dim, out_dim, init_scale, dropout, init_bias=init_bias)) self.sparsity = sparsity self.sparsity_weight = sparsity_weight if self.sparsity_weight > 0: self._sparsity_current = gnp.ones(out_dim) * sparsity self._sparsity_smoothing = 0.9 self._sparsity_objective = 0 self.loss = loss self.loss_value = 0 self.noise_added = False self.loss_computed = False self.loss_after_nonlin = loss_after_nonlin self.use_batch_normalization = use_batch_normalization if use_batch_normalization: self.bn_layer = BatchNormalizationLayer(out_dim, init_bias=init_bias) self._bn_layer_param_id = self.bn_layer._param_id
def dbn_supervised_predict_sample(ws_vh, ws_v, ws_h, x, k=20): """ Predict the class label of input x from supervised DBN WARNING: THIS IS PRETTY SLOW AND LESS RELIABLE THAN THE EXACT METHOD Uses the sampling method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 x: Input data. (NxD matrix) k: Number of Gibbs steps """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # we give random values to the supervised portion of the input v = gnp.concatenate((gnp.ones((K, N)) / K, h_prev)) # we keep the visible units clamped while sampling h, v = rbm_sample(ws_vh[-1], ws_v[-1], ws_h[-1], v, k, clamped=(K, H)) # sample visible units of top level RBM given return v[0:K, :].T
def backprop(self): self.timer_logger('backprop', time.time()) self.results['grads'] = [] self.results['bias_grads'] = [] if self.problem == 'classification': #assumes softmax + cross entropy so that both gradients cancel out to give: error = y-t self.results['error'] = self.results['current'] - gpu.garray( self.util.create_t_dataset(self.batch_y)) else: #assumes linear unit + squared error cost function so that both gradients cancel out to give: error = y-t self.results['error'] = (self.results['current'] - gpu.garray(self.batch_y)) for pair in self.results['activations']: activation = pair[0] weight = pair[1] gradient = self.activation_gradient(activation) self.results['grads'].insert( 0, gpu.dot(activation.T, self.results['error'])) self.results['bias_grads'].insert( 0, gpu.dot(gpu.ones((1, self.results['error'].shape[0])), self.results['error'])) self.results['error'] = gpu.dot(self.results['error'], weight.T) * gradient self.timer_logger('backprop', time.time())
def clip(a, a_min, a_max): """Clip (limit) the values in an array. Given an interval, values outside the interval are clipped to the interval edges. For example, if an interval of [0, 1] is specified, values smaller than 0 become 0, and values larger than 1 become 1.""" if not isinstance(a, np.ndarray): max_mask = (a > a_max) max_tar = gp.ones(a.shape) * a_max min_mask = (a < a_min) min_tar = gp.ones(a.shape) * a_min a_clipped = (a * (1 - max_mask - min_mask) + max_tar * max_mask + min_tar * min_mask) return a_clipped else: return np.clip(a, a_min, a_max)
def forward(self, X, Im, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] Im = gpu.garray(Im) C = self.C M = self.M bw = self.bw J = self.J bj = self.bj Wfx = self.Wfx Whf = self.Whf Wfv = self.Wfv # Forwardprop images Im = gpu.concatenate((Im, gpu.ones((batchsize, 1))), 1) IF = gpu.dot(Im, gpu.concatenate((J, bj))) IF = IF * (IF > 0) # Obtain word features R = gpu.dot(Wfx, Whf) tmp = R.as_numpy_array()[:,X.flatten()].flatten(order='F') tmp = tmp.reshape((batchsize, self.K * self.context)) words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i,:,:] = tmp[i,:].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:,:,i], C[i,:,:]) acts = acts + gpu.dot(IF, M) # Multiplicative interaction F = gpu.dot(acts, Wfx) * gpu.dot(IF, Wfv) F = gpu.concatenate((F, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(F, gpu.concatenate((Whf, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, IF, F, preds.as_numpy_array())
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray(reshape(x[num_weights_L1:num_weights_L2+num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_softmax = gpu.garray(reshape(x[num_weights_L2+num_weights_L1:shape(x)[0]], (numClasses, l2Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0) hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = hidden_sum_L2.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis = 0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions,axis = 0) temp = groundTruth*gpu.log(predictions) regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:,1:shape(theta_L2)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 cost = -1*gpu.sum(temp)/numCases + 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2)) + 0.5 * lambda_softmax * gpu.sum(theta_softmax*theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_L2 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 del regularized_penalty_L2 gpu.free_reuse_cache() return cost
def mlpSingleOutput1Layer_grad(x, *args): inputSize, l1Size, lambda_hidden, inputs, targets = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_output = 1 * (l1Size + 1) inputs = gpu.garray(inputs) targets = gpu.garray(targets) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_output = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size + 1))) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_activation_L1), axis=0) #hidden_activation_L1 = hidden_activation_L1 * dropout_prob hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1) outputs = hidden_sum_output.logistic() theta_L1_grad = gpu.zeros(shape(theta_L1)) theta_output_grad = gpu.zeros(shape(theta_output)) a = (outputs - targets) * outputs * (1 - outputs) theta_output_grad += gpu.dot( a, gpu.garray(transpose(hidden_activation_L1.as_numpy_array()))) b_temp = gpu.dot(gpu.garray(transpose(theta_output.as_numpy_array())), a) b = (b_temp * hidden_activation_L1) * (1 - hidden_activation_L1) delta2 = gpu.dot(b, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta2[1:shape(delta2)[0], :] theta_L1_grad = theta_L1_grad / numCases theta_output_grad = theta_output_grad / numCases theta_output_grad[:, 1:shape( theta_output_grad)[1]] = theta_output_grad[:, 1:shape( theta_output_grad )[1]] + theta_output[:, 1:shape(theta_output)[1]] * lambda_hidden theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape( theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden theta_output_grad = reshape(theta_output_grad.as_numpy_array(), num_weights_output) theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) del inputs del theta_L1 del hidden_sum_L1 del hidden_activation_L1 gpu.free_reuse_cache() return hstack((theta_L1_grad, theta_output_grad))
def __init__(self, layer_dim=None, init_bias=0, mean_std_update_rate=0.01): if layer_dim is None: return self.gamma = gnp.ones(layer_dim) self.beta = gnp.ones(layer_dim) * init_bias # mu and sigma keep a moving average of mean and standard deviation self.mu = None self.sigma = None self.mean_std_update_rate = mean_std_update_rate self.gamma_grad = gnp.zeros(layer_dim) self.beta_grad = gnp.zeros(layer_dim) self.param_size = self.gamma.size + self.beta.size self._param_id = LayerParams._param_count LayerParams._param_count += 1
def costfunc_gpu_ReLU(x, *args): num_input, num_hidden, num_output, inputs, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) #hidden_activation = gpu.log(1+hidden_sum.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum)) * (hidden_sum > 0) hidden_activation = hidden_sum * relu_mask_hidden1 hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:, 1:shape(weights1)[1]] regularized_penalty2 = weights2[:, 1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - inputs) * (output - inputs) cost = gpu.sum(output_target_diff) / (2 * nData) + 0.5 * lambda_val * ( gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) print 'GPU ReLU Linear Decoder Cost: ', cost del x del inputs del data del hidden_sum del hidden_activation del output del regularized_penalty1 del regularized_penalty2 del weights1 del weights2 del output_target_diff gpu.free_reuse_cache() return cost
def mlpSoftmax1Layer_costfunc(x, *args): numClasses, inputSize, l1Size, lambda_softmax, lambda_hidden, inputs, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_softmax = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (numClasses, l1Size))) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 #hidden_activation_L1 = hidden_sum_L1.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L1) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) temp = groundTruth * gpu.log(predictions) temp = temp.as_numpy_array() temp[temp == -inf] = -200.0 temp = nan_to_num(temp) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 cost = -1 * sum(temp) / numCases + 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L1)) + 0.5 * lambda_softmax * gpu.sum( theta_softmax * theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 gpu.free_reuse_cache() return cost
def backward(self, Y, preds, F, IF, acts, words, X, Im): """ Backward pass through the network """ batchsize = preds.shape[0] Im = gpu.garray(Im) # Compute part of df/dR Ix = gpu.garray(preds[:,:-1] - Y) / batchsize delta = gpu.dot(F.T, Ix) dWhf = delta[:-1,:] + self.gamma_r * self.Whf db = delta[-1,:] # Compute df/Wfv and part of df/Wfx Ix = gpu.dot(Ix, self.Whf.T) dWfv = gpu.dot(IF.T, Ix * gpu.dot(acts, self.Wfx)) + self.gamma_r * self.Wfv dWfx = gpu.dot(acts.T, Ix * gpu.dot(IF, self.Wfv)) + self.gamma_r * self.Wfx # Compute df/dC and word inputs for df/dR Ix_word = gpu.dot(Ix * gpu.dot(IF, self.Wfv), self.Wfx.T) dC = gpu.zeros(np.shape(self.C)) dR = np.zeros((self.K, self.V)) for i in range(self.context): delta = gpu.dot(words[:,:,i].T, Ix_word) dC[i,:,:] = delta + self.gamma_c * self.C[i,:,:] delta = gpu.dot(Ix_word, self.C[i,:,:].T) delta = delta.as_numpy_array() for j in range(X.shape[0]): dR[:,X[j,i]] = dR[:,X[j,i]] + delta.T[:,j] dR = gpu.garray(dR) dWfx = dWfx + gpu.dot(dR, self.Whf.T) dWhf = dWhf + gpu.dot(self.Wfx.T, dR) # Compute df/dM dM = gpu.dot(IF.T, Ix_word) + self.gamma_c * self.M # Compute df/dJ Ix = gpu.dot(Ix * gpu.dot(acts, self.Wfx), self.Wfv.T) * (IF > 0) + gpu.dot(Ix_word, self.M.T) * (IF > 0) Im = gpu.concatenate((Im, gpu.ones((batchsize, 1))), 1) delta = gpu.dot(Im.T, Ix) dJ = delta[:-1,:] + self.gamma_c * self.J dBj = delta[-1,:] self.db = db self.dC = dC self.dM = dM self.dJ = dJ self.dBj = dBj self.dWhf = dWhf self.dWfv = dWfv self.dWfx = dWfx
def __init__(self, in_dim=1, out_dim=1, init_scale=1e-1, dropout=0, init_bias=0): self.W = gnp.randn(in_dim, out_dim) * init_scale self.b = gnp.ones(out_dim) * init_bias self.W_grad = self.W * 0 self.b_grad = self.b * 0 self.param_size = self.W.size + self.b.size self.dropout = dropout # get an ID for this param variable. self._param_id = LayerParams._param_count LayerParams._param_count += 1
def __init__(self, in_dim=[1], out_dim=1, init_scale=1.0, dropout=[0], init_bias=0): self.n_inputs = len(in_dim) self.W = [gnp.randn(in_dim[i], out_dim) * math.sqrt(float(init_scale) / in_dim[i]) for i in xrange(self.n_inputs)] self.b = gnp.ones(out_dim) * init_bias self.W_grad = [self.W[i] * 0 for i in xrange(self.n_inputs)] self.b_grad = self.b * 0 self.param_size = sum([W.size for W in self.W]) + self.b.size self.dropout = dropout if len(dropout) == self.n_inputs else dropout[:1] * self.n_inputs # get an ID for this param variable. self._param_id = LayerParams._param_count LayerParams._param_count += 1
def __init__(self, layer_dim=None): if layer_dim is None: return self.gamma = gnp.ones(layer_dim) self.beta = gnp.zeros(layer_dim) self.gamma_grad = gnp.zeros(layer_dim) self.beta_grad = gnp.zeros(layer_dim) self.param_size = self.gamma.size + self.beta.size self._param_id = LayerParams._param_count LayerParams._param_count += 1
def mlpSingleOutput1Layer_costfunc(x, *args): inputSize, l1Size, lambda_hidden, inputs, targets = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) inputs = gpu.garray(inputs) targets = gpu.garray(targets) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_output = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size + 1))) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_activation_L1), axis=0) #hidden_activation_L1 = hidden_activation_L1 * dropout_prob hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1) outputs = hidden_sum_output.logistic() output_target_diff = (outputs - targets)**2 regularized_penalty_output = theta_output[:, 1:shape(theta_output)[1]] regularized_penalty_output = regularized_penalty_output * regularized_penalty_output regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 cost = gpu.sum(output_target_diff) / ( 2 * numCases) + 0.5 * lambda_hidden * (gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_output)) print 'Multilayer Preceptron Cost:', cost del inputs del theta_L1 del hidden_sum_L1 del hidden_activation_L1 del regularized_penalty_output del regularized_penalty_L1 gpu.free_reuse_cache() return cost
def mlpSingleOutput1Layer_grad(x, *args): inputSize, l1Size, lambda_hidden, inputs, targets = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_output = 1 * (l1Size+1) inputs = gpu.garray(inputs) targets = gpu.garray(targets) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_output = gpu.garray(reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size+1))) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis = 0) #hidden_activation_L1 = hidden_activation_L1 * dropout_prob hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1) outputs = hidden_sum_output.logistic() theta_L1_grad = gpu.zeros(shape(theta_L1)) theta_output_grad = gpu.zeros(shape(theta_output)) a = (outputs - targets) * outputs * (1-outputs) theta_output_grad += gpu.dot(a, gpu.garray(transpose(hidden_activation_L1.as_numpy_array()))) b_temp = gpu.dot(gpu.garray(transpose(theta_output.as_numpy_array())),a) b = (b_temp*hidden_activation_L1)*(1-hidden_activation_L1) delta2 = gpu.dot(b, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta2[1:shape(delta2)[0], :] theta_L1_grad = theta_L1_grad/numCases theta_output_grad = theta_output_grad/numCases theta_output_grad[:,1:shape(theta_output_grad)[1]] = theta_output_grad[:,1:shape(theta_output_grad)[1]] + theta_output[:,1:shape(theta_output)[1]] * lambda_hidden theta_L1_grad[:,1:shape(theta_L1_grad)[1]] = theta_L1_grad[:,1:shape(theta_L1_grad)[1]] + theta_L1[:,1:shape(theta_L1)[1]] * lambda_hidden theta_output_grad = reshape(theta_output_grad.as_numpy_array(), num_weights_output) theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) del inputs del theta_L1 del hidden_sum_L1 del hidden_activation_L1 gpu.free_reuse_cache() return hstack((theta_L1_grad,theta_output_grad))
def test_gnumpy(dat, num_epochs): import gnumpy as gpu import numpy import time # load data. <dat> is 2 dimensional: 60000 X 784 #dat = gpu.garray(load('mnist_cudaTest').T/255.) # training parameters epsilon = 0.1 momentum = 0.9 batch_size = 128 num_batches = dat.shape[0] / batch_size # model parameters num_vis = dat.shape[1] num_hid = 4096 # initialize weights w_vh = 0.1 * gpu.randn(num_vis, num_hid) w_v = gpu.zeros(num_vis) w_h = -4. * gpu.ones(num_hid) # initialize weight updates wu_vh = gpu.zeros((num_vis, num_hid)) wu_v = gpu.zeros(num_vis) wu_h = gpu.zeros(num_hid) for epoch in range(num_epochs): err = [] tic = time.clock() for batch in range(num_batches): # positive phase v1 = dat[batch * batch_size:(batch + 1) * batch_size] h1 = (gpu.dot(v1, w_vh) + w_h).logistic() # sample hiddens hSampled = h1.rand() < h1 # negative phase v2 = (gpu.dot(hSampled, w_vh.T) + w_v).logistic() h2 = (gpu.dot(v2, w_vh) + w_h).logistic() # update weights wu_vh = wu_vh * momentum + gpu.dot(v1.T, h1) - gpu.dot(v2.T, h2) wu_v = wu_v * momentum + v1.sum(0) - v2.sum(0) wu_h = wu_h * momentum + h1.sum(0) - h2.sum(0) w_vh += wu_vh * (epsilon / batch_size) w_v += wu_v * (epsilon / batch_size) w_h += wu_h * (epsilon / batch_size) # calculate reconstruction error err.append((v2 - v1).euclid_norm()**2 / (num_vis * batch_size)) toc = time.clock() print "Mean squared error: %.4f, takes time: %d" % (numpy.mean(err), toc - tic) return w_vh, w_v, w_h
def test_gnumpy(dat, num_epochs): import gnumpy as gpu import numpy import time # load data. <dat> is 2 dimensional: 60000 X 784 #dat = gpu.garray(load('mnist_cudaTest').T/255.) # training parameters epsilon = 0.1 momentum = 0.9 batch_size = 128 num_batches = dat.shape[0]/batch_size # model parameters num_vis = dat.shape[1] num_hid = 4096 # initialize weights w_vh = 0.1 * gpu.randn(num_vis, num_hid) w_v = gpu.zeros(num_vis) w_h = -4. * gpu.ones(num_hid) # initialize weight updates wu_vh = gpu.zeros((num_vis, num_hid)) wu_v = gpu.zeros(num_vis) wu_h = gpu.zeros(num_hid) for epoch in range(num_epochs): err = [] tic = time.clock() for batch in range(num_batches): # positive phase v1 = dat[batch*batch_size : (batch + 1)*batch_size] h1 = (gpu.dot(v1, w_vh) + w_h).logistic() # sample hiddens hSampled = h1.rand() < h1 # negative phase v2 = (gpu.dot(hSampled, w_vh.T) + w_v).logistic() h2 = (gpu.dot(v2, w_vh) + w_h).logistic() # update weights wu_vh = wu_vh * momentum + gpu.dot(v1.T, h1) - gpu.dot(v2.T, h2) wu_v = wu_v * momentum + v1.sum(0) - v2.sum(0) wu_h = wu_h * momentum + h1.sum(0) - h2.sum(0) w_vh += wu_vh * (epsilon/batch_size) w_v += wu_v * (epsilon/batch_size) w_h += wu_h * (epsilon/batch_size) # calculate reconstruction error err.append((v2-v1).euclid_norm()**2/(num_vis*batch_size)) toc = time.clock() print "Mean squared error: %.4f, takes time: %d" % (numpy.mean(err), toc-tic) return w_vh, w_v, w_h
def fine_tuning_cost_gpu(x, *args): inputSize, l1Size, l2Size, l3Size, l4Size, l5Size, lambda_val, inputs = args num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_L3 = l3Size * (l2Size + 1) num_weights_L4 = l4Size * (l3Size + 1) num_weights_L5 = l5Size * (l4Size + 1) #num_weights_L6 = inputSize * (l5Size + 1) x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = reshape(x[0:num_weights_L1], (l1Size, inputSize + 1)) weights1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #weights2 = reshape(x[num_weights_L1:num_weights_L1+num_weights_L2], (l2Size, l1Size + 1)) weights2 = x[num_weights_L1:num_weights_L1+num_weights_L2].reshape((l2Size, l1Size + 1)) #weights3 = reshape(x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3], (l3Size, l2Size + 1)) weights3 = x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3].reshape((l3Size, l2Size + 1)) #weights4 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4], (l4Size, l3Size + 1)) weights4 = x[num_weights_L1+num_weights_L2+num_weights_L3:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4].reshape((l4Size, l3Size + 1)) #weights5 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5], (l5Size, l4Size + 1)) weights5 = x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5].reshape((l5Size, l4Size + 1)) #weights6 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5:shape(x)[0]], (inputSize, l5Size+1)) weights6 = x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5:shape(x)[0]].reshape((inputSize, l5Size+1)) nData = shape(inputs)[1] x = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden1_sum = gpu.dot(weights1, x) hidden1_activation = hidden1_sum.logistic() hidden1_activation = gpu.concatenate((gpu.ones((1,nData)), hidden1_activation), axis = 0) hidden2_sum = gpu.dot(weights2, hidden1_activation) hidden2_activation = hidden2_sum.logistic() hidden2_activation = gpu.concatenate((gpu.ones((1,nData)), hidden2_activation), axis = 0) hidden3_sum = gpu.dot(weights3, hidden2_activation) hidden3_activation = hidden3_sum.logistic() hidden3_activation = gpu.concatenate((gpu.ones((1,nData)), hidden3_activation), axis = 0) hidden4_sum = gpu.dot(weights4, hidden3_activation) hidden4_activation = hidden4_sum.logistic() hidden4_activation = gpu.concatenate((gpu.ones((1,nData)), hidden4_activation), axis = 0) hidden5_sum = gpu.dot(weights5, hidden4_activation) hidden5_activation = hidden5_sum.logistic() hidden5_activation = gpu.concatenate((gpu.ones((1,nData)), hidden5_activation), axis = 0) output_sum = gpu.dot(weights6, hidden5_activation) outputs = output_sum.logistic() regularized_penalty4 = weights4[:,1:shape(weights4)[1]] regularized_penalty5 = weights5[:,1:shape(weights5)[1]] regularized_penalty6 = weights6[:,1:shape(weights6)[1]] regularized_penalty4 = regularized_penalty4 ** 2 regularized_penalty5 = regularized_penalty5 ** 2 regularized_penalty6 = regularized_penalty6 ** 2 output_target_diff = (outputs - inputs)**2 cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty4) + gpu.sum(regularized_penalty5) + gpu.sum(regularized_penalty6)) print 'Fine Tuning Cost: ', cost return cost
def __init__(self, in_dim=1, out_dim=1, init_scale=1.0, dropout=0, init_bias=0): self.W = gnp.randn(in_dim, out_dim) * math.sqrt( float(init_scale) / in_dim) self.b = gnp.ones(out_dim) * init_bias self.W_grad = self.W * 0 self.b_grad = self.b * 0 self.param_size = self.W.size + self.b.size self.dropout = dropout # get an ID for this param variable. self._param_id = LayerParams._param_count LayerParams._param_count += 1
def backward(self, Y, preds, IF, acts, words, X, Im): """ Backward pass through the network """ batchsize = preds.shape[0] Im = gpu.garray(Im) # Compute part of df/dR Ix = gpu.garray(preds[:,:-1] - Y) / batchsize delta = gpu.dot(acts.T, Ix) dR = delta[:-1,:] + self.gamma_r * self.R db = delta[-1,:] dR = dR.as_numpy_array() # Compute df/dC and word inputs for df/dR Ix = gpu.dot(Ix, self.R.T) dC = gpu.zeros(np.shape(self.C)) for i in range(self.context): delta = gpu.dot(words[:,:,i].T, Ix) dC[i,:,:] = delta + self.gamma_c * self.C[i,:,:] delta = gpu.dot(Ix, self.C[i,:,:].T) delta = delta.as_numpy_array() for j in range(X.shape[0]): dR[:,X[j,i]] = dR[:,X[j,i]] + delta.T[:,j] # Compute df/dM dM = gpu.dot(IF.T, Ix) + self.gamma_c * self.M # Compute df/dJ Ix = gpu.dot(Ix, self.M.T) * (IF > 0) Im = gpu.concatenate((Im, gpu.ones((batchsize, 1))), 1) delta = gpu.dot(Im.T, Ix) dJ = delta[:-1,:] + self.gamma_c * self.J dBj = delta[-1,:] self.dR = gpu.garray(dR) self.dM = dM self.db = db self.dC = dC self.dJ = dJ self.dBj = dBj
def backprop(self): self.timer_logger('backprop', time.time()) self.results['grads'] = [] self.results['bias_grads'] = [] if self.problem == 'classification': #assumes softmax + cross entropy so that both gradients cancel out to give: error = y-t self.results['error'] = self.results['current'] - gpu.garray(self.util.create_t_dataset(self.batch_y)) else: #assumes linear unit + squared error cost function so that both gradients cancel out to give: error = y-t self.results['error'] = (self.results['current'] - gpu.garray(self.batch_y)) for pair in self.results['activations']: activation = pair[0] weight = pair[1] gradient = self.activation_gradient(activation) self.results['grads'].insert(0,gpu.dot(activation.T,self.results['error'])) self.results['bias_grads'].insert(0,gpu.dot(gpu.ones((1,self.results['error'].shape[0])),self.results['error'])) self.results['error'] = gpu.dot(self.results['error'],weight.T)*gradient self.timer_logger('backprop', time.time())
def feedforward(self, X, return_on_gpu=False): """Perform feedforward through this layer. """ # Cleanup debris from any previous feedforward self._cleanup() # Record (a pointer to) the passed input self.X = gp.garray(X) # Generate and apply a dropout mask to the input if (self.drop_rate > 1e-4): drop_mask = self.drop_scale * \ (gp.rand((self.X.shape[0], self.X.shape[1])) > self.drop_rate) else: drop_mask = gp.ones((self.X.shape[0], self.X.shape[1])) self.dYdX = drop_mask if (self.fuzz_scale > 1e-4): fuzz_bump = (self.fuzz_scale / self.drop_scale) * \ gp.randn((self.X.shape[0], self.X.shape[1])) self.Y = drop_mask * (self.X + fuzz_bump) else: self.Y = drop_mask * self.X if not return_on_gpu: self.Y = gp.as_numpy_array(self.Y) return self.Y
def l1svm_x(z, targets, predict=False, error=False, addon=0): """ l1-SVM for the hinge loss, cross(mutual exclusive) addon, weight Note: the _targets here are (1, -1) and targets are single numbers which indicate the class label """ if predict: # argmax(z) return gpu.argmax(z, axis=1) n, m = z.shape _targets = -1 * gpu.ones((n, m)) _targets[np.arange(n), targets] += 2 _value = (1 - z * _targets) indicator = _value > 0 maximum = indicator * _value xhl = gpu.sum(maximum) if error: err = -_targets * indicator return xhl + addon, err else: return xhl + addon
def l2svm_x(z, targets, predict=False, error=False, addon=0): """ l2-SVM for the hinge loss, cross(mutual exclusive) addon, weight Note: the _targets here are (1, -1) and targets are single numbers which indicate the class label """ if predict: # argmax(z) return gpu.argmax(z, axis=1) n, m = z.shape # _targets (1, -1) _targets = -1 * gpu.ones((n, m)) # targets only has one label for one data _targets[np.arange(n), targets] += 2 _value = (1 - z * _targets) maximum = (_value > 0) * _value xhl = gpu.sum(maximum**2) if error: err = -2 * _targets * maximum return xhl + addon, err else: return xhl + addon
def __init__(self, in_dim=[1], out_dim=1, init_scale=1.0, dropout=[0], init_bias=0): self.n_inputs = len(in_dim) self.W = [ gnp.randn(in_dim[i], out_dim) * math.sqrt(float(init_scale) / in_dim[i]) for i in xrange(self.n_inputs) ] self.b = gnp.ones(out_dim) * init_bias self.W_grad = [self.W[i] * 0 for i in xrange(self.n_inputs)] self.b_grad = self.b * 0 self.param_size = sum([W.size for W in self.W]) + self.b.size self.dropout = dropout if len( dropout) == self.n_inputs else dropout[:1] * self.n_inputs # get an ID for this param variable. self._param_id = LayerParams._param_count LayerParams._param_count += 1
def build_layer(self, in_dim, out_dim, nonlin, dropout=0, sparsity=0, sparsity_weight=0, init_scale=1e-1, loss=None, params=None, loss_after_nonlin=False, init_bias=0, use_batch_normalization=False): self.nonlin = nonlin self.set_params(params if params is not None else \ LayerParams(in_dim, out_dim, init_scale, dropout, init_bias=init_bias)) self.sparsity = sparsity self.sparsity_weight = sparsity_weight if self.sparsity_weight > 0: self._sparsity_current = gnp.ones(out_dim) * sparsity self._sparsity_smoothing = 0.9 self._sparsity_objective = 0 self.loss = loss self.loss_value = 0 self.noise_added = False self.loss_computed = False self.loss_after_nonlin = loss_after_nonlin self.use_batch_normalization = use_batch_normalization if use_batch_normalization: self.bn_layer = BatchNormalizationLayer(out_dim) self._bn_layer_param_id = self.bn_layer._param_id
def fine_tuning_cost_gpu(x, *args): inputSize, l1Size, l2Size, l3Size, lambda_val, inputs = args num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_L3 = l3Size * (l2Size + 1) x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) weights2 = x[num_weights_L1:num_weights_L1+num_weights_L2].reshape((l2Size, l1Size + 1)) weights3 = x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3].reshape((l3Size, l2Size + 1)) weights4 = x[num_weights_L1+num_weights_L2+num_weights_L3:shape(x)[0]].reshape((inputSize, l3Size + 1)) nData = shape(inputs)[1] x = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden1_sum = gpu.dot(weights1, x) #hidden1_activation = gpu.log(1+hidden1_sum.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden1_sum)) * (hidden1_sum>0) hidden1_activation = hidden1_sum*relu_mask_hidden1 hidden1_activation = gpu.concatenate((gpu.ones((1,nData)), hidden1_activation), axis = 0) hidden2_sum = gpu.dot(weights2, hidden1_activation) #hidden2_activation = gpu.log(1+hidden2_sum.exp()) relu_mask_hidden2 = gpu.ones(shape(hidden2_sum)) * (hidden2_sum>0) hidden2_activation = hidden2_sum*relu_mask_hidden2 hidden2_activation = gpu.concatenate((gpu.ones((1,nData)), hidden2_activation), axis = 0) hidden3_sum = gpu.dot(weights3, hidden2_activation) hidden3_activation = hidden3_sum hidden3_activation = gpu.concatenate((gpu.ones((1,nData)), hidden3_activation), axis = 0) output_sum = gpu.dot(weights4, hidden3_activation) outputs = output_sum regularized_penalty3 = weights3[:,1:shape(weights3)[1]] regularized_penalty4 = weights4[:,1:shape(weights4)[1]] regularized_penalty3 = regularized_penalty3 ** 2 regularized_penalty4 = regularized_penalty4 ** 2 output_target_diff = (outputs - inputs)**2 cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty3) + gpu.sum(regularized_penalty4)) print 'Fine Tuning Cost: ', cost return cost
mb = gpu.zeros((1,10)) alpha = 0.1 momentum = 0.5 momentum_type = 1 for i in xrange(200): for i in xrange(X.shape[0]): if momentum_type == 1: '''Use nesterov momentum to train the weights ''' n = w + (m*momentum) nb = b + (mb*momentum) out = gpu.softmax(gpu.dot(X[i],n)+nb) gradb = gpu.dot(gpu.ones((1,batch_size)),out - t[i]) grad = gpu.dot(X[i].T,out - t[i]) m = m*momentum - (alpha*grad/128.) mb = mb*momentum - (alpha*gradb/128.) w += m b += mb elif momentum_type == 2: '''Use classic momentum to train the weights ''' out = gpu.softmax(gpu.dot(X[i],w)+b) gradb = gpu.dot(gpu.ones((1,batch_size)),out - t[i]) grad = gpu.dot(X[i].T,out - t[i]) m = m*momentum - (alpha*grad/128.)
def bias(X, bias_val=1.0): """Append a bias columns of magnitude bias_val to X.""" Xb = gp.concatenate((X, gp.ones((X.shape[0],1))), axis=1) return Xb
def __init__(self, config, name): super(PAE, self).__init__(config, name) self.factor = gp.ones(1000) for i in range(1, self.factor.size): self.factor[i] = self.factor[i - 1] * i self.N = None
def backprop_gradient(self, v, network, X, targets, weights): ''' Calculates the value of the cost function and the gradient for CG optimization. args: array v: the 1d vector of weights list[obj] network: the network array X: training data array targets: the training targets array weights: the backprop weights returns: array cost: the value of the cost function array grad: the value of the gradient This function is called by scipy's minimize function during optimization ''' if len(v.shape) == 1: v = v.reshape((v.shape[0],1)) # initialize variables n = X.shape[0] numHiddenLayers = len(network) # put the v weights back into the network ind =0 for i in range(numHiddenLayers): h,w = network[i].W.shape network[i].W = gp.garray((v[ind:(ind+h*w)]).reshape((h,w))) ind += h*w b = network[i].hbias.shape[0] network[i].hbias = gp.garray(v[ind:(ind+b)]).reshape((b,1)) ind += b # Run data through the network, keeping activations of each layer acts = [X] # a list of numpy arrays hid = X for layer in network: vis = gp.garray(hid) hid = self.get_activation(layer, vis) acts.append(hid) gp.free_reuse_cache() # store the gradients dW = [] db = [] # Compute the value of the cost function if self.targetCost == 'crossEntropy': # see www.stanford.edu/group/pdplab/pdphandbook/handbookch6.html cost = (-1.0/n) * np.sum(np.sum(targets * np.log(acts[-1]) + \ (1.0 - targets) * np.log(1.0 - acts[-1]), axis=1) * weights.T) Ix = (acts[-1] - targets) / n else: #self.targetCost == 'linSquaredErr': cost = 0.5 * np.sum(np.sum(np.square(acts[-1] - targets), axis=1) * \ weights.T) Ix = (acts[-1] - targets) Ix *= np.tile(weights, (1, Ix.shape[1])).reshape((Ix.shape[0],Ix.shape[1])) Ix = gp.garray(Ix) # Compute the gradients for i in range(numHiddenLayers-1,-1,-1): # augment activations with ones acts[i] = gp.garray(acts[i]) acts[i] = gp.concatenate((acts[i], gp.ones((n,1))), axis=1) # compute delta in next layer delta = gp.dot(acts[i].T, Ix) # split delta into weights and bias parts dW.append(delta[:-1,:].T) db.append(delta[-1,:].T) # backpropagate the error if i > 0: if network[i-1].hidtype == 'sigmoid': Ix = gp.dot(Ix,gp.concatenate((network[i].W,network[i].hbias), axis=1)) * acts[i] * (1.0 - acts[i]) elif network[i-1].hidtype == 'gaussian': Ix = gp.dot(Ix,gp.concatenate((network[i].W,network[i].hbias), axis=1)) Ix = Ix[:,:-1] gp.free_reuse_cache() dW.reverse() db.reverse() # Convert gradient information grad = np.zeros_like(v) ind = 0 for i in range(numHiddenLayers): grad[ind:(ind+dW[i].size)] = \ (dW[i].reshape((dW[i].shape[0]*dW[i].shape[1],1))).as_numpy_array() ind += dW[i].size grad[ind:(ind+db[i].size),0] = db[i].as_numpy_array() ind += db[i].size grad = grad.reshape((grad.shape[0],)) return cost, grad
n1 = w1+(m1*momentum)#nesterov updates 2.2 sec n2 = w2+(m2*momentum) nb1 = b1+(mb1*momentum) nb2 = b2+(mb2*momentum) z0 = X[i]*d02[rng.randint(0,75)] z1 = (gpu.dot(z0,n1)+nb1).logistic()*d05[rng.randint(0,75)]#dropout and activations 7.1 sec t0 = time.time() feedforward = gpu.softmax(gpu.dot(z1,n2)+nb2) time_softmax += time.time() - t0 #softmax 0.48 sec #gradients e1 = (feedforward - t[i]) grad2 = gpu.dot(z1.T,e1) grad1 = gpu.dot(X[i].T,(gpu.dot(e1,n2.T)* z1*(1-z1)))#grads 6 sec gradb2 = gpu.dot(gpu.ones((1, batch_size)),e1) gradb1= gpu.dot(gpu.ones((1, batch_size)),(gpu.dot(e1,n2.T)* z1*(1-z1))) #momentum and weight updates m1 = (momentum*m1) - ((grad1 + n1*L2)*alpha/(batch_size*1.0))#momentum und weight updates 7.4 sec m2 = (momentum*m2) - ((grad2 + n2*L2)*alpha/(batch_size*1.0)) mb1 = (momentum*mb1) - ((gradb1 + nb1*L2)*alpha/(batch_size*1.0)) mb2 = (momentum*mb2) - ((gradb2 + nb2*L2)*alpha/(batch_size*1.0)) w1 = w1 + m1 w2 = w2 + m2 b1 = b1 + mb1 b2 = b2 + mb2 momentum = momentum + 0.001 if momentum > 0.95: momentum = 0.95
def uniform(cls, nvis, nhid): return cls.from_independent(0.5 * gnp.ones(nvis), 0.5 * gnp.ones(nhid))
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, l3Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth, dropout_probability = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_L3 = l3Size * (l2Size + 1) num_weights_softmax = numClasses * l3Size #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray( reshape(x[num_weights_L1:num_weights_L2 + num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_L3 = gpu.garray( reshape( x[num_weights_L2 + num_weights_L1:num_weights_L2 + num_weights_L1 + num_weights_L3], (l3Size, l2Size + 1))) theta_softmax = gpu.garray( reshape( x[num_weights_L2 + num_weights_L1 + num_weights_L3:shape(x)[0]], (numClasses, l3Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) theta_L1_grad = gpu.zeros(shape(theta_L1)) theta_L2_grad = gpu.zeros(shape(theta_L2)) theta_L3_grad = gpu.zeros(shape(theta_L3)) dropout_l1 = gpu.garray( bernoulli.rvs(dropout_probability, size=(l1Size + 1, numCases))) dropout_l2 = gpu.garray( bernoulli.rvs(dropout_probability, size=(l2Size + 1, numCases))) dropout_l3 = gpu.garray( bernoulli.rvs(dropout_probability, size=(l3Size, numCases))) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 hidden_derivative_L1 = relu_mask_hidden1 #hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0) hidden_derivative_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_derivative_L1), axis=0) hidden_activation_L1 = gpu.concatenate( (gpu.ones((1, numCases)), hidden_activation_L1), axis=0) * dropout_l1 hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) #hidden_activation_L2 = gpu.log(1+hidden_sum_L2.exp()) relu_mask_hidden2 = gpu.ones(shape(hidden_sum_L2)) * (hidden_sum_L2 > 0) hidden_activation_L2 = hidden_sum_L2 * relu_mask_hidden2 hidden_derivative_L2 = relu_mask_hidden2 #hidden_activation_L2 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L2), axis=0) hidden_derivative_L2 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_derivative_L2), axis=0) hidden_activation_L2 = gpu.concatenate( (gpu.ones((1, numCases)), hidden_activation_L2), axis=0) * dropout_l2 hidden_sum_L3 = gpu.dot(theta_L3, hidden_activation_L2) #hidden_activation_L3 = gpu.log(1+hidden_sum_L3.exp()) relu_mask_hidden3 = gpu.ones(shape(hidden_sum_L3)) * (hidden_sum_L3 > 0) #hidden_activation_L3 = hidden_sum_L3*relu_mask_hidden3 hidden_derivative_L3 = relu_mask_hidden3 hidden_activation_L3 = hidden_sum_L3 * relu_mask_hidden3 * dropout_l3 #hidden_activation_L3 = hidden_sum_L3.logistic() * dropout_l3 hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L3) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) pred = predictions.argmax(axis=0) + 1 accuracy = mean(pred == labels) * 100 temp = groundTruth * gpu.log(predictions) temp = temp.as_numpy_array() temp[temp == -inf] = -200.0 temp = nan_to_num(temp) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:, 1:shape(theta_L2)[1]] regularized_penalty_L3 = theta_L3[:, 1:shape(theta_L3)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 regularized_penalty_L3 = regularized_penalty_L3 * regularized_penalty_L3 pred_cost = -1 * sum(temp) / numCases l2norm_cost = 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L3) + gpu.sum(regularized_penalty_L2) + gpu.sum(regularized_penalty_L1)) + 0.5 * lambda_softmax * gpu.sum( theta_softmax * theta_softmax) #l2norm_cost = 0 cost = pred_cost + l2norm_cost print 'Prediction Accuracy: ', accuracy, '%' print 'Multilayer Softmax Prediction Cost: ', pred_cost print 'Multilayer Softmax L2 Normalisation Cost: ', l2norm_cost print 'Multilayer Softmax Cost: ', cost print '--------------------------------------------------------------------' softmax_imd = groundTruth - predictions #theta_softmax_grad = -1*gpu.dot(softmax_imd, gpu.garray(transpose(hidden_activation_L3.as_numpy_array())))/numCases theta_softmax_grad = -1 * gpu.dot( softmax_imd, gpu.garray(transpose(hidden_activation_L3.as_numpy_array())) ) / numCases + lambda_softmax * theta_softmax deltaOut = -softmax_imd delta_L3_imd = gpu.dot( gpu.garray(transpose(theta_softmax.as_numpy_array())), deltaOut) delta_L3_imd2 = delta_L3_imd * hidden_derivative_L3 #delta_L3_imd2 = (delta_L3_imd * hidden_activation_L3) * (1-hidden_activation_L3) delta_L3 = gpu.dot( delta_L3_imd2, gpu.garray(transpose(hidden_activation_L2.as_numpy_array()))) theta_L3_grad += delta_L3 delta_L2_imd = gpu.dot(gpu.garray(transpose(theta_L3.as_numpy_array())), delta_L3_imd2) delta_L2_imd2 = delta_L2_imd * hidden_derivative_L2 delta_L2_imd2 = delta_L2_imd2[1:shape(delta_L2_imd2)[0] + 1, :] delta_L2 = gpu.dot( delta_L2_imd2, gpu.garray(transpose(hidden_activation_L1.as_numpy_array()))) theta_L2_grad += delta_L2 delta_L1_imd = gpu.dot(gpu.garray(transpose(theta_L2.as_numpy_array())), delta_L2_imd2) delta_L1_imd2 = delta_L1_imd * hidden_derivative_L1 delta_L1_imd2 = delta_L1_imd2[1:shape(delta_L1_imd2)[0] + 1, :] delta_L1 = gpu.dot(delta_L1_imd2, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta_L1 theta_L1_grad = theta_L1_grad / numCases theta_L2_grad = theta_L2_grad / numCases theta_L3_grad = theta_L3_grad / numCases theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape( theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden theta_L2_grad[:, 1:shape(theta_L2_grad)[1]] = theta_L2_grad[:, 1:shape( theta_L2_grad)[1]] + theta_L2[:, 1:shape(theta_L2)[1]] * lambda_hidden theta_L3_grad[:, 1:shape(theta_L3_grad)[1]] = theta_L3_grad[:, 1:shape( theta_L3_grad)[1]] + theta_L3[:, 1:shape(theta_L3)[1]] * lambda_hidden theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) theta_L2_grad = reshape(theta_L2_grad.as_numpy_array(), num_weights_L2) theta_L3_grad = reshape(theta_L3_grad.as_numpy_array(), num_weights_L3) theta_softmax_grad = reshape(theta_softmax_grad.as_numpy_array(), num_weights_softmax) del inputs del theta_L1 del theta_L2 del theta_L3 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_activation_L3 del hidden_sum_L3 del hidden_sum_softmax del predictions del temp del softmax_imd del deltaOut del delta_L3_imd del delta_L3_imd2 del delta_L3 del delta_L2_imd del delta_L2_imd2 del delta_L2 del delta_L1_imd del delta_L1_imd2 del delta_L1 #del regularized_penalty_L1 #del regularized_penalty_L2 gpu.free_reuse_cache() return cost, hstack( (theta_L1_grad, theta_L2_grad, theta_L3_grad, theta_softmax_grad))
def ones(shape): if gpu.GPU: return gp.ones(shape) else: return np.ones(shape)