def mlpSingleOutput1Layer_costfunc(x, *args): inputSize, l1Size, lambda_hidden, inputs, targets = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) inputs = gpu.garray(inputs) targets = gpu.garray(targets) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_output = gpu.garray(reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size+1))) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis = 0) #hidden_activation_L1 = hidden_activation_L1 * dropout_prob hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1) outputs = hidden_sum_output.logistic() output_target_diff = (outputs - targets)**2 regularized_penalty_output = theta_output[:,1:shape(theta_output)[1]] regularized_penalty_output = regularized_penalty_output * regularized_penalty_output regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 cost = gpu.sum(output_target_diff)/(2*numCases) + 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L1)+gpu.sum(regularized_penalty_output)) print 'Multilayer Preceptron Cost:', cost del inputs del theta_L1 del hidden_sum_L1 del hidden_activation_L1 del regularized_penalty_output del regularized_penalty_L1 gpu.free_reuse_cache() return cost
def next(self): """ Get the next minibatch of data. Return a tuple of (minibatch_x, minibatch_t) if t is not None, otherwise return only minibatch_x. """ minibatch_t = None if self.i_ptr + self.minibatch_size <= self.n_cases: minibatch_x = self.x[self.idx[self.i_ptr:self.i_ptr + self.minibatch_size]] if self.t is not None: minibatch_t = self.t[self.idx[self.i_ptr:self.i_ptr + self.minibatch_size]] self.i_ptr += self.minibatch_size else: if self.i_ptr >= self.n_cases: # empty part, needed for garray handling # minibatch_x_part = self.x[:0].copy() minibatch_x_part = None if self.t is not None: # minibatch_t_part = self.t[:0].copy() minibatch_t_part = None else: minibatch_x_part = self.x[self.idx[self.i_ptr:]].copy() if self.t is not None: minibatch_t_part = self.t[self.idx[self.i_ptr:]].copy() other_part_size = self.minibatch_size - (self.n_cases - self.i_ptr) self.shuffle_data() if minibatch_x_part is not None: if isinstance(self.x, gnp.garray): minibatch_x = gnp.concatenate( [minibatch_x_part, self.x[self.idx[:other_part_size]]], axis=0) else: minibatch_x = np.r_[minibatch_x_part, self.x[self.idx[:other_part_size]]] else: minibatch_x = self.x[self.idx[:other_part_size]] if self.t is not None: if minibatch_t_part is not None: if isinstance(self.t, gnp.garray): minibatch_t = gnp.concatenate([ minibatch_t_part, self.t[self.idx[:other_part_size]] ], axis=0) else: minibatch_t = np.r_[minibatch_t_part, self.t[self.idx[:other_part_size]]] else: minibatch_t = self.t[self.idx[:other_part_size]] self.i_ptr = other_part_size if self.t is not None: return minibatch_x, minibatch_t else: return minibatch_x
def costfunc_gpu_ReLU(x, *args): num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args num_weights1 = (num_input+1)*num_hidden x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden_sum = gpu.dot(weights1, data) hidden_activation = gpu.log(1+hidden_sum.exp()) p_avg = gpu.sum(hidden_activation,axis=1)/nData hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:,1:shape(weights1)[1]] regularized_penalty2 = weights2[:,1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - inputs)*(output - inputs) KL = gpu.sum(sparsityParam*gpu.log(sparsityParam/p_avg) + (1-sparsityParam)*gpu.log((1-sparsityParam)/(1-p_avg))) cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta*KL print 'ReLU Linear Decoder Cost: ', cost return cost
def forward(self, X, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] R = self.R C = self.C bw = self.bw # Obtain word features tmp = R.as_numpy_array()[:,X.flatten()].flatten(order='F') # flatten(), default in row-major order, order='F' means Fortran(column-major) order tmp = tmp.reshape((batchsize, self.K * self.context)) # reshape(), in row-major order words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i,:,:] = tmp[i,:].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:,:,i], C[i,:,:]) # the dot() of 2-D matrix is equiverlent to multiply acts = gpu.concatenate((acts, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(acts, gpu.concatenate((R, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, preds.as_numpy_array())
def forward(self, X, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] R = self.R C = self.C bw = self.bw # Obtain word features tmp = R.as_numpy_array()[:, X.flatten()].flatten(order='F') tmp = tmp.reshape((batchsize, self.K * self.context)) words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i, :, :] = tmp[i, :].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:, :, i], C[i, :, :]) acts = gpu.concatenate((acts, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(acts, gpu.concatenate((R, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, preds.as_numpy_array())
def grad_costfunc_gpu_ReLU(x, *args): num_input, num_hidden, num_output, inputs, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden num_weights2 = (num_hidden + 1) * num_output x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) #hidden_activation = gpu.log(1+hidden_sum.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum)) * (hidden_sum > 0) hidden_activation = hidden_sum * relu_mask_hidden1 #hidden_derivative = hidden_sum.logistic() hidden_derivative = relu_mask_hidden1 hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) hidden_derivative = gpu.concatenate((gpu.ones( (1, nData)), hidden_derivative), axis=0) outputs = gpu.dot(weights2, hidden_activation) weights1_grad = gpu.zeros(shape(weights1)) weights2_grad = gpu.zeros(shape(weights2)) p = outputs - inputs weights2_grad += gpu.dot( p, gpu.garray(transpose(hidden_activation.as_numpy_array()))) q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())), p) #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation)) q = q_temp * hidden_derivative delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array()))) weights1_grad += delta2[1:shape(delta2)[0], :] weights1_grad = weights1_grad / nData weights2_grad = weights2_grad / nData weights1_grad[:, 1:shape(weights1_grad)[1]] = weights1_grad[:, 1:shape( weights1_grad)[1]] + weights1[:, 1:shape(weights1)[1]] * lambda_val weights2_grad[:, 1:shape(weights2_grad)[1]] = weights2_grad[:, 1:shape( weights2_grad)[1]] + weights2[:, 1:shape(weights2)[1]] * lambda_val #weights1_grad = reshape(weights1_grad, num_weights1) weights1_grad = weights1_grad.reshape(num_weights1) #weights2_grad = reshape(weights2_grad, num_weights2) weights2_grad = weights2_grad.reshape(num_weights2) del x del inputs del data del p del q_temp del q del delta2 del hidden_sum del hidden_activation del weights1 del weights2 gpu.free_reuse_cache() return hstack( (weights1_grad.as_numpy_array(), weights2_grad.as_numpy_array()))
def forwardProp(X, theta1, theta2): a1 = gpu.concatenate((X, gpu.ones((np.size(X[:, 0]), 1))), axis=1) a2 = sigmoid(theta1.dot(a1.T)) a2 = gpu.concatenate((a2, gpu.ones((1, np.size(a2[0, :])))), axis=0) a3 = sigmoid(theta2.dot(a2)) return a1, a2, a3
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray( reshape(x[num_weights_L1:num_weights_L2 + num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_softmax = gpu.garray( reshape(x[num_weights_L2 + num_weights_L1:shape(x)[0]], (numClasses, l2Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_activation_L1), axis=0) hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = hidden_sum_L2.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) temp = groundTruth * gpu.log(predictions) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:, 1:shape(theta_L2)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 cost = -1 * gpu.sum(temp) / numCases + 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2) ) + 0.5 * lambda_softmax * gpu.sum(theta_softmax * theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_L2 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 del regularized_penalty_L2 gpu.free_reuse_cache() return cost
def getMMReps(self, data): tops = [] for i in range(self.modalsCnt): x = self.saes[i].forward2Top(data[i]) tops.append(x[-1]) if self.has_joint: jinp = gp.concatenate((tuple(tops)), axis=1) ja = self.jsae.forward2Top(jinp) return ja[-1].as_numpy_array() else: return gp.concatenate((tuple(tops)), axis=1).as_numpy_array()
def grad_costfunc_gpu(x, *args): num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args num_weights1 = (num_input+1)*num_hidden num_weights2 = (num_hidden+1)*num_output x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1)) weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden_sum = gpu.dot(weights1, data) hidden_activation = hidden_sum.logistic() p_avg = gpu.sum(hidden_activation,axis=1)/nData grad_sparse = -1*sparsityParam/p_avg.as_numpy_array() + (1-sparsityParam)/(1-p_avg.as_numpy_array()) grad_sparse = append(0,grad_sparse) grad_sparse = tile(grad_sparse, (nData, 1)) grad_sparse = gpu.garray(transpose(grad_sparse)) hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0) outputs = gpu.dot(weights2, hidden_activation) weights1_grad = gpu.zeros(shape(weights1)) weights2_grad = gpu.zeros(shape(weights2)) p = outputs-inputs weights2_grad += gpu.dot(p, gpu.garray(transpose(hidden_activation.as_numpy_array()))) q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())),p) + beta*grad_sparse #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation)) q = (q_temp*hidden_activation)*(1-hidden_activation) delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array()))) weights1_grad += delta2[1:shape(delta2)[0], :] weights1_grad = weights1_grad/nData weights2_grad = weights2_grad/nData weights1_grad[:,1:shape(weights1_grad)[1]] = weights1_grad[:,1:shape(weights1_grad)[1]] + weights1[:,1:shape(weights1)[1]] * lambda_val weights2_grad[:,1:shape(weights2_grad)[1]] = weights2_grad[:,1:shape(weights2_grad)[1]] + weights2[:,1:shape(weights2)[1]] * lambda_val #weights1_grad = reshape(weights1_grad, num_weights1) weights1_grad = weights1_grad.reshape(num_weights1) #weights2_grad = reshape(weights2_grad, num_weights2) weights2_grad = weights2_grad.reshape(num_weights2) del x del inputs del data del grad_sparse del p del q_temp del q del delta2 del hidden_sum del hidden_activation del weights1 del weights2 gpu.free_reuse_cache() return hstack((weights1_grad.as_numpy_array(),weights2_grad.as_numpy_array()))
def costfunc_gpu(x, *args): num_input, num_hidden, num_output, inputs, noNoiseData, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden x = gpu.garray(x) # randomNoise = random.random_sample(shape(inputs)) # criteriaTable = randomNoise > 0.32 # inputs = inputs * criteriaTable inputs = gpu.garray(inputs) noNoiseData = gpu.garray(noNoiseData) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) hidden_activation = hidden_sum.logistic() p_avg = gpu.sum(hidden_activation, axis=1) / nData hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:, 1:shape(weights1)[1]] regularized_penalty2 = weights2[:, 1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - noNoiseData) * (output - noNoiseData) KL = gpu.sum(sparsityParam * gpu.log(sparsityParam / p_avg) + (1 - sparsityParam) * gpu.log((1 - sparsityParam) / (1 - p_avg))) cost = gpu.sum(output_target_diff) / (2 * nData) + 0.5 * lambda_val * ( gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta * KL print 'GPU Linear Denoising Decoder Cost: ', cost del x del inputs del noNoiseData del data del hidden_sum del hidden_activation del p_avg del output del regularized_penalty1 del regularized_penalty2 del weights1 del weights2 del output_target_diff gpu.free_reuse_cache() return cost
def next(self): """ Get the next minibatch of data. Return a tuple of (minibatch_x, minibatch_t) if t is not None, otherwise return only minibatch_x. """ minibatch_t = None if self.i_ptr + self.minibatch_size <= self.n_cases: minibatch_x = self.x[self.idx[self.i_ptr:self.i_ptr + self.minibatch_size]] if self.t is not None: minibatch_t = self.t[self.idx[self.i_ptr:self.i_ptr + self.minibatch_size]] self.i_ptr += self.minibatch_size else: if self.i_ptr >= self.n_cases: # empty part, needed for garray handling # minibatch_x_part = self.x[:0].copy() minibatch_x_part = None if self.t is not None: # minibatch_t_part = self.t[:0].copy() minibatch_t_part = None else: minibatch_x_part = self.x[self.idx[self.i_ptr:]].copy() if self.t is not None: minibatch_t_part = self.t[self.idx[self.i_ptr:]].copy() other_part_size = self.minibatch_size - (self.n_cases - self.i_ptr) self.shuffle_data() if minibatch_x_part is not None: if isinstance(self.x, gnp.garray): minibatch_x = gnp.concatenate([minibatch_x_part, self.x[self.idx[:other_part_size]]], axis=0) else: minibatch_x = np.r_[minibatch_x_part, self.x[self.idx[:other_part_size]]] else: minibatch_x = self.x[self.idx[:other_part_size]] if self.t is not None: if minibatch_t_part is not None: if isinstance(self.t, gnp.garray): minibatch_t = gnp.concatenate([minibatch_t_part, self.t[self.idx[:other_part_size]]], axis=0) else: minibatch_t = np.r_[minibatch_t_part, self.t[self.idx[:other_part_size]]] else: minibatch_t = self.t[self.idx[:other_part_size]] self.i_ptr = other_part_size if self.t is not None: return minibatch_x, minibatch_t else: return minibatch_x
def feedforward(theta, data): nData = shape(data)[1] x = gpu.concatenate((gpu.ones((1,nData)), data), axis = 0) hidden_sum = gpu.dot(theta, x) relu_mask_hidden = gpu.ones(shape(hidden_sum)) * (hidden_sum>0) hidden_activation = hidden_sum*relu_mask_hidden return hidden_activation
def pack(self): return g.concatenate([self.h_init.ravel(), self.W_hh.ravel(), self.W_vh.ravel(), self.W_ho.ravel()])
def setup_training_data(params, midi_dir, verbose=False): ''' load and setup training data input: T - max-lag for computing frame size ''' # load training data sequential_data, sequential_labels, num_labels = load_data(midi_dir) T = max(params['Tv'], params['Th']) # max look-behind # convert sequences into subsequences of length T+1 subseq_data, subseq_labels = frame_subseqs(T + 1, sequential_data, sequential_labels) subseq_data *= params['vis_scale'] # put training data at correct scale training_data = subseq_to_frames(subseq_data) Nl = params['Nl'] training_labels = compute_binary_labels(subseq_to_frames(subseq_labels), Nl) input_training_data = gp.concatenate( (gp.garray(training_data), gp.garray(training_labels)), axis=1) return input_training_data
def apply_update(self, pos_moments, neg_moments, rbm, weight_decay, lrate): assert np.allclose(lrate.vbias, lrate.hbias) if self.count < self.params.start_after: rbm.sgd_update(pos_moments, neg_moments, lrate) return # base rates ds = gnp.concatenate([ pos_moments.expect_vis - neg_moments.expect_vis, pos_moments.expect_hid - neg_moments.expect_hid ]) dbias = lrate.vbias * gnp.dot(self.Lambda, ds.as_numpy_array()) da, db = dbias[:rbm.nvis], dbias[rbm.nvis:] residuals = pos_moments.expect_prod - neg_moments.expect_prod + \ -weight_decay * rbm.weights + \ -self.beta[:, :, 0] * (pos_moments.expect_vis - neg_moments.expect_vis)[:, nax] + \ -self.beta[:, :, 1] * (pos_moments.expect_hid - neg_moments.expect_hid)[nax, :] lam = 1. / self.sigma_sq dw = lrate.weights * lam * residuals da -= lrate.weights * (lam * residuals * self.beta[:, :, 0]).sum(1) db -= lrate.weights * (lam * residuals * self.beta[:, :, 1]).sum(0) update = binary_rbms.Update(da, db, dw) rbm += update
def apply_update(self, pos_moments, neg_moments, rbm, weight_decay, lrate): assert np.allclose(lrate.vbias, lrate.hbias) if self.count < self.params.start_after: rbm.sgd_update(pos_moments, neg_moments, lrate) return # base rates ds = gnp.concatenate([pos_moments.expect_vis - neg_moments.expect_vis, pos_moments.expect_hid - neg_moments.expect_hid]) dbias = lrate.vbias * gnp.dot(self.Lambda, ds.as_numpy_array()) da, db = dbias[:rbm.nvis], dbias[rbm.nvis:] residuals = pos_moments.expect_prod - neg_moments.expect_prod + \ -weight_decay * rbm.weights + \ -self.beta[:, :, 0] * (pos_moments.expect_vis - neg_moments.expect_vis)[:, nax] + \ -self.beta[:, :, 1] * (pos_moments.expect_hid - neg_moments.expect_hid)[nax, :] lam = 1. / self.sigma_sq dw = lrate.weights * lam * residuals da -= lrate.weights * (lam * residuals * self.beta[:, :, 0]).sum(1) db -= lrate.weights * (lam * residuals * self.beta[:, :, 1]).sum(0) update = binary_rbms.Update(da, db, dw) rbm += update
def dbn_forward_pass(ws_vh, ws_v, ws_h, x, y=None): """ Deep belief net forward pass. x: input data (N x D matrix) y: Class label (1-of-K coded, N x K matrix). If not None, it is concatenated to the input for top layer RBM when calculating the output of the DBN. ws_vh: list of layer weights (L x D x H) ws_v: list of layer input biases (L x D x 1) ws_h: list of layer output biases (L x H x 1) Returns activations (continuous) and outputs (0-1, sigmoid(activations)) of top layer """ L = len(ws_vh) h = x.T # forward (bottom-up) pass for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h) + ws_h[l] h = gnp.logistic(ah) # if supervised, concatenate class labels to input to top layer RBM if y is not None: h = gnp.concatenate((y.T, h)) ah = gnp.dot(ws_vh[-1].T, h) + ws_h[-1] h = gnp.logistic(ah) return ah.T, h.T
def dbn_supervised_predict_sample(ws_vh, ws_v, ws_h, x, k=20): """ Predict the class label of input x from supervised DBN WARNING: THIS IS PRETTY SLOW AND LESS RELIABLE THAN THE EXACT METHOD Uses the sampling method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 x: Input data. (NxD matrix) k: Number of Gibbs steps """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # we give random values to the supervised portion of the input v = gnp.concatenate((gnp.ones((K, N)) / K, h_prev)) # we keep the visible units clamped while sampling h, v = rbm_sample(ws_vh[-1], ws_v[-1], ws_h[-1], v, k, clamped=(K, H)) # sample visible units of top level RBM given return v[0:K, :].T
def forward(self, X, Im, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] Im = gpu.garray(Im) C = self.C M = self.M bw = self.bw J = self.J bj = self.bj Wfx = self.Wfx Whf = self.Whf Wfv = self.Wfv # Forwardprop images Im = gpu.concatenate((Im, gpu.ones((batchsize, 1))), 1) IF = gpu.dot(Im, gpu.concatenate((J, bj))) IF = IF * (IF > 0) # Obtain word features R = gpu.dot(Wfx, Whf) tmp = R.as_numpy_array()[:,X.flatten()].flatten(order='F') tmp = tmp.reshape((batchsize, self.K * self.context)) words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i,:,:] = tmp[i,:].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:,:,i], C[i,:,:]) acts = acts + gpu.dot(IF, M) # Multiplicative interaction F = gpu.dot(acts, Wfx) * gpu.dot(IF, Wfv) F = gpu.concatenate((F, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(F, gpu.concatenate((Whf, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, IF, F, preds.as_numpy_array())
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray(reshape(x[num_weights_L1:num_weights_L2+num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_softmax = gpu.garray(reshape(x[num_weights_L2+num_weights_L1:shape(x)[0]], (numClasses, l2Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0) hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = hidden_sum_L2.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis = 0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions,axis = 0) temp = groundTruth*gpu.log(predictions) regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:,1:shape(theta_L2)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 cost = -1*gpu.sum(temp)/numCases + 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2)) + 0.5 * lambda_softmax * gpu.sum(theta_softmax*theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_L2 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 del regularized_penalty_L2 gpu.free_reuse_cache() return cost
def mlpSingleOutput1Layer_grad(x, *args): inputSize, l1Size, lambda_hidden, inputs, targets = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_output = 1 * (l1Size + 1) inputs = gpu.garray(inputs) targets = gpu.garray(targets) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_output = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size + 1))) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_activation_L1), axis=0) #hidden_activation_L1 = hidden_activation_L1 * dropout_prob hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1) outputs = hidden_sum_output.logistic() theta_L1_grad = gpu.zeros(shape(theta_L1)) theta_output_grad = gpu.zeros(shape(theta_output)) a = (outputs - targets) * outputs * (1 - outputs) theta_output_grad += gpu.dot( a, gpu.garray(transpose(hidden_activation_L1.as_numpy_array()))) b_temp = gpu.dot(gpu.garray(transpose(theta_output.as_numpy_array())), a) b = (b_temp * hidden_activation_L1) * (1 - hidden_activation_L1) delta2 = gpu.dot(b, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta2[1:shape(delta2)[0], :] theta_L1_grad = theta_L1_grad / numCases theta_output_grad = theta_output_grad / numCases theta_output_grad[:, 1:shape( theta_output_grad)[1]] = theta_output_grad[:, 1:shape( theta_output_grad )[1]] + theta_output[:, 1:shape(theta_output)[1]] * lambda_hidden theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape( theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden theta_output_grad = reshape(theta_output_grad.as_numpy_array(), num_weights_output) theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) del inputs del theta_L1 del hidden_sum_L1 del hidden_activation_L1 gpu.free_reuse_cache() return hstack((theta_L1_grad, theta_output_grad))
def dbn_train(train_x, H, batch_size, epoch_count, epsilon, momentum, train_y=None, return_hidden=True, verbose=True): """ NOTE: SUPERVISED TRAINING IS NOT REALLY TESTED WELL. TEST IT SOMEDAY!!! Unsupervised layerwise training of a sigmoidal Deep Belief Net. train_x: Training data. NxD matrix. train_y: Training labels NxK matrix (1-of-K coded). If provided, labels are included in the inputs to top layer RBM (See Hinton, Osindero, Teh 2006) H: Number of hidden units in each layer. e.g. [100, 2000, 300] batch_size: Batch size. Either a scalar or a list (epoch count for each layer). epsilon: Learning rate. Either a scalar or a list (an epsilon for each layer and epoch). momentum: Momentum. Either a scalar or a list (an epsilon for each layer and epoch). return_hidden: If True, returns hidden unit activations for training data. verbose: If True, prints progress information Returns ws_vh (list of weight matrices for each layer), ws_v (list of input unit biases for each layer), ws_h (list of output unit biases for each layer), and, if return_hidden is True, h (output layer hidden unit activations for training data) """ layer_count = len(H) # if any of the training parameters are given as scalars, convert them to lists if not isinstance(epoch_count, list): epoch_count = [epoch_count] * layer_count if not isinstance(batch_size, list): batch_size = [batch_size] * layer_count if not isinstance(epsilon, list): epsilon = [[epsilon] * e_c for e_c in epoch_count] if not isinstance(momentum, list): momentum = [[momentum] * e_c for e_c in epoch_count] ws_vh = [] ws_v = [] ws_h = [] error = [] # train layer by layer h = train_x for i, h_count in enumerate(H): # we need to return the hidden unit activations only for output layer, if # return_hidden is True if not return_hidden and i == layer_count - 1: rh = False else: rh = True # if we have train_y and we are training the last layer, concatenate # class labels to inputs if train_y is not None and i == layer_count - 1: h = gnp.concatenate((train_y, h), axis=1) w_vh, w_v, w_h, h, l_error = rbm_train( h, h_count, batch_size[i], epoch_count[i], epsilon[i], momentum[i], return_hidden=rh, verbose=verbose ) ws_vh.append(w_vh) ws_v.append(w_v) ws_h.append(w_h) error.append(l_error) return ws_vh, ws_v, ws_h, h, error
def _init_plus(X, K, dist='euclidean'): f_dist = choose_distance_metric(dist) C = X[np.random.randint(X.shape[0])].reshape(1,-1) for k in xrange(1, K): idx = f_dist(X, C).min(axis=1).argmax() C = gnp.concatenate([C, X[idx].reshape(1,-1)], axis=0) return C
def mlpSoftmax1Layer_grad(x, *args): numClasses, inputSize, l1Size, lambda_softmax, lambda_hidden, inputs, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_softmax = numClasses * l1Size inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_softmax = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (numClasses, l1Size))) theta_L1_grad = gpu.zeros(shape(theta_L1)) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) #hidden_derivative_L1 = hidden_sum_L1.logistic() relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 #hidden_activation_L1 = hidden_sum_L1.logistic() hidden_derivative_L1 = relu_mask_hidden1 hidden_sum_softmax_imd = gpu.dot(theta_softmax, hidden_activation_L1) hidden_sum_softmax = hidden_sum_softmax_imd - hidden_sum_softmax_imd.max( axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) softmax_imd = groundTruth - predictions theta_softmax_grad = -1 * gpu.dot( softmax_imd, gpu.garray(transpose(hidden_activation_L1.as_numpy_array())) ) / numCases + lambda_softmax * theta_softmax deltaOut = -softmax_imd delta_L1_imd = gpu.dot( gpu.garray(transpose(theta_softmax.as_numpy_array())), deltaOut) delta_L1_imd2 = delta_L1_imd * hidden_derivative_L1 #delta_L1_imd2 = (delta_L1_imd*hidden_activation_L1)*(1-hidden_activation_L1) delta_L1 = gpu.dot(delta_L1_imd2, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta_L1 theta_L1_grad = theta_L1_grad / numCases theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape( theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) theta_softmax_grad = reshape(theta_softmax_grad.as_numpy_array(), num_weights_softmax) del inputs del theta_L1 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_softmax del predictions del softmax_imd del deltaOut del delta_L1_imd del delta_L1_imd2 del delta_L1 gpu.free_reuse_cache() return hstack((theta_L1_grad, theta_softmax_grad))
def pack(self): return g.concatenate([self.h_init.ravel(), self.W_hf.ravel(), self.W_fh.ravel(), #self.W_hh.ravel(), self.f_bias.ravel(), self.W_vh.ravel(), self.W_vf.ravel(), self.W_ho.ravel()])
def costfunc_gpu_ReLU(x, *args): num_input, num_hidden, num_output, inputs, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) #hidden_activation = gpu.log(1+hidden_sum.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum)) * (hidden_sum > 0) hidden_activation = hidden_sum * relu_mask_hidden1 hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:, 1:shape(weights1)[1]] regularized_penalty2 = weights2[:, 1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - inputs) * (output - inputs) cost = gpu.sum(output_target_diff) / (2 * nData) + 0.5 * lambda_val * ( gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) print 'GPU ReLU Linear Decoder Cost: ', cost del x del inputs del data del hidden_sum del hidden_activation del output del regularized_penalty1 del regularized_penalty2 del weights1 del weights2 del output_target_diff gpu.free_reuse_cache() return cost
def trainClassifierOneBatch(self, trainbatch, labelbatch, epoch, diff_cost=1.0, recf=1.0): """ trains one pair in which each element has two modalities im1: first element's image data tx1: first element's text data im2: second element's image data tx2: second element's text data sim: if the pair is in similar set recf: reconstruction factor """ a = [] for m in xrange(self.modalsCnt): a.append(self.saes[m].forward2Top(trainbatch[m])) jinp = gp.concatenate(tuple(e[self.depth - 1] for e in a), axis=1) ja = self.jsae.forward(jinp) for m in xrange(self.modalsCnt): a[m].append(ja[-1][:, self.dims[m]:self.dims[m + 1]]) self.saes[m].backward2Bottom(a[m]) #get path grad for z #backpropagate x and y wrt z g, jg, rl = self.getClassificationGrad(a, ja, labelbatch, diff_factor=diff_cost, recf=recf) #this lines are just for debug: perfaf = gp.concatenate(tuple(e[0] for e in a), axis=1) perfal = gp.concatenate(tuple(e[-1] for e in a), axis=1) perf = self.getDiffLoss(perfaf, perfal) return perf, g, jg
def dbn_supervised_predict_exact(ws_vh, ws_v, ws_h, x): """ Predict the class label of input x from supervised DBN Uses the exact method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 The free energy formula is taken from http://deeplearning.net/tutorial/rbm.html x: Input data. (NxD matrix) """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # for every class, assume it is the correct label and calculate its free energy y = gnp.zeros((K, N)) free_energy = gnp.zeros((N, K)) # we actually calculate -free_energy for k in range(K): # set the current assumed class label y[k, :] = 1.0 # visible unit vector v = gnp.concatenate((y, h_prev)) e_v = gnp.dot(ws_v[-1].T, v) # bias energy term ah = gnp.dot(ws_vh[-1].T, v) + ws_h[-1] e_h = gnp.sum(gnp.log(gnp.exp(ah) + 1.0), axis=0) free_energy[:, k] = e_v + e_h # zero the class labels for next iteration y[:, :] = 0.0 # since these numbers may get pretty small, use the sum-exp trick for converting # these to probabilities pred_y = ( gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]) / gnp.sum(gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]), axis=1)[:, gnp.newaxis] ) return pred_y
def basic_gradient_descent(): digits = datasets.load_digits() # iris = datasets.load_iris() X = digits.images.reshape((digits.images.shape[0], -1)) scaler = pre.Scaler() X = scaler.fit_transform(X) y = ut.all_to_sparse(digits.target, max(digits.target) + 1) X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets( gpu.as_garray(X), gpu.as_garray(y), "digits") X_val = gpu.concatenate([X_val, X_test]) y_val = gpu.concatenate([y_val, y_test]) thetas, costs, val_costs = neur.gradient_decent( gpu.as_garray(X), gpu.as_garray(y), #hidden_layer_sz = 11, hidden_layer_sz=45, iter=500, wd_coef=0.0, learning_rate=0.25, momentum_multiplier=0.9, rand_init_epsilon=0.012, do_early_stopping=True, #do_dropout = True, dropout_percentage=0.7, #do_learning_adapt = True, X_val=gpu.as_garray(X_val), y_val=gpu.as_garray(y_val)) h_x, a = neur.forward_prop(X_test, thetas) h_x = map(lambda x: x.as_numpy_array(), h_x) print "percentage correct predictions: ", ut.percent_equal( ut.map_to_max_binary_result(h_x), y_test.as_numpy_array()) print "training error:", costs[-1:][0] print "validation error:", val_costs[-1:][0] print "lowest validation error:", min(val_costs) plt.plot(costs, label='cost') plt.plot(val_costs, label='val cost') plt.legend() plt.ylabel('error rate')
def backward(self, Y, preds, F, IF, acts, words, X, Im): """ Backward pass through the network """ batchsize = preds.shape[0] Im = gpu.garray(Im) # Compute part of df/dR Ix = gpu.garray(preds[:,:-1] - Y) / batchsize delta = gpu.dot(F.T, Ix) dWhf = delta[:-1,:] + self.gamma_r * self.Whf db = delta[-1,:] # Compute df/Wfv and part of df/Wfx Ix = gpu.dot(Ix, self.Whf.T) dWfv = gpu.dot(IF.T, Ix * gpu.dot(acts, self.Wfx)) + self.gamma_r * self.Wfv dWfx = gpu.dot(acts.T, Ix * gpu.dot(IF, self.Wfv)) + self.gamma_r * self.Wfx # Compute df/dC and word inputs for df/dR Ix_word = gpu.dot(Ix * gpu.dot(IF, self.Wfv), self.Wfx.T) dC = gpu.zeros(np.shape(self.C)) dR = np.zeros((self.K, self.V)) for i in range(self.context): delta = gpu.dot(words[:,:,i].T, Ix_word) dC[i,:,:] = delta + self.gamma_c * self.C[i,:,:] delta = gpu.dot(Ix_word, self.C[i,:,:].T) delta = delta.as_numpy_array() for j in range(X.shape[0]): dR[:,X[j,i]] = dR[:,X[j,i]] + delta.T[:,j] dR = gpu.garray(dR) dWfx = dWfx + gpu.dot(dR, self.Whf.T) dWhf = dWhf + gpu.dot(self.Wfx.T, dR) # Compute df/dM dM = gpu.dot(IF.T, Ix_word) + self.gamma_c * self.M # Compute df/dJ Ix = gpu.dot(Ix * gpu.dot(acts, self.Wfx), self.Wfv.T) * (IF > 0) + gpu.dot(Ix_word, self.M.T) * (IF > 0) Im = gpu.concatenate((Im, gpu.ones((batchsize, 1))), 1) delta = gpu.dot(Im.T, Ix) dJ = delta[:-1,:] + self.gamma_c * self.J dBj = delta[-1,:] self.db = db self.dC = dC self.dM = dM self.dJ = dJ self.dBj = dBj self.dWhf = dWhf self.dWfv = dWfv self.dWfx = dWfx
def gradDebug(self, inputBatch, targetBatch): inputBatch = inputBatch if isinstance(inputBatch, gnp.garray) else gnp.garray(inputBatch) targetBatch = targetBatch if isinstance(targetBatch, gnp.garray) else gnp.garray(targetBatch) mbsz = inputBatch.shape[0] outputActs = self.fprop(inputBatch) outputErrSignal = -self.outputActFunct.dErrordNetInput(targetBatch, self.state[-1], outputActs) errSignals = self.bprop(outputErrSignal) for i, (WGrad, biasGrad) in enumerate(self.gradients(self.state, errSignals)): self.WGrads[i] = WGrad self.biasGrads[i] = biasGrad allWeightGrads = itertools.chain(self.WGrads, self.biasGrads) return gnp.as_numpy_array(gnp.concatenate([dw.ravel() for dw in allWeightGrads]))
def basic_gradient_descent(): digits = datasets.load_digits() # iris = datasets.load_iris() X = digits.images.reshape((digits.images.shape[0], -1)) scaler = pre.Scaler() X = scaler.fit_transform(X) y = ut.all_to_sparse( digits.target, max(digits.target) + 1 ) X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets(gpu.as_garray(X), gpu.as_garray(y), "digits") X_val = gpu.concatenate([X_val, X_test]) y_val = gpu.concatenate([y_val, y_test]) thetas, costs, val_costs = neur.gradient_decent(gpu.as_garray(X), gpu.as_garray(y), #hidden_layer_sz = 11, hidden_layer_sz = 45, iter = 500, wd_coef = 0.0, learning_rate = 0.25, momentum_multiplier = 0.9, rand_init_epsilon = 0.012, do_early_stopping = True, #do_dropout = True, dropout_percentage = 0.7, #do_learning_adapt = True, X_val = gpu.as_garray(X_val), y_val = gpu.as_garray(y_val)) h_x, a = neur.forward_prop(X_test, thetas) h_x = map(lambda x: x.as_numpy_array(), h_x) print "percentage correct predictions: ", ut.percent_equal(ut.map_to_max_binary_result(h_x), y_test.as_numpy_array()) print "training error:", costs[-1:][0] print "validation error:", val_costs[-1:][0] print "lowest validation error:", min(val_costs) plt.plot(costs, label='cost') plt.plot(val_costs, label='val cost') plt.legend() plt.ylabel('error rate')
def mlpSingleOutput1Layer_costfunc(x, *args): inputSize, l1Size, lambda_hidden, inputs, targets = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) inputs = gpu.garray(inputs) targets = gpu.garray(targets) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_output = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size + 1))) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_activation_L1), axis=0) #hidden_activation_L1 = hidden_activation_L1 * dropout_prob hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1) outputs = hidden_sum_output.logistic() output_target_diff = (outputs - targets)**2 regularized_penalty_output = theta_output[:, 1:shape(theta_output)[1]] regularized_penalty_output = regularized_penalty_output * regularized_penalty_output regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 cost = gpu.sum(output_target_diff) / ( 2 * numCases) + 0.5 * lambda_hidden * (gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_output)) print 'Multilayer Preceptron Cost:', cost del inputs del theta_L1 del hidden_sum_L1 del hidden_activation_L1 del regularized_penalty_output del regularized_penalty_L1 gpu.free_reuse_cache() return cost
def gradDebug(self, inputBatch, targetBatch): inputBatch = inputBatch if isinstance(inputBatch, gnp.garray) else gnp.garray(inputBatch) targetBatch = targetBatch if isinstance(targetBatch, gnp.garray) else gnp.garray(targetBatch) mbsz = inputBatch.shape[0] outputActs = self.fprop(inputBatch) outputErrSignal = -self.outputActFunct.dErrordNetInput(targetBatch, self.state[-1], outputActs) # error = self.outputActFunct.error(targetBatch, self.state[-1], outputActs) errSignals = self.bprop(outputErrSignal) for i, (WGrad, biasGrad) in enumerate(self.gradients(self.state, errSignals)): # update the weight increments self.WGrads[i] = WGrad self.biasGrads[i] = biasGrad allWeightGrads = itertools.chain(self.WGrads, self.biasGrads) return gnp.as_numpy_array(gnp.concatenate([dw.ravel() for dw in allWeightGrads]))
def fine_tuning_cost_gpu(x, *args): inputSize, l1Size, l2Size, l3Size, lambda_val, inputs = args num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_L3 = l3Size * (l2Size + 1) x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) weights2 = x[num_weights_L1:num_weights_L1+num_weights_L2].reshape((l2Size, l1Size + 1)) weights3 = x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3].reshape((l3Size, l2Size + 1)) weights4 = x[num_weights_L1+num_weights_L2+num_weights_L3:shape(x)[0]].reshape((inputSize, l3Size + 1)) nData = shape(inputs)[1] x = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden1_sum = gpu.dot(weights1, x) #hidden1_activation = gpu.log(1+hidden1_sum.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden1_sum)) * (hidden1_sum>0) hidden1_activation = hidden1_sum*relu_mask_hidden1 hidden1_activation = gpu.concatenate((gpu.ones((1,nData)), hidden1_activation), axis = 0) hidden2_sum = gpu.dot(weights2, hidden1_activation) #hidden2_activation = gpu.log(1+hidden2_sum.exp()) relu_mask_hidden2 = gpu.ones(shape(hidden2_sum)) * (hidden2_sum>0) hidden2_activation = hidden2_sum*relu_mask_hidden2 hidden2_activation = gpu.concatenate((gpu.ones((1,nData)), hidden2_activation), axis = 0) hidden3_sum = gpu.dot(weights3, hidden2_activation) hidden3_activation = hidden3_sum hidden3_activation = gpu.concatenate((gpu.ones((1,nData)), hidden3_activation), axis = 0) output_sum = gpu.dot(weights4, hidden3_activation) outputs = output_sum regularized_penalty3 = weights3[:,1:shape(weights3)[1]] regularized_penalty4 = weights4[:,1:shape(weights4)[1]] regularized_penalty3 = regularized_penalty3 ** 2 regularized_penalty4 = regularized_penalty4 ** 2 output_target_diff = (outputs - inputs)**2 cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty3) + gpu.sum(regularized_penalty4)) print 'Fine Tuning Cost: ', cost return cost
def mlpSingleOutput1Layer_grad(x, *args): inputSize, l1Size, lambda_hidden, inputs, targets = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_output = 1 * (l1Size+1) inputs = gpu.garray(inputs) targets = gpu.garray(targets) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_output = gpu.garray(reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size+1))) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis = 0) #hidden_activation_L1 = hidden_activation_L1 * dropout_prob hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1) outputs = hidden_sum_output.logistic() theta_L1_grad = gpu.zeros(shape(theta_L1)) theta_output_grad = gpu.zeros(shape(theta_output)) a = (outputs - targets) * outputs * (1-outputs) theta_output_grad += gpu.dot(a, gpu.garray(transpose(hidden_activation_L1.as_numpy_array()))) b_temp = gpu.dot(gpu.garray(transpose(theta_output.as_numpy_array())),a) b = (b_temp*hidden_activation_L1)*(1-hidden_activation_L1) delta2 = gpu.dot(b, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta2[1:shape(delta2)[0], :] theta_L1_grad = theta_L1_grad/numCases theta_output_grad = theta_output_grad/numCases theta_output_grad[:,1:shape(theta_output_grad)[1]] = theta_output_grad[:,1:shape(theta_output_grad)[1]] + theta_output[:,1:shape(theta_output)[1]] * lambda_hidden theta_L1_grad[:,1:shape(theta_L1_grad)[1]] = theta_L1_grad[:,1:shape(theta_L1_grad)[1]] + theta_L1[:,1:shape(theta_L1)[1]] * lambda_hidden theta_output_grad = reshape(theta_output_grad.as_numpy_array(), num_weights_output) theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) del inputs del theta_L1 del hidden_sum_L1 del hidden_activation_L1 gpu.free_reuse_cache() return hstack((theta_L1_grad,theta_output_grad))
def forward_prop_setup_bn_mean_std_on_big_set(self, X, minibatch_size=1000): i_start = X.shape[0] % minibatch_size a = [X[:i_start].dot(self.params.W)] while i_start < X.shape[0]: a.append(X[i_start:i_start + minibatch_size].dot(self.params.W)) i_start += minibatch_size a = gnp.concatenate(a, axis=0) if not self.use_batch_normalization: a += self.params.b else: self.bn_layer.setup_mean_std_stats(a) a = self.bn_layer.forward_prop(a, is_test=True) Y = self.nonlin.forward_prop(a) return Y
def plot_samples(init_samples, samples, save_to_file=False, epoch=None): all_samples = gp.concatenate((init_samples.reshape((1, init_samples.shape[0], init_samples.shape[1])), samples)) n_samples = all_samples.shape[0] n_chains = all_samples.shape[1] img = np.zeros((29 * n_samples + 1, 29 * n_chains - 1), dtype="uint8") for step in range(n_samples): v = all_samples[step, :, :] A = dlutil.tile_raster_images( gp.as_numpy_array(v), img_shape=(28, 28), tile_shape=(1, n_chains), tile_spacing=(1, 1) ) img[29 * step : 29 * step + 28, :] = A if save_to_file: assert epoch is not None pilimage = pil.fromarray(img) pilimage.save("samples-%02i.png" % epoch) return img
def extractTrainReps(self, datahandler, numBatch): """ extract representations for (big) training data through DataHandler """ for tl in datahandler: tl.reset() for i in range(numBatch): batches = [None for x in datahandler] for i in range(len(batches)): batches[i] = datahandler[i].getOneBatch() batch = gp.concatenate(tuple(batches), axis=1) if batch is None: break reps = self.getReps(batch) datahandler[0].write(reps) datahandler[0].flush()
def dbn_sample(ws_vh, ws_v, ws_h, x, y=None, k=1): """ Sample from DBN ws_vh, ws_v, ws_h: Lists of layer weights for DBN x: Initial sample. This is the input to DBN. (1xD vector) y: Class label for the sample. This corresponds to sampling from class conditionals. (1-of-K coded, row vector) k: Number of Gibbs steps Returns a sample from DBN (1xD vector) """ L = len(ws_vh) # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) h_prev = h_prev > gnp.rand(h_prev.shape[0], h_prev.shape[1]) # if not supervised, sample from top layer RBM without clamping any of its # inputs if y is None: # sample from top layer RBM h, v = rbm_sample(ws_vh[-1], ws_v[-1], ws_h[-1], h_prev, k) else: K = y.shape[1] # number of classes H = ws_vh[-1].shape[0] # generate a random input to top layer RBM with class label units clamped to y v = gnp.concatenate((y.T, h_prev)) # sample from top layer RBM h, v = rbm_sample(ws_vh[-1], ws_v[-1], ws_h[-1], v, k, clamped=(0, K)) v = v[K:H, :] # backward (top-down) pass # propagate sample from RBM back to input for l in range(L - 2, -1, -1): av = gnp.dot(ws_vh[l], v) + ws_v[l] v = gnp.logistic(av) return v.T
def fine_tuning_cost_gpu(x, *args): inputSize, l1Size, l2Size, l3Size, l4Size, l5Size, lambda_val, inputs = args num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_L3 = l3Size * (l2Size + 1) num_weights_L4 = l4Size * (l3Size + 1) num_weights_L5 = l5Size * (l4Size + 1) #num_weights_L6 = inputSize * (l5Size + 1) x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = reshape(x[0:num_weights_L1], (l1Size, inputSize + 1)) weights1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #weights2 = reshape(x[num_weights_L1:num_weights_L1+num_weights_L2], (l2Size, l1Size + 1)) weights2 = x[num_weights_L1:num_weights_L1+num_weights_L2].reshape((l2Size, l1Size + 1)) #weights3 = reshape(x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3], (l3Size, l2Size + 1)) weights3 = x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3].reshape((l3Size, l2Size + 1)) #weights4 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4], (l4Size, l3Size + 1)) weights4 = x[num_weights_L1+num_weights_L2+num_weights_L3:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4].reshape((l4Size, l3Size + 1)) #weights5 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5], (l5Size, l4Size + 1)) weights5 = x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5].reshape((l5Size, l4Size + 1)) #weights6 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5:shape(x)[0]], (inputSize, l5Size+1)) weights6 = x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5:shape(x)[0]].reshape((inputSize, l5Size+1)) nData = shape(inputs)[1] x = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden1_sum = gpu.dot(weights1, x) hidden1_activation = hidden1_sum.logistic() hidden1_activation = gpu.concatenate((gpu.ones((1,nData)), hidden1_activation), axis = 0) hidden2_sum = gpu.dot(weights2, hidden1_activation) hidden2_activation = hidden2_sum.logistic() hidden2_activation = gpu.concatenate((gpu.ones((1,nData)), hidden2_activation), axis = 0) hidden3_sum = gpu.dot(weights3, hidden2_activation) hidden3_activation = hidden3_sum.logistic() hidden3_activation = gpu.concatenate((gpu.ones((1,nData)), hidden3_activation), axis = 0) hidden4_sum = gpu.dot(weights4, hidden3_activation) hidden4_activation = hidden4_sum.logistic() hidden4_activation = gpu.concatenate((gpu.ones((1,nData)), hidden4_activation), axis = 0) hidden5_sum = gpu.dot(weights5, hidden4_activation) hidden5_activation = hidden5_sum.logistic() hidden5_activation = gpu.concatenate((gpu.ones((1,nData)), hidden5_activation), axis = 0) output_sum = gpu.dot(weights6, hidden5_activation) outputs = output_sum.logistic() regularized_penalty4 = weights4[:,1:shape(weights4)[1]] regularized_penalty5 = weights5[:,1:shape(weights5)[1]] regularized_penalty6 = weights6[:,1:shape(weights6)[1]] regularized_penalty4 = regularized_penalty4 ** 2 regularized_penalty5 = regularized_penalty5 ** 2 regularized_penalty6 = regularized_penalty6 ** 2 output_target_diff = (outputs - inputs)**2 cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty4) + gpu.sum(regularized_penalty5) + gpu.sum(regularized_penalty6)) print 'Fine Tuning Cost: ', cost return cost
def backward(self, Y, preds, IF, acts, words, X, Im): """ Backward pass through the network """ batchsize = preds.shape[0] Im = gpu.garray(Im) # Compute part of df/dR Ix = gpu.garray(preds[:,:-1] - Y) / batchsize delta = gpu.dot(acts.T, Ix) dR = delta[:-1,:] + self.gamma_r * self.R db = delta[-1,:] dR = dR.as_numpy_array() # Compute df/dC and word inputs for df/dR Ix = gpu.dot(Ix, self.R.T) dC = gpu.zeros(np.shape(self.C)) for i in range(self.context): delta = gpu.dot(words[:,:,i].T, Ix) dC[i,:,:] = delta + self.gamma_c * self.C[i,:,:] delta = gpu.dot(Ix, self.C[i,:,:].T) delta = delta.as_numpy_array() for j in range(X.shape[0]): dR[:,X[j,i]] = dR[:,X[j,i]] + delta.T[:,j] # Compute df/dM dM = gpu.dot(IF.T, Ix) + self.gamma_c * self.M # Compute df/dJ Ix = gpu.dot(Ix, self.M.T) * (IF > 0) Im = gpu.concatenate((Im, gpu.ones((batchsize, 1))), 1) delta = gpu.dot(Im.T, Ix) dJ = delta[:-1,:] + self.gamma_c * self.J dBj = delta[-1,:] self.dR = gpu.garray(dR) self.dM = dM self.db = db self.dC = dC self.dJ = dJ self.dBj = dBj
def mlpSoftmax1Layer_costfunc(x, *args): numClasses, inputSize, l1Size, lambda_softmax, lambda_hidden, inputs, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_softmax = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (numClasses, l1Size))) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 #hidden_activation_L1 = hidden_sum_L1.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L1) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) temp = groundTruth * gpu.log(predictions) temp = temp.as_numpy_array() temp[temp == -inf] = -200.0 temp = nan_to_num(temp) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 cost = -1 * sum(temp) / numCases + 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L1)) + 0.5 * lambda_softmax * gpu.sum( theta_softmax * theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 gpu.free_reuse_cache() return cost
def from_activations(cls, v, h): nvis, nhid = v.shape[1], h.shape[1] v_mean = v.mean(0) h_mean = h.mean(0) vh = gnp.concatenate([v, h], axis=1) m_unary = vh.mean(0) S_unary = gnp.dot(vh.T, vh) / vh.shape[0] S_unary[:nvis, :nvis] += gnp.diagflat((v * (1. - v)).mean(0)) S_unary[nvis:, nvis:] += gnp.diagflat((h * (1. - h)).mean(0)) m_pair = gnp.zeros((nvis, nhid, 3)) m_pair[:, :, 0] = v_mean[:, nax] m_pair[:, :, 1] = h_mean[nax, :] m_pair[:, :, 2] = gnp.dot(v.T, h) / h.shape[0] S_pair = gnp.zeros((nvis, nhid, 3, 3)) S_pair[:] = S_unary[:nvis, nvis:, nax, nax] S_pair[:, :, 0, 0] = v_mean[:, nax] S_pair[:, :, 1, 1] = h_mean[nax, :] return cls(m_unary, S_unary, m_pair, S_pair)
def setup_training_data(params,midi_dir,verbose=False): ''' load and setup training data input: T - max-lag for computing frame size ''' # load training data sequential_data, sequential_labels, num_labels = load_data(midi_dir) T = max(params['Tv'],params['Th']) # max look-behind # convert sequences into subsequences of length T+1 subseq_data, subseq_labels = frame_subseqs(T+1,sequential_data,sequential_labels) subseq_data *= params['vis_scale'] # put training data at correct scale training_data = subseq_to_frames(subseq_data) Nl = params['Nl'] training_labels = compute_binary_labels(subseq_to_frames(subseq_labels),Nl) input_training_data = gp.concatenate((gp.garray(training_data), gp.garray(training_labels)),axis=1) return input_training_data
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, l3Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth, dropout_probability = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_L3 = l3Size * (l2Size + 1) num_weights_softmax = numClasses * l3Size #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray( reshape(x[num_weights_L1:num_weights_L2 + num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_L3 = gpu.garray( reshape( x[num_weights_L2 + num_weights_L1:num_weights_L2 + num_weights_L1 + num_weights_L3], (l3Size, l2Size + 1))) theta_softmax = gpu.garray( reshape( x[num_weights_L2 + num_weights_L1 + num_weights_L3:shape(x)[0]], (numClasses, l3Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) theta_L1_grad = gpu.zeros(shape(theta_L1)) theta_L2_grad = gpu.zeros(shape(theta_L2)) theta_L3_grad = gpu.zeros(shape(theta_L3)) dropout_l1 = gpu.garray( bernoulli.rvs(dropout_probability, size=(l1Size + 1, numCases))) dropout_l2 = gpu.garray( bernoulli.rvs(dropout_probability, size=(l2Size + 1, numCases))) dropout_l3 = gpu.garray( bernoulli.rvs(dropout_probability, size=(l3Size, numCases))) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 hidden_derivative_L1 = relu_mask_hidden1 #hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0) hidden_derivative_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_derivative_L1), axis=0) hidden_activation_L1 = gpu.concatenate( (gpu.ones((1, numCases)), hidden_activation_L1), axis=0) * dropout_l1 hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) #hidden_activation_L2 = gpu.log(1+hidden_sum_L2.exp()) relu_mask_hidden2 = gpu.ones(shape(hidden_sum_L2)) * (hidden_sum_L2 > 0) hidden_activation_L2 = hidden_sum_L2 * relu_mask_hidden2 hidden_derivative_L2 = relu_mask_hidden2 #hidden_activation_L2 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L2), axis=0) hidden_derivative_L2 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_derivative_L2), axis=0) hidden_activation_L2 = gpu.concatenate( (gpu.ones((1, numCases)), hidden_activation_L2), axis=0) * dropout_l2 hidden_sum_L3 = gpu.dot(theta_L3, hidden_activation_L2) #hidden_activation_L3 = gpu.log(1+hidden_sum_L3.exp()) relu_mask_hidden3 = gpu.ones(shape(hidden_sum_L3)) * (hidden_sum_L3 > 0) #hidden_activation_L3 = hidden_sum_L3*relu_mask_hidden3 hidden_derivative_L3 = relu_mask_hidden3 hidden_activation_L3 = hidden_sum_L3 * relu_mask_hidden3 * dropout_l3 #hidden_activation_L3 = hidden_sum_L3.logistic() * dropout_l3 hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L3) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) pred = predictions.argmax(axis=0) + 1 accuracy = mean(pred == labels) * 100 temp = groundTruth * gpu.log(predictions) temp = temp.as_numpy_array() temp[temp == -inf] = -200.0 temp = nan_to_num(temp) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:, 1:shape(theta_L2)[1]] regularized_penalty_L3 = theta_L3[:, 1:shape(theta_L3)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 regularized_penalty_L3 = regularized_penalty_L3 * regularized_penalty_L3 pred_cost = -1 * sum(temp) / numCases l2norm_cost = 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L3) + gpu.sum(regularized_penalty_L2) + gpu.sum(regularized_penalty_L1)) + 0.5 * lambda_softmax * gpu.sum( theta_softmax * theta_softmax) #l2norm_cost = 0 cost = pred_cost + l2norm_cost print 'Prediction Accuracy: ', accuracy, '%' print 'Multilayer Softmax Prediction Cost: ', pred_cost print 'Multilayer Softmax L2 Normalisation Cost: ', l2norm_cost print 'Multilayer Softmax Cost: ', cost print '--------------------------------------------------------------------' softmax_imd = groundTruth - predictions #theta_softmax_grad = -1*gpu.dot(softmax_imd, gpu.garray(transpose(hidden_activation_L3.as_numpy_array())))/numCases theta_softmax_grad = -1 * gpu.dot( softmax_imd, gpu.garray(transpose(hidden_activation_L3.as_numpy_array())) ) / numCases + lambda_softmax * theta_softmax deltaOut = -softmax_imd delta_L3_imd = gpu.dot( gpu.garray(transpose(theta_softmax.as_numpy_array())), deltaOut) delta_L3_imd2 = delta_L3_imd * hidden_derivative_L3 #delta_L3_imd2 = (delta_L3_imd * hidden_activation_L3) * (1-hidden_activation_L3) delta_L3 = gpu.dot( delta_L3_imd2, gpu.garray(transpose(hidden_activation_L2.as_numpy_array()))) theta_L3_grad += delta_L3 delta_L2_imd = gpu.dot(gpu.garray(transpose(theta_L3.as_numpy_array())), delta_L3_imd2) delta_L2_imd2 = delta_L2_imd * hidden_derivative_L2 delta_L2_imd2 = delta_L2_imd2[1:shape(delta_L2_imd2)[0] + 1, :] delta_L2 = gpu.dot( delta_L2_imd2, gpu.garray(transpose(hidden_activation_L1.as_numpy_array()))) theta_L2_grad += delta_L2 delta_L1_imd = gpu.dot(gpu.garray(transpose(theta_L2.as_numpy_array())), delta_L2_imd2) delta_L1_imd2 = delta_L1_imd * hidden_derivative_L1 delta_L1_imd2 = delta_L1_imd2[1:shape(delta_L1_imd2)[0] + 1, :] delta_L1 = gpu.dot(delta_L1_imd2, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta_L1 theta_L1_grad = theta_L1_grad / numCases theta_L2_grad = theta_L2_grad / numCases theta_L3_grad = theta_L3_grad / numCases theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape( theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden theta_L2_grad[:, 1:shape(theta_L2_grad)[1]] = theta_L2_grad[:, 1:shape( theta_L2_grad)[1]] + theta_L2[:, 1:shape(theta_L2)[1]] * lambda_hidden theta_L3_grad[:, 1:shape(theta_L3_grad)[1]] = theta_L3_grad[:, 1:shape( theta_L3_grad)[1]] + theta_L3[:, 1:shape(theta_L3)[1]] * lambda_hidden theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) theta_L2_grad = reshape(theta_L2_grad.as_numpy_array(), num_weights_L2) theta_L3_grad = reshape(theta_L3_grad.as_numpy_array(), num_weights_L3) theta_softmax_grad = reshape(theta_softmax_grad.as_numpy_array(), num_weights_softmax) del inputs del theta_L1 del theta_L2 del theta_L3 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_activation_L3 del hidden_sum_L3 del hidden_sum_softmax del predictions del temp del softmax_imd del deltaOut del delta_L3_imd del delta_L3_imd2 del delta_L3 del delta_L2_imd del delta_L2_imd2 del delta_L2 del delta_L1_imd del delta_L1_imd2 del delta_L1 #del regularized_penalty_L1 #del regularized_penalty_L2 gpu.free_reuse_cache() return cost, hstack( (theta_L1_grad, theta_L2_grad, theta_L3_grad, theta_softmax_grad))
def bias(X, bias_val=1.0): """Append a bias columns of magnitude bias_val to X.""" Xb = gp.concatenate((X, gp.ones((X.shape[0],1))), axis=1) return Xb
def vector_weights(self, Ws=[]): """Get vectorized form of weights in Ws (or current net weights).""" if (len(Ws) == 0): Ws = self.layer_weights() Wv = [W.reshape((W.size, 1)) for W in Ws] return gp.concatenate(Wv, axis=0)
def computeGrads2(self, a, ja, diffgrad, recf): # assert(recf==0) aes = [] grad = [] d = [] for m in xrange(self.modalsCnt): aes.append(self.saes[m].ae) grad.append([]) d.append([0] * self.depth) jaes = self.jsae.ae jgrad = [] jd = [0] * self.jdepth topidx = self.depth - 1 topjidx = self.jdepth - 1 if recf > 0: for m in xrange(self.modalsCnt): #compute derivatives of reconstruction layers from L_r for saes d[m][0] = aes[m][1].computeDlast(a[m][0], a[m][-1], recf) for i in range(1, self.depth): grad[m].append(aes[m][i].getbGradient(d[m][i - 1])) grad[m].append(aes[m][i].getWGradient( d[m][i - 1], a[m][-1 - i], aes[m][i].W2)) if i + 1 < self.depth: d[m][i] = aes[m][i + 1].computeD( a[m][-1 - i], d[m][i - 1], aes[m][i].W2) d[m][topidx] = aes[m][topidx].computeD(a[m][-1 - topidx], d[m][topidx - 1], aes[m][topidx].W2) # d[m][topidx]=gp.dot(d[m][topidx-1],aes[m][topidx].W2.T) #compute derivatives of reconstruction layers from L_r for jsae if self.has_joint: jd[0] = gp.concatenate(tuple(e[self.depth - 1] for e in d), axis=1) # jd[0] = jaes[1].computeDlast(ja[0],ja[-1],recf) for i in range(1, self.jdepth): jgrad.append(jaes[i].getbGradient(jd[i - 1])) jgrad.append(jaes[i].getWGradient(jd[i - 1], ja[-1 - i], jaes[i].W2)) if i + 1 < self.jdepth: jd[i] = jaes[i + 1].computeD(ja[-1 - i], jd[i - 1], jaes[i].W2) jd[topjidx] = gp.dot(jd[topjidx - 1], jaes[topjidx].W2.T) #add diffgrad to generative loss if self.has_joint: if diffgrad is not None: jd[topjidx] += (diffgrad + self.sparsityFactor * (2 * ja[topjidx] / (1 + ja[topjidx] * ja[topjidx])) ) * jaes[topjidx].getActivationGradient( ja[topjidx]) #backprop in jsae for i in range(self.jdepth - 1, 0, -1): #compute derivates of latent layers jgrad.append(jaes[i].getbGradient(jd[i])) jgrad.append(jaes[i].getWGradient(jd[i], ja[i - 1], jaes[i].W1)) if i > 1: jd[i - 1] = jaes[i - 1].computeD( ja[i - 1], jd[i], jaes[i].W1) + self.sparsityFactor * ( 2 * ja[i - 1] / (1 + ja[i - 1] * ja[i - 1]) ) * jaes[topjidx].getActivationGradient(ja[i - 1]) #propagate to isae and tsae if diffgrad is not None: if self.has_joint: transD = aes[0][topidx].computeD((ja[0]), jd[1], (jaes[1].W1)) else: transD = (diffgrad) * aes[0][topidx].getActivationGradient( gp.concatenate(tuple(e[topidx] for e in a), axis=1)) # no sparsity for m in xrange(self.modalsCnt): if diffgrad is not None: #combine derivates from L_d d[m][topidx] += transD[:, self.dims[m]:self.dims[m + 1]] d[m][topidx] *= aes[m][topidx].getActivationGradient(a[m][topidx]) for i in range(self.depth - 1, 0, -1): #compute derivates of latent layers grad[m].append(aes[m][i].getbGradient(d[m][i])) grad[m].append(aes[m][i].getWGradient(d[m][i], a[m][i - 1], aes[m][i].W1)) if i > 1: d[m][i - 1] = aes[m][i - 1].computeD( a[m][i - 1], d[m][i], aes[m][i].W1) for m in xrange(self.modalsCnt): grad[m].reverse() jgrad.reverse() return grad, jgrad
def getSinglePathGrad2(self, a, ja, sim, other, recf, sim_diff_factor, dis_diff_factor): """ ia:image ae data ta:text ae data ja:joint ae data sim: should this be similar to other other:output of jae given other element of the pair """ recloss = [] for m in xrange(self.modalsCnt): recloss.append(0) if recf > 0: for m in xrange(self.modalsCnt): # a[m]=self.saes[m].backward2Bottom(a[m]) recloss[m] = self.saes[m].ae[1].getErrorLoss( a[m][0], a[m][-1], recf) # ja=self.jsae.backward2Bottom(ja) # jrecloss=self.jsae.ae[1].getErrorLoss(ja[0],ja[-1],recf) if sim_diff_factor == 0 and dis_diff_factor == 0: diffgrad = None else: if (sim): if self.has_joint: npj = ja[self.jdepth - 1] #.as_numpy_array() else: npj = gp.concatenate(tuple(e[self.depth - 1] for e in a), axis=1) #.as_numpy_array() npo = other #.as_numpy_array() jsum = ((npj**2).sum( axis=1))**0.5 #(np.linalg.norm(npj,axis=1)) nj = (npj / jsum[:, gp.newaxis]) osum = ((npj**2).sum( axis=1))**0.5 #(np.linalg.norm(npo,axis=1)) no = (npo / osum[:, gp.newaxis]) # jsum = gp.as_garray(jsum) # osum = gp.as_garray(osum) # nj = gp.as_garray(nj) # no = gp.as_garray(no) tmp = gp.sum(nj * no, axis=1) tmp = tmp.reshape(tmp.shape + (1, )) tmp = gp.garray(tmp) tmp = (tmp * nj - no) tmp = tmp / jsum[:, gp.newaxis] dist = (1 - gp.sum(nj * no, axis=1)) dist = dist > 0.034 diffgrad = gp.zeros(nj.shape) for i in xrange(self.batchsize): if dist[i]: diffgrad[i, :] = (tmp[i, :]) diffgrad = sim_diff_factor * diffgrad / self.batchsize else: if self.has_joint: npj = ja[self.jdepth - 1] #.as_numpy_array() else: npj = gp.concatenate(tuple(e[self.depth - 1] for e in a), axis=1) #.as_numpy_array() npo = other #.as_numpy_array() jsum = ((npj**2).sum( axis=1))**0.5 #(np.linalg.norm(npj,axis=1)) nj = (npj / jsum[:, gp.newaxis]) osum = ((npj**2).sum( axis=1))**0.5 #(np.linalg.norm(npo,axis=1)) no = (npo / osum[:, gp.newaxis]) # jsum = gp.as_garray(jsum) # osum = gp.as_garray(osum) # nj = gp.as_garray(nj) # no = gp.as_garray(no) tmp = gp.sum(nj * no, axis=1) tmp = tmp.reshape(tmp.shape + (1, )) tmp = (tmp * nj - no) tmp = -1 * tmp / jsum[:, gp.newaxis] dist = (1 - gp.sum(nj * no, axis=1)) dist = dist < 0.1 diffgrad = gp.zeros(nj.shape) for i in xrange(self.batchsize): if dist[i]: diffgrad[i, :] = (tmp[i, :]) diffgrad = dis_diff_factor * diffgrad / self.batchsize g, jg = self.computeGrads2(a, ja, diffgrad, recf) return g, jg, recloss
def trainOnePair(self, bat1, bat2, sim, epoch, recf, sim_diffcost, dis_diffcost): """ trains one pair in which each element has two modalities im1: first element's image data tx1: first element's text data im2: second element's image data tx2: second element's text data sim: if the pair is in similar set recf: reconstruction factor """ #consider diffcost?! a1 = [] a2 = [] for m in xrange(self.modalsCnt): a1.append(self.saes[m].forward2Top(bat1[m])) a2.append(self.saes[m].forward2Top(bat2[m])) j1a = None j2a = None if self.has_joint: j1inp = gp.concatenate(tuple(e[self.depth - 1] for e in a1), axis=1) j2inp = gp.concatenate(tuple(e[self.depth - 1] for e in a2), axis=1) j1a = self.jsae.forward(j1inp) j2a = self.jsae.forward(j2inp) for m in xrange(self.modalsCnt): if self.has_joint: a1[m].append(j1a[-1][:, self.dims[m]:self.dims[m + 1]]) self.saes[m].backward2Bottom(a1[m]) if self.has_joint: a2[m].append(j2a[-1][:, self.dims[m]:self.dims[m + 1]]) self.saes[m].backward2Bottom(a2[m]) # j1a = j1a[1:-1] # j2a = j2a[1:-1] #get path grad for z #backpropagate x and y wrt z if self.has_joint: other1 = j2a[self.jdepth - 1] other2 = j1a[self.jdepth - 1] else: other1 = gp.concatenate(tuple(e[self.depth - 1] for e in a2), axis=1) other2 = gp.concatenate(tuple(e[self.depth - 1] for e in a1), axis=1) g1, jg1, rl1 = self.getSinglePathGrad2(a1, j1a, sim, other1, recf, sim_diffcost, dis_diffcost) g2, jg2, rl2 = self.getSinglePathGrad2(a2, j2a, sim, other2, recf, sim_diffcost, dis_diffcost) g = [[] for x in g1] for m in xrange(self.modalsCnt): g[m] = [[] for x in g1[m]] for i in xrange(len(g1[m])): g[m][i] = g1[m][i] + g2[m][i] jg = None if self.has_joint: jg = [[] for x in jg1] for i in xrange(len(jg1)): jg[i] = jg1[i] + jg2[i] #this lines are just for debug: if self.has_joint: perf = [ sim, self.getDiffLoss(j1a[self.jdepth - 1], j2a[self.jdepth - 1]) ] else: perf = [ sim, self.getDiffLoss( gp.concatenate(tuple(e[self.depth - 1] for e in a1), axis=1), gp.concatenate(tuple(e[self.depth - 1] for e in a2), axis=1)) ] # for i in range(1,self.depth): # perf.append(self.getDiffLoss(ia[i],ta[i])) # a=ia[1:self.depth]+ta[1:self.depth] # ae=self.isae.ae[1:]+self.tsae.ae[1:] # for i in range(len(a)): # perf.append(ae[i].computeSparsity(a[i])) return np.array(perf), g, jg