def backward(self, dEdY): # Need to generalize, but now, let's assume it's the attention model. dEdX = [] if self.gpu: if len(self.X) == 2: dEdY = dEdY.reshape(dEdY.shape[0], 1, dEdY.shape[1]) dEdY = gpu.as_garray(dEdY) dEdX1 = self.beta * gpu.sum(dEdY * self.X[1], axis=2) dEdX2 = self.beta * dEdY * self.X[0] dEdX.append(dEdX1.as_numpy_array(dtype='float32')) dEdX.append(dEdX2.as_numpy_array(dtype='float32')) elif len(self.X) == 3: dEdY = gpu.as_garray(dEdY) dEdY2 = dEdY.reshape(dEdY.shape[0], 1, dEdY.shape[1]) dEdY2 = gpu.as_garray(dEdY2) dEdX1 = self.X[2] * gpu.sum(dEdY2 * self.X[1], axis=2) dEdX2 = self.X[2].reshape(self.X[2].shape[0], 1, 1) * dEdY2 * self.X[0] dEdX3 = gpu.sum(dEdY * self.Z, axis=-1).reshape(self.X[2].shape[0], 1) dEdX.append(dEdX1.as_numpy_array(dtype='float32')) dEdX.append(dEdX2.as_numpy_array(dtype='float32')) dEdX.append(dEdX3.as_numpy_array(dtype='float32')) else: if len(self.X) == 2: dEdY = dEdY.reshape(dEdY.shape[0], 1, dEdY.shape[1]) dEdX.append(self.beta * np.sum(dEdY * self.X[1], axis=2)) dEdX.append(self.beta * dEdY * self.X[0]) elif len(self.X) == 3: dEdY2 = dEdY.reshape(dEdY.shape[0], 1, dEdY.shape[1]) dEdX.append(self.X[2] * np.sum(dEdY2 * self.X[1], axis=2)) dEdX.append(self.X[2].reshape(self.X[2].shape[0], 1, 1) * dEdY2 * self.X[0]) dEdX.append(np.sum(dEdY * self.Z, axis=-1).reshape(self.X[2].shape[0], 1)) return dEdX
def _updateWeights(self, dEdW): if self.gpu: if self.gradientClip > 0.0: self.dEdWnorm = gpu.sqrt(gpu.sum(dEdW ** 2)) if self.dEdWnorm > self.gradientClip: dEdW *= self.gradientClip / self.dEdWnorm if self.learningRate > 0.0: self.lastdW = -self.learningRate * dEdW + \ self.momentum * self.lastdW self.W += self.lastdW if self.weightRegConst > 0.0: a = self.learningRate * self.weightRegConst self.W -= a * self.W if self.weightClip > 0.0: self.Wnorm = gpu.sqrt(gpu.sum(self.W ** 2)) if self.Wnorm > self.weightClip: self.W *= self.weightClip / self.Wnorm else: if self.gradientClip > 0.0: self.dEdWnorm = np.sqrt(np.sum(np.power(dEdW, 2))) if self.dEdWnorm > self.gradientClip: dEdW *= self.gradientClip / self.dEdWnorm if self.learningRate > 0.0: self.lastdW = -self.learningRate * dEdW + \ self.momentum * self.lastdW self.W += self.lastdW if self.weightRegConst > 0.0: a = self.learningRate * self.weightRegConst self.W -= a * self.W if self.weightClip > 0.0: self.Wnorm = np.sqrt(np.sum(np.power(self.W, 2))) if self.Wnorm > self.weightClip: self.W *= self.weightClip / self.Wnorm
def simulate_step(bodies, dt_min, epsilon, dt_output, alpha): current_t = 0 current_step = 0 n_bodies = bodies.r.shape[0] delta_v = np.zeros_like(bodies.v) for i in range(n_bodies): coord_diff = bodies.r - bodies.r[i, :] r_ik3 = (gpu.sum(coord_diff**2, axis=1) + epsilon**2)**1.5 #+ 1e-16 delta_v[i, :] = gpu.sum(bodies.m[:, np.newaxis] * coord_diff / r_ik3[:, np.newaxis], axis=0) dt = max(calculate_dt(bodies.v, delta_v, n_bodies, alpha), dt_min) bodies.v += 0.5 * dt * delta_v while True: bodies.r += dt * bodies.v for i in range(n_bodies): coord_diff = bodies.r - bodies.r[i, :] r_ik3 = (gpu.sum(coord_diff**2, axis=1) + epsilon**2)**1.5 #+ 1e-16 delta_v[i, :] = gpu.sum(bodies.m[:, np.newaxis] * coord_diff / r_ik3[:, np.newaxis], axis=0) dt = max(calculate_dt(bodies.v, delta_v, n_bodies, alpha), dt_min) bodies.v += dt * delta_v if current_step * dt_output <= current_t: current_step += 1 yield current_t gpu.status() current_t += dt
def l1svm_mia(z, targets, predict=False, error=False, addon=0): """ l1-SVM for the hinge loss, Multiple independent attributes addon, weight Note: the targets here are (1, -1) """ if predict: # argmax_t(z*t) t = 2 * (z > 0) - 1 return t _value = (1 - z * targets) indicator = _value > 0 maximum = indicator * _value # diff C for unbalance dataset # automatically adjust weights inversely proportional to class frequencies n, _ = targets.shape positive = gpu.sum((targets + 1.) / 2, axis=0) negative = n - positive inv_ne_freq = float(n) / (negative + 1) inv_po_freq = float(n) / (positive + 1) class_weight = inv_po_freq * (targets > 0) + inv_ne_freq * (targets < 0) bhl = gpu.sum(maximum * class_weight) if error: err = -targets * indicator * class_weight return bhl + addon, err else: return bhl + addon
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end + self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] w = params[:self.m_end].reshape(self.shape) cae = gpu.sum( gpu.mean(Dsigmoid(hddn)**2, axis=0) * gpu.sum(w**2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn)**2, axis=0) * w cae_grad += (gdot(inpts.T, (Dsigmoid(hddn)**2 * (1 - 2 * hddn))) / m * gpu.sum(w**2, axis=0)) g[:self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot( delta, params[:self.m_end].reshape(self.shape)) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] w = params[: self.m_end].reshape(self.shape) cae = gpu.sum(gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * gpu.sum(w ** 2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[: self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0] :] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * w cae_grad += gdot(inpts.T, (Dsigmoid(hddn) ** 2 * (1 - 2 * hddn))) / m * gpu.sum(w ** 2, axis=0) g[: self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot(delta, params[: self.m_end].reshape(self.shape)) g[: self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end : -self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def mlpSingleOutput1Layer_costfunc(x, *args): inputSize, l1Size, lambda_hidden, inputs, targets = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) inputs = gpu.garray(inputs) targets = gpu.garray(targets) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_output = gpu.garray(reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size+1))) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis = 0) #hidden_activation_L1 = hidden_activation_L1 * dropout_prob hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1) outputs = hidden_sum_output.logistic() output_target_diff = (outputs - targets)**2 regularized_penalty_output = theta_output[:,1:shape(theta_output)[1]] regularized_penalty_output = regularized_penalty_output * regularized_penalty_output regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 cost = gpu.sum(output_target_diff)/(2*numCases) + 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L1)+gpu.sum(regularized_penalty_output)) print 'Multilayer Preceptron Cost:', cost del inputs del theta_L1 del hidden_sum_L1 del hidden_activation_L1 del regularized_penalty_output del regularized_penalty_L1 gpu.free_reuse_cache() return cost
def _cd_update_terms(self, vis, model_vis, model_p_vis): """Returns (weights update, visible bias update, hidden bias update) given visible states from the data vis, visible states sampled from the model model_vis and the probability of the visible units being active from the model.""" #print "vis.shape: ", vis.shape #print "p_hid(vis).shape: ", self.p_hid(vis).shape #print "model_p_vis.shape: ", model_p_vis.shape #print "p_hid(model_p_vis).shape: ", self.p_hid(model_p_vis).shape # my update rule: #dweights = (gp.dot(vis.T, self.p_hid(vis)) - # gp.dot(model_p_vis.T, self.p_hid(model_vis))) #dbias_vis = gp.sum(vis, axis=0) - gp.sum(model_p_vis, axis=0) #dbias_hid = (gp.sum(self.p_hid(vis), axis=0) - # gp.sum(self.p_hid(model_vis), axis=0)) # deep learning update rule: dweights = (gp.dot(vis.T, self.p_hid_given_vis(vis)) - gp.dot(model_vis.T, self.p_hid_given_vis(model_vis))) dbias_vis = gp.sum(vis, axis=0) - gp.sum(model_vis, axis=0) dbias_hid = (gp.sum(self.p_hid_given_vis(vis), axis=0) - gp.sum(self.p_hid_given_vis(model_vis), axis=0)) n_samples = vis.shape[0] return (dweights / n_samples, dbias_vis / n_samples, dbias_hid / n_samples)
def _updateWeights(self, dEdW): if self.gpu: if self.gradientClip > 0.0: self.dEdWnorm = gpu.sqrt(gpu.sum(dEdW**2)) if self.dEdWnorm > self.gradientClip: dEdW *= self.gradientClip / self.dEdWnorm if self.learningRate > 0.0: self.lastdW = -self.learningRate * dEdW + \ self.momentum * self.lastdW self.W += self.lastdW if self.weightRegConst > 0.0: a = self.learningRate * self.weightRegConst self.W -= a * self.W if self.weightClip > 0.0: self.Wnorm = gpu.sqrt(gpu.sum(self.W**2)) if self.Wnorm > self.weightClip: self.W *= self.weightClip / self.Wnorm else: if self.gradientClip > 0.0: self.dEdWnorm = np.sqrt(np.sum(np.power(dEdW, 2))) if self.dEdWnorm > self.gradientClip: dEdW *= self.gradientClip / self.dEdWnorm if self.learningRate > 0.0: self.lastdW = -self.learningRate * dEdW + \ self.momentum * self.lastdW self.W += self.lastdW if self.weightRegConst > 0.0: a = self.learningRate * self.weightRegConst self.W -= a * self.W if self.weightClip > 0.0: self.Wnorm = np.sqrt(np.sum(np.power(self.W, 2))) if self.Wnorm > self.weightClip: self.W *= self.weightClip / self.Wnorm
def costfunc_gpu_ReLU(x, *args): num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args num_weights1 = (num_input+1)*num_hidden x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden_sum = gpu.dot(weights1, data) hidden_activation = gpu.log(1+hidden_sum.exp()) p_avg = gpu.sum(hidden_activation,axis=1)/nData hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:,1:shape(weights1)[1]] regularized_penalty2 = weights2[:,1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - inputs)*(output - inputs) KL = gpu.sum(sparsityParam*gpu.log(sparsityParam/p_avg) + (1-sparsityParam)*gpu.log((1-sparsityParam)/(1-p_avg))) cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta*KL print 'ReLU Linear Decoder Cost: ', cost return cost
def simulate_step(bodies, dt_min, G, epsilon, dt_output, alpha): current_t = 0 current_step = 0 n_bodies = bodies.r.shape[0] delta_v = np.zeros_like(bodies.v) for i in range(n_bodies): coord_diff = bodies.r - bodies.r[i, :] r_ik3 = (gpu.sum(coord_diff**2, axis=1) + epsilon**2)**1.5 #+ 1e-16 delta_v[i,:] = G*gpu.sum(bodies.m[:, np.newaxis] * coord_diff / r_ik3[:, np.newaxis], axis=0) dt = max(calculate_dt(bodies.v, delta_v, n_bodies, alpha), dt_min) bodies.v += 0.5 * dt * delta_v while True: bodies.r += dt * bodies.v for i in range(n_bodies): coord_diff = bodies.r - bodies.r[i, :] r_ik3 = (gpu.sum(coord_diff**2, axis=1) + epsilon**2)**1.5 #+ 1e-16 delta_v[i,:] = G*gpu.sum(bodies.m[:, np.newaxis] * coord_diff / r_ik3[:, np.newaxis], axis=0) dt = max(calculate_dt(bodies.v, delta_v, n_bodies, alpha), dt_min) bodies.v += dt * delta_v if current_step * dt_output <= current_t: current_step += 1 yield current_t gpu.status() current_t += dt
def forward(self): """ Perform a forward pass to calculate the activation (objective) """ numExamples = self.output_port.getOutput().shape[0] self.objective = -gpu.sum(gpu.garray(self.target_port.getOutput()) * gpu.log(gpu.garray(self.output_port.getOutput()))) self.objective += -gpu.sum((1.0 - self.target_port.getOutput())*(gpu.log(1.000001 - self.output_port.getOutput()))) self.objective /= numExamples
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray( reshape(x[num_weights_L1:num_weights_L2 + num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_softmax = gpu.garray( reshape(x[num_weights_L2 + num_weights_L1:shape(x)[0]], (numClasses, l2Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_activation_L1), axis=0) hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = hidden_sum_L2.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) temp = groundTruth * gpu.log(predictions) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:, 1:shape(theta_L2)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 cost = -1 * gpu.sum(temp) / numCases + 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2) ) + 0.5 * lambda_softmax * gpu.sum(theta_softmax * theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_L2 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 del regularized_penalty_L2 gpu.free_reuse_cache() return cost
def pt_score(self, params, inpts, **kwargs): hddn = logistic( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] w = params[: self.m_end].reshape(self.shape) cae = gpu.sum(gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * gpu.sum(w ** 2, axis=0)) cae *= self.cae sc = self.score(Z, inpts, addon=cae) return np.array([sc, cae])
def dbn_supervised_predict_exact(ws_vh, ws_v, ws_h, x): """ Predict the class label of input x from supervised DBN Uses the exact method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 The free energy formula is taken from http://deeplearning.net/tutorial/rbm.html x: Input data. (NxD matrix) """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # for every class, assume it is the correct label and calculate its free energy y = gnp.zeros((K, N)) free_energy = gnp.zeros((N, K)) # we actually calculate -free_energy for k in range(K): # set the current assumed class label y[k, :] = 1.0 # visible unit vector v = gnp.concatenate((y, h_prev)) e_v = gnp.dot(ws_v[-1].T, v) # bias energy term ah = gnp.dot(ws_vh[-1].T, v) + ws_h[-1] e_h = gnp.sum(gnp.log(gnp.exp(ah) + 1.0), axis=0) free_energy[:, k] = e_v + e_h # zero the class labels for next iteration y[:, :] = 0.0 # since these numbers may get pretty small, use the sum-exp trick for converting # these to probabilities pred_y = ( gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]) / gnp.sum(gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]), axis=1)[:, gnp.newaxis] ) return pred_y
def calculate_dt(v, delta_v, N_bodies, alpha): a_max = 0. for i in range(N_bodies): delta_v = gpu.garray(delta_v) a = gpu.sum(delta_v[i,:]**2) if a > a_max: a_max = a a_max_index = i v = gpu.garray(v) v_mag = gpu.sqrt(gpu.sum(v[a_max_index,:]**2)) return alpha*v_mag/a_max
def calculate_dt(v, delta_v, N_bodies, alpha): a_max = 0. for i in range(N_bodies): delta_v = gpu.garray(delta_v) a = gpu.sum(delta_v[i, :]**2) if a > a_max: a_max = a a_max_index = i v = gpu.garray(v) v_mag = gpu.sqrt(gpu.sum(v[a_max_index, :]**2)) return alpha * v_mag / a_max
def ssd(z, targets, weight=0.5, predict=False, error=False, addon=0): """ """ if predict: return z n, m = z.shape err = z - targets if error: # rec. error + first deriv return weight * gpu.sum(err**2) / n + addon, 2. * weight * err / n else: # only return reconstruction error return weight * gpu.sum(err**2) / n + addon
def forward(self): """ Perform a forward pass to calculate the activation (objective) """ numExamples = self.output_port.getOutput().shape[0] self.objective = -gpu.sum( gpu.garray(self.target_port.getOutput()) * gpu.log(gpu.garray(self.output_port.getOutput()))) self.objective += -gpu.sum( (1.0 - self.target_port.getOutput()) * (gpu.log(1.000001 - self.output_port.getOutput()))) self.objective /= numExamples
def fine_tuning_cost_gpu(x, *args): inputSize, l1Size, l2Size, l3Size, l4Size, l5Size, lambda_val, inputs = args num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_L3 = l3Size * (l2Size + 1) num_weights_L4 = l4Size * (l3Size + 1) num_weights_L5 = l5Size * (l4Size + 1) #num_weights_L6 = inputSize * (l5Size + 1) x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = reshape(x[0:num_weights_L1], (l1Size, inputSize + 1)) weights1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #weights2 = reshape(x[num_weights_L1:num_weights_L1+num_weights_L2], (l2Size, l1Size + 1)) weights2 = x[num_weights_L1:num_weights_L1+num_weights_L2].reshape((l2Size, l1Size + 1)) #weights3 = reshape(x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3], (l3Size, l2Size + 1)) weights3 = x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3].reshape((l3Size, l2Size + 1)) #weights4 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4], (l4Size, l3Size + 1)) weights4 = x[num_weights_L1+num_weights_L2+num_weights_L3:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4].reshape((l4Size, l3Size + 1)) #weights5 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5], (l5Size, l4Size + 1)) weights5 = x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5].reshape((l5Size, l4Size + 1)) #weights6 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5:shape(x)[0]], (inputSize, l5Size+1)) weights6 = x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5:shape(x)[0]].reshape((inputSize, l5Size+1)) nData = shape(inputs)[1] x = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden1_sum = gpu.dot(weights1, x) hidden1_activation = hidden1_sum.logistic() hidden1_activation = gpu.concatenate((gpu.ones((1,nData)), hidden1_activation), axis = 0) hidden2_sum = gpu.dot(weights2, hidden1_activation) hidden2_activation = hidden2_sum.logistic() hidden2_activation = gpu.concatenate((gpu.ones((1,nData)), hidden2_activation), axis = 0) hidden3_sum = gpu.dot(weights3, hidden2_activation) hidden3_activation = hidden3_sum.logistic() hidden3_activation = gpu.concatenate((gpu.ones((1,nData)), hidden3_activation), axis = 0) hidden4_sum = gpu.dot(weights4, hidden3_activation) hidden4_activation = hidden4_sum.logistic() hidden4_activation = gpu.concatenate((gpu.ones((1,nData)), hidden4_activation), axis = 0) hidden5_sum = gpu.dot(weights5, hidden4_activation) hidden5_activation = hidden5_sum.logistic() hidden5_activation = gpu.concatenate((gpu.ones((1,nData)), hidden5_activation), axis = 0) output_sum = gpu.dot(weights6, hidden5_activation) outputs = output_sum.logistic() regularized_penalty4 = weights4[:,1:shape(weights4)[1]] regularized_penalty5 = weights5[:,1:shape(weights5)[1]] regularized_penalty6 = weights6[:,1:shape(weights6)[1]] regularized_penalty4 = regularized_penalty4 ** 2 regularized_penalty5 = regularized_penalty5 ** 2 regularized_penalty6 = regularized_penalty6 ** 2 output_target_diff = (outputs - inputs)**2 cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty4) + gpu.sum(regularized_penalty5) + gpu.sum(regularized_penalty6)) print 'Fine Tuning Cost: ', cost return cost
def costfunc_gpu(x, *args): num_input, num_hidden, num_output, inputs, noNoiseData, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden x = gpu.garray(x) # randomNoise = random.random_sample(shape(inputs)) # criteriaTable = randomNoise > 0.32 # inputs = inputs * criteriaTable inputs = gpu.garray(inputs) noNoiseData = gpu.garray(noNoiseData) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) hidden_activation = hidden_sum.logistic() p_avg = gpu.sum(hidden_activation, axis=1) / nData hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:, 1:shape(weights1)[1]] regularized_penalty2 = weights2[:, 1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - noNoiseData) * (output - noNoiseData) KL = gpu.sum(sparsityParam * gpu.log(sparsityParam / p_avg) + (1 - sparsityParam) * gpu.log((1 - sparsityParam) / (1 - p_avg))) cost = gpu.sum(output_target_diff) / (2 * nData) + 0.5 * lambda_val * ( gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta * KL print 'GPU Linear Denoising Decoder Cost: ', cost del x del inputs del noNoiseData del data del hidden_sum del hidden_activation del p_avg del output del regularized_penalty1 del regularized_penalty2 del weights1 del weights2 del output_target_diff gpu.free_reuse_cache() return cost
def pt_score(self, params, inpts, **kwargs): hddn = logistic( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end + self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] w = params[:self.m_end].reshape(self.shape) cae = gpu.sum( gpu.mean(Dsigmoid(hddn)**2, axis=0) * gpu.sum(w**2, axis=0)) cae *= self.cae sc = self.score(Z, inpts, addon=cae) return np.array([sc, cae])
def norm_trans(X, mode='ff'): """Compute feedforward and backprop for unit-normalization.""" EPS = 0.00000001 if (mode == 'ff'): N = gp.sqrt(gp.sum(X**2.0, axis=1) + EPS) N = N[:,gp.newaxis] F = X / N if (mode == 'bp'): N = gp.sqrt(gp.sum(X['X']**2.0, axis=1) + EPS) N = N[:,gp.newaxis] V = X['dLdA'] * X['X'] V = gp.sum(V, axis=1) V = V[:,gp.newaxis] F = (X['dLdA'] / N) - (X['A'] * (V / (N**2.0))) return F
def sig_ssd(z, targets, weight=0.5, predict=False, error=False, addon=0): """ Sigmoid SSD. """ bern = gpu.logistic(z) if predict: return bern n, m = bern.shape err = bern - targets if error: # rec. error + first deriv return weight * gpu.sum(err**2) / n + addon, 2. * weight * err / n else: # only return reconstruction error return weight * gpu.sum(err**2) / n + addon
def norm_trans(X, mode='ff'): """Compute feedforward and backprop for unit-normalization.""" EPS = 0.00000001 if (mode == 'ff'): N = gp.sqrt(gp.sum(X**2.0, axis=1) + EPS) N = N[:, gp.newaxis] F = X / N if (mode == 'bp'): N = gp.sqrt(gp.sum(X['X']**2.0, axis=1) + EPS) N = N[:, gp.newaxis] V = X['dLdA'] * X['X'] V = gp.sum(V, axis=1) V = V[:, gp.newaxis] F = (X['dLdA'] / N) - (X['A'] * (V / (N**2.0))) return F
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[self.size:-self.shape[0]] = gdot(hddn.T, delta).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] = gdot(inpts.T, dsc_dha).ravel() g[self.m_end:self.size] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def correlation_fraction(g, s, nvis, nhid): with misc.gnumpy_conversion_check('allow'): expect_vis = s[:nvis] expect_hid = s[nvis:nvis+nhid] da = g[:nvis] db = g[nvis:nvis+nhid] dW = g[nvis+nhid:].reshape((nvis, nhid)) first_order_expl = gnp.outer(da, expect_hid) + gnp.outer(expect_vis, db) first_order_norm = gnp.sum(da**2) + gnp.sum(db**2) + gnp.sum(first_order_expl**2) dcorr = dW - first_order_expl dcorr_norm = gnp.sum(dcorr**2) g_norm = gnp.sum(g**2) #return first_order_norm, dcorr_norm, g_norm return dcorr_norm / (dcorr_norm + first_order_norm)
def softmax(self, x): max = gp.max(x, axis=1) x = x - max[:, gp.newaxis] y = gp.exp(x) s = gp.sum(y, 1) z = y / s[:, gp.newaxis] return z
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def loss_mclr(Yh, Y): """Compute mutinomial logistic regression loss for Yh, w.r.t. Y. Values in Yh should probably be network outputs, and each row in Y must be a +1/-1 indicator vector for the target class of a row in Yh. """ obs_count = float(Y.shape[0]) # Get boolean mask for each observation's target class cl_mask = (Y > 0.0) # Compute softmax distribution tranform of Yh sm_sum = gp.sum(gp.exp(Yh), axis=1) P = gp.exp(Yh) / sm_sum[:, gp.newaxis] dL = (P - cl_mask) / obs_count logP = gp.log(P) * cl_mask L = -gp.sum(logP) / obs_count return {'L': L, 'dL': dL}
def softmax(self, x): max=gp.max(x,axis=1) x=x-max[:,gp.newaxis] y=gp.exp(x) s=gp.sum(y,1) z=y/s[:,gp.newaxis] return z
def strict_flip_sample(self, vis_start, iterations, beta=1): """Flips a randomly chosen bit and accepts the change if the resulting free energy is lower. Repeats for given iterations.""" vis = vis_start.copy() fes = self.free_energy(vis) n_total_flips = 0 for i in range(iterations): # flip a bit at random f = np.random.randint(0, vis.shape[1]) vis_prop = vis.copy() vis_prop[:,f] = 1-vis[:,f] # calculate new free energy and accept change if it is lower fes_prop = self.free_energy(vis_prop, beta=beta) acc_prop = fes_prop <= fes n_flips = gp.sum(acc_prop) n_total_flips += n_flips # compose new state acc_prop_t = gp.tile(acc_prop, (vis.shape[1], 1)).T vis = acc_prop_t * vis_prop + (1-acc_prop_t) * vis fes = acc_prop * fes_prop + (1-acc_prop) * fes return vis
def CDStep(self, inputBatch, layer, learnRate, momentum, L2Cost = 0): """ layer=0 will train the first RBM directly on the input """ inputBatch = inputBatch if isinstance(inputBatch, gnp.garray) else gnp.garray(inputBatch) mbsz = inputBatch.shape[0] vis = self.fprop(inputBatch, layer) GRBMFlag = layer==0 and self.realValuedVis visType = RBMGaussian() if GRBMFlag else self.RBMHidUnitType visHidStats, hidBiasStats, visBiasStats, negVis = \ CD1(vis, self.weights[layer], self.genBiases[layer], self.biases[layer], visType, self.RBMHidUnitType) factor = 1-momentum if not self.nestCompare else 1 self.dW = momentum*self.dW + factor*visHidStats self.dvb = momentum*self.dvb + factor*visBiasStats self.dhb = momentum*self.dhb + factor*hidBiasStats if L2Cost > 0: self.weights[layer] *= 1-L2Cost*learnRate*factor self.weights[layer] += (learnRate/mbsz) * self.dW self.genBiases[layer] += (learnRate/mbsz) * self.dvb self.biases[layer] += (learnRate/mbsz) * self.dhb #we compute squared error even for binary visible unit RBMs because who cares return gnp.sum((vis-negVis)**2)
def CDStep(self, inputBatch, layer, learnRate, momentum, L2Cost=0): """ layer=0 will train the first RBM directly on the input """ inputBatch = inputBatch if isinstance( inputBatch, gnp.garray) else gnp.garray(inputBatch) mbsz = inputBatch.shape[0] vis = self.fprop(inputBatch, layer) GRBMFlag = layer == 0 and self.realValuedVis visType = RBMGaussian() if GRBMFlag else self.RBMHidUnitType visHidStats, hidBiasStats, visBiasStats, negVis = \ CD1(vis, self.weights[layer], self.genBiases[layer], self.biases[layer], visType, self.RBMHidUnitType) factor = 1 - momentum if not self.nestCompare else 1 self.dW = momentum * self.dW + factor * visHidStats self.dvb = momentum * self.dvb + factor * visBiasStats self.dhb = momentum * self.dhb + factor * hidBiasStats if L2Cost > 0: self.weights[layer] *= 1 - L2Cost * learnRate * factor self.weights[layer] += (learnRate / mbsz) * self.dW self.genBiases[layer] += (learnRate / mbsz) * self.dvb self.biases[layer] += (learnRate / mbsz) * self.dhb #we compute squared error even for binary visible unit RBMs because who cares return gnp.sum((vis - negVis)**2)
def backprop(self, X, y_target) : # forward activity = [] result = X for i in range(len(self.weights)): p = self.dropout_probability[i] mask = (g.rand(result.shape) >= p) result = result * mask del mask activity.append(result) w,b = self.weights[i] result = g.dot(result,w) + b result = self.activation[i](result) # backward gradientNodes = [] lastGradient = self.gradient[-1](result, y_target) gradientNodes.append(lastGradient) for i in reversed(range(1,len(self.weights))): w,b = self.weights[i] lastGradient = g.dot(lastGradient, w.T) * self.gradient[i-1](activity[i]) gradientNodes.append(lastGradient) # get gradient resultGradient = [] for i in range(len(self.weights)): gradW = (g.dot(activity[i].T,gradientNodes[-(i+1)]) / len(X)) assert(gradW.shape == self.weights[i][0].shape) gradB = (g.sum(gradientNodes[-(i+1)],axis=0) / len(X)) assert(gradB.shape == self.weights[i][1].shape) resultGradient.append([gradW,gradB]) del gradientNodes return resultGradient
def loss_mclr(Yh, Y): """Compute mutinomial logistic regression loss for Yh, w.r.t. Y. Values in Yh should probably be network outputs, and each row in Y must be a +1/-1 indicator vector for the target class of a row in Yh. """ obs_count = float(Y.shape[0]) # Get boolean mask for each observation's target class cl_mask = (Y > 0.0) # Compute softmax distribution tranform of Yh sm_sum = gp.sum(gp.exp(Yh), axis=1) P = gp.exp(Yh) / sm_sum[:,gp.newaxis] dL = (P - cl_mask) / obs_count logP = gp.log(P) * cl_mask L = -gp.sum(logP) / obs_count return {'L': L, 'dL': dL}
def forward(self): """ Perform a forward pass to calculate the activation (objective) """ numExamples = self.output_port.getOutput().shape[0] self.objective = 0.5 * gpu.sum((self.output_port.getOutput() - self.target_port.getOutput())**2) / numExamples
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray(reshape(x[num_weights_L1:num_weights_L2+num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_softmax = gpu.garray(reshape(x[num_weights_L2+num_weights_L1:shape(x)[0]], (numClasses, l2Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0) hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = hidden_sum_L2.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis = 0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions,axis = 0) temp = groundTruth*gpu.log(predictions) regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:,1:shape(theta_L2)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 cost = -1*gpu.sum(temp)/numCases + 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2)) + 0.5 * lambda_softmax * gpu.sum(theta_softmax*theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_L2 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 del regularized_penalty_L2 gpu.free_reuse_cache() return cost
def forward(self): """ Perform a forward step - activate the net input using logistic function """ # Perform the activation self.output.setOutput(gpu.exp(self.input.getNetInput())) self.output.setOutput(self.output.getOutput() / (gpu.garray([gpu.sum(self.output.getOutput(),1)]).transpose()))
def energy(self, vis, hid): assert hid.ndim == 2 #return (vis * self.vbias[nax, :]).sum(1) + \ # (hid * self.hbias[nax, :]).sum(1) + \ # (vis[:, :, nax] * self.weights[nax, :, :] * hid[:, nax, :]).sum(2).sum(1) return gnp.dot(vis, self.vbias) + \ gnp.dot(hid, self.hbias) + \ gnp.sum(vis * gnp.dot(hid, self.weights.T), 1)
def update(self): self.w *= self.l2reg if self.dropout > 0: self.w -= gpu.dot((self.x * self.r).T, self.d) * self.learn # / self.q else: self.w -= gpu.dot(self.x.T, self.d) * self.learn # / self.q self.b *= self.l2reg self.b -= gpu.sum(self.d, 0) * self.learn
def clip_params(self, max_norm=10.0): """Bound L2 (row-wise) norm of W by max_norm.""" M = self.params['W'] m_scales = max_norm / gp.sqrt(gp.sum(M**2.0,axis=1) + 1e-5) mask = (m_scales < 1.0) # with gnumpy, this already comes as float32 m_scales = (m_scales * mask) + (1.0 - mask) self.params['W'] = M * m_scales[:,gp.newaxis] return
def rmssd(z, targets, predict=False, error=False, addon=0): """ Root mean sum of squares. """ if predict: return z n, m = z.shape err = z - targets per_sample = gpu.sqrt(gpu.sum(err**2, axis=1) + 1e-8) if error: # rec. error + first deriv return gpu.sum(per_sample) / n + addon, err / ( n * per_sample[:, gpu.newaxis]) else: # only return reconstruction error return gpu.sum(per_sample) / n + addon
def phi(self): """ Compute phi = p(w|z). """ V = self.nzw.shape[1] num = self.nzw + self.beta num /= gpu.sum(num, axis=1)[:, np.newaxis] print num return num
def forward(self): """ Perform a forward pass to calculate the activation (objective) """ numExamples = self.output_port.getOutput().shape[0] self.objective = 0.5 * gpu.sum( (self.output_port.getOutput() - self.target_port.getOutput())** 2) / numExamples
def mlpSoftmax1Layer_grad(x, *args): numClasses, inputSize, l1Size, lambda_softmax, lambda_hidden, inputs, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_softmax = numClasses * l1Size inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_softmax = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (numClasses, l1Size))) theta_L1_grad = gpu.zeros(shape(theta_L1)) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) #hidden_derivative_L1 = hidden_sum_L1.logistic() relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 #hidden_activation_L1 = hidden_sum_L1.logistic() hidden_derivative_L1 = relu_mask_hidden1 hidden_sum_softmax_imd = gpu.dot(theta_softmax, hidden_activation_L1) hidden_sum_softmax = hidden_sum_softmax_imd - hidden_sum_softmax_imd.max( axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) softmax_imd = groundTruth - predictions theta_softmax_grad = -1 * gpu.dot( softmax_imd, gpu.garray(transpose(hidden_activation_L1.as_numpy_array())) ) / numCases + lambda_softmax * theta_softmax deltaOut = -softmax_imd delta_L1_imd = gpu.dot( gpu.garray(transpose(theta_softmax.as_numpy_array())), deltaOut) delta_L1_imd2 = delta_L1_imd * hidden_derivative_L1 #delta_L1_imd2 = (delta_L1_imd*hidden_activation_L1)*(1-hidden_activation_L1) delta_L1 = gpu.dot(delta_L1_imd2, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta_L1 theta_L1_grad = theta_L1_grad / numCases theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape( theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) theta_softmax_grad = reshape(theta_softmax_grad.as_numpy_array(), num_weights_softmax) del inputs del theta_L1 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_softmax del predictions del softmax_imd del deltaOut del delta_L1_imd del delta_L1_imd2 del delta_L1 gpu.free_reuse_cache() return hstack((theta_L1_grad, theta_softmax_grad))
def getErrorLoss(self,a0,a2,factor=1): """ compute error/reconstruction error a2: reconstruction a0: input one row per case """ loss=factor*0.5*gp.sum((a2-a0)**2)/a0.shape[0] return loss
def update(self): self.w *= self.l2reg if self.dropout > 0: self.w -= gpu.dot( (self.x * self.r).T, self.d) * self.learn # / self.q else: self.w -= gpu.dot(self.x.T, self.d) * self.learn # / self.q self.b *= self.l2reg self.b -= gpu.sum(self.d, 0) * self.learn
def safe_softmax(self, Y): """Compute a reasonably (numerically) safe softmax.""" Y_max = gp.max(Y, axis=1) Y_max = Y_max[:,gp.newaxis] Y_exp = gp.exp(Y - Y_max) Y_sum = gp.sum(Y_exp, axis=1) Y_sum = Y_sum[:,gp.newaxis] Y_sm = Y_exp / Y_sum return Y_sm
def correlation_fraction(g, s, nvis, nhid): with misc.gnumpy_conversion_check('allow'): expect_vis = s[:nvis] expect_hid = s[nvis:nvis + nhid] da = g[:nvis] db = g[nvis:nvis + nhid] dW = g[nvis + nhid:].reshape((nvis, nhid)) first_order_expl = gnp.outer(da, expect_hid) + gnp.outer( expect_vis, db) first_order_norm = gnp.sum(da**2) + gnp.sum(db**2) + gnp.sum( first_order_expl**2) dcorr = dW - first_order_expl dcorr_norm = gnp.sum(dcorr**2) g_norm = gnp.sum(g**2) #return first_order_norm, dcorr_norm, g_norm return dcorr_norm / (dcorr_norm + first_order_norm)
def getErrorLoss(self, a0, a2, factor=1): """ compute error/reconstruction error a2: reconstruction a0: input one row per case """ loss = factor * 0.5 * gp.sum((a2 - a0)**2) / a0.shape[0] return loss
def costAndGrad(self, data, labels): # forward prop self.hActs[0] = data i = 1 for w, b in self.stack: self.hActs[i] = w.dot(self.hActs[i - 1]) + b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0) probs = gp.exp(probs) probs = probs / gp.sum(probs, axis=0) probs += (probs < 1e-8) * (1e-8 - probs) labelMat = np.zeros(probs.shape) labelMat[labels, range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1. / self.mbSize) * gp.sum(labelMat * gp.log(probs)) if not self.train: return cost, None # back prop self.deltas[-1] = probs - labelMat i = len(self.layerSizes) - 1 for w, b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i + 1], True) self.deltas[i] = w.T.dot(self.deltas[i + 1]) * grad i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1. / self.mbSize) * self.deltas[i].dot( self.hActs[i].T) self.grad[i][1] = (1. / self.mbSize) * gp.sum( self.deltas[i], axis=1).reshape(-1, 1) # add gaussian noise # self.grad[i][0] += .01 * gp.randn(self.grad[i][0].shape) # self.grad[i][1] += .01 * gp.randn(self.grad[i][1].shape) return cost, self.grad
def forward(self): """ Perform a forward step - activate the net input using logistic function """ # Perform the activation self.output.setOutput(gpu.exp(self.input.getNetInput())) self.output.setOutput( self.output.getOutput() / (gpu.garray([gpu.sum(self.output.getOutput(), 1)]).transpose()))
def reg_loss(self, Ws=[]): """Compute basic L1/L2 loss and gradient on weights in Ws.""" if (len(Ws) == 0): Ws = self.layer_weights() L = 0.0 dLdWs = [] for i in range(self.layer_count): L = L + (self.lam_l2 * gp.sum(Ws[i]**2.0)) dLdWs.append((2.0 * self.lam_l2) * Ws[i]) return {'L': L, 'dLdWs': dLdWs}
def dev_loss(A, dev_type=1, use_shepherd=0): """DEV regularizer, cool stuff.""" b_reps = len(A) b_obs = A[0].shape[0] At = [] for i in range(b_reps): if (dev_type == 1): At.append(norm_trans(A[i], 'ff')) elif (dev_type == 2): At.append(tanh_trans(A[i], 'ff')) elif (dev_type == 3): At.append(line_trans(A[i], 'ff')) else: raise Exception('Unknown DEV types.') # Compute the mean activations for this ensemble sample N = float(A[0].shape[1]) n = float(b_reps) m = float(b_obs * b_reps * N) Am = gp.zeros(At[0].shape) if (use_shepherd != 1): for i in range(b_reps): Am = Am + At[i] Am = Am / float(b_reps) else: Am = At[0] # Compute difference from mean of each set of droppy activations Ad = [(At[i] - Am) for i in range(b_reps)] L = sum([gp.sum(ad**2.0) for ad in Ad]) / m dLdA = [] if (use_shepherd != 1): Add = gp.zeros(At[0].shape) for i in range(b_reps): Add = Add + Ad[i] for i in range(b_reps): dLdA.append(-(2.0/m) * ((((1.0/n) - 1.0) * Ad[i]) + \ ((1.0/n) * (Add - Ad[i])))) else: for i in range(b_reps): if (i == 0): dLdA.append(gp.zeros(Ad[0].shape)) else: dLdA.append((2.0 / m) * Ad[i]) for i in range(1, b_reps): dLdA[0] = dLdA[0] - dLdA[i] # Backpropagate gradient on variance through the desired transform for i in range(b_reps): BP = {'X': A[i], 'A': At[i], 'dLdA': dLdA[i]} if (dev_type == 1): dLdA[i] = norm_trans(BP, 'bp') elif (dev_type == 2): dLdA[i] = tanh_trans(BP, 'bp') elif (dev_type == 3): dLdA[i] = line_trans(BP, 'bp') return {'L': L, 'dLdA': dLdA}
def costfunc_gpu_ReLU(x, *args): num_input, num_hidden, num_output, inputs, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) #hidden_activation = gpu.log(1+hidden_sum.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum)) * (hidden_sum > 0) hidden_activation = hidden_sum * relu_mask_hidden1 hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:, 1:shape(weights1)[1]] regularized_penalty2 = weights2[:, 1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - inputs) * (output - inputs) cost = gpu.sum(output_target_diff) / (2 * nData) + 0.5 * lambda_val * ( gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) print 'GPU ReLU Linear Decoder Cost: ', cost del x del inputs del data del hidden_sum del hidden_activation del output del regularized_penalty1 del regularized_penalty2 del weights1 del weights2 del output_target_diff gpu.free_reuse_cache() return cost
def bprop(self, data, targs): cost = 0 a = data acts = [a] zs = [a] for l in xrange(len(self.weights)): z = self.biases[l] + gnp.dot(a, self.weights[l]) a = sigmoid(z) zs.append(z) acts.append(a) #print acts[-1] delta_l = self.loss_fn_grad(acts[-1], targs) * sigmoid_prime(zs[-1]) for l in reversed(xrange(len(self.weights))): self.biasGrads[l] += gnp.sum(delta_l, axis=0) self.WGrads[l] += gnp.dot(acts[l].T, delta_l) if l > 0: delta_l = gnp.dot(delta_l, self.weights[l].T) * sigmoid_prime( zs[l]) cost = self.loss_fn(acts[-1], targs) n_err = gnp.sum(acts[-1].argmax(axis=1) != targs.argmax(axis=1)) return (cost, n_err)