def linear_decoder_run_gpu(data, numInput, numHidden): print "Starting Feature Abstraction..." num_input = numInput num_hidden = numHidden num_output = numInput lambda_val = 3e-3 sparsityParam = 0.035 beta = 5 inputs = data r = gpu.sqrt(6)/gpu.sqrt(num_hidden+num_input+1) weights1 = (gpu.rand(num_hidden,num_input+1))*2*r-r weights2 = (gpu.rand(num_output,num_hidden+1))*2*r-r num_weights1 = (num_input+1)*num_hidden num_weights2 = (num_hidden+1)*num_output #weights1 = reshape(weights1, num_weights1) weights1 = weights1.reshape(num_weights1) #weights2 = reshape(weights2, num_weights2) weights2 = weights2.reshape(num_weights2) weights = hstack((weights1.as_numpy_array(),weights2.as_numpy_array())) args = (num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta) opttheta = optimize.fmin_l_bfgs_b(costfunc_gpu, weights, fprime=grad_costfunc_gpu, args=args, maxiter=400) weights = opttheta[0] weights1 = reshape(weights[0:num_weights1],(num_hidden,num_input+1)) weights2 = reshape(weights[num_weights1:shape(weights)[0]], (num_output,num_hidden+1)) scipy.io.savemat('learntFeaturesGPU.mat', mdict={'learntFeatures': weights1}) return weights1
def test_random_feature_mmd_loss_approximation(sigma=[1,10], scale_weight=[0.5,1], n_features=3): print 'Testing random feature MMD loss approximation error' n_dims = 2 n_target = 5 n_pred = 5 target = gnp.rand(n_target, n_dims) pred = gnp.rand(n_pred, n_dims) rand_mmd = ls.get_loss_from_type_name(ls.LOSS_NAME_RANDOM_FEATURE_MMDGEN, sigma=sigma, scale_weight=scale_weight, n_features=n_features) rand_mmd.load_target(target) print rand_mmd mmd = ls.get_loss_from_type_name(ls.LOSS_NAME_MMDGEN_MULTISCALE_PAIR, sigma=sigma, scale_weight=scale_weight) mmd.load_target(target) rand_loss, rand_grad = rand_mmd.compute_loss_and_grad(pred, compute_grad=True) true_loss, true_grad = mmd.compute_loss_and_grad(pred, compute_grad=True) test_passed = test_vec_pair(rand_grad.asarray().ravel(), 'Approximate Gradient', true_grad.asarray().ravel(), ' True Gradient', error_thres=1e-2) test_passed = test_vec_pair(np.array([rand_loss]), 'Approximate Loss', np.array([true_loss]), ' True Loss', error_thres=1e-2) \ and test_passed print '' return test_passed
def fpropDropout(self, inputBatch, weightsToStopBefore = None ): """ Perform a (possibly partial) forward pass through the network. Updates self.state which, on a full forward pass, holds the input followed by each hidden layer's activation and finally the net input incident on the output layer. For a full forward pass, we return the actual output unit activations. In a partial forward pass we return None. """ inputBatch = inputBatch if isinstance(inputBatch, gnp.garray) else gnp.garray(inputBatch) if weightsToStopBefore == None: weightsToStopBefore = len(self.weights) #self.state holds everything before the output nonlinearity, including the net input to the output units sample = (gnp.rand(*inputBatch.shape) > self.dropouts[0]) self.state = [inputBatch * sample] for i in range(min(len(self.weights) - 1, weightsToStopBefore)): dropoutMultiplier = 1.0/(1.0-self.dropouts[i]) curActs = self.hidActFuncts[i].activation(gnp.dot(dropoutMultiplier*self.state[-1], self.weights[i]) + self.biases[i]) sample = (gnp.rand(*curActs.shape) > self.dropouts[i+1]) self.state.append(curActs * sample) if weightsToStopBefore >= len(self.weights): dropoutMultiplier = 1.0/(1.0-self.dropouts[-1]) self.state.append(gnp.dot(dropoutMultiplier*self.state[-1], self.weights[-1]) + self.biases[-1]) self.acts = self.outputActFunct.activation(self.state[-1]) return self.acts #we didn't reach the output units # To return the first set of hidden activations, we would set # weightsToStopBefore to 1. return self.state[weightsToStopBefore]
def get_drop_masks(self, mask_count, in_drop=0, hd_drop=0): """Get mask_count dropout masks shaped for each layer in self.layers. Dropout masks are computed based on drop rates self.drop_input and self.drop_hidden, and self.drop_undrop. Masks are scaled so that the sum of each mask for a given layer is the same. If in_drop == 1, we do dropping on input layer and if hd_drop == 1, we also drop hiddens. """ M = [] # Generate an 'undrop' mask, which sets some masks to be dropless u_mask = (gp.rand(mask_count,1) < self.drop_undrop) for i in range(self.layer_count): # Set drop_rate based on layer and in_drop/hd_drop drop_rate = 0.0 if ((i == 0) and (in_drop == 1)): drop_rate = self.drop_input elif (hd_drop == 1): drop_rate = self.drop_hidden # Get mask dimension for this layer mask_dim = self.layers[i].dim_input # Generate random 'bit' mask d_mask = (gp.rand(mask_count, mask_dim) > drop_rate) # Compute bootleg 'or' with the undrop mask mask = ((d_mask + u_mask) > 0.1) # Rescale mask entries to have unit mean scales = 1.0 / gp.mean(mask, axis=1) scales = scales[:,gp.newaxis] mask = mask * scales # Record the generated mask M.append(mask) return M
def test_batch_normalization_layer(): print 'Testing Batch Normalization layer' in_dim = 3 n_cases = 5 x = gnp.randn(n_cases, in_dim) * 2 + 3 t = gnp.randn(n_cases, in_dim) * 2 loss = ls.get_loss_from_type_name(ls.LOSS_NAME_SQUARED) loss.load_target(t) bn_layer = ly.BatchNormalizationLayer(in_dim) bn_layer.params.gamma = gnp.rand(in_dim) bn_layer.params.beta = gnp.rand(in_dim) w_0 = bn_layer.params.get_param_vec() y = bn_layer.forward_prop(x, is_test=False) _, loss_grad = loss.compute_not_weighted_loss_and_grad(y, True) bn_layer.backward_prop(loss_grad) backprop_grad = bn_layer.params.get_grad_vec() def f(w): bn_layer.params.set_param_from_vec(w) y = bn_layer.forward_prop(x, is_test=False) return loss.compute_not_weighted_loss_and_grad(y)[0] fdiff_grad = finite_difference_gradient(f, w_0) test_passed = test_vec_pair(fdiff_grad, 'Finite Difference Gradient', backprop_grad, ' Backpropagation Gradient', eps=_BN_GRAD_CHECK_EPS, use_rel_err=True) print '' return test_passed
def mlpSoftmax_test(): numClasses = 10 inputSize = 28 * 28 l1Size = 100 l2Size = 20 lambda_softmax = 1e-4 lambda_hidden = 8e-5 print "Loading data..." inputs, labels, testData, testLabels = obtain_data() print shape(labels) print "Done." numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_softmax = numClasses * l2Size r = gpu.sqrt(6)/gpu.sqrt(inputSize+l1Size+l2Size+1) theta_L1 = (gpu.rand(l1Size, inputSize+1))*2*r-r theta_L2 = (gpu.rand(l2Size, l1Size+1))*2*r-r theta_softmax = (gpu.rand(numClasses, l2Size))*2*r-r groundTruth = ground_Truth(labels,numCases) #theta_L1 = reshape(theta_L1, num_weights_L1) theta_L1 = theta_L1.reshape(num_weights_L1) #theta_L2 = reshape(theta_L2, num_weights_L2) theta_L2 = theta_L2.reshape(num_weights_L2) #theta_softmax = reshape(theta_softmax, num_weights_softmax) theta_softmax = theta_softmax.reshape(num_weights_softmax) theta = hstack((theta_L1.as_numpy_array(), theta_L2.as_numpy_array(), theta_softmax.as_numpy_array())) args = (numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth) print "Starting Network Training..." opttheta = optimize.fmin_l_bfgs_b(mlpSoftmax_costfunc, theta, fprime=mlpSoftmax_grad, args=args, maxiter=400) theta = opttheta[0] print "Training finished." scipy.io.savemat('mlpSoftmax.mat', mdict={'theta': theta}) print "Now testing prediction accuracy..." theta_L1 = reshape(theta[0:num_weights_L1], (l1Size, inputSize + 1)) theta_L2 = reshape(theta[num_weights_L1:num_weights_L2+num_weights_L1], (l2Size, l1Size + 1)) theta_softmax = reshape(theta[num_weights_L2+num_weights_L1:shape(theta)[0]], (numClasses, l2Size)) numCasesPred = shape(testData)[1] testData = concatenate((ones((1,numCasesPred)), testData), axis = 0) hidden_sum_L1 = dot(theta_L1, testData) hidden_activation_L1 = 1/(1 + exp(-hidden_sum_L1)) hidden_activation_L1 = concatenate((ones((1,numCasesPred)), hidden_activation_L1), axis=0) hidden_sum_L2 = dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = 1/(1 + exp(-hidden_sum_L2)) hidden_sum_softmax = dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis = 0) predictions = exp(hidden_sum_softmax) predictions = predictions / predictions.sum(axis = 0) pred = predictions.argmax(axis=0) + 1 testLabels = squeeze(testLabels) accuracy = mean(pred == testLabels) * 100 print "Accuracy: ", accuracy, "%" return pred, testLabels
def multilayer_feature_learning(data, inputSize, l1Size, l2Size, sparsityParam, lambda_val, beta): print "Now starting feature abstraction..." num_input = inputSize num_hidden_L1 = l1Size num_hidden_L2 = l2Size num_output_L1 = inputSize num_output_L2 = num_hidden_L1 sparsityParam = sparsityParam lambda_val = lambda_val beta = beta inputs = gpu.garray(data) r = gpu.sqrt(6)/gpu.sqrt(num_hidden_L1+num_input+1) weights1_L1 = (gpu.rand(num_hidden_L1,num_input+1))*2*r-r weights2_L1 = (gpu.rand(num_output_L1,num_hidden_L1+1))*2*r-r num_weights1_L1 = (num_input+1)*num_hidden_L1 num_weights2_L1 = (num_hidden_L1+1)*num_output_L1 weights1_L1 = weights1_L1.reshape(num_weights1_L1) weights2_L1 = weights2_L1.reshape(num_weights2_L1) weights_L1 = hstack((weights1_L1.as_numpy_array(),weights2_L1.as_numpy_array())) print "Level 1 Abstraction Starting...." weights_L1 = linear_decoder_run_ReLU(data, weights_L1, num_input, num_hidden_L1) weights1_L1 = weights_L1[0:num_weights1_L1].reshape((num_hidden_L1,num_input+1)) weights2_L1 = weights_L1[num_weights1_L1:shape(weights_L1)[0]].reshape((num_output_L1,num_hidden_L1+1)) scipy.io.savemat('HiggsBosonLevel1.mat', mdict={'learntFeaturesL1_1': weights1_L1, 'learntFeaturesL1_2': weights2_L1}) L1_activation = feedforward(weights1_L1, inputs) del weights_L1 del weights1_L1 del weights2_L1 gpu.free_reuse_cache() v = gpu.sqrt(6)/gpu.sqrt(num_hidden_L2+num_hidden_L1+1) weights1_L2 = (gpu.rand(num_hidden_L2,num_hidden_L1+1))*2*v-v weights2_L2 = (gpu.rand(num_output_L2,num_hidden_L2+1))*2*v-v num_weights1_L2 = (num_hidden_L1+1)*num_hidden_L2 num_weights2_L2 = (num_hidden_L2+1)*num_output_L2 weights1_L2 = weights1_L2.reshape(num_weights1_L2) weights2_L2 = weights2_L2.reshape(num_weights2_L2) weights_L2 = hstack((weights1_L2.as_numpy_array(),weights2_L2.as_numpy_array())) print "Level 2 Abstraction Starting...." weights_L2 = linear_decoder_run_ReLU(L1_activation, weights_L2, num_hidden_L1, num_hidden_L2) weights1_L2 = weights_L2[0:num_weights1_L2].reshape((num_hidden_L2,num_hidden_L1+1)) weights2_L2 = weights_L2[num_weights1_L2:shape(weights_L2)[0]].reshape((num_output_L2,num_hidden_L2+1)) scipy.io.savemat('HiggsBosonLevel2.mat', mdict={'learntFeaturesL2_1': weights1_L2,'learntFeaturesL2_2': weights2_L2}) L2_activation = feedforward(weights1_L2, L1_activation) del weights_L2 del weights1_L2 del weights2_L2 gpu.free_reuse_cache() gpu.free_reuse_cache() print "Abstraction completed." return L2_activation
def checkGradientGPU(): num_input = 8*8*3 num_hidden = 10 num_output = num_input lambda_val = 0.003 sparsityParam = 0.035 beta = 5 data = scipy.io.loadmat('stlSampledPatches.mat') patches = data['patches'] inputs = patches[:,0:10] r = gpu.sqrt(6)/gpu.sqrt(num_hidden+num_input+1) weights1 = (gpu.rand(num_hidden,num_input+1))*2*r-r weights2 = (gpu.rand(num_output,num_hidden+1))*2*r-r num_weights1 = (num_input+1)*num_hidden num_weights2 = (num_hidden+1)*num_output weights1 = weights1.reshape(num_weights1) weights2 = weights2.reshape(num_weights2) weights = hstack((weights1.as_numpy_array(),weights2.as_numpy_array())) args = (num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta) numgrad = zeros(size(weights)) numgrad2 = zeros(size(weights)) perturb = zeros(size(weights)) e = 1e-4 for p in range(size(weights)): perturb[p] = e; minus_weights = weights - perturb plus_weights = weights + perturb loss1 = costfunc_gpuTRY(minus_weights, *args) lossc1 = costfunc(minus_weights, *args) loss2 = costfunc_gpuTRY(plus_weights, *args) lossc2 = costfunc(plus_weights, *args) numgrad[p] = (loss2 - loss1) / (2*e) numgrad2[p] = (lossc2 - lossc1) / (2*e) perturb[p] = 0 grad = grad_costfunc_gpu(weights, *args) grad2 = grad_costfunc(weights, *args) diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad) diff2 = linalg.norm(numgrad2-grad2)/linalg.norm(numgrad2+grad2) diff3 = linalg.norm(numgrad-grad2)/linalg.norm(numgrad+grad2) diff4 = linalg.norm(numgrad2-grad)/linalg.norm(numgrad2+grad) diffnum = linalg.norm(numgrad2-numgrad)/linalg.norm(numgrad2+numgrad) diffgrad = linalg.norm(grad2-grad)/linalg.norm(grad2+grad) print "pure GPU difference:",diff print "pure CPU difference:",diff2 print "GPU cost, CPU grad:",diff3 print "CPU cost, GPU grad:",diff4 print "CPU cost and GPU cost difference:",diffnum print "CPU grad and GPU grad difference:",diffgrad return "OK"
def random_like(x): """Return an array of the same shape as `x` filled with random numbers from the interval [0, 1).""" if not isinstance(x, np.ndarray): return gp.rand(x.shape) else: return np.random.random(x.shape)
def pt_init(self, H=bernoulli, V=bernoulli, init_var=1e-2, init_bias=0., rho=0.5, lmbd=0., l2=0., **kwargs): pt_params = gzeros(self.m_end + self.shape[1] + self.shape[0]) if init_var is None: init_heur = 4*np.sqrt(6./(self.shape[0]+self.shape[1])) pt_params[:self.m_end] = gpu.rand(self.m_end) pt_params[:self.m_end] *= 2 pt_params[:self.m_end] -= 1 pt_params[:self.m_end] *= init_heur else: pt_params[:self.m_end] = init_var * gpu.randn(self.m_end) pt_params[self.m_end:] = init_bias self.H = H self.V = V self.activ = match_table[H] self.pt_score = self.reconstruction self.pt_grad = self.grad_cd1 self.l2 = l2 self.rho = rho self.lmbd = lmbd self.rho_hat = None return pt_params
def getCorruptedInput(self, input): if self.corrputionLevel>0: rnd=gp.rand(self.batchsize, self.vDim)>self.corrputionLevel output=rnd*input return output else: return input
def rbm_sample(w_vh, w_v, w_h, x, k=1, clamped=None): """ Sample from RBM with k steps of Gibbs sampling w_vh: Weights between visible and hidden units (matrix of size DxH) w_v: Visible unit biases (column vector of size Dx1) w_h: Hidden unit biases (column vector of size Hx1) x: Input (column vector of size DxN) k: Number of Gibbs steps. Default is 1. clamped: If not None, keeps the given elements of x clamped (constant) while sampling clamped is a two-tuple that gives the start and end indices of clamped elements Returns hidden unit and visible unit activations (matrices of size HxN, DxN) """ if clamped is not None: cx = x[clamped[0] : clamped[1], :] v = x for i in range(k): # sample hiddens ah = gnp.dot(w_vh.T, v) + w_h h = gnp.logistic(ah) hs = h > gnp.rand(h.shape[0], h.shape[1]) # sample visibles av = gnp.dot(w_vh, hs) + w_v v = gnp.logistic(av) if clamped is not None: v[clamped[0] : clamped[1], :] = cx return h, v
def sample_vis_3d(self, n_chains, n_steps, gibbs_steps_between_samples, sample_probabilities=False, init_vis=None, beta=1, betas=None): """Obtains unbiased samples for the visible units. Runs n_chains Gibbs chains in parallel for (n_steps*gibbs_steps_between_samples) steps. Grabs samples every gibbs_steps_between_samples Gibbs steps.""" samples = gp.zeros((n_steps, n_chains, self.n_vis)) if init_vis is None: vis = gp.rand((n_chains, self.n_vis)) < 0.5 else: assert init_vis.shape[0] == n_chains vis = init_vis for step in range(n_steps): #print >>stderr, "%d / %d \r" % (step, n_steps), if betas is None: vis, p_vis = self.gibbs_sample(vis, gibbs_steps_between_samples, beta=beta) else: assert gibbs_steps_between_samples is None vis, p_vis = self.annealed_gibbs_sample(vis, betas) if sample_probabilities: sample = p_vis else: sample = vis samples[step, :, :] = sample return samples
def backprop(self, X, y_target) : # forward activity = [] result = X for i in range(len(self.weights)): p = self.dropout_probability[i] mask = (g.rand(result.shape) >= p) result = result * mask del mask activity.append(result) w,b = self.weights[i] result = g.dot(result,w) + b result = self.activation[i](result) # backward gradientNodes = [] lastGradient = self.gradient[-1](result, y_target) gradientNodes.append(lastGradient) for i in reversed(range(1,len(self.weights))): w,b = self.weights[i] lastGradient = g.dot(lastGradient, w.T) * self.gradient[i-1](activity[i]) gradientNodes.append(lastGradient) # get gradient resultGradient = [] for i in range(len(self.weights)): gradW = (g.dot(activity[i].T,gradientNodes[-(i+1)]) / len(X)) assert(gradW.shape == self.weights[i][0].shape) gradB = (g.sum(gradientNodes[-(i+1)],axis=0) / len(X)) assert(gradB.shape == self.weights[i][1].shape) resultGradient.append([gradW,gradB]) del gradientNodes return resultGradient
def bernoulli(data, wm, bias, sampling=False): """ """ suff = (gpu.dot(data, wm) + bias).logistic() if sampling: sample = suff > gpu.rand(suff.shape) else: sample = None return suff, sample
def initParams(self): sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s,self.mbSize)) for s in sizes] if self.train: self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def initParams(self): # crude way of random initialization (random seed) for parameters import time self.seed = int(time.time()) % 100000; # for tt in range(self.seed): gp.rand() sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s,self.mbSize)) for s in sizes] if self.train: self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack] for tt in range(self.seed): gp.rand() self.stack = [[ws[0]+.01 * gp.randn(ws[0].shape),ws[1]+.01 * gp.randn(ws[1].shape)] for ws in self.stack]
def initParams(self): """ Initialize parameters using 6/sqrt(fanin+fanout) """ sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] if self.temporalLayer > 0: rs = sizes[self.temporalLayer] s = gp.sqrt(6)/ rs # temporal layer stored at end of stack self.stack.append([gp.rand(rs,rs) * 2 * s - s, gp.zeros((2,1))]) if self.train: #TODO why store all deltas? #self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] #NOTE if a temporal layer is used it's already added to stack so will have a grad self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def input_to_hidden(self, set_name = 'train'): self.timer_logger('input_to_hidden {0}'.format(type), time.time()) self.results['activations'] = [] if set_name == 'train': self.results['activations'].append([self.batch, self.w[0], self.b[0]]) dropped_out = self.batch * (gpu.rand(self.current_batch_size,self.X.shape[1]) > self.dropout[0]) self.results['current'] = gpu.dot(dropped_out,self.w[0])+self.b[0] else: self.results['current'] = gpu.dot(self.batch,self.w[0]) + self.b[0] self.timer_logger('input_to_hidden {0}'.format(type), time.time())
def feed_forward(self, input_batch): if not isinstance(input_batch, gnp.garray): input_batch = gnp.garray(input_batch) weights_to_stop = len(self.weights) self.state = [input_batch * (gnp.rand(*input_batch.shape) > self.dropouts[0])] for i in range(min(len(self.weights) -1, weights_to_stop)): do_factor = 1.0 / (1.0-self.dropouts[i]) linear_outputs = gnp.dot(self.state[-1]*do_factor, self.weights[i]) + self.biases[i] act_outputs = self.hidden_functions[i].activate(linear_outputs) self.state.append(act_outputs*(gnp.rand(*act_outputs.shape) > self.dropouts[i+1])) if weights_to_stop >= len(self.weights): do_factor = 1.0 / (1.0-self.dropouts[-1]) self.state.append(gnp.dot(self.state[-1]*do_factor, self.weights[-1]) + self.biases[-1]) self.acts = self.output_function.activate(self.state[-1]) return self.acts return self.state[weights_to_stop]
def sample_binomial(p): """Samples elementwise from the binomial distribution with probability p""" if use_debug_rng: r = myrand.rand(p.shape) else: r = gp.rand(p.shape) # n = np.random.random(p.shape) # n = gp.rand(p.shape) # r = gp.zeros(p.shape) return r < p
def pt_init(self, score=None, init_var=1e-2, init_bias=0., **kwargs): pt_params = gzeros(self.size + self.m_end + self.shape[0]) if init_var is None: init_heur = 4*np.sqrt(6./(self.shape[0]+self.shape[1])) pt_params[:self.m_end] = gpu.rand(self.m_end) pt_params[:self.m_end] *= 2 pt_params[:self.m_end] -= 1 pt_params[:self.m_end] *= init_heur pt_params[self.size:-self.shape[0]] = gpu.rand(self.m_end) pt_params[self.size:-self.shape[0]] *= 2 pt_params[self.size:-self.shape[0]] -= 1 pt_params[self.size:-self.shape[0]] *= init_heur else: pt_params[:self.m_end] = init_var * gpu.randn(self.m_end) pt_params[self.size:-self.shape[0]] = init_var * gpu.randn(self.m_end) pt_params[self.m_end:self.size] = init_bias pt_params[-self.shape[0]:] = init_bias self.score = score return pt_params
def test_nonlin_invert(nonlin): print 'Testing inverting nonlinearity <%s>' % nonlin.get_name() sx, sy = 3, 4 x = gnp.rand(sx, sy) y = nonlin.forward_prop(x) xx = nonlin.invert_output(y) test_passed = test_vec_pair(x.asarray().ravel(), '%15s' % 'Input', xx.asarray().ravel(), '%15s' % 'Inferred Input') print '' return test_passed
def hidden_to_output(self, set_name = 'train'): self.timer_logger('hidden_to_output {0}'.format(type), time.time()) i = 0 for weight, bias in zip(self.w, self.b): if i > 0: #ignore the first weight that goes from inputs to first hidden layer if set_name == 'train': self.results['activations'].insert(0, [self.activation(self.results['current']) , weight]) self.results['current'] = gpu.dot(self.results['activations'][0][0] * (gpu.rand(self.results['activations'][0][0].shape[0],self.results['activations'][0][0].shape[1]) > self.dropout[1]), #dropout weight) + bias else: self.results['current'] = gpu.dot(self.activation(self.results['current'])* (1 - self.dropout[1]), weight) + bias i += 1 self.timer_logger('hidden_to_output {0}'.format(type), time.time())
def sample_vis(self, n_chains, n_steps, gibbs_steps_between_samples, sample_probabilities=False): """Obtains unbiased samples for the visible units. Runs n_chains Gibbs chains in parallel for n_steps. Grabs samples every gibbs_steps_between_samples Gibbs steps.""" samples = gp.zeros((n_chains * n_steps, self.n_vis)) vis = gp.rand((n_chains, self.n_vis)) < 0.5 for step in range(n_steps): print >>stderr, "%d / %d \r" % (step, n_steps), vis, p_vis = self.gibbs_sample(vis, gibbs_steps_between_samples) if sample_probabilities: sample = p_vis else: sample = vis samples[step*n_chains : (step+1)*n_chains, :] = sample return samples
def dbn_sample(ws_vh, ws_v, ws_h, x, y=None, k=1): """ Sample from DBN ws_vh, ws_v, ws_h: Lists of layer weights for DBN x: Initial sample. This is the input to DBN. (1xD vector) y: Class label for the sample. This corresponds to sampling from class conditionals. (1-of-K coded, row vector) k: Number of Gibbs steps Returns a sample from DBN (1xD vector) """ L = len(ws_vh) # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) h_prev = h_prev > gnp.rand(h_prev.shape[0], h_prev.shape[1]) # if not supervised, sample from top layer RBM without clamping any of its # inputs if y is None: # sample from top layer RBM h, v = rbm_sample(ws_vh[-1], ws_v[-1], ws_h[-1], h_prev, k) else: K = y.shape[1] # number of classes H = ws_vh[-1].shape[0] # generate a random input to top layer RBM with class label units clamped to y v = gnp.concatenate((y.T, h_prev)) # sample from top layer RBM h, v = rbm_sample(ws_vh[-1], ws_v[-1], ws_h[-1], v, k, clamped=(0, K)) v = v[K:H, :] # backward (top-down) pass # propagate sample from RBM back to input for l in range(L - 2, -1, -1): av = gnp.dot(ws_vh[l], v) + ws_v[l] v = gnp.logistic(av) return v.T
def test(shape=(3,4,5)): """ Make sure that the gnumpy conversion is exact. """ gpu = theano.sandbox.cuda.basic_ops.gpu_from_host U = gpu(theano.tensor.ftensor3('U')) ii = theano.function([U], gpu(U+1)) A = gnumpy.rand(*shape) A_cnd = garray_to_cudandarray(A) B_cnd = ii(A_cnd) B = cudandarray_to_garray(B_cnd) from numpy import array B2 = array(B_cnd) u = (A+1).asarray() v = B.asarray() w = B2 assert abs(u-v).max() == 0 assert abs(u-w).max() == 0
def forward_prop(self, X, add_noise=False, compute_loss=False): """ Compute the forward propagation step that maps the input data matrix X into the output. Loss and loss gradient will be computed when compute_loss set to True. Note that the loss is applied on nonlinearity activation, rather than the final output by default, unless loss_after_nonlin is set to True. """ if self.params.dropout > 0 and add_noise: self.dropout_mask = gnp.rand(X.shape[0], X.shape[1]) > self.params.dropout self.inputs = X * self.dropout_mask else: self.inputs = X self.noise_added = add_noise if not self.use_batch_normalization: self.activation = self.inputs.dot(self.params.W) + self.params.b self.output = self.nonlin.forward_prop(self.activation) if self.sparsity_weight > 0: self._sparsity_current = self._sparsity_smoothing * self.output.mean(axis=0) \ + (1 - self._sparsity_smoothing) * self._sparsity_current self._sparsity_objective = (- self.sparsity * gnp.log(self._sparsity_current + 1e-20) \ - (1 - self.sparsity) * gnp.log(1 - self._sparsity_current + 1e-20)).sum() * self.sparsity_weight else: self.activation = self.inputs.dot(self.params.W) self.bn_output = self.bn_layer.forward_prop(self.activation) self.output = self.nonlin.forward_prop(self.bn_output) if compute_loss and self.loss is not None: if self.loss_after_nonlin: self.loss_value, self.loss_grad = self.loss.compute_loss_and_grad( self.output, compute_grad=True) else: self.loss_value, self.loss_grad = self.loss.compute_loss_and_grad( self.activation if not self.use_batch_normalization else self.bn_output, compute_grad=True) self.loss_computed = True return self.output
def test(shape=(3, 4, 5)): """ Make sure that the gnumpy conversion is exact from garray to CudaNdarray back to garray. """ gpu = theano.sandbox.cuda.basic_ops.gpu_from_host U = gpu(theano.tensor.ftensor3('U')) ii = theano.function([U], gpu(U + 1)) A = gnumpy.rand(*shape) A_cnd = garray_to_cudandarray(A) assert A_cnd.shape == A.shape # dtype always float32 # garray don't have strides B_cnd = ii(A_cnd) B = cudandarray_to_garray(B_cnd) assert A_cnd.shape == A.shape u = (A + 1).asarray() v = B.asarray() w = np.array(B_cnd) assert (u == v).all() assert (u == w).all()
def feedforward(self, X, return_on_gpu=False): """Perform feedforward through this layer. """ # Cleanup debris from any previous feedforward self._cleanup() # Record (a pointer to) the passed input self.X = gp.garray(X) # Generate and apply a dropout mask to the input if (self.drop_rate > 1e-4): drop_mask = self.drop_scale * \ (gp.rand((self.X.shape[0], self.X.shape[1])) > self.drop_rate) else: drop_mask = gp.ones((self.X.shape[0], self.X.shape[1])) self.dYdX = drop_mask if (self.fuzz_scale > 1e-4): fuzz_bump = (self.fuzz_scale / self.drop_scale) * \ gp.randn((self.X.shape[0], self.X.shape[1])) self.Y = drop_mask * (self.X + fuzz_bump) else: self.Y = drop_mask * self.X if not return_on_gpu: self.Y = gp.as_numpy_array(self.Y) return self.Y
def train(self): config = self.config # convert t into a matrix in 1-of-K representation if it is a vector t = self.train_data.T T_matrix = self.output.act_type.label_vec_to_mat(t, self.train_data.K) layer_config = LayerConfig() layer_config.learn_rate = config.learn_rate layer_config.momentum = config.init_momentum layer_config.weight_decay = config.weight_decay nnstore = NNStore() nnstore.init_from_net(self) best_net = NNStore() best_net.init_from_net(self) train_acc, val_acc, test_acc = self.display_training_info( -1, self._compute_loss( self.train_data.X, T_matrix, config.minibatch_size), 0) acc_rec = np.zeros((config.num_epochs / config.epoch_to_display + 1, 4)) acc_rec[0, 0] = 0 acc_rec[0, 1] = train_acc if config.is_val: acc_rec[0, 2] = val_acc if config.is_test: acc_rec[0, 3] = test_acc t_start = time.time() best_acc = val_acc if self.config.is_test: best_test_acc = test_acc best_epoch = -1 for epoch in range(0, config.num_epochs): gnp.free_reuse_cache() # decrease learning rate over time layer_config.learn_rate = config.learn_rate / \ (epoch / config.lr_drop_rate + 1) # TODO [dirty] special for Lnsvm if isinstance(self.output.act_type, act.LnsvmVariantOutput): #self.output.act_type.n = 3.0 - (3.0 - 0.5) / 50 * epoch self.output.act_type.n = 0.5 if self.output.act_type.n < 0.5: self.output.act_type.n = 0.5 if (epoch + 1) % config.epoch_to_display == 0: print 'n %.4f' % self.output.act_type.n, if epoch >= config.switch_epoch: layer_config.momentum = config.final_momentum # shuffle the dataset idx = np.random.permutation(self.num_total_cases) #idx = np.arange(self.num_total_cases) train_X = self.train_data.X[idx] train_T = T_matrix[idx] if config.input_noise > 0: train_X = train_X * (gnp.rand(train_X.shape) > config.input_noise) # train_X = train_X + gnp.randn(train_X.shape) * config.input_noise loss = 0 for batch in range(0, self.num_minibatches): i_start = batch * config.minibatch_size if not batch == self.num_minibatches - 1: i_end = i_start + config.minibatch_size else: i_end = self.num_total_cases X = train_X[i_start:i_end] T = train_T[i_start:i_end] # forward pass self._forward(X) # compute loss loss += self.output.loss(T) if self.output.Y.isnan().any(): import ipdb ipdb.set_trace() print 'batch #%d <-- nan' % batch # backprop dLdXabove = self.output.backprop(layer_config) for i in range(self.num_layers-1, -1, -1): dLdXabove = self.layer[i].backprop(dLdXabove, layer_config) # statistics avg_loss = 1.0 * loss / self.num_total_cases if (epoch + 1) % config.epoch_to_display == 0: train_acc, val_acc, test_acc = self.display_training_info( epoch, avg_loss, time.time() - t_start) if val_acc == None: val_acc = train_acc if (config.show_task_loss and val_acc < best_acc) or \ (not config.show_task_loss and val_acc > best_acc): best_acc = val_acc best_net.update_from_net(self) if config.is_test: best_test_acc = test_acc best_epoch = epoch t_start = time.time() acc_rec[(epoch + 1) / config.epoch_to_display, 0] = epoch + 1 acc_rec[(epoch + 1) / config.epoch_to_display, 1] = train_acc if config.is_val: acc_rec[(epoch + 1) / config.epoch_to_display, 2] = val_acc if config.is_test: acc_rec[(epoch + 1) / config.epoch_to_display, 3] = test_acc if (epoch + 1) % config.epoch_to_save == 0: nnstore.update_from_net(self) nnstore.write(config.output_dir + '/m' + str(epoch + 1) + '.pdata') print '----------------------------------------------------------------' if config.show_task_loss: s = 'loss' else: s = 'acc' if config.is_val: print 'Best val_%s %.4f' % (s, best_acc), else: print 'Best train_%s %.4f' % (s, best_acc), if config.is_test: print '--> test_%s %.4f' % (s, best_test_acc), print 'at epoch %d' % (best_epoch + 1) if config.is_output: f = open('%s/acc_rec.pdata' % config.output_dir, 'w') pickle.dump(acc_rec, f, -1) f.close() self.write_config('%s/cfg.txt' % config.output_dir) # save the best net fname = config.output_dir + '/best_net.pdata' print 'Saving the best model to ' + fname best_net.write(fname) if config.is_test: return (best_acc, best_test_acc) else: return (best_acc)
def threshold_mask_soft(x, k, dropout=None): b = k * gp.std(x, axis=1)[:, gp.newaxis] std_matrix = gp.dot(b, gp.ones((1, x.shape[1]))) if dropout == None: return (x > std_matrix) return (x > std_matrix) * (gp.rand(x.shape) > (1 - dropout))
def mask(x, dropout=1): return (gp.rand(x.shape) > (1 - dropout))
def rand(shape, dtype): return gp.rand(*shape)
""" This code can only work if gnumpy and theano are initialized on the same gpu as theano. """ from six.moves import reduce try: import gnumpy import cudamat gnumpy_available = True ___const_garray = gnumpy.rand(1) import theano.sandbox.cuda as cuda if cuda.cuda_available == False: raise ImportError('Optional theano package cuda disabled') def cudandarray_to_garray(x, copyif=False): """ take a CudaNdarray and return a gnumpy.garray object. :type x: CudaNdarray :param x: The array to transform to gnumpy.garray. :type copyif: bool :param copyif: If False, raise an error if x is not c contiguous. If it is c contiguous, we return a GPUArray that share the same memory region as x. If True, copy x if it is no c contiguous, so the return won't shape the same memory region. If c contiguous, the return will share the same memory region. We need to do this as GPUArray don't fully support strided memory.
def rand_binary(shape, dtype): return gp.rand(*shape) > .5
def rand(*shape): return gp.rand(*shape)
def multilayer_feature_learning(data, inputSize, l1Size, l2Size, l3Size, sparsityParam, lambda_val, beta): print "Now starting feature abstraction..." num_input = inputSize num_hidden_L1 = l1Size num_hidden_L2 = l2Size num_hidden_L3 = l3Size num_output_L1 = inputSize num_output_L2 = num_hidden_L1 num_output_L3 = num_hidden_L2 sparsityParam = sparsityParam lambda_val = lambda_val beta = beta inputs = gpu.garray(data) r = gpu.sqrt(6) / gpu.sqrt(num_hidden_L1 + num_input + 1) weights1_L1 = (gpu.rand(num_hidden_L1, num_input + 1)) * 2 * r - r weights2_L1 = (gpu.rand(num_output_L1, num_hidden_L1 + 1)) * 2 * r - r num_weights1_L1 = (num_input + 1) * num_hidden_L1 num_weights2_L1 = (num_hidden_L1 + 1) * num_output_L1 #weights1_L1 = reshape(weights1_L1, num_weights1_L1) weights1_L1 = weights1_L1.reshape(num_weights1_L1) #weights2_L1 = reshape(weights2_L1, num_weights2_L1) weights2_L1 = weights2_L1.reshape(num_weights2_L1) weights_L1 = hstack( (weights1_L1.as_numpy_array(), weights2_L1.as_numpy_array())) print "Level 1 Abstraction Starting...." args = (num_input, num_hidden_L1, num_output_L1, inputs, lambda_val, sparsityParam, beta) opttheta_L1 = optimize.fmin_l_bfgs_b(costfunc_gpu, weights_L1, fprime=grad_costfunc_gpu, args=args, maxiter=400) weights_L1 = gpu.garray(opttheta_L1[0]) #weights1_L1 = reshape(weights_L1[0:num_weights1_L1],(num_hidden_L1,num_input+1)) weights1_L1 = weights_L1[0:num_weights1_L1].reshape( (num_hidden_L1, num_input + 1)) #weights2_L1 = reshape(weights_L1[num_weights1_L1:shape(weights_L1)[0]],(num_hidden_L2,num_hidden_L1+1)) weights2_L1 = weights_L1[num_weights1_L1:shape(weights_L1)[0]].reshape( (num_output_L1, num_hidden_L1 + 1)) scipy.io.savemat('MINSTLevel1.mat', mdict={ 'learntFeaturesL1_1': weights1_L1.as_numpy_array(), 'learntFeaturesL1_2': weights2_L1.as_numpy_array() }) L1_activation = feedforward(weights1_L1, inputs) del weights_L1 del weights1_L1 del weights2_L1 gpu.free_reuse_cache() v = gpu.sqrt(6) / gpu.sqrt(num_hidden_L2 + num_hidden_L1 + 1) weights1_L2 = (gpu.rand(num_hidden_L2, num_hidden_L1 + 1)) * 2 * v - v weights2_L2 = (gpu.rand(num_output_L2, num_hidden_L2 + 1)) * 2 * v - v num_weights1_L2 = (num_hidden_L1 + 1) * num_hidden_L2 num_weights2_L2 = (num_hidden_L2 + 1) * num_output_L2 #weights1_L2 = reshape(weights1_L2, num_weights1_L2) weights1_L2 = weights1_L2.reshape(num_weights1_L2) #weights2_L2 = reshape(weights2_L2, num_weights2_L2) weights2_L2 = weights2_L2.reshape(num_weights2_L2) weights_L2 = hstack( (weights1_L2.as_numpy_array(), weights2_L2.as_numpy_array())) args = (num_hidden_L1, num_hidden_L2, num_output_L2, L1_activation, lambda_val, sparsityParam, beta) print "Level 2 Abstraction Starting...." opttheta_L2 = optimize.fmin_l_bfgs_b(costfunc_gpu, weights_L2, fprime=grad_costfunc_gpu, args=args, maxiter=400) weights_L2 = gpu.garray(opttheta_L2[0]) #weights1_L2 = reshape(weights_L2[0:num_weights1_L2],(num_hidden_L2,num_hidden_L1+1)) weights1_L2 = weights_L2[0:num_weights1_L2].reshape( (num_hidden_L2, num_hidden_L1 + 1)) weights2_L2 = weights_L2[num_weights1_L2:shape(weights_L2)[0]].reshape( (num_output_L2, num_hidden_L2 + 1)) scipy.io.savemat('MINSTLevel2.mat', mdict={ 'learntFeaturesL2_1': weights1_L2.as_numpy_array(), 'learntFeaturesL2_2': weights2_L2.as_numpy_array() }) L2_activation = feedforward(weights1_L2, L1_activation) del weights_L2 del weights1_L2 del weights2_L2 gpu.free_reuse_cache() u = gpu.sqrt(6) / gpu.sqrt(num_hidden_L3 + num_hidden_L2 + 1) weights1_L3 = (gpu.rand(num_hidden_L3, num_hidden_L2 + 1)) * 2 * u - u weights2_L3 = (gpu.rand(num_output_L3, num_hidden_L3 + 1)) * 2 * u - u num_weights1_L3 = (num_hidden_L2 + 1) * num_hidden_L3 num_weights2_L3 = (num_hidden_L3 + 1) * num_output_L3 #weights1_L3 = reshape(weights1_L3, num_weights1_L3) weights1_L3 = weights1_L3.reshape(num_weights1_L3) #weights2_L3 = reshape(weights2_L3, num_weights2_L3) weights2_L3 = weights2_L3.reshape(num_weights2_L3) weights_L3 = hstack( (weights1_L3.as_numpy_array(), weights2_L3.as_numpy_array())) args = (num_hidden_L2, num_hidden_L3, num_output_L3, L2_activation, lambda_val, sparsityParam, beta) print "Level 3 Abstraction Starting...." opttheta_L3 = optimize.fmin_l_bfgs_b(costfunc_gpu, weights_L3, fprime=grad_costfunc_gpu, args=args, maxiter=400) weights_L3 = gpu.garray(opttheta_L3[0]) #weights1_L3 = reshape(weights_L3[0:num_weights1_L3],(num_hidden_L3,num_hidden_L2+1)) weights1_L3 = weights_L3[0:num_weights1_L3].reshape( (num_hidden_L3, num_hidden_L2 + 1)) weights2_L3 = weights_L3[num_weights1_L3:shape(weights_L3)[0]].reshape( (num_output_L3, num_hidden_L3 + 1)) scipy.io.savemat('MINSTLevel3.mat', mdict={ 'learntFeaturesL3_1': weights1_L3.as_numpy_array(), 'learntFeaturesL3_2': weights2_L3.as_numpy_array() }) L3_activation = feedforward(weights1_L3, L2_activation) del weights_L3 del weights1_L3 del weights2_L3 gpu.free_reuse_cache() print "Abstraction completed." return L3_activation
self.hActs[i].T) self.grad[i][1] = (1. / self.mbSize) * gp.sum( self.deltas[i], axis=1).reshape(-1, 1) return cost, self.grad def updateParams(self, scale, update): self.stack = [[ws[0] + scale * wsDelta[0], ws[1] + scale * wsDelta[1]] for ws, wsDelta in zip(self.stack, update)] if __name__ == '__main__': inputDim = 5 outputDim = 10 layerSizes = [100, 100, 300] mbSize = 5 # fake data data = gp.rand(inputDim, mbSize) import random labels = [random.randint(0, 9)] * mbSize # make nnet nn = NNet(inputDim, outputDim, layerSizes, mbSize, train=True) nn.initParams() # run cost, grad = nn.costAndGrad(data, labels) print cost
def sampleStates(self, acts): return gnp.rand(*acts.shape) <= acts
def sample_units(inputs): return gnp.rand(inputs.shape) < sigmoid(inputs)