def fpropDropout(self, inputBatch, weightsToStopBefore = None ): """ Perform a (possibly partial) forward pass through the network. Updates self.state which, on a full forward pass, holds the input followed by each hidden layer's activation and finally the net input incident on the output layer. For a full forward pass, we return the actual output unit activations. In a partial forward pass we return None. """ inputBatch = inputBatch if isinstance(inputBatch, gnp.garray) else gnp.garray(inputBatch) if weightsToStopBefore == None: weightsToStopBefore = len(self.weights) #self.state holds everything before the output nonlinearity, including the net input to the output units sample = (gnp.rand(*inputBatch.shape) > self.dropouts[0]) self.state = [inputBatch * sample] for i in range(min(len(self.weights) - 1, weightsToStopBefore)): dropoutMultiplier = 1.0/(1.0-self.dropouts[i]) curActs = self.hidActFuncts[i].activation(gnp.dot(dropoutMultiplier*self.state[-1], self.weights[i]) + self.biases[i]) sample = (gnp.rand(*curActs.shape) > self.dropouts[i+1]) self.state.append(curActs * sample) if weightsToStopBefore >= len(self.weights): dropoutMultiplier = 1.0/(1.0-self.dropouts[-1]) self.state.append(gnp.dot(dropoutMultiplier*self.state[-1], self.weights[-1]) + self.biases[-1]) self.acts = self.outputActFunct.activation(self.state[-1]) return self.acts #we didn't reach the output units # To return the first set of hidden activations, we would set # weightsToStopBefore to 1. return self.state[weightsToStopBefore]
def R_forward_pass(self, state, R): """ Apply the R-operator on RNN. R is an RNN object which represents the vector we multiply by. Note that it needs to know the RNN's state, so that it doesn't have to unnecessarily recompute the state. """ V, H, OX = state if V[0] is not None: V = [None] + V assert V[0] is None T = len(V)-1 batch_size = len(V[1]) R_OX, R_HX = [[None]*(T+1) for _ in range(2)] import numpy as np R_H_t = g.tile(R.h_init, (batch_size, 1)) for t in range(1, T+1): R_H_1t = R_H_t R_HX[t] = g.dot(R_H_1t, self.W_hh) + g.dot(H[t-1], R.W_hh) + g.dot(V[t], R.W_vh) R_H_t = self.hid_nonlin.grad_y(H[t]) * R_HX[t] R_OX[t] = g.dot(H[t], R.W_ho) + g.dot(R_H_t, self.W_ho) # \/---(for the structured reg). return (R_HX, R_OX[1:])
def backprop(self): self.timer_logger('backprop', time.time()) self.results['grads'] = [] self.results['bias_grads'] = [] if self.problem == 'classification': #assumes softmax + cross entropy so that both gradients cancel out to give: error = y-t self.results['error'] = self.results['current'] - gpu.garray( self.util.create_t_dataset(self.batch_y)) else: #assumes linear unit + squared error cost function so that both gradients cancel out to give: error = y-t self.results['error'] = (self.results['current'] - gpu.garray(self.batch_y)) for pair in self.results['activations']: activation = pair[0] weight = pair[1] gradient = self.activation_gradient(activation) self.results['grads'].insert( 0, gpu.dot(activation.T, self.results['error'])) self.results['bias_grads'].insert( 0, gpu.dot(gpu.ones((1, self.results['error'].shape[0])), self.results['error'])) self.results['error'] = gpu.dot(self.results['error'], weight.T) * gradient self.timer_logger('backprop', time.time())
def forward_pass(self, batch, O=None): if len(batch)==2 and type(batch)==tuple: V,O=batch assert len(V)==len(O) elif len(batch)==3 and type(batch)==tuple: V,O,M=batch assert len(V)==len(O)==len(M) else: V=batch if V[0] is not None: V = [None] + V T = len(V)-1 batch_size = len(V[1]) A, B, H, OX = [[None]*(T+1) for _ in range(4)] H[0] = g.tile(self.h_init, (batch_size, 1)) for t in range(1, T+1): B[t] = g.dot(V[t], self.W_vf).tanh() A[t] = g.dot(H[t-1], self.W_hf) C_t = g.dot(V[t], self.W_vh) # + hh stuff AB = A[t]*(B[t] + self.f_bias) HX_t = g.dot(AB, self.W_fh) + C_t H[t] = self.hid_nonlin(HX_t) OX[t] = g.dot(H[t], self.W_ho) return (V[1:], A, B, H, OX[1:])
def costfunc_gpu_ReLU(x, *args): num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args num_weights1 = (num_input+1)*num_hidden x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden_sum = gpu.dot(weights1, data) hidden_activation = gpu.log(1+hidden_sum.exp()) p_avg = gpu.sum(hidden_activation,axis=1)/nData hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:,1:shape(weights1)[1]] regularized_penalty2 = weights2[:,1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - inputs)*(output - inputs) KL = gpu.sum(sparsityParam*gpu.log(sparsityParam/p_avg) + (1-sparsityParam)*gpu.log((1-sparsityParam)/(1-p_avg))) cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta*KL print 'ReLU Linear Decoder Cost: ', cost return cost
def dbn_forward_pass(ws_vh, ws_v, ws_h, x, y=None): """ Deep belief net forward pass. x: input data (N x D matrix) y: Class label (1-of-K coded, N x K matrix). If not None, it is concatenated to the input for top layer RBM when calculating the output of the DBN. ws_vh: list of layer weights (L x D x H) ws_v: list of layer input biases (L x D x 1) ws_h: list of layer output biases (L x H x 1) Returns activations (continuous) and outputs (0-1, sigmoid(activations)) of top layer """ L = len(ws_vh) h = x.T # forward (bottom-up) pass for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h) + ws_h[l] h = gnp.logistic(ah) # if supervised, concatenate class labels to input to top layer RBM if y is not None: h = gnp.concatenate((y.T, h)) ah = gnp.dot(ws_vh[-1].T, h) + ws_h[-1] h = gnp.logistic(ah) return ah.T, h.T
def exact_fisher_information_biases(rbm, batch_units=10, show_progress=False): batch_size = 2 ** batch_units nvis, nhid = rbm.nvis, rbm.nhid num_params = nvis + nhid s = gnp.zeros(num_params) G = gnp.zeros((num_params, num_params)) for hid, p in iter_configurations(rbm, batch_units=batch_units, show_progress=show_progress): g = gnp.zeros((batch_size, num_params)) cond_vis = gnp.logistic(rbm.vis_inputs(hid)) g[:, :nvis] = cond_vis g[:, nvis:] = hid s += gnp.dot(p, g) G += gnp.dot(g.T * p, g) diag_term = gnp.dot(p, g * (1. - g)) G += np.diag(diag_term.as_numpy_array()) G -= s[:, nax] * s[nax, :] return G
def forward(self, X, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] R = self.R C = self.C bw = self.bw # Obtain word features tmp = R.as_numpy_array()[:, X.flatten()].flatten(order='F') tmp = tmp.reshape((batchsize, self.K * self.context)) words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i, :, :] = tmp[i, :].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:, :, i], C[i, :, :]) acts = gpu.concatenate((acts, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(acts, gpu.concatenate((R, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, preds.as_numpy_array())
def backward(self, Y, preds, acts, words, X): """ Backward pass through the network """ batchsize = preds.shape[0] # Compute part of df/dR Ix = gpu.garray(preds[:, :-1] - Y) / batchsize delta = gpu.dot(acts.T, Ix) dR = delta[:-1, :] + self.gamma_r * self.R db = delta[-1, :] dR = dR.as_numpy_array() # Compute df/dC and word inputs for df/dR Ix = gpu.dot(Ix, self.R.T) dC = gpu.zeros(np.shape(self.C)) for i in range(self.context): delta = gpu.dot(words[:, :, i].T, Ix) dC[i, :, :] = delta + self.gamma_c * self.C[i, :, :] delta = gpu.dot(Ix, self.C[i, :, :].T) delta = delta.as_numpy_array() for j in range(X.shape[0]): dR[:, X[j, i]] = dR[:, X[j, i]] + delta.T[:, j] self.dR = gpu.garray(dR) self.db = db self.dC = dC
def backprop(self, X, y_target) : # forward activity = [] result = X for i in range(len(self.weights)): p = self.dropout_probability[i] mask = (g.rand(result.shape) >= p) result = result * mask del mask activity.append(result) w,b = self.weights[i] result = g.dot(result,w) + b result = self.activation[i](result) # backward gradientNodes = [] lastGradient = self.gradient[-1](result, y_target) gradientNodes.append(lastGradient) for i in reversed(range(1,len(self.weights))): w,b = self.weights[i] lastGradient = g.dot(lastGradient, w.T) * self.gradient[i-1](activity[i]) gradientNodes.append(lastGradient) # get gradient resultGradient = [] for i in range(len(self.weights)): gradW = (g.dot(activity[i].T,gradientNodes[-(i+1)]) / len(X)) assert(gradW.shape == self.weights[i][0].shape) gradB = (g.sum(gradientNodes[-(i+1)],axis=0) / len(X)) assert(gradB.shape == self.weights[i][1].shape) resultGradient.append([gradW,gradB]) del gradientNodes return resultGradient
def bprop(self, outputErrSignal, MLerr, fpropState=None): """ Perform a backward pass through the network. fpropState defaults to self.state (set during fprop) and outputErrSignal should be self.outputActFunct.dErrordNetInput(...). """ # Manifold learning ml_sense = [None for i in range(len(self.weights))] pivt_sense = [None for i in range(len(self.weights))] ml_sense[-1] = MLerr * self.actsML * (1 - self.actsML) pivt_sense[-1] = outputErrSignal - MLerr * self.actsMLpvt * ( 1 - self.actsMLpvt) for i in reversed(range(len(self.weights) - 1)): ml_sense[i] = gnp.dot( ml_sense[i + 1], self.weights[i + 1].T) * self.hidActFuncts[i].dEdNetInput( self.stateML[i + 1]) pivt_sense[i] = gnp.dot(pivt_sense[i + 1], self.weights[ i + 1].T) * self.hidActFuncts[i].dEdNetInput(self.pivt[i + 1]) return ml_sense, pivt_sense
def backward(self, dEdY): N = dEdY.shape[0] S = self.windowSize T = dEdY.shape[1] + S - 1 F = dEdY.shape[2] D = self.X.shape[2] dEdY = dEdY.reshape(N * (T - S + 1), F) dEdX = np.zeros(self.X.shape, self.X.dtype) if self.gpu: gdEdY = gpu.as_garray(dEdY.astype('float32')) self.dEdW = gpu.dot(self.Z.transpose(), gdEdY) else: self.dEdW = np.dot(self.Z.transpose(), dEdY) if self.outputdEdX: if self.gpu: gdEdZ = gpu.dot(gdEdY, self.W.transpose()) dEdZ = gpu.as_numpy_array(gdEdZ) else: dEdZ = np.dot(dEdY, self.W.transpose()) dEdZ = dEdZ.reshape(N, T - S + 1, S, D) for t in range(0, T): if t <= S - 1: dEdX[:, t, :] = np.sum(dEdZ[:, range(0, t + 1), range(t, -1, -1), :], axis=1) elif t >= T - S + 1: dEdX[:, t, :] = np.sum(dEdZ[:, range(t - S + 1, T - S + 1), range(S - 1, S - (T - t) - 1, -1), :], axis=1) else: dEdX[:, t, :] = np.sum(dEdZ[:, range(t - S + 1, t + 1), range(S - 1, -1, -1), :], axis=1) return dEdX
def backward_pass(self, state, dOX, compute_grad2 = False): grad = self.unpack(self.pack() * 0) if compute_grad2: grad2 = self.unpack(self.pack() * 0) else: grad2 = None dY = dOX for i in reversed(range(len(self.sizes) - 1)): dX = self.nonlins[i].grad_y(state[i + 1]) * dY X = state[i] #state[i + 1] = self.hid_nonlin(g.dot(X, self.W[i]) + self.b[i]) grad.b[i] += dX.sum(0) grad.W[i] += g.dot(X.T, dX) if compute_grad2: grad2.b[i] += (dX*dX).sum(0) grad2.W[i] += g.dot((X*X).T, dX*dX) ## backprop the gradient: if i > 0: # typically the first multiplication is the costliest. dY = g.dot(dX, self.W[i].T) return grad, grad2
def rbm_sample(w_vh, w_v, w_h, x, k=1, clamped=None): """ Sample from RBM with k steps of Gibbs sampling w_vh: Weights between visible and hidden units (matrix of size DxH) w_v: Visible unit biases (column vector of size Dx1) w_h: Hidden unit biases (column vector of size Hx1) x: Input (column vector of size DxN) k: Number of Gibbs steps. Default is 1. clamped: If not None, keeps the given elements of x clamped (constant) while sampling clamped is a two-tuple that gives the start and end indices of clamped elements Returns hidden unit and visible unit activations (matrices of size HxN, DxN) """ if clamped is not None: cx = x[clamped[0] : clamped[1], :] v = x for i in range(k): # sample hiddens ah = gnp.dot(w_vh.T, v) + w_h h = gnp.logistic(ah) hs = h > gnp.rand(h.shape[0], h.shape[1]) # sample visibles av = gnp.dot(w_vh, hs) + w_v v = gnp.logistic(av) if clamped is not None: v[clamped[0] : clamped[1], :] = cx return h, v
def fprop(self, inputBatch, weightsToStopBefore=None): """ Perform a (possibly partial) forward pass through the network. Updates self.state which, on a full forward pass, holds the input followed by each hidden layer's activation and finally the net input incident on the output layer. For a full forward pass, we return the actual output unit activations. In a partial forward pass we return None. """ inputBatch = inputBatch if isinstance( inputBatch, gnp.garray) else gnp.garray(inputBatch) if weightsToStopBefore == None: weightsToStopBefore = len(self.weights) #self.state holds everything before the output nonlinearity, including the net input to the output units self.state = [inputBatch] for i in range(min(len(self.weights) - 1, weightsToStopBefore)): curActs = self.hidActFuncts[i].activation( gnp.dot(self.state[-1], self.weights[i]) + self.biases[i]) self.state.append(curActs) if weightsToStopBefore >= len(self.weights): self.state.append( gnp.dot(self.state[-1], self.weights[-1]) + self.biases[-1]) self.acts = self.outputActFunct.activation(self.state[-1]) return self.acts #we didn't reach the output units # To return the first set of hidden activations, we would set # weightsToStopBefore to 1. return self.state[weightsToStopBefore]
def forward(self, X, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] R = self.R C = self.C bw = self.bw # Obtain word features tmp = R.as_numpy_array()[:,X.flatten()].flatten(order='F') # flatten(), default in row-major order, order='F' means Fortran(column-major) order tmp = tmp.reshape((batchsize, self.K * self.context)) # reshape(), in row-major order words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i,:,:] = tmp[i,:].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:,:,i], C[i,:,:]) # the dot() of 2-D matrix is equiverlent to multiply acts = gpu.concatenate((acts, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(acts, gpu.concatenate((R, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, preds.as_numpy_array())
def backward(self, Y, preds, acts, words, X): """ Backward pass through the network """ batchsize = preds.shape[0] # Compute part of df/dR Ix = gpu.garray(preds[:,:-1] - Y) / batchsize delta = gpu.dot(acts.T, Ix) dR = delta[:-1,:] + self.gamma_r * self.R db = delta[-1,:] dR = dR.as_numpy_array() # Compute df/dC and word inputs for df/dR Ix = gpu.dot(Ix, self.R.T) dC = gpu.zeros(np.shape(self.C)) for i in range(self.context): delta = gpu.dot(words[:,:,i].T, Ix) dC[i,:,:] = delta + self.gamma_c * self.C[i,:,:] delta = gpu.dot(Ix, self.C[i,:,:].T) delta = delta.as_numpy_array() for j in range(X.shape[0]): dR[:,X[j,i]] = dR[:,X[j,i]] + delta.T[:,j] self.dR = gpu.garray(dR) self.db = db self.dC = dC
def get_all_dists(self, query_id): v = self.index[query_id] # normalized if self._metric == 'angular': dists = -gnumpy.dot(self.index, v) elif self._metric == 'euclidean': dists = self.lengths - 2 * gnumpy.dot(self.index, v) return dists.as_numpy_array()
def get_output(self, input): past = [] # append past_input if self.input is not None: past.append(self.input) # set input self.input = input # append past_hidden if self.hidden is not None: past.append(self.hidden) # set hidden if self.hidden is None: self.hidden = np.zeros(self.n_hidden) #print self.hidden self.hidden = self.a_hidden(gnp.dot(self.u.T, self.input) + gnp.dot(self.w.T, self.hidden)) # set output #print "dot", np.dot(self.v.T, self.hidden) self.output = self.a_output(gnp.dot(self.v.T, self.hidden)) # append past_data (store (self.trun + 1) past's) if len(past) != 0: self.past_data.append(past) if len(self.past_data) > (self.trun + 1): self.past_data.pop(0) return self.output
def _cd_update_terms(self, vis, model_vis, model_p_vis): """Returns (weights update, visible bias update, hidden bias update) given visible states from the data vis, visible states sampled from the model model_vis and the probability of the visible units being active from the model.""" #print "vis.shape: ", vis.shape #print "p_hid(vis).shape: ", self.p_hid(vis).shape #print "model_p_vis.shape: ", model_p_vis.shape #print "p_hid(model_p_vis).shape: ", self.p_hid(model_p_vis).shape # my update rule: #dweights = (gp.dot(vis.T, self.p_hid(vis)) - # gp.dot(model_p_vis.T, self.p_hid(model_vis))) #dbias_vis = gp.sum(vis, axis=0) - gp.sum(model_p_vis, axis=0) #dbias_hid = (gp.sum(self.p_hid(vis), axis=0) - # gp.sum(self.p_hid(model_vis), axis=0)) # deep learning update rule: dweights = (gp.dot(vis.T, self.p_hid_given_vis(vis)) - gp.dot(model_vis.T, self.p_hid_given_vis(model_vis))) dbias_vis = gp.sum(vis, axis=0) - gp.sum(model_vis, axis=0) dbias_hid = (gp.sum(self.p_hid_given_vis(vis), axis=0) - gp.sum(self.p_hid_given_vis(model_vis), axis=0)) n_samples = vis.shape[0] return (dweights / n_samples, dbias_vis / n_samples, dbias_hid / n_samples)
def mlpSingleOutput1Layer_costfunc(x, *args): inputSize, l1Size, lambda_hidden, inputs, targets = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) inputs = gpu.garray(inputs) targets = gpu.garray(targets) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_output = gpu.garray(reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size+1))) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis = 0) #hidden_activation_L1 = hidden_activation_L1 * dropout_prob hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1) outputs = hidden_sum_output.logistic() output_target_diff = (outputs - targets)**2 regularized_penalty_output = theta_output[:,1:shape(theta_output)[1]] regularized_penalty_output = regularized_penalty_output * regularized_penalty_output regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 cost = gpu.sum(output_target_diff)/(2*numCases) + 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L1)+gpu.sum(regularized_penalty_output)) print 'Multilayer Preceptron Cost:', cost del inputs del theta_L1 del hidden_sum_L1 del hidden_activation_L1 del regularized_penalty_output del regularized_penalty_L1 gpu.free_reuse_cache() return cost
def fpropDropout(self, inputBatch, weightsToStopBefore=None): """ Perform a (possibly partial) forward pass through the network. Updates self.state which, on a full forward pass, holds the input followed by each hidden layer's activation and finally the net input incident on the output layer. For a full forward pass, we return the actual output unit activations. In a partial forward pass we return None. """ if weightsToStopBefore == None: weightsToStopBefore = len(self.weights) #self.state holds everything before the output nonlinearity, including the net input to the output units self.state = [ inputBatch * (gnp.rand(*inputBatch.shape) > self.dropouts[0]) ] for i in range(min(len(self.weights) - 1, weightsToStopBefore)): dropoutMultiplier = 1.0 / (1.0 - self.dropouts[i]) curActs = self.hidActFuncts[i].activation( gnp.dot(dropoutMultiplier * self.state[-1], self.weights[i]) + self.biases[i]) self.state.append( curActs * (gnp.rand(*curActs.shape) > self.dropouts[i + 1])) if weightsToStopBefore >= len(self.weights): dropoutMultiplier = 1.0 / (1.0 - self.dropouts[-1]) self.state.append( gnp.dot(dropoutMultiplier * self.state[-1], self.weights[-1]) + self.biases[-1]) self.acts = self.outputActFunct.activation(self.state[-1])
def fprop_xf(self, inputBatch, weightsToStopBefore=None): """ Only used during feature dumping after the network has been trained. Perform a (possibly partial) forward pass through the network. Updates self.state which, on a full forward pass, holds the input followed by each hidden layer's activation and finally the net input incident on the output layer. Note that state does NOT constrain the activation of the output layer. For a full forward pass, we return the actual output unit activations. In a partial forward pass we return None. """ if weightsToStopBefore == None: weightsToStopBefore = len(self.weights) #self.state holds everything before the output nonlinearity, including the net input to the output units self.state = [inputBatch] for i in range(min(len(self.weights) - 1, weightsToStopBefore)): curActs = self.hidActFuncts[i].activation( gnp.dot(self.state[-1], self.weights[i]) + self.biases[i]) self.state.append(curActs) if weightsToStopBefore >= len(self.weights): self.state.append( gnp.dot(self.state[-1], self.weights[-1]) + self.biases[-1]) self.acts = self.outputActFunct.activation(self.state[-1]) return self.acts #we didn't reach the output units # To return the first set of hidden activations, we would set # weightsToStopBefore to 1. return self.state[weightsToStopBefore]
def update(self): self.w *= self.l2reg if self.dropout > 0: self.w -= gpu.dot((self.x * self.r).T, self.d) * self.learn # / self.q else: self.w -= gpu.dot(self.x.T, self.d) * self.learn # / self.q self.b *= self.l2reg self.b -= gpu.sum(self.d, 0) * self.learn
def energy(self, vis, hid): assert hid.ndim == 2 #return (vis * self.vbias[nax, :]).sum(1) + \ # (hid * self.hbias[nax, :]).sum(1) + \ # (vis[:, :, nax] * self.weights[nax, :, :] * hid[:, nax, :]).sum(2).sum(1) return gnp.dot(vis, self.vbias) + \ gnp.dot(hid, self.hbias) + \ gnp.sum(vis * gnp.dot(hid, self.weights.T), 1)
def fobos_nn(self, w): nu = self.tau * self.lr u, s, vt = linalg.svd(w, full_matrices=0, compute_uv=1) sdash = np.maximum(s - nu, 0) sdashzeros = np.diag(sdash) # sdashzeros = np.zeros(u.shape, dtype=np.float) # sdashzeros[:sdashtemp.shape[0], :sdashtemp.shape[1]] = sdashtemp return gnp.dot(gnp.garray(u), gnp.dot(gnp.garray(sdashzeros), gnp.garray(vt))).as_numpy_array(), s
def fobos_nn(self, w): nu = self.tau * self.lr u, s, vt = randomized_svd(w, w.shape[0]) sdash = np.maximum(s - nu, 0) sdashtemp = np.diag(sdash) sdashzeros = np.zeros(u.shape, dtype=np.float) sdashzeros[:sdashtemp.shape[0], :sdashtemp.shape[1]] = sdashtemp return gnp.dot(gnp.garray(u), gnp.dot(gnp.garray(sdashzeros), gnp.garray(vt))).as_numpy_array(), s
def run_gnumpy(a, b): st_g = time() len_a = gnumpy.dot(a, a) len_b = gnumpy.dot(b, b) res = gnumpy.dot(a, b) / (len_a * len_b) et_g = time() print res return et_g - st_g
def grad_costfunc_gpu_ReLU(x, *args): num_input, num_hidden, num_output, inputs, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden num_weights2 = (num_hidden + 1) * num_output x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) #hidden_activation = gpu.log(1+hidden_sum.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum)) * (hidden_sum > 0) hidden_activation = hidden_sum * relu_mask_hidden1 #hidden_derivative = hidden_sum.logistic() hidden_derivative = relu_mask_hidden1 hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) hidden_derivative = gpu.concatenate((gpu.ones( (1, nData)), hidden_derivative), axis=0) outputs = gpu.dot(weights2, hidden_activation) weights1_grad = gpu.zeros(shape(weights1)) weights2_grad = gpu.zeros(shape(weights2)) p = outputs - inputs weights2_grad += gpu.dot( p, gpu.garray(transpose(hidden_activation.as_numpy_array()))) q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())), p) #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation)) q = q_temp * hidden_derivative delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array()))) weights1_grad += delta2[1:shape(delta2)[0], :] weights1_grad = weights1_grad / nData weights2_grad = weights2_grad / nData weights1_grad[:, 1:shape(weights1_grad)[1]] = weights1_grad[:, 1:shape( weights1_grad)[1]] + weights1[:, 1:shape(weights1)[1]] * lambda_val weights2_grad[:, 1:shape(weights2_grad)[1]] = weights2_grad[:, 1:shape( weights2_grad)[1]] + weights2[:, 1:shape(weights2)[1]] * lambda_val #weights1_grad = reshape(weights1_grad, num_weights1) weights1_grad = weights1_grad.reshape(num_weights1) #weights2_grad = reshape(weights2_grad, num_weights2) weights2_grad = weights2_grad.reshape(num_weights2) del x del inputs del data del p del q_temp del q del delta2 del hidden_sum del hidden_activation del weights1 del weights2 gpu.free_reuse_cache() return hstack( (weights1_grad.as_numpy_array(), weights2_grad.as_numpy_array()))
def mlpSoftmax1Layer_grad(x, *args): numClasses, inputSize, l1Size, lambda_softmax, lambda_hidden, inputs, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_softmax = numClasses * l1Size inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_softmax = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (numClasses, l1Size))) theta_L1_grad = gpu.zeros(shape(theta_L1)) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) #hidden_derivative_L1 = hidden_sum_L1.logistic() relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 #hidden_activation_L1 = hidden_sum_L1.logistic() hidden_derivative_L1 = relu_mask_hidden1 hidden_sum_softmax_imd = gpu.dot(theta_softmax, hidden_activation_L1) hidden_sum_softmax = hidden_sum_softmax_imd - hidden_sum_softmax_imd.max( axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) softmax_imd = groundTruth - predictions theta_softmax_grad = -1 * gpu.dot( softmax_imd, gpu.garray(transpose(hidden_activation_L1.as_numpy_array())) ) / numCases + lambda_softmax * theta_softmax deltaOut = -softmax_imd delta_L1_imd = gpu.dot( gpu.garray(transpose(theta_softmax.as_numpy_array())), deltaOut) delta_L1_imd2 = delta_L1_imd * hidden_derivative_L1 #delta_L1_imd2 = (delta_L1_imd*hidden_activation_L1)*(1-hidden_activation_L1) delta_L1 = gpu.dot(delta_L1_imd2, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta_L1 theta_L1_grad = theta_L1_grad / numCases theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape( theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) theta_softmax_grad = reshape(theta_softmax_grad.as_numpy_array(), num_weights_softmax) del inputs del theta_L1 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_softmax del predictions del softmax_imd del deltaOut del delta_L1_imd del delta_L1_imd2 del delta_L1 gpu.free_reuse_cache() return hstack((theta_L1_grad, theta_softmax_grad))
def update(self): self.w *= self.l2reg if self.dropout > 0: self.w -= gpu.dot( (self.x * self.r).T, self.d) * self.learn # / self.q else: self.w -= gpu.dot(self.x.T, self.d) * self.learn # / self.q self.b *= self.l2reg self.b -= gpu.sum(self.d, 0) * self.learn
def input_to_hidden(self, set_name = 'train'): self.timer_logger('input_to_hidden {0}'.format(type), time.time()) self.results['activations'] = [] if set_name == 'train': self.results['activations'].append([self.batch, self.w[0], self.b[0]]) dropped_out = self.batch * (gpu.rand(self.current_batch_size,self.X.shape[1]) > self.dropout[0]) self.results['current'] = gpu.dot(dropped_out,self.w[0])+self.b[0] else: self.results['current'] = gpu.dot(self.batch,self.w[0]) + self.b[0] self.timer_logger('input_to_hidden {0}'.format(type), time.time())
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray( reshape(x[num_weights_L1:num_weights_L2 + num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_softmax = gpu.garray( reshape(x[num_weights_L2 + num_weights_L1:shape(x)[0]], (numClasses, l2Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_activation_L1), axis=0) hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = hidden_sum_L2.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) temp = groundTruth * gpu.log(predictions) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:, 1:shape(theta_L2)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 cost = -1 * gpu.sum(temp) / numCases + 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2) ) + 0.5 * lambda_softmax * gpu.sum(theta_softmax * theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_L2 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 del regularized_penalty_L2 gpu.free_reuse_cache() return cost
def feedforward(self, train_set_x): self.activations = [] self.activations.append(train_set_x) for i in range(self.n_layers): current_activations = gnp.tanh(gnp.dot(self.activations[i], self.W_params[i]) + self.b_params[i]) self.activations.append(current_activations) #output layers self.final_layer_output = gnp.dot(self.activations[self.n_layers], self.W_params[self.n_layers]) + self.b_params[self.n_layers]
def parameter_prediction(self, test_set_x): test_set_x = gnp.as_garray(test_set_x) current_activations = test_set_x for i in range(self.n_layers): current_activations = gnp.tanh(gnp.dot(current_activations, self.W_params[i]) + self.b_params[i]) final_layer_output = gnp.dot(current_activations, self.W_params[self.n_layers]) + self.b_params[self.n_layers] return final_layer_output.as_numpy_array()
def nn_forward_pass(x, w, b, return_all=True): """ Forward pass for multilayer feed-forward sigmoid neural network Hidden units have sigmoid non-linearity. Output is soft-max. x: DxN matrix of input data w: Weights. List of weight matrices for each layer. b: Biases. List of bias vectors for each layer return_all: If True, returns hidden unit activations for each layer. If False just returns the output layer activations Returns a list h where each element is a matrix containing the activations for that layer. h[0] is input data x. """ # ---- TEMP HACK -------------- # I should find a more seamless way of running in mixed (some operations # with numpy, some with gnumpy) mode. # I had to resort to this, because i needed the validation classification # step in nn_train to run on CPU with numpy. GPU ran out of memory. if isinstance(x, gnp.garray): use_gpu = True else: use_gpu = False layer_count = len(w) if return_all: hs = [x] # unit activations for each layer h = x # all layers except the output layer for l in range(layer_count - 1): if use_gpu: a = gnp.dot(w[l].T, h) + b[l] h = gnp.logistic(a) else: a = np.dot(gnp.as_numpy_array(w[l]).T, h) + gnp.as_numpy_array(b[l]) h = 1.0 / (1 + np.exp(-a)) if return_all: hs.append(h) # output layer if use_gpu: h = gnp.dot(w[-1].T, h) + b[-1] h = gnp.exp(h) / gnp.sum(gnp.exp(h), axis=0) # soft-max else: h = np.dot(gnp.as_numpy_array(w[-1]).T, h) + gnp.as_numpy_array(b[-1]) h = np.exp(h) / np.sum(np.exp(h), axis=0) # soft-max if return_all: hs.append(h) return hs else: return h
def backward_pass(self, state, dOX, R_HX=None, mu_times_lambda=0.): """ The backward pass (or the L-op). Given the gradients wrt the output units and the state, compute the implied derivative wrt the parameters. If R_HX is given, then structural damping will be added. """ V, H, OX = state if V[0] is not None: V = [None] + V if OX[0] is not None: OX = [None] + OX if dOX[0] is not None: dOX = [None] + dOX assert V[0] is None T = len(V)-1 grad = self.unpack(self.pack() * 0) dH_1t = H[-1] * 0 for t in reversed(range(1, T+1)): dH_t = dH_1t dH_t += g.dot(dOX[t], self.W_ho.T) grad.W_ho += g.dot(H[t].T, dOX[t]) ## backpropagate the nonlinearity: at this point, dHX_t, the gradinet ## wrt the total inputs to H_t, is correct. dHX_t = dH_t * self.hid_nonlin.grad_y(H[t]) ## THIS IS THE ONLY LINE THAT HAS ANYTHING TO DO WITH STRUCTURAL ## DAMPING. Pretty cool :-) if R_HX is not None: dHX_t += float(mu_times_lambda) * \ self.struct_damp_nonlin.H_prod(R_HX[t], H[t], 1) dH_1t = g.dot(dHX_t, self.W_hh.T) grad.W_hh += g.dot(H[t-1].T, dHX_t) grad.W_vh += g.dot(V[t].T, dHX_t) grad.h_init += dH_1t.sum(0) return grad
def dbn_supervised_predict_exact(ws_vh, ws_v, ws_h, x): """ Predict the class label of input x from supervised DBN Uses the exact method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 The free energy formula is taken from http://deeplearning.net/tutorial/rbm.html x: Input data. (NxD matrix) """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # for every class, assume it is the correct label and calculate its free energy y = gnp.zeros((K, N)) free_energy = gnp.zeros((N, K)) # we actually calculate -free_energy for k in range(K): # set the current assumed class label y[k, :] = 1.0 # visible unit vector v = gnp.concatenate((y, h_prev)) e_v = gnp.dot(ws_v[-1].T, v) # bias energy term ah = gnp.dot(ws_vh[-1].T, v) + ws_h[-1] e_h = gnp.sum(gnp.log(gnp.exp(ah) + 1.0), axis=0) free_energy[:, k] = e_v + e_h # zero the class labels for next iteration y[:, :] = 0.0 # since these numbers may get pretty small, use the sum-exp trick for converting # these to probabilities pred_y = ( gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]) / gnp.sum(gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]), axis=1)[:, gnp.newaxis] ) return pred_y
def exact_moments(rbm, batch_units=10, show_progress=False): expect_vis = gnp.zeros(rbm.nvis) expect_hid = gnp.zeros(rbm.nhid) expect_prod = gnp.zeros((rbm.nvis, rbm.nhid)) for hid, p in iter_configurations(rbm, batch_units=batch_units, show_progress=show_progress): cond_vis = gnp.logistic(rbm.vis_inputs(hid)) expect_vis += gnp.dot(p, cond_vis) expect_hid += gnp.dot(p, hid) expect_prod += gnp.dot(cond_vis.T * p, hid) return binary_rbms.Moments(expect_vis, expect_hid, expect_prod)
def fprop(self, inputBatch, weightsToStopBefore = None ): inputBatch = inputBatch if isinstance(inputBatch, gnp.garray) else gnp.garray(inputBatch) if weightsToStopBefore == None: weightsToStopBefore = len(self.weights) self.state = [inputBatch] for i in range(min(len(self.weights) - 1, weightsToStopBefore)): curActs = self.hidActFuncts[i].activation(gnp.dot(self.state[-1], self.weights[i]) + self.biases[i]) self.state.append(curActs) if weightsToStopBefore >= len(self.weights): self.state.append(gnp.dot(self.state[-1], self.weights[-1]) + self.biases[-1]) self.acts = self.outputActFunct.activation(self.state[-1]) return self.acts return self.state[weightsToStopBefore]
def CD1(vis, visToHid, visBias, hidBias, visUnit = Binary(), hidUnit = Binary()): posHid = hidUnit.activate(gnp.dot(vis, visToHid) + hidBias) posHidStates = hidUnit.sampleStates(posHid) negVis = visUnit.activate(gnp.dot(posHidStates, visToHid.T) + visBias) negHid = hidUnit.activate(gnp.dot(negVis, visToHid) + hidBias) visHidStats = gnp.dot(vis.T, posHid) - gnp.dot(negVis.T, negHid) visBiasStats = vis.sum(axis=0).reshape(*visBias.shape) - negVis.sum(axis=0).reshape(*visBias.shape) hidBiasStats = posHid.sum(axis=0).reshape(*hidBias.shape) - negHid.sum(axis=0).reshape(*hidBias.shape) return visHidStats, hidBiasStats, visBiasStats, negVis
def costfunc_gpu(x, *args): num_input, num_hidden, num_output, inputs, noNoiseData, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden x = gpu.garray(x) # randomNoise = random.random_sample(shape(inputs)) # criteriaTable = randomNoise > 0.32 # inputs = inputs * criteriaTable inputs = gpu.garray(inputs) noNoiseData = gpu.garray(noNoiseData) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) hidden_activation = hidden_sum.logistic() p_avg = gpu.sum(hidden_activation, axis=1) / nData hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:, 1:shape(weights1)[1]] regularized_penalty2 = weights2[:, 1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - noNoiseData) * (output - noNoiseData) KL = gpu.sum(sparsityParam * gpu.log(sparsityParam / p_avg) + (1 - sparsityParam) * gpu.log((1 - sparsityParam) / (1 - p_avg))) cost = gpu.sum(output_target_diff) / (2 * nData) + 0.5 * lambda_val * ( gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta * KL print 'GPU Linear Denoising Decoder Cost: ', cost del x del inputs del noNoiseData del data del hidden_sum del hidden_activation del p_avg del output del regularized_penalty1 del regularized_penalty2 del weights1 del weights2 del output_target_diff gpu.free_reuse_cache() return cost
def grad_costfunc_gpu(x, *args): num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args num_weights1 = (num_input+1)*num_hidden num_weights2 = (num_hidden+1)*num_output x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1)) weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden_sum = gpu.dot(weights1, data) hidden_activation = hidden_sum.logistic() p_avg = gpu.sum(hidden_activation,axis=1)/nData grad_sparse = -1*sparsityParam/p_avg.as_numpy_array() + (1-sparsityParam)/(1-p_avg.as_numpy_array()) grad_sparse = append(0,grad_sparse) grad_sparse = tile(grad_sparse, (nData, 1)) grad_sparse = gpu.garray(transpose(grad_sparse)) hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0) outputs = gpu.dot(weights2, hidden_activation) weights1_grad = gpu.zeros(shape(weights1)) weights2_grad = gpu.zeros(shape(weights2)) p = outputs-inputs weights2_grad += gpu.dot(p, gpu.garray(transpose(hidden_activation.as_numpy_array()))) q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())),p) + beta*grad_sparse #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation)) q = (q_temp*hidden_activation)*(1-hidden_activation) delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array()))) weights1_grad += delta2[1:shape(delta2)[0], :] weights1_grad = weights1_grad/nData weights2_grad = weights2_grad/nData weights1_grad[:,1:shape(weights1_grad)[1]] = weights1_grad[:,1:shape(weights1_grad)[1]] + weights1[:,1:shape(weights1)[1]] * lambda_val weights2_grad[:,1:shape(weights2_grad)[1]] = weights2_grad[:,1:shape(weights2_grad)[1]] + weights2[:,1:shape(weights2)[1]] * lambda_val #weights1_grad = reshape(weights1_grad, num_weights1) weights1_grad = weights1_grad.reshape(num_weights1) #weights2_grad = reshape(weights2_grad, num_weights2) weights2_grad = weights2_grad.reshape(num_weights2) del x del inputs del data del grad_sparse del p del q_temp del q del delta2 del hidden_sum del hidden_activation del weights1 del weights2 gpu.free_reuse_cache() return hstack((weights1_grad.as_numpy_array(),weights2_grad.as_numpy_array()))
def input_to_hidden(self, set_name='train'): self.timer_logger('input_to_hidden {0}'.format(type), time.time()) self.results['activations'] = [] if set_name == 'train': self.results['activations'].append( [self.batch, self.w[0], self.b[0]]) dropped_out = self.batch * (gpu.rand( self.current_batch_size, self.X.shape[1]) > self.dropout[0]) self.results['current'] = gpu.dot(dropped_out, self.w[0]) + self.b[0] else: self.results['current'] = gpu.dot(self.batch, self.w[0]) + self.b[0] self.timer_logger('input_to_hidden {0}'.format(type), time.time())
def fpropDropout(self, inputBatch, useDropout=False, weightsToStopBefore=None): """ Perform a (possibly partial) forward pass through the network. Updates self.state which, on a full forward pass, holds the input followed by each hidden layer's activation and finally the net input incident on the output layer. For a full forward pass, we return the actual output unit activations. In a partial forward pass we return None. If useDropout == True, ranomly drop units for each layer. """ inputBatch = inputBatch if isinstance( inputBatch, gnp.garray) else gnp.garray(inputBatch) if weightsToStopBefore == None: weightsToStopBefore = len(self.weights) self.keptMask = [gnp.rand(*inputBatch.shape) > self.dropouts[0]] #self.state holds everything before the output nonlinearity, including the net input to the output units self.state = [inputBatch * self.keptMask[0]] for i in range(min(len(self.weights) - 1, weightsToStopBefore)): if useDropout: dropoutMultiplier = 1.0 / (1.0 - self.dropouts[i]) curActs = self.hidActFuncts[i].activation( gnp.dot(dropoutMultiplier * self.state[-1], self.weights[i]) + self.biases[i]) self.keptMask.append( gnp.rand(*curActs.shape) > self.dropouts[i + 1]) self.state.append(curActs * self.keptMask[-1]) else: curActs = self.hidActFuncts[i].activation( gnp.dot(self.state[-1], self.weights[i]) + self.biases[i]) self.state.append(curActs) if weightsToStopBefore >= len(self.weights): if useDropout: dropoutMultiplier = 1.0 / (1.0 - self.dropouts[-1]) self.state.append( gnp.dot(dropoutMultiplier * self.state[-1], self.weights[-1]) + self.biases[-1]) else: self.state.append( gnp.dot(self.state[-1], self.weights[-1]) + self.biases[-1]) self.acts = self.outputActFunct.activation(self.state[-1]) return self.acts # If we didn't reach the output units # To return the first set of hidden activations, we would set # weightsToStopBefore to 1. return self.state[weightsToStopBefore]
def fpropDropout(self, inputBatch, weightsToStopBefore = None ): inputBatch = inputBatch if isinstance(inputBatch, gnp.garray) else gnp.garray(inputBatch) if weightsToStopBefore == None: weightsToStopBefore = len(self.weights) self.state = [inputBatch * (gnp.rand(*inputBatch.shape) > self.dropouts[0])] for i in range(min(len(self.weights) - 1, weightsToStopBefore)): dropoutMultiplier = 1.0/(1.0-self.dropouts[i]) curActs = self.hidActFuncts[i].activation(gnp.dot(dropoutMultiplier*self.state[-1], self.weights[i]) + self.biases[i]) self.state.append(curActs * (gnp.rand(*curActs.shape) > self.dropouts[i+1]) ) if weightsToStopBefore >= len(self.weights): dropoutMultiplier = 1.0/(1.0-self.dropouts[-1]) self.state.append(gnp.dot(dropoutMultiplier*self.state[-1], self.weights[-1]) + self.biases[-1]) self.acts = self.outputActFunct.activation(self.state[-1]) return self.acts return self.state[weightsToStopBefore]
def hidden_to_output(self, set_name = 'train'): self.timer_logger('hidden_to_output {0}'.format(type), time.time()) i = 0 for weight, bias in zip(self.w, self.b): if i > 0: #ignore the first weight that goes from inputs to first hidden layer if set_name == 'train': self.results['activations'].insert(0, [self.activation(self.results['current']) , weight]) self.results['current'] = gpu.dot(self.results['activations'][0][0] * (gpu.rand(self.results['activations'][0][0].shape[0],self.results['activations'][0][0].shape[1]) > self.dropout[1]), #dropout weight) + bias else: self.results['current'] = gpu.dot(self.activation(self.results['current'])* (1 - self.dropout[1]), weight) + bias i += 1 self.timer_logger('hidden_to_output {0}'.format(type), time.time())
def R_forward_pass(self, state, R): self.R_state_X = R_state_X = [None] * len(self.sizes) R_state_X[0] = state[0]*0 R_state_i = R_state_X[0] for i in range(len(self.sizes) - 1): R_state_X[i+1] = g.dot(state[i], R.W[i]) + \ g.dot(R_state_i, self.W[i]) + R.b[i] R_state_i = self.nonlins[i].grad_y(state[i+1]) * R_state_X[i+1] return R_state_X[-1]
def test_gnumpy(dat, num_epochs): import gnumpy as gpu import numpy import time # load data. <dat> is 2 dimensional: 60000 X 784 #dat = gpu.garray(load('mnist_cudaTest').T/255.) # training parameters epsilon = 0.1 momentum = 0.9 batch_size = 128 num_batches = dat.shape[0] / batch_size # model parameters num_vis = dat.shape[1] num_hid = 4096 # initialize weights w_vh = 0.1 * gpu.randn(num_vis, num_hid) w_v = gpu.zeros(num_vis) w_h = -4. * gpu.ones(num_hid) # initialize weight updates wu_vh = gpu.zeros((num_vis, num_hid)) wu_v = gpu.zeros(num_vis) wu_h = gpu.zeros(num_hid) for epoch in range(num_epochs): err = [] tic = time.clock() for batch in range(num_batches): # positive phase v1 = dat[batch * batch_size:(batch + 1) * batch_size] h1 = (gpu.dot(v1, w_vh) + w_h).logistic() # sample hiddens hSampled = h1.rand() < h1 # negative phase v2 = (gpu.dot(hSampled, w_vh.T) + w_v).logistic() h2 = (gpu.dot(v2, w_vh) + w_h).logistic() # update weights wu_vh = wu_vh * momentum + gpu.dot(v1.T, h1) - gpu.dot(v2.T, h2) wu_v = wu_v * momentum + v1.sum(0) - v2.sum(0) wu_h = wu_h * momentum + h1.sum(0) - h2.sum(0) w_vh += wu_vh * (epsilon / batch_size) w_v += wu_v * (epsilon / batch_size) w_h += wu_h * (epsilon / batch_size) # calculate reconstruction error err.append((v2 - v1).euclid_norm()**2 / (num_vis * batch_size)) toc = time.clock() print "Mean squared error: %.4f, takes time: %d" % (numpy.mean(err), toc - tic) return w_vh, w_v, w_h
def forward(self, X): self.X = X # Num of examples N = X.shape[0] # Timespan T = X.shape[1] # Windows size S = self.windowSize # Channels D = self.numChannels # Num filters F = self.numFilters Z = np.zeros((N, T - S + 1, S, D), X.dtype) for i in range(T - S + 1): Z[:, i, :, :] = X[:, i:i + S, :] Z = Z.reshape(N * (T - S + 1), S * D) if self.gpu: Z = gpu.as_garray(Z.astype('float32')) Y = gpu.dot(Z, self.W) Y = gpu.as_numpy_array(Y) else: Y = np.dot(Z, self.W) Y = Y.reshape(N, T - S + 1, F) self.Z = Z return Y
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] w = params[: self.m_end].reshape(self.shape) cae = gpu.sum(gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * gpu.sum(w ** 2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[: self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0] :] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * w cae_grad += gdot(inpts.T, (Dsigmoid(hddn) ** 2 * (1 - 2 * hddn))) / m * gpu.sum(w ** 2, axis=0) g[: self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot(delta, params[: self.m_end].reshape(self.shape)) g[: self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end : -self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def cov(x): y = gpu.mean(x, axis=1)[:, None] x = x.as_numpy_array().__sub__(y.as_numpy_array()) x_T = x.T.conj() result = gpu.dot(x, x_T) result = result.__div__(x.shape[1] - 1) return result
def check_rank(self, query_id, same_ids): # same_ids is the list of the ids in the same class v = self.index[query_id] # normalized if self._metric == 'angular': # argmax_a cossim(a, b) = argmax_a dot(a, b) / |a||b| = argmin_a -dot(a, b) dists = -gnumpy.dot(self.index, v) elif self._metric == 'euclidean': # argmin_a (a - b)^2 = argmin_a a^2 - 2ab + b^2 = argmin_a a^2 - 2ab dists = self.lengths - 2 * gnumpy.dot(self.index, v) else: assert False, "invalid metric" # shouldn't get past the constructor! # this rank should start from 1, because of the self-retrieval neighbor_dists = [ dists[i] for i in same_ids] closest_positive_dist = min(neighbor_dists) rank = gnumpy.sum( dists < closest_positive_dist ) return int(rank)
def apply_update(self, pos_moments, neg_moments, rbm, weight_decay, lrate): assert np.allclose(lrate.vbias, lrate.hbias) if self.count < self.params.start_after: rbm.sgd_update(pos_moments, neg_moments, lrate) return # base rates ds = gnp.concatenate([pos_moments.expect_vis - neg_moments.expect_vis, pos_moments.expect_hid - neg_moments.expect_hid]) dbias = lrate.vbias * gnp.dot(self.Lambda, ds.as_numpy_array()) da, db = dbias[:rbm.nvis], dbias[rbm.nvis:] residuals = pos_moments.expect_prod - neg_moments.expect_prod + \ -weight_decay * rbm.weights + \ -self.beta[:, :, 0] * (pos_moments.expect_vis - neg_moments.expect_vis)[:, nax] + \ -self.beta[:, :, 1] * (pos_moments.expect_hid - neg_moments.expect_hid)[nax, :] lam = 1. / self.sigma_sq dw = lrate.weights * lam * residuals da -= lrate.weights * (lam * residuals * self.beta[:, :, 0]).sum(1) db -= lrate.weights * (lam * residuals * self.beta[:, :, 1]).sum(0) update = binary_rbms.Update(da, db, dw) rbm += update
def glog_l_new(self, Wmat): ll = 0 n_correct = 0 gWmat = gnp.garray(np.array(Wmat)) for n in xrange(self.nsamples): #print self.Xi[n][0].shape, self.Xi[n][1].shape, self.Xi[n][2].shape, gWmat.shape internals = (gnp.dot(gnp.garray(self.Xi[n][0]),gnp.dot(gWmat, gnp.garray(self.Xi[n][2].T))) - gnp.dot(gnp.garray(self.Xi[n][1]),gnp.dot(gWmat, gnp.garray(self.Xi[n][2].T)))).as_numpy_array()[0] if logistic(internals) > 0.5 and self.Y[n] == 1: n_correct += 1 elif logistic(internals) < 0.5 and self.Y[n] == -1: n_correct += 1 ll += np.log(logistic(self.Y[n] * internals)) return ll, n_correct / self.nsamples