def create_weight_update_functions(self): updates = [] for i in range(len(self.error_gradients)): updates.append((self.weights[i], g(T.sub(self.weights[i],T.mul(T.mul(self.error_gradients[-(i+1)],self.alpha),self.batch_size_divisor))))) updates.append((self.biases[i],g(T.sub(self.biases[i],T.mul(T.mul(self.errors[-(i+1)], self.alpha),self.batch_size_divisor))))) self.update_weight_function = function(inputs=[self.idx,self.alpha],updates= updates)
def create_dropout_sample_functions(self, reset = False): '''Creates functions of sample vectors which can be index with random integers to create a pseudo random sample for dropout. This greatly speeds up sampling as no new samples have to be created. ''' if reset: self.dropout = self.dropout_init print 'Reset dropout to ' + str(self.dropout) self.dropout_function = None sample_function = None if self.dropout > 0: if self.dropout_type == Dropout.drop_activation: if reset: self.bino_sample_vector.set_value(np.matrix(np.float32( np.random.binomial(1,1-self.dropout,(10000000,1)))), borrow=True) else: self.bino_sample_vector = shared(np.matrix(np.float32( np.random.binomial(1,1-self.dropout,(10000000,1)))), 'float32', borrow=True) sample_function = lambda rand: g(T.reshape(self.bino_sample_vector[rand:rand + (self.batch_size*self.size)],(self.batch_size,self.size))) sample_function_cv = lambda rand: g(T.reshape(self.bino_sample_vector[rand:rand + (4200*self.size)],(4200,self.size))) self.dropout_function = sample_function(self.rdm.random_integers(low=0, high=self.sample_range)) self.dropout_function_cv = sample_function_cv(self.rdm.random_integers(low=0, high=self.sample_range))
def create_weight_update_with_momentum_functions(self): weight_updates_with_momentum = [] for i in range(len(self.weights)): weight_updates_with_momentum.append((self.weights[i], g(T.add(self.weights[i],self.H.L.momentum_weights[i])))) self.weight_updates_with_momentum_function = function(inputs=[], updates=weight_updates_with_momentum)
def create_weight_decay_updates(self): '''Decays the weights exponentially. ''' weight_updates = [] for i in range(len(self.weights)): weight_updates.append((self.weights[i], g(T.mul(self.weights[i],self.alpha)))) self.decay_weights = function(inputs=[self.alpha],updates=weight_updates)
def create_weight_decay_updates(self): """Decays the weights exponentially. """ weight_updates = [] for i in range(len(self.weights)): weight_updates.append((self.weights[i], g(T.mul(self.weights[i], self.alpha)))) self.decay_weights = function(inputs=[self.alpha], updates=weight_updates)
def create_weight_update_with_momentum_functions(self): weight_updates_with_momentum = [] for i in range(len(self.weights)): weight_updates_with_momentum.append( (self.weights[i], g(T.add(self.weights[i], self.H.L.momentum_weights[i]))) ) self.weight_updates_with_momentum_function = function(inputs=[], updates=weight_updates_with_momentum)
def constaint_weight_function(self): '''Calculates the max squared element of the weight vector to rescale it. ''' max_values = [] for w in self.weights: max_values.append(g(T.max(T.square(w)))) self.weight_constaint_function = function([],outputs=max_values)
def constaint_weight_function(self): """Calculates the max squared element of the weight vector to rescale it. """ max_values = [] for w in self.weights: max_values.append(g(T.max(T.square(w)))) self.weight_constaint_function = function([], outputs=max_values)
def create_momentum_weight_update_functions(self): momentum_updates = [] for i in range(len(self.H.L.momentum_weights)): momentum_updates.append( (self.H.L.momentum_weights[i], g(T.mul(self.batch_size_divisor,T.sub(T.mul(self.M,self.H.L.momentum_weights[i]),T.mul(self.alpha,self.error_gradients[-(i+1)])))))) self.H.L.momentum_update_function = function(inputs=[self.idx, self.M, self.alpha], updates=momentum_updates)
def create_nesterov_weight_update_functions(self): '''Creates functions for Nesterov accelerated gradient which is similar to momentum. The difference is that the gradient is calculated with the current weights plus the momentum vector; the result is used to update the weights and momentum matrices. This is generally better than normal momentum. Also see Sutskever, 2013: http://www.cs.toronto.edu/~hinton/absps/momentum.pdf ''' nesterov_updates = [] for i in range(len(self.nesterov_weights)): nesterov_updates.append((self.weights[i], g(T.add(self.nesterov_weights[i],self.H.L.momentum_weights[i])))) for i in range(len(self.nesterov_weights)): nesterov_updates.append((self.nesterov_weights[i], g(T.add(self.nesterov_weights[i],self.H.L.momentum_weights[i])))) self.nesterov_update_function = function([],updates=nesterov_updates)
def create_nesterov_weight_update_functions(self): """Creates functions for Nesterov accelerated gradient which is similar to momentum. The difference is that the gradient is calculated with the current weights plus the momentum vector; the result is used to update the weights and momentum matrices. This is generally better than normal momentum. Also see Sutskever, 2013: http://www.cs.toronto.edu/~hinton/absps/momentum.pdf """ nesterov_updates = [] for i in range(len(self.nesterov_weights)): nesterov_updates.append((self.weights[i], g(T.add(self.nesterov_weights[i], self.H.L.momentum_weights[i])))) for i in range(len(self.nesterov_weights)): nesterov_updates.append( (self.nesterov_weights[i], g(T.add(self.nesterov_weights[i], self.H.L.momentum_weights[i]))) ) self.nesterov_update_function = function([], updates=nesterov_updates)
def create_dropout_sample_functions(self, reset=False): '''Creates functions of sample vectors which can be index with random integers to create a pseudo random sample for dropout. This greatly speeds up sampling as no new samples have to be created. ''' if reset: self.dropout = self.dropout_init print 'Reset dropout to ' + str(self.dropout) self.dropout_function = None sample_function = None if self.dropout > 0: if self.dropout_type == Dropout.drop_activation: if reset: self.bino_sample_vector.set_value(np.matrix( np.float32( np.random.binomial(1, 1 - self.dropout, (10000000, 1)))), borrow=True) else: self.bino_sample_vector = shared(np.matrix( np.float32( np.random.binomial(1, 1 - self.dropout, (10000000, 1)))), 'float32', borrow=True) sample_function = lambda rand: g( T.reshape( self.bino_sample_vector[rand:rand + (self.batch_size * self.size)], (self.batch_size, self.size))) sample_function_cv = lambda rand: g( T.reshape( self.bino_sample_vector[rand:rand + (4200 * self.size)], (4200, self.size))) self.dropout_function = sample_function( self.rdm.random_integers(low=0, high=self.sample_range)) self.dropout_function_cv = sample_function_cv( self.rdm.random_integers(low=0, high=self.sample_range))
def create_backprop_gradient_functions(self): self.errors =[] self.error_gradients = [] error_function = None error_gradient = None for i in range(len(self.weights)): if len(self.errors) == 0: #this is the last layer of the net: The error is X - t because of #the combination of softmax and cross entropy cost function error_function = g(T.sub(self.feedforward,self.t[self.idx])) self.errors.append(error_function) error_gradient = g(T.dot(self.z[-2].T,self.errors[i])) error_gradient = self.apply_L2_penalties_error_gradients(error_gradient, -1) self.error_gradients.append(error_gradient) elif (len(self.weights) - 1) == i: #this involves the input X instead of z-values as it is the first weights that #need to be updated self.errors.append(g(T.mul(T.dot(self.errors[-1],self.weights[1].T), self.layers[1].activation_derivative(self.z[0])))) error_gradient = g(T.dot(self.X[self.idx].T,self.errors[-1])) #error_gradient = self.apply_L2_penalties_error_gradients(error_gradient, 0) self.error_gradients.append(error_gradient) else: self.errors.append(g(T.mul(T.dot(self.errors[-1],self.weights[-i].T), self.layers[-(i+1)].activation_derivative(self.z[-(i+1)])))) error_gradient = g(T.dot(self.z[-(i+2)].T,self.errors[-1])) #error_gradient = self.apply_L2_penalties_error_gradients(error_gradient, -(i+1)) self.error_gradients.append(error_gradient)
def create_weight_update_functions(self): updates = [] for i in range(len(self.error_gradients)): updates.append( ( self.weights[i], g( T.sub( self.weights[i], T.mul(T.mul(self.error_gradients[-(i + 1)], self.alpha), self.batch_size_divisor), ) ), ) ) updates.append( ( self.biases[i], g(T.sub(self.biases[i], T.mul(T.mul(self.errors[-(i + 1)], self.alpha), self.batch_size_divisor))), ) ) self.update_weight_function = function(inputs=[self.idx, self.alpha], updates=updates)
def create_momentum_weight_update_functions(self): momentum_updates = [] for i in range(len(self.H.L.momentum_weights)): momentum_updates.append( ( self.H.L.momentum_weights[i], g( T.mul( self.batch_size_divisor, T.sub( T.mul(self.M, self.H.L.momentum_weights[i]), T.mul(self.alpha, self.error_gradients[-(i + 1)]), ), ) ), ) ) self.H.L.momentum_update_function = function(inputs=[self.idx, self.M, self.alpha], updates=momentum_updates)
def __init__(self, size, activation_function, dropout_type, dropout, dropout_decay, batch_size, frequency): self.drop_count = 0 self.size = size self.frequency = frequency self.dropout = dropout self.dropout_init = dropout self.dropout_decay = dropout_decay self.dropout_type = dropout_type self.rdm = RandomStreams(seed=1234) self.batch_size = batch_size self.sample_range = 100000 self.create_dropout_sample_functions() self.activation_crossvalidation = activation_function self.activation_function = self.set_dropout(dropout, activation_function) self.activation_derivative = lambda X: g(T.mul(X, (1.0 - X))) self.activation_tracker = self.set_activation_tracker(activation_function) pass
def __init__(self, size, activation_function, dropout_type, dropout, dropout_decay, batch_size, frequency): self.drop_count = 0 self.size = size self.frequency = frequency self.dropout = dropout self.dropout_init = dropout self.dropout_decay = dropout_decay self.dropout_type = dropout_type self.rdm = RandomStreams(seed=1234) self.batch_size = batch_size self.sample_range = 100000 self.create_dropout_sample_functions() self.activation_crossvalidation = activation_function self.activation_function = self.set_dropout(dropout, activation_function) self.activation_derivative = lambda X: g(T.mul(X, (1.0 - X))) self.activation_tracker = self.set_activation_tracker( activation_function) pass
def create_backprop_gradient_functions(self): self.errors = [] self.error_gradients = [] error_function = None error_gradient = None for i in range(len(self.weights)): if len(self.errors) == 0: # this is the last layer of the net: The error is X - t because of # the combination of softmax and cross entropy cost function error_function = g(T.sub(self.feedforward, self.t[self.idx])) self.errors.append(error_function) error_gradient = g(T.dot(self.z[-2].T, self.errors[i])) error_gradient = self.apply_L2_penalties_error_gradients(error_gradient, -1) self.error_gradients.append(error_gradient) elif (len(self.weights) - 1) == i: # this involves the input X instead of z-values as it is the first weights that # need to be updated self.errors.append( g(T.mul(T.dot(self.errors[-1], self.weights[1].T), self.layers[1].activation_derivative(self.z[0]))) ) error_gradient = g(T.dot(self.X[self.idx].T, self.errors[-1])) # error_gradient = self.apply_L2_penalties_error_gradients(error_gradient, 0) self.error_gradients.append(error_gradient) else: self.errors.append( g( T.mul( T.dot(self.errors[-1], self.weights[-i].T), self.layers[-(i + 1)].activation_derivative(self.z[-(i + 1)]), ) ) ) error_gradient = g(T.dot(self.z[-(i + 2)].T, self.errors[-1])) # error_gradient = self.apply_L2_penalties_error_gradients(error_gradient, -(i+1)) self.error_gradients.append(error_gradient)
def apply_L2_penalties_error_gradients(self, error_gradient, weight_index): if self.H.R.use_L2: error_gradient = g(T.add(error_gradient, T.mul(self.H.R.L2_penalty, self.weights[weight_index]))) return error_gradient
def create_feedforward_chains(self): #feedforward with dropout for train data self.a = [] self.z = [] for i in range(len(self.weights)): if i == 0: #input units self.a.append(g(T.dot(self.layers[i].activation_function(self.X[self.idx]),self.weights[i]))) self.z.append(g(self.layers[i+1].activation_function(T.add(self.a[i],self.biases[i]) if self.H.L.use_bias else self.a[i]))) else: self.a.append(g(T.dot(self.z[i-1],self.weights[i]))) if len(self.layers) > i: self.z.append(g(self.layers[i+1].activation_function(g(T.add(self.a[i],self.biases[i])) if self.H.L.use_bias else self.a[i]))) self.feedforward = self.z[-1] self.feedforward_function = function(inputs=[self.idx],outputs=self.feedforward) #feedforward for validation data with dropout instead of mean net multiplication self.a_valid_drop = [] self.z_valid_drop = [] for i in range(len(self.weights)): if i == 0: #input units self.a_valid_drop.append(g(T.dot(self.layers[i].activation_cv_dropout(self.X_val),self.weights[i]))) self.z_valid_drop.append(g(self.layers[i+1].activation_cv_dropout(T.add(self.a_valid_drop[i],self.biases[i][0,:]) if self.H.L.use_bias else self.a_valid_drop[i]))) else: self.a_valid_drop.append(g(T.dot(self.z_valid_drop[i-1],self.weights[i]))) if len(self.layers) > i: self.z_valid_drop.append(g(self.layers[i+1].activation_cv_dropout(g(T.add(self.a_valid_drop[i],self.biases[i][0,:])) if self.H.L.use_bias else self.a_valid_drop[i]))) self.feedforward_valid_drop = self.z_valid_drop[-1] self.feedforward_valid_drop_function = function(inputs=[],outputs=self.feedforward_valid_drop) #feedforward for validation data with mean net multiplication a_valid = [] z_valid = [] for i in range(len(self.weights)): if i == 0: #input units a_valid.append(g(T.dot(self.X_val,self.weights[i]))) z_valid.append(g(T.mul(self.layers[i+1].activation_crossvalidation((g(T.add(self.biases[i][0,:],a_valid[i])) if self.H.L.use_bias else a_valid[i])), 1-self.layers[i+1].dropout))) else: a_valid.append(g(T.dot(z_valid[i-1],self.weights[i]))) if len(self.layers) > i: z_valid.append(g(self.layers[i+1].activation_crossvalidation((g(T.add(self.biases[i][0,:],a_valid[i])) if self.H.L.use_bias else a_valid[i])))) self.feedforward_valid = z_valid[-1] #feedforward for training data with mean net multiplication (for train error) a_train = [] z_train = [] for i in range(len(self.weights)): if i == 0: #input units a_train.append(g(T.dot(self.X[self.idx],self.weights[i]))) z_train.append(g(T.mul(self.layers[i+1].activation_crossvalidation((g(T.add(self.biases[i],a_train[i])) if self.H.L.use_bias else a_train[i])), 1-self.layers[i+1].dropout))) else: a_train.append(g(T.dot(z_train[i-1],self.weights[i]))) if len(self.layers) > i: z_train.append(g(self.layers[i+1].activation_crossvalidation((g(T.add(self.biases[i],a_train[i])) if self.H.L.use_bias else a_train[i])))) self.feedforward_train = z_train[-1] #feedforward of test data with mean net multiplication a_predict = [] z_predict = [] for i in range(len(self.weights)): if i == 0: #input units a_predict.append(g(T.dot(self.X_test,self.weights[i]))) z_predict.append(g(T.mul(self.layers[i+1].activation_crossvalidation((g(T.add(self.biases[i][0,:],a_predict[i])) if self.H.L.use_bias else a_predict[i])), 1-self.layers[i+1].dropout))) else: a_predict.append(g(T.dot(z_predict[i-1],self.weights[i]))) if len(self.layers) > i: z_predict.append(g(self.layers[i+1].activation_crossvalidation(g(T.add(self.biases[i][0,:],a_predict[i])) if self.H.L.use_bias else a_predict[i]))) self.feedforward_predict = z_predict[-1]
def create_feedforward_chains(self): # feedforward with dropout for train data self.a = [] self.z = [] for i in range(len(self.weights)): if i == 0: # input units self.a.append(g(T.dot(self.layers[i].activation_function(self.X[self.idx]), self.weights[i]))) self.z.append( g( self.layers[i + 1].activation_function( T.add(self.a[i], self.biases[i]) if self.H.L.use_bias else self.a[i] ) ) ) else: self.a.append(g(T.dot(self.z[i - 1], self.weights[i]))) if len(self.layers) > i: self.z.append( g( self.layers[i + 1].activation_function( g(T.add(self.a[i], self.biases[i])) if self.H.L.use_bias else self.a[i] ) ) ) self.feedforward = self.z[-1] self.feedforward_function = function(inputs=[self.idx], outputs=self.feedforward) # feedforward for validation data with dropout instead of mean net multiplication self.a_valid_drop = [] self.z_valid_drop = [] for i in range(len(self.weights)): if i == 0: # input units self.a_valid_drop.append(g(T.dot(self.layers[i].activation_cv_dropout(self.X_val), self.weights[i]))) self.z_valid_drop.append( g( self.layers[i + 1].activation_cv_dropout( T.add(self.a_valid_drop[i], self.biases[i][0, :]) if self.H.L.use_bias else self.a_valid_drop[i] ) ) ) else: self.a_valid_drop.append(g(T.dot(self.z_valid_drop[i - 1], self.weights[i]))) if len(self.layers) > i: self.z_valid_drop.append( g( self.layers[i + 1].activation_cv_dropout( g(T.add(self.a_valid_drop[i], self.biases[i][0, :])) if self.H.L.use_bias else self.a_valid_drop[i] ) ) ) self.feedforward_valid_drop = self.z_valid_drop[-1] self.feedforward_valid_drop_function = function(inputs=[], outputs=self.feedforward_valid_drop) # feedforward for validation data with mean net multiplication a_valid = [] z_valid = [] for i in range(len(self.weights)): if i == 0: # input units a_valid.append(g(T.dot(self.X_val, self.weights[i]))) z_valid.append( g( T.mul( self.layers[i + 1].activation_crossvalidation( (g(T.add(self.biases[i][0, :], a_valid[i])) if self.H.L.use_bias else a_valid[i]) ), 1 - self.layers[i + 1].dropout, ) ) ) else: a_valid.append(g(T.dot(z_valid[i - 1], self.weights[i]))) if len(self.layers) > i: z_valid.append( g( self.layers[i + 1].activation_crossvalidation( (g(T.add(self.biases[i][0, :], a_valid[i])) if self.H.L.use_bias else a_valid[i]) ) ) ) self.feedforward_valid = z_valid[-1] # feedforward for training data with mean net multiplication (for train error) a_train = [] z_train = [] for i in range(len(self.weights)): if i == 0: # input units a_train.append(g(T.dot(self.X[self.idx], self.weights[i]))) z_train.append( g( T.mul( self.layers[i + 1].activation_crossvalidation( (g(T.add(self.biases[i], a_train[i])) if self.H.L.use_bias else a_train[i]) ), 1 - self.layers[i + 1].dropout, ) ) ) else: a_train.append(g(T.dot(z_train[i - 1], self.weights[i]))) if len(self.layers) > i: z_train.append( g( self.layers[i + 1].activation_crossvalidation( (g(T.add(self.biases[i], a_train[i])) if self.H.L.use_bias else a_train[i]) ) ) ) self.feedforward_train = z_train[-1] # feedforward of test data with mean net multiplication a_predict = [] z_predict = [] for i in range(len(self.weights)): if i == 0: # input units a_predict.append(g(T.dot(self.X_test, self.weights[i]))) z_predict.append( g( T.mul( self.layers[i + 1].activation_crossvalidation( (g(T.add(self.biases[i][0, :], a_predict[i])) if self.H.L.use_bias else a_predict[i]) ), 1 - self.layers[i + 1].dropout, ) ) ) else: a_predict.append(g(T.dot(z_predict[i - 1], self.weights[i]))) if len(self.layers) > i: z_predict.append( g( self.layers[i + 1].activation_crossvalidation( g(T.add(self.biases[i][0, :], a_predict[i])) if self.H.L.use_bias else a_predict[i] ) ) ) self.feedforward_predict = z_predict[-1]
def apply_L2_penalties_error_gradients(self, error_gradient, weight_index): if self.H.R.use_L2: error_gradient = g(T.add(error_gradient, T.mul(self.H.R.L2_penalty,self.weights[weight_index]))) return error_gradient
a1 = T.fmatrix("a1") e1 = T.fmatrix("e1") idx = T.iscalar("idx") bsize = T.fscalar("bsize") alpha = T.fscalar("alpha") cv_size = T.fscalar("cv_size") drop_input = lambda rand: T.reshape( bino_input[rand:rand + (batch_size * dim_visible)], (batch_size, dim_visible)) input_drop = drop_input(rdm.random_integers(low=0, high=sample_range_dropout)) h = T.nnet.sigmoid(T.add(T.dot(v, w_vh), w_h)) u_w_plus = function([], updates=[(wu_vh, g(T.add(wu_vh, T.dot(v.T, h)))), (wu_v, g(T.add(T.sum(v[:], axis=0), wu_v))), (wu_h, g(T.add(T.sum(h[:], axis=0), wu_h)))]) u_w_minus = function([], updates=[(wu_vh, g(T.sub(wu_vh, T.dot(v.T, h)))), (wu_v, g(T.sub(T.sum(v[:], axis=0), wu_v))), (wu_h, g(T.sub(T.sum(h[:], axis=0), wu_h)))]) sample = lambda rdm: T.reshape( uniform_sample[rdm:rdm + (dim_hidden * batch_size)], (batch_size, dim_hidden)) gibbs = T.cast(T.gt(h, sample(rdm.random_integers(low=0, high=sample_range))), 'float32')
t1 = T.fmatrix("t1") a1 = T.fmatrix("a1") e1 = T.fmatrix("e1") idx = T.iscalar("idx") bsize = T.fscalar("bsize") alpha = T.fscalar("alpha") cv_size = T.fscalar("cv_size") drop_input = lambda rand: T.reshape(bino_input[rand:rand + (batch_size*dim_visible)],(batch_size,dim_visible)) input_drop = drop_input(rdm.random_integers(low=0, high=sample_range_dropout)) h = T.nnet.sigmoid(T.add(T.dot(v,w_vh),w_h)) u_w_plus = function([],updates=[(wu_vh, g(T.add(wu_vh,T.dot(v.T,h)))), (wu_v, g(T.add(T.sum(v[:],axis=0),wu_v))), (wu_h, g(T.add(T.sum(h[:],axis=0),wu_h))) ]) u_w_minus = function([],updates=[(wu_vh, g(T.sub(wu_vh,T.dot(v.T,h)))), (wu_v, g(T.sub(T.sum(v[:],axis=0),wu_v))), (wu_h, g(T.sub(T.sum(h[:],axis=0),wu_h))) ]) sample = lambda rdm: T.reshape(uniform_sample[rdm:rdm + (dim_hidden*batch_size)],(batch_size,dim_hidden)) gibbs = T.cast(T.gt(h, sample(rdm.random_integers(low=0, high=sample_range))),'float32')