def sgdWithLrsClip(loss_or_grads, params, learning_rate=.01, mu_lr=.01, si_lr=.001, focused_w_lr=.01, momentum=.9, verbose=False): ''' Sames as sgdWithLrs bu applies clips after updates ''' from collections import OrderedDict from lasagne.updates import get_or_compute_grads, apply_momentum grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() #momentum_params_list =[] f32 = np.float32 if verbose: print("Params List", params) for param, grad in zip(params, grads): if verbose: print("param name", param.name, "shape:", param.eval().shape) #print("param name", param.name, "shape:", param.get_value().shape) #grad = clip_tensor(grad, -0.001, 0.001) if param.name.find('focus') >= 0 and param.name.find('mu') >= 0: updates[param] = param - mu_lr * grad updates = apply_momentum(updates, params=[param], momentum=momentum) updates[param] = clip_tensor(updates[param], f32(0.01), f32(0.99)) elif param.name.find('focus') >= 0 and param.name.find('si') >= 0: updates[param] = param - si_lr * grad updates = apply_momentum(updates, params=[param], momentum=momentum) updates[param] = clip_tensor(updates[param], f32(0.01), f32(0.5)) elif param.name.find('focus') >= 0 and param.name.find('W') >= 0: updates[param] = param - (focused_w_lr * grad) updates = apply_momentum(updates, params=[param], momentum=momentum) #updates[param] =clip_tensor(updates[param], -0.5, 0.5) else: updates[param] = param - learning_rate * grad updates = apply_momentum(updates, params=[param], momentum=momentum) #if param.name.find('W')>=0: #print (param, grad, learning_rate) return updates
def get_network(model): input_data = tensor.dmatrix('x') targets_var = tensor.dmatrix('y') network = layers.InputLayer((model['batch_size'], model['input_vars']), input_data) nonlin = nonlinearities.rectify if model['hidden_nonlinearity'] != 'ReLu': nonlin = nonlinearities.tanh prev_layer = network for l in range(model['nlayers']): fc = layers.DenseLayer(prev_layer, model['units'], nonlinearity=nonlin) if model['dropout']: fc = layers.DropoutLayer(fc, 0.5) prev_layer = fc output_lin = None if model['output_mode'] == OUTPUT_LOG: output_lin = nonlinearities.tanh output_layer = layers.DenseLayer(prev_layer, 1, nonlinearity=output_lin) predictions = layers.get_output(output_layer) if model['output_mode'] == OUTPUT_BOUNDED: (minth, maxth) = model['maxmin'][model['control']] maxt = theano.shared(np.ones((model['batch_size'], 1)) * maxth) mint = theano.shared(np.ones((model['batch_size'], 1)) * minth) predictions = tensor.min(tensor.concatenate([maxt, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) predictions = tensor.max(tensor.concatenate([mint, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) loss = objectives.squared_error(predictions, targets_var) loss = objectives.aggregate(loss, mode='mean') params = layers.get_all_params(output_layer) test_prediction = layers.get_output(output_layer, deterministic=True) test_loss = objectives.squared_error(test_prediction, targets_var) test_loss = test_loss.mean() updates_sgd = updates.sgd(loss, params, learning_rate=model['lr']) ups = updates.apply_momentum(updates_sgd, params, momentum=0.9) train_fn = theano.function([input_data, targets_var], loss, updates=ups) pred_fn = theano.function([input_data], predictions) val_fn = theano.function([input_data, targets_var], test_loss) return {'train': train_fn, 'eval': val_fn, 'pred': pred_fn, 'layers': output_layer}
def get_cost_updates(self, corruption_level, learning_rate, noise = 0.0, momentum=0): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level, noise) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) L = - T.sum(self.desired * T.log(z) + (1 - self.desired) * T.log(1 - z), axis=1) cost = T.mean(L) # adagrad with momentum on cost updates_ada = adagrad(cost, self.params, learning_rate=learning_rate) updates = apply_momentum(updates_ada, self.params, momentum=momentum) return (cost, updates)
def build_model(n_input, n_hidden, optimizer=adagrad, l2_weight=1e-4, l1_weight=1e-2): ''' build NN model to estimating model function ''' global LR input_A = L.InputLayer((None, n_input), name='A') layer_A = L.DenseLayer(input_A, n_hidden, b=None, nonlinearity=identity) input_B = L.InputLayer((None, n_input), name='B') layer_B = L.DenseLayer(input_B, n_hidden, b=None, nonlinearity=identity) merge_layer = L.ElemwiseSumLayer((layer_A, layer_B)) output_layer = L.DenseLayer(merge_layer, 1, b=None, nonlinearity=identity) # output is scalar x1 = T.matrix('x1') x2 = T.matrix('x2') y = T.matrix('y') out = L.get_output(output_layer, {input_A: x1, input_B: x2}) params = L.get_all_params(output_layer) loss = T.mean(squared_error(out, y)) # add l1 penalty l1_penalty = regularize_layer_params([layer_A, layer_B, output_layer], l1) # add l2 penalty l2_penalty = regularize_layer_params([layer_A, layer_B, output_layer], l2) # get loss + penalties loss = loss + l1_penalty * l1_weight + l2_penalty * l2_weight updates_sgd = optimizer(loss, params, learning_rate=LR) updates = apply_momentum(updates_sgd, params, momentum=0.9) # updates = optimizer(loss,params,learning_rate=LR) f_train = theano.function([x1, x2, y], loss, updates=updates) f_test = theano.function([x1, x2, y], loss) f_out = theano.function([x1, x2], out) return f_train, f_test, f_out, output_layer
def get_cost_updates(self, corrupted_input, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x=corrupted_input y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) #z=corrupted_input # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch # L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z)) L=categorical_crossentropy(z,self.x) #L = (self.x * T.log(z) + (1 - self.x) * T.log(1 - z)) #cost=L.mean() # temp=(self.x*T.log(z)+(1-self.x)*T.log(1-z)) # L=-T.sum(temp) # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch cost = T.mean(L) # print cost reg=1e-8*lasagne.regularization.l2(self.params[0]) cost=cost+reg # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params,add_names='True') updates_sgd=sgd(cost,self.params,learning_rate) updates_dic=apply_momentum(updates_sgd, self.params, momentum=0.9) updates=updates_dic.items() # generate the list of updates # updates = [ # (param, param - learning_rate * gparam) # for param, gparam in zip(self.params, gparams) # ] return (cost, updates)
def sgdWithLrs(loss_or_grads, params, learning_rate=.01, mu_lr=.01, si_lr=.001, focused_w_lr=.01, momentum=.9): ''' # This function provides SGD with different learning rates to focus params mu, si, w ''' from collections import OrderedDict from lasagne.updates import get_or_compute_grads, apply_momentum grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() momentum_params_list = [] print(params) for param, grad in zip(params, grads): # import pdb; pdb.set_trace() #grad = clip_tensor(grad, -0.01, 0.01) if param.name.find('focus') >= 0 and param.name.find('mu') >= 0: updates[param] = param - mu_lr * grad momentum_params_list.append(param) elif param.name.find('focus') >= 0 and param.name.find('si') >= 0: updates[param] = param - si_lr * grad #momentum_params_list.append(param) elif param.name.find('focus') >= 0: updates[param] = param - (focused_w_lr * grad) momentum_params_list.append(param) else: updates[param] = param - learning_rate * grad momentum_params_list.append(param) #print (param, grad, learning_rate) return apply_momentum(updates, params=momentum_params_list, momentum=momentum)
def get_network(model): input_data = tensor.dmatrix('x') targets_var = tensor.dmatrix('y') network = layers.InputLayer((model['batch_size'], model['input_vars']), input_data) nonlin = nonlinearities.rectify if model['hidden_nonlinearity'] != 'ReLu': nonlin = nonlinearities.tanh prev_layer = network for l in range(model['nlayers']): W = None if model['hidden_nonlinearity'] == 'ReLu': W = lasagne.init.GlorotUniform('relu') else: W = lasagne.init.GlorotUniform(1) fc = layers.DenseLayer(prev_layer, model['units'], nonlinearity=nonlin, W=W) if model['dropout']: fc = layers.DropoutLayer(fc, 0.5) prev_layer = fc output_lin = None if model['output_mode'] == OUTPUT_LOG: output_lin = nonlinearities.tanh output_layer = layers.DenseLayer(prev_layer, 1, nonlinearity=output_lin) predictions = layers.get_output(output_layer) if model['output_mode'] != OUTPUT_LOG: (minth, maxth) = model['maxmin'][model['control']] maxt = theano.shared(np.ones((model['batch_size'], 1)) * maxth) mint = theano.shared(np.ones((model['batch_size'], 1)) * minth) predictions = tensor.min(tensor.concatenate([maxt, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) predictions = tensor.max(tensor.concatenate([mint, predictions], axis=1), axis=1) predictions = tensor.reshape(predictions, (model['batch_size'], 1)) if model['output_mode'] == OUTPUT_NO: prediction_unboun = layers.get_output(output_layer) loss = objectives.squared_error(prediction_unboun, targets_var) else: loss = objectives.squared_error(predictions, targets_var) loss = objectives.aggregate(loss, mode='mean') params = layers.get_all_params(output_layer) # test_prediction = layers.get_output(output_layer, deterministic=True) #fix for dropout test_loss = objectives.squared_error(predictions, targets_var) test_loss = test_loss.mean() if model['hidden_nonlinearity'] == 'ReLu': model['lr'] *= 0.5 updates_sgd = updates.sgd(loss, params, learning_rate=model['lr']) ups = updates.apply_momentum(updates_sgd, params, momentum=0.9) train_fn = theano.function([input_data, targets_var], loss, updates=ups) pred_fn = theano.function([input_data], predictions) # pred_fn = theano.function([input_data], prediction_unboun) val_fn = theano.function([input_data, targets_var], test_loss) return { 'train': train_fn, 'eval': val_fn, 'pred': pred_fn, 'layers': output_layer }
def rmsprop_momentum(loss, params, eta=1e-3, alpha=0.9, **kwargs): rms = updt.rmsprop(loss, params, learning_rate=eta, **kwargs) return updt.apply_momentum(rms, params, momentum=alpha)
all_params = get_all_params(layers, trainable=True) # compute loss generation = lasagne.layers.get_output(net) generation = generation.dimshuffle([0, 2, 3, 1]) # mean squared error train_loss = lasagne.objectives.squared_error( generation.reshape((generation.shape[0], -1)), img_batch_target.reshape((img_batch_target.shape[0], -1))) train_loss = train_loss.sum(axis=1) train_loss = train_loss.mean() # update lrn_rate = T.cast(theano.shared(options['learning_rate']), 'floatX') # we can use dynamic learning rate optimizer = sgd updates_sgd = optimizer(train_loss, all_params, learning_rate=lrn_rate) updates = apply_momentum(updates_sgd, all_params, momentum=0.95) # train _train = theano.function([img_batch, pose_code, img_batch_target], train_loss, updates=updates, allow_input_downcast=True) # ------------ training ---------------- print("Train...") if options['start_epoch'] == 0: start_epoch = 0 else: model.load_model(options['init_model_from']) start_epoch = options['start_epoch'] nb_epoch = options['max_epochs']
def sgdWithWeightSupress(loss_or_grads, params, learning_rate=.01, mu_lr=.01, si_lr=.001, focused_w_lr=.01, momentum=.9, verbose=False): ''' this update function masks focus weights after they are updated. The idea is that weights outside of the focus function must be suppressed to prevent weight memory when focus changes it print("Hey weight shape::",mu_si_w[param.name].shape) s position To do this I get mu and si values of the focus layer, calculate a Gauss, window scale it so the center is 1 but outside is close to 0, and then multiply it with the weights. ''' from collections import OrderedDict from lasagne.updates import get_or_compute_grads, apply_momentum grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() #momentum_params_list =[] if verbose: print(params) for param, grad in zip(params, grads): #grad = clip_tensor(grad, -0.001, 0.001) if param.name.find('focus') >= 0 and param.name.find('mu') >= 0: updates[param] = param - mu_lr * grad updates = apply_momentum(updates, params=[param], momentum=momentum) updates[param] = clip_tensor(updates[param], 0.01, 0.99) elif param.name.find('focus') >= 0 and param.name.find('si') >= 0: updates[param] = param - si_lr * grad updates = apply_momentum(updates, params=[param], momentum=momentum) updates[param] = clip_tensor(updates[param], 0.01, 0.5) elif param.name.find('focus') >= 0 and param.name.find('W') >= 0: param_layer_name = param.name.split(".")[0] mu_name = param_layer_name + '.mu' si_name = param_layer_name + ".si" mu_si_w = get_params_values_wkey(params, [mu_name, si_name, param.name]) from focusing import U_numeric us = U_numeric(np.linspace(0, 1, mu_si_w[param.name].shape[0]), mu_si_w[mu_name], mu_si_w[si_name], 1, normed=False) updates[param] = (param - (focused_w_lr * grad)) updates = apply_momentum(updates, params=[param], momentum=momentum) # here we are masking the weights, so they can not stay out of envelope us[us > 0.1] = 1.0 updates[param] = updates[param] * us.T #updates[param] = updates[param]*, -0.5, 0.5) else: updates[param] = param - learning_rate * grad updates = apply_momentum(updates, params=[param], momentum=momentum) #print (param, grad, learning_rate) return updates
def sgdWithLrLayers(loss_or_grads, params, learning_rate=.01, mu_lr=.01, si_lr=.001, focused_w_lr=.01, momentum=.9): ''' # This function updates each layer parameters with a different learning rate. Under dev. ''' from collections import OrderedDict from lasagne.updates import get_or_compute_grads, apply_momentum grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() #momentum_params_list =[] #print(params) for param, grad in zip(params, grads): # import pdb; pdb.set_trace() grad = clip_tensor(grad, -0.01, 0.01) if param.name.find('focus') >= 0 and param.name.find('mu') >= 0: updates[param] = param - mu_lr * grad updates = apply_momentum(updates, params=[param], momentum=momentum / 2) updates[param] = clip_tensor(updates[param], 0.05, 0.95) #momentum_params_list.append(param) #print (param,mu_lr) #print (param, grad, mu_lr) elif param.name.find('focus') >= 0 and param.name.find('si') >= 0: updates[param] = param - si_lr * grad #momentum_params_list.append(param) updates = apply_momentum(updates, params=[param], momentum=momentum) updates[param] = clip_tensor(updates[param], 0.01, 0.5) #print (param,si_lr) #print (param, grad, si_lr) #print (param, grad, scaler_lr) elif param.name.find('focus') >= 0 and (param.name.find('W') >= 0 or param.name.find('bias') >= 0): level = int(str.split(param.name, '-')[1].split('.')[0]) #print(param.name, level) updates[param] = param - (learning_rate * (1. / (level + 1))) * grad updates = apply_momentum(updates, params=[param], momentum=momentum) if (param.name.find('W') >= 0): updates[param] = clip_tensor(updates[param], -0.4, 0.4) #momentum_params_list.append(param) #print (param,focused_w_lr) elif param.name.find('W') >= 0 or param.name.find('b') >= 0: if param.name.find('-') >= 0: level = int(str.split(param.name, '-')[1].split('.')[0]) updates[param] = param - (learning_rate * (1. / level)) * grad updates = apply_momentum(updates, params=[param], momentum=momentum) else: updates[param] = param - (learning_rate) * grad #momentum_params_list.append(param) updates = apply_momentum(updates, params=[param], momentum=momentum) if (param.name.find('W') >= 0): updates[param] = clip_tensor(updates[param], -0.4, 0.4) if (param.name.find('b') >= 0): updates[param] = clip_tensor(updates[param], -1.0, 1.0) else: updates[param] = param - (learning_rate) * grad #momentum_params_list.append(param) updates = apply_momentum(updates, params=[param], momentum=momentum) if (param.name.find('beta') >= 0): updates[param] = clip_tensor(updates[param], -1., 1.) #print (param, grad, learning_rate) return updates