def default_training(self): input_var = tensor.tensor4('inputs') target_var = tensor.ivector('targets') loss, _ = loss_acc(self.model, input_var, target_var, deterministic=False) loss += regularize_layer_params(get_all_layers(self.model), l2, tags={ 'regularizable': True, 'layer_weight': False }) * 1e-4 # TODO : does this count as weight decay (...*1e-4) or not? # the learning rate is 1/100 of the normal learning rate # ... but we just adapt the decay loss += regularize_layer_params( get_all_layers(self.model), l2, tags={'layer_weight': True}) * 1e-6 params = get_all_params(self.model, trainable=True) # updates = adam(loss, params, learning_rate=self.learning_rate) updates = self.momentum_method(loss, params, momentum=self.momentum, learning_rate=self.learning_rate) for weight in get_all_params(self.model, trainable=True, tags={'layer_weight': True}): # all residual weights are in [-1, 1] assert weight in updates updates[weight] = tensor.minimum( 1.0, tensor.maximum(-1.0, updates[weight])) self.set_training(input_var, target_var, loss, updates)
def objective(layers, loss_function, target, aggregate=aggregate, deterministic=False, l1=0, l2=0, get_output_kw=None): """ Default implementation of the NeuralNet Objective. :param layers: The underlying layers of the NeuralNetwork :param loss_function: The callable loss function to use :param target: the expected output :param aggregate: the aggregation function to use :param deterministic: Whether or not to get a deterministic output :param l1: Optional l1 regularization parameter :param l2: Optional l2 regularization parameter :param get_output_kw: optional kwargs to pass to :meth:`NeuralNetwork.get_output` :return: The total calculated loss """ if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output( output_layer, deterministic=deterministic, **get_output_kw) loss = aggregate(loss_function(network_output, target)) if l1: loss += regularization.regularize_layer_params( layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params( layers.values(), regularization.l2) * l2 return loss
def objective(layers, loss_function, target, aggregate=aggregate, aggregation_weights=None, deterministic=False, l1=0, l2=0, get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output( output_layer, deterministic=deterministic, **get_output_kw) if isfunction(aggregation_weights): weights = aggregation_weights(layers) else: weights = aggregation_weights loss = aggregate(loss_function(network_output, target), weights) if l1: loss += regularization.regularize_layer_params( layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params( layers.values(), regularization.l2) * l2 return loss
def cost_network(network, target_var, l1_reg, l2_reg, learn, train_layers=[], output_layer=[]): #for key in dont_train: # network[key].params[network[key].W].remove("trainable") # network[key].params[network[key].b].remove("trainable") #Basic loss is negative loss likelihood network_out = network[output_layer] prediction = lasagne.layers.get_output(network_out) loss = T.mean(T.nnet.categorical_crossentropy(prediction,target_var)) #Shared costs l1_penalty = regularize_layer_params(network_out, l1) * l1_reg l2_penalty = regularize_layer_params(network_out, l2) * l2_reg cost = loss + l2_penalty + l1_penalty #params = lasagne.layers.get_all_params(network_out, trainable=True) #print(params) params=[] for p in train_layers: params.append(network[p].get_params(trainable=True)) params = [item for sublist in params for item in sublist] print([i.eval().shape for i in params]) print(params) print(train_layers) print("----") updates = lasagne.updates.sgd(cost, params, learning_rate=learn) return([cost, updates, loss])
def objective(layers, loss_function, target, aggregate=aggregate, deterministic=False, l1=0, l2=0, tv=0, get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output( output_layer, deterministic=deterministic, **get_output_kw) loss = aggregate(loss_function(network_output, target)) if l1: loss += regularization.regularize_layer_params( layers[-2], regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params( layers[-2], regularization.l2) * l2 if tv: loss += T.mean(T.abs_(network_output[:, 1:] - network_output[:, :-1]))*tv return loss
def compile_logistic_model(self, lamda, input_params=None): X,Y = self.X,self.Y net = self.build_model(X) network = net['l_out'] self.net_logistic = network prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, Y) loss = loss.mean() for key in net.keys(): loss += lamda*regularize_layer_params(net[key], l2) + \ lamda*regularize_layer_params(net[key], l1) if input_params: print"Compiling classifier with input params..." lasagne.layers.set_all_param_values( network, [i.get_value() for i in input_params]) params = lasagne.layers.get_all_params(network) self.inst_params = params updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=0.01, momentum=0.9) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_prediction = T.argmax(test_prediction, axis=1) train = theano.function([X, Y], loss, updates=updates, allow_input_downcast=True) predict = theano.function([X], test_prediction, allow_input_downcast=True) print "Done Compiling logistic model..." return train,predict
def objective(layers, loss_function, target, aggregate=aggregate, mode='mean', weights=None, deterministic=False, l1=0, l2=0, l3=0, l3_layers=[], get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output( output_layer, deterministic=deterministic, **get_output_kw) loss = aggregate(loss_function(network_output, target), weights=weights, mode=mode) if l1: loss += regularization.regularize_layer_params( layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params( layers.values(), regularization.l2) * l2 if l3: for layer in l3_layers: loss += regularization.regularize_layer_params( layer, regularization.l2) * l3 return loss
def objective(layers, loss_function, target, aggregate=aggregate, aggregation_weights=None, deterministic=False, l1=0, l2=0, get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output(output_layer, deterministic=deterministic, **get_output_kw) if isfunction(aggregation_weights): weights = aggregation_weights(layers) else: weights = aggregation_weights loss = aggregate(loss_function(network_output, target), weights) if l1: loss += regularization.regularize_layer_params(layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params(layers.values(), regularization.l2) * l2 return loss
def build_qn_type_model(self, from_scratch=False): qtype, qembd = self.qtype, self.qembd qX, mask = self.qX, self.lstm_mask if from_scratch: #q_bow_net = self.build_question_boW(qX) #q_bow = lasagne.layers.get_output(q_bow_net['l_embd']) #l2_penalty_qbow = regularize_layer_params(q_bow_net['l_embd'], l2) #qbow_params = lasagne.layers.get_all_params(q_bow_net['l_embd']) #qembd = T.sum(q_bow,axis=1) q_lstm_net = self.build_qn_classifier_lstm(qX, mask) qlstm_params = lasagne.layers.get_all_params(q_lstm_net['l_dense']) l2_penalty_qlstm = regularize_layer_params(q_lstm_net['l_dense'], l2) #l2_penalty_qlstm += regularize_layer_params(q_lstm_net['l_lstm'], l2) qembd = lasagne.layers.get_output(q_lstm_net['l_dense']) q_type_net = self.build_qn_classifier_mlp(qembd) q_type_pred = lasagne.layers.get_output(q_type_net['l_out'], deterministic=False) l2_penalty_mlp = regularize_layer_params(q_type_net['l_out'], l2) loss = lasagne.objectives.categorical_crossentropy(q_type_pred, qtype) loss = loss.mean() + l2_penalty_mlp loss += l2_penalty_qlstm params = [] qmlp_params = lasagne.layers.get_all_params(q_type_net['l_out']) for p in qmlp_params: params.append(p) for p in qlstm_params: params.append(p) all_grads = T.grad(loss, params) if self.grad_clip != None: all_grads = [ T.clip(g, self.grad_clip[0], self.grad_clip[1]) for g in all_grads ] updates = lasagne.updates.adam(all_grads, params, learning_rate=0.003) qtype_test_pred = lasagne.layers.get_output(q_type_net['l_out'], deterministic=True) qtype_test_pred = T.argmax(qtype_test_pred, axis=1) print "Compiling..." self.timer.set_checkpoint('compile') if from_scratch: train = theano.function([qX, mask, qtype], loss, updates=updates, allow_input_downcast=True) qtype_predict = theano.function([qX, mask], qtype_test_pred, allow_input_downcast=True) else: train = theano.function([qembd, qtype], loss, updates=updates, allow_input_downcast=True) qtype_predict = theano.function([qembd], qtype_test_pred, allow_input_downcast=True) print "Compile time(mins)", self.timer.print_checkpoint('compile') print "Done Compiling qtype model..." return train, qtype_predict
def compile_logistic_model(self, lamda, input_params=None): X, Y = self.X, self.Y net = self.build_model(X) network = net['l_out'] self.net_logistic = network prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, Y) loss = loss.mean() for key in net.keys(): loss += lamda*regularize_layer_params(net[key], l2) + \ lamda*regularize_layer_params(net[key], l1) if input_params: print "Compiling classifier with input params..." lasagne.layers.set_all_param_values( network, [i.get_value() for i in input_params]) params = lasagne.layers.get_all_params(network) self.inst_params = params updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_prediction = T.argmax(test_prediction, axis=1) train = theano.function([X, Y], loss, updates=updates, allow_input_downcast=True) predict = theano.function([X], test_prediction, allow_input_downcast=True) print "Done Compiling logistic model..." return train, predict
def _get_loss_updates(self, L1_reg = 0.0, L2_reg = 0.001, update_fn = lasagne.updates.nesterov_momentum, max_norm = None, deterministic = False, momentum = 0.9, **kwargs): """ Returns Theano expressions for the network's loss function and parameter updates. Parameters: L1_reg: float for L1 weight regularization coefficient. L2_reg: float for L2 weight regularization coefficient. max_norm: If not None, constraints the norm of gradients to be less than max_norm. deterministic: True or False. Determines if the output of the network is calculated determinsitically. update_fn: lasagne update function. Default: Stochastic Gradient Descent with Nesterov momentum **kwargs: additional parameters to provide to update_fn. For example: momentum Returns: loss: Theano expression for a penalized negative log likelihood. updates: Theano expression to update the parameters using update_fn. """ loss = ( self._negative_log_likelihood(self.E, deterministic) + regularize_layer_params(self.network,l1) * L1_reg + regularize_layer_params(self.network, l2) * L2_reg ) if max_norm: grads = T.grad(loss,self.params) scaled_grads = lasagne.updates.total_norm_constraint(grads, max_norm) updates = update_fn( scaled_grads, self.params, **kwargs ) else: updates = update_fn( loss, self.params, **kwargs ) if momentum: updates = lasagne.updates.apply_nesterov_momentum(updates, self.params, self.learning_rate, momentum=momentum) # If the model was loaded from file, reload params if self.restored_update_params: for p, value in zip(updates.keys(), self.restored_update_params): p.set_value(value) self.restored_update_params = None # Store last update function to be later saved self.updates = updates return loss, updates
def _get_loss_updates(self, L1_reg=0.0, L2_reg=0.001, update_fn=lasagne.updates.nesterov_momentum, max_norm=None, deterministic=False, momentum=0.9, **kwargs): """ Returns Theano expressions for the network's loss function and parameter updates. Parameters: L1_reg: float for L1 weight regularization coefficient. L2_reg: float for L2 weight regularization coefficient. max_norm: If not None, constraints the norm of gradients to be less than max_norm. deterministic: True or False. Determines if the output of the network is calculated determinsitically. update_fn: lasagne update function. Default: Stochastic Gradient Descent with Nesterov momentum **kwargs: additional parameters to provide to update_fn. For example: momentum Returns: loss: Theano expression for a penalized negative log likelihood. updates: Theano expression to update the parameters using update_fn. """ loss = (self._negative_log_likelihood(self.E, deterministic) + regularize_layer_params(self.network, l1) * L1_reg + regularize_layer_params(self.network, l2) * L2_reg) if max_norm: grads = T.grad(loss, self.params) scaled_grads = lasagne.updates.total_norm_constraint( grads, max_norm) updates = update_fn(scaled_grads, self.params, **kwargs) else: updates = update_fn(loss, self.params, **kwargs) if momentum: updates = lasagne.updates.apply_nesterov_momentum( updates, self.params, self.learning_rate, momentum=momentum) # If the model was loaded from file, reload params if self.restored_update_params: for p, value in zip(updates.keys(), self.restored_update_params): p.set_value(value) self.restored_update_params = None # Store last update function to be later saved self.updates = updates return loss, updates
def cost_ELBO(self, Y=None, X=None, padleft=False, sample_strategy='with_symb_noise', regularize_evolution_weights=False): """ """ if Y is None: Y = self.Y if X is None: X = self.X mrec = self.get_RecModel() mgen = self.get_GenModel() postX = self.get_symb_postX(Y, X, sample_strategy) if regularize_evolution_weights: from lasagne.layers import get_all_layers from lasagne.regularization import regularize_layer_params, l2 lat_ev_layers = get_all_layers(self.lat_ev_model.NNEvolve) lat_weights_regloss = regularize_layer_params(lat_ev_layers[1], l2) Nsamps = Y.shape[0] LogDensity = mgen.compute_LogDensity(Y, postX, padleft=padleft) Entropy = mrec.compute_Entropy(Y, postX) ELBO = (LogDensity + Entropy if not regularize_evolution_weights else LogDensity + Entropy + lat_weights_regloss) costs_func = theano.function( inputs=self.CostsInputDict['ELBO'], outputs=[ELBO / Nsamps, LogDensity / Nsamps, Entropy / Nsamps]) return ELBO, costs_func
def define_loss(network, targets): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, targets) loss = loss.mean() test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, targets) test_loss = test_loss.mean() if params.REGULARIZATION: regularization_penalty = regularize_layer_params(network, l2) * params.REGULARIZATION_WEIGHT loss = loss + regularization_penalty test_loss = test_loss + regularization_penalty acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), targets), dtype=theano.config.floatX) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([inputs, targets], [test_prediction, test_loss, acc]) return loss, val_fn
def build_train_loss(self, train_output, target_values): l2_penalty = regularize_layer_params(self.layers, l2) * self.l2_reg_weight loss = self.msq_err(train_output, target_values) loss += l2_penalty return loss
def build_model(n_input, n_hidden, optimizer=adagrad, l2_weight=1e-4, l1_weight=1e-2): ''' build NN model to estimating model function ''' global LR input_A = L.InputLayer((None, n_input), name='A') layer_A = L.DenseLayer(input_A, n_hidden, b=None, nonlinearity=identity) input_B = L.InputLayer((None, n_input), name='B') layer_B = L.DenseLayer(input_B, n_hidden, b=None, nonlinearity=identity) merge_layer = L.ElemwiseSumLayer((layer_A, layer_B)) output_layer = L.DenseLayer(merge_layer, 1, b=None, nonlinearity=identity) # output is scalar x1 = T.matrix('x1') x2 = T.matrix('x2') y = T.matrix('y') out = L.get_output(output_layer, {input_A: x1, input_B: x2}) params = L.get_all_params(output_layer) loss = T.mean(squared_error(out, y)) # add l1 penalty l1_penalty = regularize_layer_params([layer_A, layer_B, output_layer], l1) # add l2 penalty l2_penalty = regularize_layer_params([layer_A, layer_B, output_layer], l2) # get loss + penalties loss = loss + l1_penalty * l1_weight + l2_penalty * l2_weight updates_sgd = optimizer(loss, params, learning_rate=LR) updates = apply_momentum(updates_sgd, params, momentum=0.9) # updates = optimizer(loss,params,learning_rate=LR) f_train = theano.function([x1, x2, y], loss, updates=updates) f_test = theano.function([x1, x2, y], loss) f_out = theano.function([x1, x2], out) return f_train, f_test, f_out, output_layer
def objective_with_L2(layers, loss_function, target, aggregate=aggregate, deterministic=False, get_output_kw=None): reg = regularize_layer_params([layers["hidden5"]], l2) loss = objective(layers, loss_function, target, aggregate, deterministic, get_output_kw) if deterministic is False: return loss + reg * lambda_regularization else: return loss
def build_train_loss(self, train_output, target_values): l2_penalty = regularize_layer_params(self.layers, l2) * self.l2_reg_weight loss = T.nnet.categorical_crossentropy( train_output, target_values).mean() loss += l2_penalty return loss
def _get_loss_updates(self, L1_reg = 0.0, L2_reg = 0.001, update_fn = lasagne.updates.nesterov_momentum, max_norm = None, deterministic = False, **kwargs): """ Returns Theano expressions for the network's loss function and parameter updates. Parameters: L1_reg: float for L1 weight regularization coefficient. L2_reg: float for L2 weight regularization coefficient. max_norm: If not None, constraints the norm of gradients to be less than max_norm. deterministic: True or False. Determines if the output of the network is calculated determinsitically. update_fn: lasagne update function. Default: Stochastic Gradient Descent with Nesterov momentum **kwargs: additional parameters to provide to update_fn. For example: momentum Returns: loss: Theano expression for a penalized negative log likelihood. updates: Theano expression to update the parameters using update_fn. """ loss = ( self._negative_log_likelihood(self.E, deterministic) + regularize_layer_params(self.network,l1) * L1_reg + regularize_layer_params(self.network, l2) * L2_reg ) if max_norm: grads = T.grad(loss,self.params) scaled_grads = lasagne.updates.total_norm_constraint(grads, max_norm) updates = update_fn( grads, self.params, **kwargs ) return loss, updates updates = update_fn( loss, self.params, **kwargs ) return loss, updates
def test_regularize_layer_params_single_layer(self, layers): from lasagne.regularization import regularize_layer_params l_1, l_2, l_3 = layers penalty = Mock(return_value=0) loss = regularize_layer_params(l_2, penalty) assert penalty.call_count == 1 penalty.assert_any_call(l_2.W)
def build_mlp(self, input_var=None, dropout_rate=0.5, l2_reg=0., l1_reg=0.): # This creates an MLP of two hidden layers of 800 units each, followed by # a softmax output layer of 10 units. It applies 20% dropout to the input # data and 50% dropout to the hidden layers. # Input layer, specifying the expected input shape of the network # (unspecified batchsize, 1 channel, 28 rows and 28 columns) and # linking it to the given Theano variable `input_var`, if any: l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input_var) # Apply 20% dropout to the input data: #l_in_drop = lasagne.layers.DropoutLayer(l_in, p=dropout_rate) # Add a fully-connected layer of 800 units, using the linear rectifier, and # initializing weights with Glorot's scheme (which is the default anyway): l_hid1 = lasagne.layers.DenseLayer( l_in, num_units=800, nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform()) # We'll now add dropout of 50%: self.l2_penalty = regularize_layer_params(l_hid1, l2) self.l1_penalty = regularize_layer_params(l_hid1, l1) l_hid1_drop = lasagne.layers.DropoutLayer(l_hid1, p=dropout_rate) # Another 800-unit layer: #l_hid2 = lasagne.layers.DenseLayer( # l_hid1_drop, num_units=800, # nonlinearity=lasagne.nonlinearities.rectify) # 50% dropout again: #l_hid2_drop = lasagne.layers.DropoutLayer(l_hid2, p=dropout_rate) # Finally, we'll add the fully-connected output layer, of 10 softmax units: l_out = lasagne.layers.DenseLayer( l_hid1_drop, num_units=10, nonlinearity=lasagne.nonlinearities.softmax) # Each layer is linked to its incoming layer(s), so we only need to pass # the output layer to give access to a network in Lasagne: return l_out
def build_qn_type_model(self, from_scratch=False): qtype,qembd = self.qtype,self.qembd qX, mask = self.qX, self.lstm_mask if from_scratch: #q_bow_net = self.build_question_boW(qX) #q_bow = lasagne.layers.get_output(q_bow_net['l_embd']) #l2_penalty_qbow = regularize_layer_params(q_bow_net['l_embd'], l2) #qbow_params = lasagne.layers.get_all_params(q_bow_net['l_embd']) #qembd = T.sum(q_bow,axis=1) q_lstm_net = self.build_qn_classifier_lstm(qX, mask) qlstm_params = lasagne.layers.get_all_params(q_lstm_net['l_dense']) l2_penalty_qlstm = regularize_layer_params(q_lstm_net['l_dense'], l2) #l2_penalty_qlstm += regularize_layer_params(q_lstm_net['l_lstm'], l2) qembd = lasagne.layers.get_output(q_lstm_net['l_dense']) q_type_net = self.build_qn_classifier_mlp(qembd) q_type_pred = lasagne.layers.get_output(q_type_net['l_out'],deterministic=False) l2_penalty_mlp = regularize_layer_params(q_type_net['l_out'], l2) loss = lasagne.objectives.categorical_crossentropy(q_type_pred, qtype) loss = loss.mean() + l2_penalty_mlp loss += l2_penalty_qlstm params = [] qmlp_params = lasagne.layers.get_all_params(q_type_net['l_out']) for p in qmlp_params: params.append(p) for p in qlstm_params: params.append(p) all_grads = T.grad(loss, params) if self.grad_clip != None: all_grads = [T.clip(g, self.grad_clip[0], self.grad_clip[1]) for g in all_grads] updates = lasagne.updates.adam(all_grads, params, learning_rate=0.003) qtype_test_pred = lasagne.layers.get_output(q_type_net['l_out'],deterministic=True) qtype_test_pred = T.argmax(qtype_test_pred, axis=1) print "Compiling..." self.timer.set_checkpoint('compile') if from_scratch: train = theano.function([qX,mask, qtype], loss, updates=updates, allow_input_downcast=True) qtype_predict = theano.function([qX,mask], qtype_test_pred, allow_input_downcast=True) else: train = theano.function([qembd, qtype], loss, updates=updates, allow_input_downcast=True) qtype_predict = theano.function([qembd], qtype_test_pred, allow_input_downcast=True) print "Compile time(mins)", self.timer.print_checkpoint('compile') print "Done Compiling qtype model..." return train, qtype_predict
def loss_function(net, prediction, targets): # We use L2 Norm for regularization l2_reg = regularization.regularize_layer_params( net, regularization.l2) * cfg.L2_WEIGHT # Calculate the loss loss = calc_loss(prediction, targets) + l2_reg return loss
def get_loss(prediction,in_var,target_var,all_layers,l1_reg=True): loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() l_hid=all_layers["out"] reg_param=0.001 if(l1_reg): l1_penalty = regularize_layer_params(l_hid, l1) * reg_param return loss + l1_penalty else: return loss
def weight_decay_objective(layers, loss_function, target, penalty_conv=1e-8, penalty_conv_type = l2, penalty_output=1e-8, penalty_output_type = l2, aggregate=aggregate, deterministic=False, get_output_kw={}): ''' Defines L2 weight decay on network weights. ''' net_out = get_output(layers[-1], deterministic=deterministic, **get_output_kw) loss = loss_function(net_out, target) p1 = penalty_conv * regularize_layer_params(layers[1], penalty_conv_type) p2 = penalty_output * regularize_layer_params(layers[-1], penalty_output_type) losses = loss + p1 + p2 return aggregate(losses)
def build_nn(cls, n_model, n_units=100): ### current params + response + grads in_l = layers.InputLayer(shape=( None, 2 * n_model + 1, ), name='input_params') dense1 = layers.DenseLayer(in_l, num_units=n_units, nonlinearity=nonlinearities.tanh) out_l = layers.DenseLayer(dense1, num_units=n_model, nonlinearity=nonlinearities.linear) reg = \ regularization.regularize_layer_params(dense1, regularization.l2) + \ regularization.regularize_layer_params(out_l, regularization.l2) return in_l, out_l, reg
def build_mlp(self, input_var=None, dropout_rate=0.5, l2_reg=0., l1_reg=0.): l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input_var) l_in_drop = lasagne.layers.DropoutLayer(l_in, p=dropout_rate) self.l2_penalty = regularize_layer_params(l_in_drop, l2) self.l1_penalty = regularize_layer_params(l_in_drop, l1) l_out = lasagne.layers.DenseLayer( l_in_drop, num_units=10, nonlinearity=lasagne.nonlinearities.softmax) return l_out
def compile_conv_ae(hyper_params,preproc): l_hid,l_out,in_var=build_conv_ae(hyper_params) params = lasagne.layers.get_all_params(l_out, trainable=True) target_var = T.ivector('targets') reconstruction = lasagne.layers.get_output(l_out) reduction=lasagne.layers.get_output(l_hid) loss = lasagne.objectives.squared_error(reconstruction, in_var).mean() l1_penalty = regularize_layer_params(l_hid, l1) * 0.0001 loss+=l1_penalty updates=lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.001, momentum=0.8) return ConvAutoencoder(hyper_params,l_out,preproc,in_var, reduction,reconstruction,loss,updates)
def objective( output_layer, regularize_layers, target, loss_function=squared_error, aggregate=aggregate, deterministic=False, l1=0, l2=0, tv=0, ): network_output = layers.get_output(output_layer, deterministic=deterministic) loss = aggregate(loss_function(network_output, target)) for layer in regularize_layers: if l1: loss += regularization.regularize_layer_params(layer, regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params(layer, regularization.l2) * l2 if tv: loss += T.mean(T.abs_(network_output[:, 1:] - network_output[:, :-1])) * tv return loss
def weight_decay_objective(layers, loss_function, target, penalty_conv=1e-8, penalty_conv_type=l2, penalty_output=1e-8, penalty_output_type=l2, aggregate=aggregate, deterministic=False, get_output_kw={}): ''' Defines L2 weight decay on network weights. ''' net_out = get_output(layers[-1], deterministic=deterministic, **get_output_kw) loss = loss_function(net_out, target) p1 = penalty_conv * regularize_layer_params(layers[1], penalty_conv_type) p2 = penalty_output * regularize_layer_params(layers[-1], penalty_output_type) losses = loss + p1 + p2 return aggregate(losses)
def objective(layers, loss_function, target, aggregate=aggregate, deterministic=False, l1=0, l2=0, get_output_kw=None): """ Default implementation of the NeuralNet objective. :param layers: The underlying layers of the NeuralNetwork :param loss_function: The callable loss function to use :param target: the expected output :param aggregate: the aggregation function to use :param deterministic: Whether or not to get a deterministic output :param l1: Optional l1 regularization parameter :param l2: Optional l2 regularization parameter :param get_output_kw: optional kwargs to pass to :meth:`NeuralNetwork.get_output` :return: The total calculated loss """ if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] network_output = get_output(output_layer, deterministic=deterministic, **get_output_kw) loss = aggregate(loss_function(network_output, target)) if l1: loss += regularization.regularize_layer_params(layers.values(), regularization.l1) * l1 if l2: loss += regularization.regularize_layer_params(layers.values(), regularization.l2) * l2 return loss
def _init_train_fn(self): """ Initialize Theano function to compute loss and update weights using Adam for a single epoch and minibatch. """ input_var = tensor5('input') output_var = T.lvector('output') one_hot = T.extra_ops.to_one_hot(output_var, self.num_classes, dtype='int64') # output_one_hot = T.extra_ops.to_one_hot(output_var, self.num_classes, dtype='int64') # Compute losses by iterating over the input variable (a 5D tensor where each "row" represents a clip that # has some number of frames. [losses, predictions], updates = theano.scan(fn=lambda X_clip, output: self.model.clip_loss(X_clip, output), outputs_info=None, sequences=[input_var, one_hot]) loss = losses.mean() output_layer = self.model.layer('fc8') l2_penalty = regularization.regularize_layer_params(output_layer, regularization.l2) * self.reg * 0.5 for layer_key in self.tuning_layers: layer = self.model.layer(layer_key) l2_penalty += regularization.regularize_layer_params(layer, regularization.l2) * self.reg * 0.5 loss += l2_penalty # Get params for output layer and update using Adam params = output_layer.get_params(trainable=True) adam_update = lasagne.updates.adam(loss, params, learning_rate=self.output_lr) # Combine update expressions returned by theano.scan() with update expressions returned from the adam update updates.update(adam_update) for layer_key in self.tuning_layers: layer = self.model.layer(layer_key) layer_params = layer.get_params(trainable=True) layer_adam_updates = lasagne.updates.adam(loss, layer_params, learning_rate=self.tuning_lr) updates.update(layer_adam_updates) self.train_function = theano.function([input_var, output_var], [loss, predictions], updates=updates)
def build_discriminator_lstm(params, gate_params, cell_params): from lasagne.layers import InputLayer, DenseLayer, concat from lasagne.layers.recurrent import LSTMLayer from lasagne.regularization import l2, regularize_layer_params # from layers import MinibatchLayer # input layers l_in = InputLayer( shape=params['input_shape'], name='d_in') l_mask = InputLayer( shape=params['mask_shape'], name='d_mask') # recurrent layers for bidirectional network l_forward = LSTMLayer( l_in, params['n_units'], grad_clipping=params['grad_clip'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, nonlinearity=params['non_linearities'][0], only_return_final=True, mask_input=l_mask) l_backward = LSTMLayer( l_in, params['n_units'], grad_clipping=params['grad_clip'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, nonlinearity=params['non_linearities'][1], only_return_final=True, mask_input=l_mask, backwards=True) # concatenate output of forward and backward layers l_concat = concat([l_forward, l_backward], axis=1) # minibatch layer on forward and backward layers # l_minibatch = MinibatchLayer(l_concat, num_kernels=100) # output layer l_out = DenseLayer( l_concat, num_units=params['n_output_units'], nonlinearity=params['non_linearities'][2]) regularization = regularize_layer_params( l_out, l2) * params['regularization'] class Discriminator: def __init__(self, l_in, l_mask, l_out): self.l_in = l_in self.l_mask = l_mask self.l_out = l_out self.regularization = regularization return Discriminator(l_in, l_mask, l_out)
def objective(layers, loss_function, target, aggregate=aggregate, deterministic=False, get_output_kw=None): if get_output_kw is None: get_output_kw = {} output_layer = layers[-1] first_layer = layers[1] network_output = lasagne.layers.get_output( output_layer, deterministic=deterministic, **get_output_kw) if not deterministic: losses = loss_function(network_output, target) \ + l2 * regularization.regularize_network_params( output_layer, regularization.l2) \ + l1 * regularization.regularize_layer_params( first_layer, regularization.l1) else: losses = loss_function(network_output, target) return aggregate(losses)
def define_updates(network, inputs, targets): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(T.clip(prediction, 0.00001, 0.99999), targets) loss = loss.mean() test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(T.clip(test_prediction, 0.00001, 0.99999), targets) test_loss = test_loss.mean() l2_loss = regularize_layer_params(network, l2) * params.L2_LAMBDA loss = loss + l2_loss test_loss = test_loss + l2_loss acc = T.mean(T.eq(T.argmax(prediction, axis=1), targets), dtype=theano.config.floatX) test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), targets), dtype=theano.config.floatX) l_r = theano.shared(np.array(params.LEARNING_RATE, dtype=theano.config.floatX)) # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD), but Lasagne offers plenty more. network_params = lasagne.layers.get_all_params(network, trainable=True) if params.OPTIMIZATION == "MOMENTUM": updates = lasagne.updates.momentum(loss, network_params, learning_rate=l_r, momentum=params.MOMENTUM) elif params.OPTIMIZATION == "ADAM": updates = lasagne.updates.adam(loss, network_params, learning_rate=l_r) elif params.OPTIMIZATION == "RMSPROP": updates = lasagne.updates.adam(loss, network_params) prediction_binary = T.argmax(prediction, axis=1) test_prediction_binary = T.argmax(test_prediction, axis=1) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([inputs, targets], [loss, l2_loss, acc, prediction_binary], updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([inputs, targets], [test_loss, l2_loss, test_acc, test_prediction_binary]) return train_fn, val_fn, l_r
def __init__(self, *args, **kwargs): super(TrainerMixin, self).__init__(*args, **kwargs) input_var = tensor.tensor4('inputs') target_var = tensor.ivector('targets') loss, _ = loss_acc(self.model, input_var, target_var, deterministic=False) layers = get_all_layers(self.model) decay = regularize_layer_params(layers, l2) * 0.0001 loss = loss + decay params = get_all_params(self.model, trainable=True) updates = momentum(loss, params, momentum=0.9, learning_rate=self.learning_rate) self.set_training(input_var, target_var, loss, updates)
def nll_l2(predictions, targets, net, batch_size, num_samples, rw=None, train_clip=False, thresh=3, weight_decay=0.00001, **kwargs): if rw is None: rw = theano.shared(np.cast[theano.config.floatX](0)) print('Weight decay:', weight_decay) loss = categorical_crossentropy(predictions, targets).mean() loss += rg.regularize_layer_params(ll.get_all_layers(net), rg.l2) * weight_decay return loss, rw
def compile_train_function(neural_network, lr, w_dacy): input_var = neural_network['input'].input_var output_var = T.lvector() # Variable symbolic predicted = lasagne.layers.get_output(neural_network['out'], inputs=input_var) # Answer of output loss = lasagne.objectives.categorical_crossentropy(predicted, output_var) # Function of error loss = loss.mean() """ Regularize L2 (avoid over-fitting) Only to function of train Lreg = L + λ*∑(w^2) where: L --> loss λ --> weight decay w --> weight """ loss += w_dacy * regularize_layer_params(neural_network['out'], l2) # Regularize L2 # Accuracy rate y_pred = T.argmax(predicted, axis=1) acc = T.eq(y_pred, output_var) acc = acc.mean() valid_predicted = lasagne.layers.get_output(neural_network['out'], inputs=input_var) # Validation answer of output valid_loss = lasagne.objectives.categorical_crossentropy(valid_predicted, output_var) # Validation function of error valid_loss = valid_loss.mean() # Validation accuracy rate valid_y_pred = T.argmax(valid_predicted, axis=1) valid_acc = T.eq(valid_y_pred, output_var) valid_acc = valid_acc.mean() # Parameters updating params = lasagne.layers.get_all_params(neural_network['out']) updates = lasagne.updates.sgd(loss, params, lr) # Compile function train_fn = theano.function([input_var, output_var], [loss, acc], updates=updates) valid_fn = theano.function([input_var, output_var], [valid_loss, valid_acc]) return train_fn, valid_fn
def build_mlp(size_x, lstm_size, input_var=None): lstm_nonlinearity = lasagne.nonlinearities.sigmoid gate_parameters = lasagne.layers.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.)) cell_parameters = lasagne.layers.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), # Setting W_cell to None denotes that no # cell connection will be used. W_cell=None, b=lasagne.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=lasagne.nonlinearities.tanh) l_in = InputLayer((None, None, size_x), input_var=input_var) batch_size, seqlen, _ = l_in.input_var.shape l_lstm = LSTMLayer(l_in, lstm_size, learn_init=True, nonlinearity=lstm_nonlinearity, ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, grad_clipping=100.) l2_penalty = regularize_layer_params(l_lstm, l2) l_reshape = lasagne.layers.ReshapeLayer(l_lstm, (-1, lstm_size)) # Now, we can apply feed-forward layers as usual. l_dense = lasagne.layers.DenseLayer( l_reshape, num_units=1, nonlinearity=None) # Now, the shape will be n_batch*n_timesteps, 1. We can then reshape to # batch_size, seqlen to get a single value # for each timstep from each sequence l_out = lasagne.layers.ReshapeLayer(l_dense, (batch_size, seqlen, size_x)) # l1_penalty = regularize_layer_params(l_out, l2) return l_out, l2_penalty # , l1_penalty
def default_training(self): """Set the training (updates) for this trainer.""" input_var = tensor.tensor4('inputs') target_var = tensor.ivector('targets') errors = OrderedDict() loss, acc = loss_acc(self.model, input_var, target_var, deterministic=False) errors['train_acc'] = acc errors['classification error'] = loss layers = get_all_layers(self.model) decay = regularize_layer_params(layers, l2) * self.weight_decay errors['weight decay'] = decay loss = loss + decay params = get_all_params(self.model, trainable=True) updates = self.momentum_method(loss, params, momentum=self.momentum, learning_rate=self.learning_rate) self.set_training(input_var, target_var, loss, updates, values=errors)
def _get_loss_updates(self, L1_reg=0.0, L2_reg=0.001, update_fn=lasagne.updates.nesterov_momentum, max_norm=None, deterministic=False, momentum=0.9, **kwargs): loss = (self._negative_log_likelihood(self.network_1, self.E, deterministic) + self._negative_log_likelihood(self.network_2, self.E, deterministic) + regularize_layer_params(self.network_1, l1) * L1_reg + regularize_layer_params(self.network_1, l2) * L2_reg + regularize_layer_params(self.network_2, l1) * L1_reg + regularize_layer_params(self.network_2, l2) * L2_reg + (regularize_layer_params(self.network_1, l2) - regularize_layer_params(self.network_2, l2)) * L2_reg) if max_norm: grads = T.grad(loss, self.params) scaled_grads = lasagne.updates.total_norm_constraint( grads, max_norm) updates = update_fn(scaled_grads, self.params, **kwargs) else: updates = update_fn(loss, self.params, **kwargs) if momentum: updates = lasagne.updates.apply_nesterov_momentum( updates, self.params, self.learning_rate, momentum=momentum) # If the model was loaded from file, reload params if self.restored_update_params: for p, value in zip(updates.keys(), self.restored_update_params): p.set_value(value) self.restored_update_params = None # Store last update function to be later saved self.updates = updates return loss, updates
prediction = T.clip(prediction, 0.0000001, 0.9999999) #binary crossentropy is the best choice for a multi-class sigmoid output loss = T.mean(objectives.binary_crossentropy(prediction, targets)) return loss #theano variable for the class targets targets = T.matrix('targets', dtype=theano.config.floatX) #get the network output prediction = l.get_output(NET) #we use L2 Norm for regularization l2_reg = regularization.regularize_layer_params(NET, regularization.l2) * L2_WEIGHT #calculate the loss if MULTI_LABEL: loss = calc_loss_multi(prediction, targets) + l2_reg else: loss = calc_loss(prediction, targets) + l2_reg ################# ACCURACY FUNCTION ##################### def calc_accuracy(prediction, targets): #we can use the lasagne objective categorical_accuracy to determine the top1 single label accuracy a = T.mean(objectives.categorical_accuracy(prediction, targets, top_k=1)) return a
def main(model='cnn', num_epochs=5): # Load the dataset print "Loading data..." X_train, y_train, X_val, y_val, X_test, y_test = load_data_set() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs', dtype=theano.config.floatX) target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print "Building model and compiling functions..." if model == 'cnn': network = build_cnn(input_var) else: print "Unrecognized model type %r.", model return # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) # loss = lasagne.objectives.binary_crossentropy(prediction,target_var) loss = loss.mean() l2_penalty = regularize_layer_params(network, l2) * 1e-1 loss += l2_penalty acc = T.mean(T.eq(T.argmax(prediction, axis=1), target_var), dtype=theano.config.floatX) # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.001, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) # test_loss = lasagne.objectives.binary_crossentropy(test_prediction,target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], [loss, acc], updates=updates, allow_input_downcast=True) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) test_pre = theano.function([input_var, target_var], [prediction], on_unused_input='ignore') # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: train_out = open("result/train_loss.txt", 'w') val_out = open("result/val_loss.txt", 'w') for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_acc = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True): inputs, targets = batch # print type(targets), targets err, acc = train_fn(inputs, targets) train_err += err train_acc += acc # train_err += train_fn(inputs, targets) train_batches += 1 train_out.write(str(train_err) + "\r\n") # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 val_out.write(str(val_err) + "\r\n") # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" training accuracy:\t\t{:.2f} %".format(train_acc / train_batches * 100)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100)) # After training, we compute and print the test error: train_out.close() val_out.close() test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, 5, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 pre = test_pre(inputs, targets) print "预测概率:", pre print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))
def main(num_epochs=200): # Load the dataset print("Loading data...") datasets = load_data() X_train, y_train = datasets[0] X_test, y_test = datasets[1] # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') learnrate=0.02 # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") network = build_cnn(input_var) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): l2_penalty = regularize_layer_params(network, l2) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean()+0.1*l2_penalty # We could add some weight decay as well here, see lasagne.regularization. params = lasagne.layers.get_all_params(network, trainable=True) #optimizer: #updates = lasagne.updates.adadelta(loss, params,learning_rate=learnrate) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=learnrate, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) best_acc = 0 # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() if epoch % 8 == 7: learnrate*= 0.96 #updates = lasagne.updates.adadelta(loss, params,learning_rate=learnrate) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=learnrate, momentum=0.9) train_fn = theano.function([input_var, target_var], loss, updates=updates) for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=False): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test,batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 test_err = test_err / test_batches test_acc = test_acc / test_batches # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" test loss:\t\t{:.6f}".format(test_err)) print(" validation accuracy:\t\t{:.2f} %".format( test_acc * 100)) if test_acc > best_acc: best_acc = test_acc np.savez('model10.npz', *lasagne.layers.get_all_param_values(network)) print("final accuracy is:\t\t{:.6f}".format(best_acc * 100)) print('*****************************************************\n'*2) return best_acc
def main(num_epochs=100): # Load the dataset print("Loading data...") datasets = load_data() X_train, y_train = datasets[0], datasets[1] X_val, y_val = datasets[2], datasets[3] # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') learnrate=0.005 # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") network = build_cnn(input_var) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): l2_penalty = regularize_layer_params(network, l2) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean()+0.01*l2_penalty # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=learnrate, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) best_val_loss=10 improvement_threshold=0.999 best_acc=0 # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() if epoch % 8 == 7: learnrate*=0.96 updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=learnrate, momentum=0.9) for batch in iterate_minibatches(X_train, y_train,BATCHSIZE, shuffle=False): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, BATCHSIZE, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) if val_err/val_batches < best_val_loss*improvement_threshold: np.savez('best_model_omit5_v2.npz', *lasagne.layers.get_all_param_values(network)) best_val_loss=val_err/val_batches print(" best validation loss\t\t{:.6f}".format(best_val_loss)) if val_acc / val_batches>best_acc: best_acc=val_acc / val_batches np.savez('best_classification_model_omit5_v2.npz', *lasagne.layers.get_all_param_values(network)) print(' saved best classification model')
split = data.split_data(labels, args.seeds) maxf = get_maxf(features) trainx, trainy = constuct_dataset(features, labels, label_set, split[0], maxf) testx, testy = constuct_dataset(features, labels, label_set, split[1], maxf) allx, ally = constuct_dataset(features, labels, label_set, features.keys(), maxf) input_var = sparse.csr_matrix(name = 'x', dtype = 'float32') un_var = sparse.csr_matrix(name = 'ux', dtype = 'float32') target_var = T.imatrix('targets') ent_target = T.ivector('ent_targets') network, l_entropy = build_model(input_var, maxf + 1, trainy.shape[1], args.ent_reg > 0, un_var) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() + regularize_layer_params(network, l2) * args.param_reg if args.ent_reg > 0.0: ent_pred = lasagne.layers.get_output(l_entropy) loss += lasagne.objectives.binary_crossentropy(ent_pred, ent_target).mean() * args.ent_reg params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=args.learning_rate, momentum = 0.9) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), T.argmax(target_var, axis = 1)), dtype=theano.config.floatX)
def create_nnet(input_dims, action_dims, observation_dims, value_dims, learning_rate, grad_clip=None, l1_weight=None, l2_weight=None, num_hidden_units=20, num_hidden_action_units=None, num_hidden_observ_units=None, num_hidden_value_units=None, batch_size=32, max_train_epochs=1, hidden_nonlinearity=nonlinearities.rectify, output_nonlinearity=None, update_method=updates.sgd): commonlayers = [] commonlayers.append(layers.InputLayer(shape=(None, input_dims))) commonlayers.append(DenseLayer(commonlayers[-1], num_hidden_units, nonlinearity=hidden_nonlinearity)) if num_hidden_action_units is None: actionlayers = [DenseLayer(commonlayers[-1], action_dims, nonlinearity=output_nonlinearity)] else: actionlayers = [DenseLayer(commonlayers[-1], num_hidden_action_units, nonlinearity=output_nonlinearity)] actionlayers.append(DenseLayer(actionlayers[-1], action_dims, nonlinearity=output_nonlinearity)) if num_hidden_observ_units is None: observlayers = [DenseLayer(commonlayers[-1], observation_dims, nonlinearity=output_nonlinearity)] else: observlayers = [DenseLayer(commonlayers[-1], num_hidden_observ_units, nonlinearity=output_nonlinearity)] observlayers.append(DenseLayer(observlayers[-1], observation_dims, nonlinearity=output_nonlinearity)) if num_hidden_value_units is None: dvaluelayers = [DenseLayer(commonlayers[-1], value_dims, nonlinearity=output_nonlinearity)] else: dvaluelayers = [DenseLayer(commonlayers[-1], num_hidden_value_units, nonlinearity=output_nonlinearity)] dvaluelayers.append(DenseLayer(dvaluelayers[-1], value_dims, nonlinearity=output_nonlinearity)) actvallayers = [layers.ConcatLayer([actionlayers[-1], dvaluelayers[-1]])] obsvallayers = [layers.ConcatLayer([observlayers[-1], dvaluelayers[-1]])] concatlayers = [layers.ConcatLayer([actionlayers[-1], observlayers[-1], dvaluelayers[-1]])] action_prediction = layers.get_output(actionlayers[-1]) dvalue_prediction = layers.get_output(dvaluelayers[-1]) actval_prediction = layers.get_output(actvallayers[-1]) obsval_prediction = layers.get_output(obsvallayers[-1]) concat_prediction = layers.get_output(concatlayers[-1]) input_var = commonlayers[0].input_var action_target = T.matrix(name="action_target", dtype=floatX) dvalue_target = T.matrix(name="value_target", dtype=floatX) actval_target = T.matrix(name="actval_target", dtype=floatX) obsval_target = T.matrix(name="obsval_target", dtype=floatX) concat_target = T.matrix(name="concat_target", dtype=floatX) action_loss = objectives.squared_error(action_prediction, action_target).mean() obsval_loss = objectives.squared_error(obsval_prediction, obsval_target).mean() dvalue_loss = objectives.squared_error(dvalue_prediction, dvalue_target).mean() actval_loss = objectives.squared_error(actval_prediction, actval_target).mean() concat_loss = objectives.squared_error(concat_prediction, concat_target).mean() if l1_weight is not None: action_l1penalty = regularize_layer_params(commonlayers + actionlayers, l1) * l1_weight obsval_l1penalty = regularize_layer_params(commonlayers + observlayers + dvaluelayers, l1) * l1_weight dvalue_l1penalty = regularize_layer_params(commonlayers + dvaluelayers, l1) * l1_weight actval_l1penalty = regularize_layer_params(commonlayers + actionlayers + dvaluelayers, l1) * l1_weight concat_l1penalty = regularize_layer_params(commonlayers + actionlayers + observlayers + dvaluelayers, l1) * l1_weight action_loss += action_l1penalty obsval_loss += obsval_l1penalty dvalue_loss += dvalue_l1penalty actval_loss += actval_l1penalty concat_loss += concat_l1penalty if l2_weight is not None: action_l2penalty = regularize_layer_params(commonlayers + actionlayers, l2) * l2_weight obsval_l2penalty = regularize_layer_params(commonlayers + observlayers + dvaluelayers, l2) * l2_weight dvalue_l2penalty = regularize_layer_params(commonlayers + dvaluelayers, l2) * l2_weight actval_l2penalty = regularize_layer_params(commonlayers + actionlayers + dvaluelayers, l2) * l2_weight concat_l2penalty = regularize_layer_params(commonlayers + actionlayers + observlayers + dvaluelayers, l2) * l2_weight action_loss += action_l2penalty obsval_loss += obsval_l2penalty dvalue_loss += dvalue_l2penalty actval_loss += actval_l2penalty concat_loss += concat_l2penalty action_params = layers.get_all_params(actionlayers[-1], trainable=True) obsval_params = layers.get_all_params(obsvallayers[-1], trainable=True) dvalue_params = layers.get_all_params(dvaluelayers[-1], trainable=True) actval_params = layers.get_all_params(actvallayers[-1], trainable=True) concat_params = layers.get_all_params(concatlayers[-1], trainable=True) if grad_clip is not None: action_grads = theano.grad(action_loss, action_params) obsval_grads = theano.grad(obsval_loss, obsval_params) dvalue_grads = theano.grad(dvalue_loss, dvalue_params) actval_grads = theano.grad(actval_loss, actval_params) concat_grads = theano.grad(concat_loss, concat_params) action_grads = [updates.norm_constraint(grad, grad_clip, range(grad.ndim)) for grad in action_grads] obsval_grads = [updates.norm_constraint(grad, grad_clip, range(grad.ndim)) for grad in obsval_grads] dvalue_grads = [updates.norm_constraint(grad, grad_clip, range(grad.ndim)) for grad in dvalue_grads] actval_grads = [updates.norm_constraint(grad, grad_clip, range(grad.ndim)) for grad in actval_grads] concat_grads = [updates.norm_constraint(grad, grad_clip, range(grad.ndim)) for grad in concat_grads] action_updates = update_method(action_grads, action_params, learning_rate) obsval_updates = update_method(obsval_grads, obsval_params, learning_rate) dvalue_updates = update_method(dvalue_grads, dvalue_params, learning_rate) actval_updates = update_method(actval_grads, actval_params, learning_rate) concat_updates = update_method(concat_grads, concat_params, learning_rate) else: action_updates = update_method(action_loss, action_params, learning_rate) obsval_updates = update_method(obsval_loss, obsval_params, learning_rate) dvalue_updates = update_method(dvalue_loss, dvalue_params, learning_rate) actval_updates = update_method(actval_loss, actval_params, learning_rate) concat_updates = update_method(concat_loss, concat_params, learning_rate) fit_action = theano.function([input_var, action_target], action_loss, updates=action_updates) fit_obsval = theano.function([input_var, obsval_target], obsval_loss, updates=obsval_updates) fit_dvalue = theano.function([input_var, dvalue_target], dvalue_loss, updates=dvalue_updates) fit_actval = theano.function([input_var, actval_target], actval_loss, updates=actval_updates) fit_concat = theano.function([input_var, concat_target], concat_loss, updates=concat_updates) predict_action = theano.function([input_var], action_prediction) predict_obsval = theano.function([input_var], obsval_prediction) predict_dvalue = theano.function([input_var], dvalue_prediction) predict_actval = theano.function([input_var], actval_prediction) predict_concat = theano.function([input_var], concat_prediction) nnet = Mock( fit_action=fit_action, fit_obsval=fit_obsval, fit_value=fit_dvalue, fit_actval=fit_actval, fit_both=fit_concat, predict_action=predict_action, predict_obsval=predict_obsval, predict_value=predict_dvalue, predict_actval=predict_actval, predict_both=predict_concat, ) return nnet
def main(num_epochs=500): # Load the dataset print("Loading data...") #X_train, y_train, X_val, y_val, X_test, y_test= pull_data() trainX, trainY, valX, valY, testX, testY = pull_data() trainX = normalize(trainX.reshape(trainX.shape[0],1, DIM, DIM)) valX = normalize(valX.reshape(valX.shape[0],1, DIM, DIM)) testX = normalize(testX.reshape(testX.shape[0],1, DIM, DIM)) trainY = trainY - 1 valY = valY - 1 testY = testY - 1 trainX, trainY = shuffle(trainX, trainY) valX, valY = shuffle(valX, valY) testX, testY = shuffle(testX, testY) # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') output_var = T.ivector('targets') model = build_cnn(input_var) print "[X] CNN defining its goals." model_params = lasagne.layers.get_all_params(model, trainable=True) sh_lr = theano.shared(lasagne.utils.floatX(LEARNING_RATE)) #why do we want to compute output expressions for model and input_var??? noisy_output = lasagne.layers.get_output(model, input_var, deterministic=False) true_output = lasagne.layers.get_output(model, input_var, deterministic=True) noisy_prediction = T.argmax(noisy_output, 1) true_prediction = T.argmax(true_output, 1) l2_loss = regularize_layer_params(model, l2)*L2_REG ## Loss expression noisy_cost = T.mean(T.nnet.categorical_crossentropy(noisy_output, output_var)) + l2_loss true_cost = T.mean(T.nnet.categorical_crossentropy(true_output, output_var)) + l2_loss ## error values noisy_error = 1.0 - T.mean(lasagne.objectives.categorical_accuracy(noisy_output, output_var)) true_error = 1.0 - T.mean(lasagne.objectives.categorical_accuracy(true_output, output_var)) ## stochastic gradient descent updates #updates = lasagne.updates.sgd(noisy_cost, model_params, learning_rate=sh_lr) ##stochastic gradient descent with Nesterov momentum updates = lasagne.updates.nesterov_momentum( noisy_cost, model_params, learning_rate=sh_lr, momentum=0.99) train = theano.function([input_var,output_var], [noisy_cost, noisy_error], updates=updates, allow_input_downcast=True) get_score = theano.function([input_var,output_var], [true_cost, true_error], allow_input_downcast=True) best_validation_cost = np.inf best_iter = 0 n_train_batches = int(np.ceil(trainX.shape[0] / float(BATCH_SIZE))) plot_iters = [] plot_train_cost = [] plot_train_error = [] plot_valid_cost = [] plot_valid_error = [] plot_test_cost = [] plot_test_error = [] epoch = 0 print "[X] CNN begins its training." try: while True: epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print "[O] Training at iteration %d." % iter cost_ij = train(trainX[minibatch_index*BATCH_SIZE:np.minimum((minibatch_index+1)*BATCH_SIZE, trainX.shape[0])], trainY[minibatch_index*BATCH_SIZE:np.minimum((minibatch_index+1)*BATCH_SIZE, trainY.shape[0])]) if (iter+1) % VALIDATION_FREQUENCY == 0: train_cost, train_error = get_score(trainX, trainY) valid_cost, valid_error = get_score(valX, valY) test_cost, test_error = get_score(testX, testY) plot_train_cost.append(train_cost) plot_train_error.append(train_error) plot_valid_cost.append(valid_cost) plot_valid_error.append(valid_error) plot_test_cost.append(test_cost) plot_test_error.append(test_error) plot_iters.append(iter) ## plotting functions if not os.path.exists(FIGURE_SAVE_DIR): os.makedirs(FIGURE_SAVE_DIR) plot_curves(plot_iters, plot_train_cost, plot_valid_cost, 'Training Cost', 'Validation Cost', 'train_val_cost.pdf') plot_curves(plot_iters, plot_train_error, plot_valid_error, 'Training Error', 'Validation Error', 'train_val_error.pdf') #plot_cm(train_pred, trainY, 'Confusion Matrix on the Training Set', 'cm_train.pdf') #plot_cm(valid_pred, valY, 'Confusion Matrix on the Validation Set', 'cm_valid.pdf') #plot_cm(test_pred, testY, 'Confusion Matrix on the Test Set', 'cm_test.pdf') print "--> Epoch %i, minibatch %i/%i has training true cost \t %f." % (epoch, minibatch_index+1, n_train_batches, train_cost) print "--> Epoch %i, minibatch %i/%i has validation true cost \t %f and error of \t %f %%." % (epoch, minibatch_index+1, n_train_batches, valid_cost, valid_error) if valid_cost < best_validation_cost: print "----> New best score found!" print "--> Test cost of %f and test error of %f." % (test_cost, test_error) if not os.path.exists(PARAM_SAVE_DIR): os.makedirs(PARAM_SAVE_DIR) for f in glob.glob(PARAM_SAVE_DIR+'/*'): os.remove(f) all_param_values = lasagne.layers.get_all_param_values(model) joblib.dump(all_param_values, os.path.join(PARAM_SAVE_DIR, 'params.pkl')) print "----> Parameters saved." best_validation_cost = valid_cost best_iter = iter except KeyboardInterrupt: pass end_time = timeit.default_timer() print "--> Best validation score of %f." % best_validation_cost print "--> Total runtime %.2f minutes." % ((end_time-start_time) / 60.) print "[X] Saving the scores." joblib.dump(plot_iters, os.path.join(PARAM_SAVE_DIR, "iters.pkl")) joblib.dump(plot_train_cost, os.path.join(PARAM_SAVE_DIR, "train_cost.pkl")) joblib.dump(plot_train_error, os.path.join(PARAM_SAVE_DIR, "train_error.pkl")) joblib.dump(plot_valid_cost, os.path.join(PARAM_SAVE_DIR, "valid_cost.pkl")) joblib.dump(plot_valid_error, os.path.join(PARAM_SAVE_DIR, "valid_error.pkl")) joblib.dump(plot_test_cost, os.path.join(PARAM_SAVE_DIR, "test_cost.pkl")) joblib.dump(plot_test_error, os.path.join(PARAM_SAVE_DIR, "test_error.pkl"))
def main(num_epochs=200): # Load the dataset print("Loading data...") datasets = load_data() X_train, y_train = datasets[0] X_test, y_test = datasets[1] # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') learnrate = 0.01 # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") network = build_cnn(input_var) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): l2_penalty = regularize_layer_params(network, l2) l1_penalty = regularize_layer_params(network, l1) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() + 5 * l2_penalty + l1_penalty # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) #updates = lasagne.updates.adadelta(loss, params) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=learnrate, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) best_acc = 0 # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() if epoch % 8 == 7: learnrate *= 0.96 #updates = lasagne.updates.adadelta(loss, params,learning_rate=learnrate) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=learnrate, momentum=0.9) train_fn = theano.function([input_var, target_var], loss, updates=updates) for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=False): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 test_err = test_err / test_batches test_acc = test_acc / test_batches # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" test loss:\t\t{:.6f}".format(test_err)) print(" validation accuracy:\t\t{:.2f} %".format(test_acc * 100)) if test_acc > best_acc: best_acc = test_acc return best_acc
def main(num_epochs=100, num_points=1200, compute_flag='cpu'): # Arguments passed as string need to be converted to int num_epochs = int(num_epochs) num_points = int(num_points) # Define name of output files results_file_name = 'exp_' + str(num_epochs) + '_' + str( num_points) + '_' + compute_flag + '.csv' network_file_name = 'network_' + str(num_epochs) + '_' + str( num_points) + '_' + compute_flag print 'Saving file to: %s' % results_file_name print 'Number of points: %d ' % num_points print 'Compute Flag: %s ' % compute_flag save_file(results_file_name) Deep_learner = DCNN_network.DCNN_network() # Define the input tensor input_var = T.tensor4('inputs') # Define the output tensor (in this case it is a real value or reflectivity) if compute_flag == 'gpu3_softmax': output_var = T.ivector('targets') else: output_var = T.fcol('targets') # User input to decide which experiment to run, cpu runs were performed # to check if the network was working correctly if compute_flag == 'cpu': network, l_hidden1 = Deep_learner.build_CNN(input_var) elif compute_flag == 'cpu2': network, l_hidden1 = Deep_learner.build_CNN_2(input_var) elif compute_flag == 'cpu3': network, l_hidden1 = Deep_learner.build_CNN_3(input_var) elif compute_flag == 'gpu2': print('gpu2 experiment') network, l_hidden1 = Deep_learner.build_DCNN_2(input_var) elif compute_flag == 'gpu3': print('gpu3 experiment') network, l_hidden1 = Deep_learner.build_DCNN_3(input_var) elif compute_flag == 'deep': network, l_hidden1 = Deep_learner.build_DCNN_deep(input_var) elif compute_flag == 'gpu3_softmax': network, l_hidden1 = Deep_learner.build_DCNN_3_softmax(input_var) else: network, l_hidden1 = Deep_learner.build_DCNN(input_var) train_prediction = lasagne.layers.get_output(network) test_prediction = lasagne.layers.get_output(network) if compute_flag == 'gpu3_softmax': loss = lasagne.objectives.categorical_crossentropy( train_prediction, output_var) loss = loss.mean() else: # Define the mean square error objective function loss = T.mean( lasagne.objectives.squared_error(train_prediction, output_var)) test_loss = T.mean( lasagne.objectives.squared_error(test_prediction, output_var)) # Add a l1 regulerization on the fully connected dense layer l1_penalty = regularize_layer_params(l_hidden1, l1) loss = loss + l1_penalty test_loss = loss + l1_penalty params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.0000001, momentum=0.9) train_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), output_var), dtype=theano.config.floatX) # Define theano function which generates and compiles C code for the optimization problem train_fn = theano.function([input_var, output_var], [loss, train_acc], updates=updates) # test_fn = theano.function([input_var, output_var],test_loss, updates=updates) base_path = '/home/an67a/deep_nowcaster/data/dataset2/' training_set_list = os.listdir(base_path) training_set_list = filter(lambda x: x[-4:] == '.pkl' and 'val' not in x, training_set_list) validation_set_list = os.listdir(base_path) validation_set_list = filter(lambda x: x[-4:] == '.pkl' and 'val' in x, validation_set_list) experiment_start_time = time.time() # Load Data Set DataSet = [] print('Loading data set...') for file_name in training_set_list[:3]: print file_name temp_file = file(base_path + file_name, 'rb') X_train, Y_train = cPickle.load(temp_file) temp_file.close() Y_train = Y_train.reshape(-1, ).astype('uint8') DataSet.append((X_train, Y_train)) print('Start training...') for epoch in range(num_epochs): print('Epoch number : %d ' % epoch) train_err = 0 train_batches = 0 train_acc = 0 start_time = time.time() for data in DataSet: # for file_name in training_set_list: # print file_name # temp_file = file(base_path + file_name,'rb') # X_train,Y_train = cPickle.load(temp_file) # Y_train = Y_train.astype('uint8') # temp_file.close() for batch in iterate_minibatches(data[0], data[1], 1059, shuffle=False): inputs, targets = batch err, acc = train_fn(inputs, targets) train_err += err train_acc += acc train_batches += 1 print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation accuracy:\t\t{:.2f} %".format(train_acc / train_batches * 100)) append_file(results_file_name, epoch + 1, round(train_err / train_batches, 2), round((train_acc / train_batches) * 100, 2)) # Dump the network file every 100 epochs if (epoch + 1) % 100 == 0: print('creating network file') network_file = file( '/home/an67a/deep_nowcaster/output/' + network_file_name + '_' + str(epoch + 1) + '.pkl', 'wb') cPickle.dump(network, network_file, protocol=cPickle.HIGHEST_PROTOCOL) network_file.close() time_taken = round(time.time() - experiment_start_time, 2) print('The experiment took {:.3f}s'.format(time.time() - experiment_start_time)) append_file(results_file_name, 'The experiment took', time_taken, 0)
def prepare_functions(): from lasagne.regularization import regularize_layer_params_weighted, regularize_layer_params from lasagne.regularization import l1, l2 """ This prepares the theano/lasagne functions for use in the training functions """ observations = T.matrix('observations') srng = RandomStreams(seed=42) predictions = T.vector('predictions') predictions_ct = theano.gradient.disconnected_grad_(predictions) discounted_reward = T.vector('actual') r = T.vector('random') # Set up random sampling used in some policies rv_u = srng.uniform(size=(1, )) r = theano.function([], rv_u) # Set up the network D_network = QNetwork(observations) q_values = lasagne.layers.get_output(D_network) probabilities = lasagne.nonlinearities.softmax(q_values) D_params = lasagne.layers.get_all_params(D_network, trainable=True) get_q_values = theano.function([observations], q_values) l1_penalty = 1e-4 * regularize_layer_params( lasagne.layers.get_all_layers(D_network), l1) # Policies: # Policy1: 'greedy_choice': Greedy # Policy2: ' weighted_choice': chooses actions based upon probabilities policyname = 'greedy' # policyname='greedy' if policyname == 'greedy': actions = T.argmax(q_values, axis=1) elif policyname == 'weighted': actions = T.argmax( T.abs_(T.extra_ops.cumsum(probabilities, axis=1) - r()), axis=1) else: raise Exception policy_action = theano.function([observations], actions, name=policyname) prediction = q_values[:, actions].reshape((-1, )) get_prediction = theano.function([observations], prediction) D_obj = lasagne.objectives.squared_error(prediction, discounted_reward )\ .mean(axis=0, keepdims=False)# + l1_penalty D_updates = lasagne.updates.adam(D_obj, D_params, learning_rate=LEARN_RATE) D_train = theano.function([observations, discounted_reward], D_obj, updates=D_updates, name='D_training') functions = {} functions['get_q_values'] = get_q_values functions['policy_action'] = policy_action functions['D_train'] = D_train functions['D_params'] = D_params functions['D_network'] = D_network functions['get_params'] = lasagne.layers.get_all_params(D_network) functions['get_all_param_values'] = lasagne.layers.get_all_param_values( D_network) return functions
l_hidden2 = lasagne.layers.DenseLayer(l_hid1_drop, num_units=600, nonlinearity=lasagne.nonlinearities.sigmoid) l_out = lasagne.layers.DenseLayer(l_hidden2, num_units=10, nonlinearity=lasagne.nonlinearities.softmax) # get the prediction of network train_prediction = lasagne.layers.get_output(l_out) #f = theano.function([X], prediction) # Loss function for train train_loss = lasagne.objectives.categorical_crossentropy(train_prediction, y) train_loss = train_loss.mean() # Regularization layer1_reg = reg.regularize_layer_params(l_hidden1, reg.l1)*Lambda layer2_reg = reg.regularize_layer_params(l_hidden2, reg.l1)*Lambda train_loss = train_loss + layer1_reg + layer2_reg # train params and updates params = lasagne.layers.get_all_params(l_out, trainable=True) updates = lasagne.updates.nesterov_momentum( train_loss, params, learning_rate=0.01, momentum=0.9) # train function train_fn = theano.function([X, y], train_loss, updates=updates) # ############################################################## # Test side # Test prediction
all_out = [l_out] all_out.extend(l_hids) train_out = lasagne.layers.get_output( all_out, deterministic=False) hids_out_train = train_out[1:] train_out = train_out[0] eval_out = lasagne.layers.get_output( all_out, deterministic=True) hids_out_eval = eval_out[1:] eval_out = eval_out[0] cost_train = T.mean(networks.calc_cross_ent(train_out, sym_y, paras)) if paras["L2_reg"] > 0: cost_train += paras["L2_reg"] * regularize_layer_params(l_out, l2) if paras["L1_reg"] > 0: cost_train += paras["L1_reg"] * regularize_layer_params(l_out, l1) cost_eval = networks.calc_cross_ent(eval_out, sym_y, paras) all_params = lasagne.layers.get_all_params(l_out, trainable=True) updates, norm = networks.gradient_updates(cost_train, all_params, paras, sh_lr, update_function=eval(paras["optimizer"])) print("compiling f_eval...") fun_inp = [sym_x, sym_y] if paras["rnn_type"] != "lstm": hids.pop(-2)
def main(): print("Building network ...") # Note in Rocktaschel's paper he first used a linear layer to transform wordvector # into vector of size K_HIDDEN. I'm assuming that this is equivalent to update W. # Input layer for premise input_var_type = T.TensorType('int32', [False] * 2) var_name = "input" input_var_prem = input_var_type(var_name) input_var_hypo = input_var_type(var_name) l_in_prem = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH_PREM), input_var=input_var_prem) # Mask layer for premise l_mask_prem = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH_PREM)) # Input layer for hypothesis l_in_hypo = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH_HYPO), input_var=input_var_hypo) # Mask layer for hypothesis l_mask_hypo = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH_HYPO)) # Word embedding layers l_in_prem_hypo = lasagne.layers.ConcatLayer([l_in_prem, l_in_hypo], axis=1) l_in_embedding = lasagne.layers.EmbeddingLayer(l_in_prem_hypo, VOCAB_SIZE, WORD_VECTOR_SIZE, W=word_vector_init, name='EmbeddingLayer') # Adding this linear layer didn't increase the accuracy, so I comment it out # l_in_linear = lasagne.layers.EmbeddingChangeLayer(l_in_embedding, K_HIDDEN, nonlinearity=lasagne.nonlinearities.linear) l_in_embedding_dropout = lasagne.layers.DropoutLayer(l_in_embedding, p=DROPOUT_RATE, rescale=True) l_in_prem_embedding = lasagne.layers.SliceLayer(l_in_embedding_dropout, slice(0, MAX_LENGTH_PREM), axis=1) l_in_hypo_embedding = lasagne.layers.SliceLayer(l_in_embedding_dropout, slice(MAX_LENGTH_PREM, MAX_LENGTH_PREM + MAX_LENGTH_HYPO), axis=1) # LSTM layer for premise l_lstm_prem = lasagne.layers.LSTMLayer_withCellOut(l_in_prem_embedding, K_HIDDEN, peepholes=False, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask_prem, only_return_final=False) # The slicelayer extracts the cell output of the premise sentence l_lstm_prem_out = lasagne.layers.SliceLayer(l_lstm_prem, -1, axis=1) # LSTM layer for hypothesis # LSTM for premise and LSTM for hypothesis have different parameters l_lstm_hypo = lasagne.layers.LSTMLayer(l_in_hypo_embedding, K_HIDDEN, peepholes=False, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, cell_init=l_lstm_prem_out, mask_input=l_mask_hypo) l_lstm_hypo_dropout = lasagne.layers.DropoutLayer(l_lstm_hypo, p=DROPOUT_RATE, rescale=True) # Isolate the last hidden unit output l_hypo_out = lasagne.layers.SliceLayer(l_lstm_hypo_dropout, -1, axis=1) # A softmax layer create probability distribution of the prediction l_out = lasagne.layers.DenseLayer(l_hypo_out, num_units=NUM_LABELS, W=lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax) # The output of the net network_output_train = lasagne.layers.get_output(l_out, deterministic=False) network_output_test = lasagne.layers.get_output(l_out, deterministic=True) # Theano tensor for the targets target_values = T.ivector('target_output') # The loss function is calculated as the mean of the cross-entropy cost = lasagne.objectives.categorical_crossentropy(network_output_train, target_values).mean() from lasagne.regularization import l2, regularize_layer_params l2_penalty = regularize_layer_params(l_out, l2) * REGU cost = cost + l2_penalty # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_out) # Compute ADAM updates for training print("Computing updates ...") # updates = lasagne.updates.adam(cost, all_params, learning_rate=LEARNING_RATE, beta1=0.9, beta2=0.999, epsilon=1e-08) updates = lasagne.updates.adam(cost, all_params, masks=[('EmbeddingLayer.W', embedding_w_mask)], learning_rate=LEARNING_RATE, beta1=0.9, beta2=0.999, epsilon=1e-08) """ # Test test_prediction = lasagne.layers.get_output(l_out, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_values).mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) """ # Theano functions for training and computing cost train_acc = T.mean(T.eq(T.argmax(network_output_test, axis=1), target_values), dtype=theano.config.floatX) print("Compiling functions ...") train = theano.function([l_in_prem.input_var, l_mask_prem.input_var, l_in_hypo.input_var, l_mask_hypo.input_var, target_values], [cost, train_acc], updates=updates, allow_input_downcast=True) # Theano function computing the validation loss and accuracy val_acc = T.mean(T.eq(T.argmax(network_output_test, axis=1), target_values), dtype=theano.config.floatX) validate = theano.function([l_in_prem.input_var, l_mask_prem.input_var, l_in_hypo.input_var, l_mask_hypo.input_var, target_values], [cost, val_acc], allow_input_downcast=True) print("Training ...") print('Regularization strength: ', REGU) print('Learning rate: ', LEARNING_RATE) print('Dropout rate: ', DROPOUT_RATE) print('Hidden size: ', K_HIDDEN) sys.stdout.flush() try: for epoch in range(NUM_EPOCHS): n = 0 avg_cost = 0.0 count = 0 sub_epoch = 0 train_acc = 0 while n < TRAIN_SIZE: X_prem, X_prem_mask, X_hypo, X_hypo_mask, y = get_batch_data(n, data_train) err, acc = train(X_prem, X_prem_mask, X_hypo, X_hypo_mask, y) avg_cost += err train_acc += acc n += BATCH_SIZE count += 1 if (n / BATCH_SIZE) % (TRAIN_SIZE / BATCH_SIZE / 5) == 0: sub_epoch += 1 avg_cost /= count print("Sub epoch {} average loss = {}, accuracy = {}".format(sub_epoch, avg_cost, train_acc / count * 100)) avg_cost = 0 count = 0 train_acc = 0 # Calculate validation accuracy m = 0 val_err = 0 val_acc = 0 val_batches = 0 while m < VAL_SIZE: X_prem, X_prem_mask, X_hypo, X_hypo_mask, y = get_batch_data(m, data_val) err, acc = validate(X_prem, X_prem_mask, X_hypo, X_hypo_mask, y) val_err += err val_acc += acc val_batches += 1 m += BATCH_SIZE print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) sys.stdout.flush() except KeyboardInterrupt: pass
def main(num_epochs=NUM_EPOCHS): print("Building network ...") # First, we build the network, starting with an input layer # Recurrent layers expect input of shape # (batch size, max sequence length, number of features) l_in = lasagne.layers.InputLayer(shape=(N_BATCH, WINDOW, 20)) l_forward = lasagne.layers.LSTMLayer( l_in, N_HIDDEN, grad_clipping=GRAD_CLIP, only_return_final=True) l_backward = lasagne.layers.LSTMLayer( l_in, N_HIDDEN, grad_clipping=GRAD_CLIP, only_return_final=True, backwards=True) # Now, we'll concatenate the outputs to combine them. l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward]) # Our output layer is a simple dense connection, with 1 output unit l_out = lasagne.layers.DenseLayer( l_concat, num_units=3, nonlinearity=lasagne.nonlinearities.softmax) target_values = T.ivector('target_output') prediction = lasagne.layers.get_output(l_out) loss = lasagne.objectives.categorical_crossentropy(prediction, target_values) l1_penalty = regularize_layer_params(l_out, l1) loss = loss.mean() + lamda * l1_penalty acc = T.mean(T.eq(T.argmax(prediction, axis=1), target_values),dtype=theano.config.floatX) all_params = lasagne.layers.get_all_params(l_out) LEARNING_RATE = .01 print("Computing updates ...") updates = lasagne.updates.nesterov_momentum(loss, all_params,LEARNING_RATE,0.95) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([l_in.input_var, target_values], loss, updates=updates) valid = theano.function([l_in.input_var, target_values], [loss, acc]) accuracy = theano.function( [l_in.input_var, target_values],acc ) result = theano.function([l_in.input_var],prediction) best_acc=0 print("Training ...") try: for epoch in range(NUM_EPOCHS): if epoch % 50 == 49: LEARNING_RATE *= 0.5 updates = lasagne.updates.nesterov_momentum(loss, all_params,LEARNING_RATE,0.95) train = theano.function([l_in.input_var, target_values], loss, updates=updates) train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(train_data, train_label, N_BATCH, WINDOW): inputs, targets = batch train_err += train(inputs, targets) train_batches += 1 val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(valid_data, valid_label, N_BATCH, WINDOW): inputs, targets = batch err, acc = valid(inputs, targets) val_err += err val_acc += acc val_batches += 1 val_acc = val_acc / val_batches if val_acc > best_acc: best_acc = val_acc # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, NUM_EPOCHS, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc * 100)) except KeyboardInterrupt: pass
def main(num_epochs=NUM_EPOCHS): print("Building network ...") # First, we build the network, starting with an input layer # Recurrent layers expect input of shape # (batch size, max sequence length, number of features) l_in = lasagne.layers.InputLayer(shape=(N_BATCH, WINDOW, 20)) l_forward = lasagne.layers.LSTMLayer(l_in, N_HIDDEN, grad_clipping=GRAD_CLIP, only_return_final=True) l_backward = lasagne.layers.LSTMLayer(l_in, N_HIDDEN, grad_clipping=GRAD_CLIP, only_return_final=True, backwards=True) # Now, we'll concatenate the outputs to combine them. l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward]) # Our output layer is a simple dense connection, with 1 output unit l_out = lasagne.layers.DenseLayer( l_concat, num_units=3, nonlinearity=lasagne.nonlinearities.softmax) target_values = T.ivector('target_output') prediction = lasagne.layers.get_output(l_out) loss = lasagne.objectives.categorical_crossentropy(prediction, target_values) l1_penalty = regularize_layer_params(l_out, l1) loss = loss.mean() + lamda * l1_penalty acc = T.mean(T.eq(T.argmax(prediction, axis=1), target_values), dtype=theano.config.floatX) all_params = lasagne.layers.get_all_params(l_out) LEARNING_RATE = .01 print("Computing updates ...") updates = lasagne.updates.nesterov_momentum(loss, all_params, LEARNING_RATE, 0.95) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([l_in.input_var, target_values], loss, updates=updates) valid = theano.function([l_in.input_var, target_values], [loss, acc]) accuracy = theano.function([l_in.input_var, target_values], acc) result = theano.function([l_in.input_var], prediction) best_acc = 0 print("Training ...") try: for epoch in range(NUM_EPOCHS): if epoch % 50 == 49: LEARNING_RATE *= 0.5 updates = lasagne.updates.nesterov_momentum( loss, all_params, LEARNING_RATE, 0.95) train = theano.function([l_in.input_var, target_values], loss, updates=updates) train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(train_data, train_label, N_BATCH, WINDOW): inputs, targets = batch train_err += train(inputs, targets) train_batches += 1 val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(valid_data, valid_label, N_BATCH, WINDOW): inputs, targets = batch err, acc = valid(inputs, targets) val_err += err val_acc += acc val_batches += 1 val_acc = val_acc / val_batches if val_acc > best_acc: best_acc = val_acc # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, NUM_EPOCHS, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc * 100)) except KeyboardInterrupt: pass