def test_apply_penalty(self): from lasagne.regularization import apply_penalty, l2 A = T.vector() B = T.matrix() assert apply_penalty([], l2) == 0 assert equal_computations([apply_penalty(A, l2)], [l2(A)]) assert equal_computations([apply_penalty([A, B], l2)], [sum([l2(A), l2(B)])])
def build_model_focusing(input_feas, classes, hidden_count, batchnorm=True): # Initers, Layers ini = lasagne.init.HeUniform() nonlin = lasagne.nonlinearities.rectify linear = lasagne.nonlinearities.linear softmax = lasagne.nonlinearities.softmax # Input Layer l_in = lasagne.layers.InputLayer(shape=(None, input_feas)) l_focus1 = FocusedLayer1D(l_in, num_units=hidden_count, nonlinearity=linear, name='focus1', trainMus=UPDATE_MU, trainSis=UPDATE_SI, initMu=INIT_MU, W=ini, withWeights=WITH_WEIGHTS, bias=lasagne.init.Constant(0.0), initSigma=INIT_SI, scaler=INIT_SCALER, weight_gain=1.0, trainScaler=UPDATE_SCAlER, trainWs=True) if batchnorm: # if you close BATCHNORM weights get LARGE l_bn = lasagne.layers.NonlinearityLayer( lasagne.layers.BatchNormLayer(l_focus1), nonlinearity=nonlin) else: l_bn = lasagne.layers.NonlinearityLayer(l_focus1, nonlinearity=nonlin) #l_drop1 = lasagne.layers.dropout(l_bn, p=0.1) # Output l_out = lasagne.layers.DenseLayer(l_bn, num_units=classes, nonlinearity=softmax, W=ini, name='output') penalty = l2(l_out.W) * 1e-3 if WITH_WEIGHTS: penalty += l2(l_focus1.W) * 1e-4 + (l1(l_focus1.W) * 1e-6) + l2(l_focus1.si) * 1e-2 if not USE_PENALTY: penalty = penalty * 0 return l_out, penalty
def build_model(input_feas, classes, hidden_count, batchnorm=True): # Initers, Layers ini = lasagne.init.GlorotUniform() nonlin = lasagne.nonlinearities.rectify softmax = lasagne.nonlinearities.softmax lin = lasagne.nonlinearities.linear # Input Layer l_in = lasagne.layers.InputLayer(shape=(None, input_feas)) # Denses l_dense1 = lasagne.layers.DenseLayer(l_in, num_units=hidden_count, nonlinearity=lin, W=ini, name="dense1", b=None) if batchnorm: # if you close BATCHNORM weights get LARGE l_bn = lasagne.layers.NonlinearityLayer( lasagne.layers.BatchNormLayer(l_dense1), nonlinearity=nonlin) else: l_bn = lasagne.layers.NonlinearityLayer(l_dense1, nonlinearity=nonlin) #l_dense2 = lasagne.layers.DenseLayer(l_dense1, num_units=4, nonlinearity=lasagne.nonlinearities.tanh, W=ini, name='dense2') #l_drop1 = lasagne.layers.dropout(l_bn, p=0.1) # Output Layer l_out = lasagne.layers.DenseLayer(l_bn, num_units=classes, nonlinearity=softmax, W=ini, name='output') penalty = (l2(l_dense1.W) * 1e-4) + (l1(l_dense1.W) * 1e-6) + (l2(l_out.W) * 1e-3) if not USE_PENALTY: penalty = penalty * 0 #penalty = penalty*0 #penalty = (l2(l_dense1.W)*1e-30)#(l2(l_dense1.W)*1e-3)+(l1(l_dense1.W)*1e-6) +(l2(l_out.W)*1e-3) return l_out, penalty
def get_loss(self, input=None, target=None, aggregation=None, **kwargs): """ Get loss scalar expression :parameters: - input : (default `None`) an expression that results in the input data that is passed to the network - target : (default `None`) an expression that results in the desired output that the network is being trained to generate given the input - aggregation : None to use the value passed to the constructor or a value to override it - kwargs : additional keyword arguments passed to `input_layer`'s `get_output` method :returns: - output : loss expressions """ network_output = lasagne.layers.get_output(self.input_layer, input, **kwargs) if target is None: target = self.target_var if aggregation not in self._valid_aggregation: raise ValueError('aggregation must be \'mean\', \'sum\', ' 'or None, not {0}'.format(aggregation)) if aggregation is None: aggregation = self.aggregation losses = self.loss_function(network_output, target) + self.l2_strength*l2(self.input_layer) if aggregation is None or aggregation == 'mean': return losses.mean() elif aggregation == 'sum': return losses.sum() else: raise RuntimeError('This should have been caught earlier')
def get_loss(self, input=None, target=None, deterministic=False, **kwargs): loss = super(RMSE, self).get_loss(input=input, target=target, deterministic=deterministic, **kwargs) loss = loss**0.5 + self.alpha * l2(self.input_layer) return loss
def synth_compiled(input_layer, output_layer, I): which_class = 0 LAM = 0.1 theImage = theano.shared(I, name='theImage') params = [theImage] classNeuron = get_output( output_layer, deterministic=True, inputs=theImage )[0, which_class] # the zero is needed to get a scalar gradient (otherwise we would a single number per image)! we want a single image anyways regularized_score = -(classNeuron - LAM * l2(theImage) ) # turn this into a loss that ADAM minimizes theGrad = T.grad(regularized_score, theImage) updates = lasagne.updates.adam([theGrad], params, learning_rate=0.1) synth_fn = theano.function([], [regularized_score], updates=updates) terr = [] bar = progressbar.ProgressBar() for i in bar(range(1000)): terr.append(synth_fn()) # this also updates the params=images # print(terr[-1]) Isynth = np.array(theImage.eval()) figure() imshow(Isynth[0, 0]) figure() imshow(Isynth[0, 1])
def get_cost_prior(self): prior_cost = 0 params = self.get_params() for param in params: if param.name == 'W': prior_cost += regularization.l2(param).sum() return prior_cost
def get_loss(self, input=None, target=None, deterministic=False, **kwargs): loss = super(MyObjective, self).get_loss(input=input, target=target, deterministic=deterministic, **kwargs) if not deterministic: return loss + self.magicnum * l2(self.input_layer) else: return loss
def get_loss(self, input=None, target=None, deterministic=False, **kwargs): loss = super(L2Regularization, self).get_loss(input=input, target=target, deterministic=deterministic, **kwargs) if not deterministic: return loss + self.alpha * l2(self.input_layer) else: return loss
def get_loss(self, input=None, target=None, deterministic=False, **kwargs): loss = super(WeightDecayObjective, self).get_loss(input=input, target=target, deterministic=deterministic, **kwargs) if not deterministic: return loss + self.weight_decay * regularization.l2( self.input_layer) else: return loss
def get_loss(self, input=None, target=None, deterministic=False, **kwargs): loss = super(RMSE, self).get_loss(input=input,target=target, deterministic=deterministic, **kwargs) loss = loss**0.5 + self.alpha * l2(self.input_layer) return loss
def get_loss(self, input=None, target=None, deterministic=False, **kwargs): loss = super(L2Regularization, self).get_loss(input=input,target=target, deterministic=deterministic, **kwargs) if not deterministic: return loss + self.alpha * l2(self.input_layer) else: return loss
def reset(): if any(np.isnan(scale.get_value()) for scale in scales): for scale in scales: scale.set_value(1.) for l in l_hiddens: l.b.set_value(Constant()(l.b.get_value().shape)) l.W.set_value(Orthogonal()(l.W.get_value().shape)) l_out.b.set_value(Constant()(l_out.b.get_value().shape)) l_out.W.set_value(Orthogonal()(l_out.W.get_value().shape)) for p in (p for u in (updates_ada, updates_other, updates_scal) for p in u if p not in get_all_params(l_out)): p.set_value(Constant()(p.get_value().shape)) chunky_l2 = apply_penalty(get_all_params(l_out, regularizable=True), l2) - l2( l_hiddens[0].W) + l2(l_hiddens[0].W / T.reshape(vscale, (206279, 1))) chunky_l1 = apply_penalty(get_all_params(l_out, regularizable=True), l1) - l1( l_hiddens[0].W) + l1(l_hiddens[0].W / T.reshape(vscale, (206279, 1))) simple_l2 = apply_penalty(get_all_params(l_out, regularizable=True), l2) #l_out2 = DenseLayer(dropout(l_hiddens2[-1]), num_units=y.shape[1]) #l_out = lasagne.layers.NonlinearityLayer(lasagne.layers.ElemwiseSumLayer((l_out1,l_out2),.5), softmax) #categorical_crossentropy(get_output(l_out)[train_indice]) target = T.fmatrix(name="target") #f=theano.function([l_in.input_var],get_output(l_out),allow_input_downcast=True) #f(X[0,:].toarray()) loss = categorical_crossentropy(get_output(l_out), target).mean() # train_loss_smoo=categorical_crossentropy(get_output(l_out,deterministic=True)[train_indices,],target[train_indices,]).mean() # valid_loss=categorical_crossentropy(get_output(l_out)[valid_indices,],target[valid_indices,]).mean()
def build_model(var_x, input_size_x, var_y, input_size_y, layer_sizes, weight_init=lasagne.init.GlorotUniform(), drop_prob=None, train_gamma_layer=None, **kwargs): layer_types = Params.LAYER_TYPES # Create x to y network model_x, hidden_x, weights_x, biases_x, prediction_y, hooks_x, dropouts_x = build_single_channel(var_x, input_size_x, input_size_y, layer_sizes, layer_types, weight_init, lasagne.init.Constant( 0.), drop_prob, 'x', train_gamma_layer=train_gamma_layer) weights_y = [transpose_recursive(w) for w in reversed(weights_x)] bias_y = lasagne.init.Constant(0.) model_y, hidden_y, weights_y, biases_y, prediction_x, hooks_y, dropouts_y = build_single_channel(var_y, input_size_y, input_size_x, list(reversed( layer_sizes)), list(reversed( layer_types)), weights_y, bias_y, drop_prob, 'y', dropouts_x, train_gamma_layer) reversed_hidden_y = list(reversed(hidden_y)) hooks = {} if "BatchNormalizationLayer:movingavg" in hooks_x: # Merge the two dictionaries hooks = hooks_x hooks["BatchNormalizationLayer:movingavg"].extend(hooks_y["BatchNormalizationLayer:movingavg"]) # hooks["WhiteningLayer:movingavg"].extend(hooks_y["WhiteningLayer:movingavg"]) loss_x = Params.LOSS_X * lasagne.objectives.squared_error(var_x, prediction_x).sum(axis=1).mean() loss_y = Params.LOSS_Y * lasagne.objectives.squared_error(var_y, prediction_y).sum(axis=1).mean() if len(hidden_x) % 2 == 0: middle_layer = int(len(hidden_x) / 2.) - 1 else: middle_layer = int(floor(float(len(hidden_x)) / 2.)) hooks_temp = {} layer_x = lasagne.layers.get_output(hidden_x[Params.TEST_LAYER], moving_avg_hooks=hooks_temp) layer_y = lasagne.layers.get_output(reversed_hidden_y[Params.TEST_LAYER], moving_avg_hooks=hooks_temp) loss_l2 = Params.L2_LOSS * lasagne.objectives.squared_error(layer_x, layer_y).sum(axis=1).mean() loss_weight_decay = 0 shrinkage = Params.SHRINKAGE cov_x = T.dot(layer_x.T, layer_x) / T.cast(layer_x.shape[0], dtype=T.config.floatX) cov_y = T.dot(layer_y.T, layer_y) / T.cast(layer_x.shape[0], dtype=T.config.floatX) # mu_x = T.nlinalg.trace(cov_x) / layer_x.shape[1] # mu_y = T.nlinalg.trace(cov_y) / layer_y.shape[1] # cov_x = (1. - shrinkage) * cov_x + shrinkage * mu_x * T.identity_like(cov_x) # cov_y = (1. - shrinkage) * cov_y + shrinkage * mu_y * T.identity_like(cov_y) # loss_withen_x = Params.WITHEN_REG_X * T.mean(T.sum(abs(cov_x - T.identity_like(cov_x)), axis=0)) # loss_withen_y = Params.WITHEN_REG_Y * T.mean(T.sum(abs(cov_y - T.identity_like(cov_y)), axis=0)) loss_withen_x = Params.WITHEN_REG_X * (T.sqrt(T.sum(T.sum(cov_x ** 2))) - T.sqrt(T.sum(T.diag(cov_x) ** 2))) loss_withen_y = Params.WITHEN_REG_Y * (T.sqrt(T.sum(T.sum(cov_y ** 2))) - T.sqrt(T.sum(T.diag(cov_y) ** 2))) loss_weight_decay += lasagne.regularization.regularize_layer_params(model_x, penalty=l2) * Params.WEIGHT_DECAY loss_weight_decay += lasagne.regularization.regularize_layer_params(model_y, penalty=l2) * Params.WEIGHT_DECAY gamma_x = lasagne.layers.get_all_params(model_x, gamma=True) gamma_y = lasagne.layers.get_all_params(model_y, gamma=True) loss_gamma = T.constant(0) loss_gamma += sum(l2(gamma) for gamma in gamma_x) * Params.GAMMA_COEF loss_gamma += sum(l2(gamma) for gamma in gamma_y) * Params.GAMMA_COEF loss = loss_x + loss_y + loss_l2 + loss_weight_decay + loss_withen_x + loss_withen_y + loss_gamma output = { 'loss_x': loss_x, 'loss_y': loss_y, 'loss_l2': loss_l2, 'loss_weight_decay': loss_weight_decay, 'loss_gamma': loss_gamma, 'loss_withen_x': loss_withen_x, 'loss_withen_y': loss_withen_y, 'mean_x': T.mean(T.mean(layer_x, axis=0)), 'mean_y': T.mean(T.mean(layer_y, axis=0)), 'var_x': T.mean(T.var(layer_x, axis=0)), 'var_y': T.mean(T.var(layer_y, axis=0)), 'var_mean_x': T.var(T.mean(layer_x, axis=0)), 'var_mean_y': T.var(T.mean(layer_y, axis=0)) } return model_x, model_y, hidden_x, reversed_hidden_y, loss, output, hooks
y_train = T.cast(theano.shared(np.load('/root/proj/MIT_dumped/y_train.npy')),'int32') y_test = T.cast(theano.shared(np.load('/root/proj/MIT_dumped/y_test.npy')),'int32') # load datasets X_train_fc7 = theano.shared(np.load('/root/proj/MIT_dumped/X_train_fc7.npy').astype(theano.config.floatX)) X_test_fc7 = theano.shared(np.load('/root/proj/MIT_dumped/X_test_fc7.npy').astype(theano.config.floatX)) all_params = layers.get_all_params(output) objective = objectives.Objective(output,loss_function=objectives.multinomial_nll) loss_train = objective.get_loss([X_batch_one, X_batch_two], target=y_batch) LEARNING_RATE =0.0122 MOMENTUM=0.9 REG = .0009 reg_loss = regularization.l2(output) * REG total_loss = loss_train + reg_loss upds = updates.nesterov_momentum(total_loss, all_params, LEARNING_RATE, MOMENTUM) pred = T.argmax( output.get_output([X_batch_one, X_batch_two], deterministic=True), axis=1) accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) print "begin compiling" givens = {X_batch_one: X_train_fc6[batch_index*batch_size:(batch_index+1)*batch_size], X_batch_two: X_train_fc7[batch_index*batch_size:(batch_index+1)*batch_size], y_batch: y_train[batch_index*batch_size:(batch_index+1)*batch_size]} train = theano.function([batch_index], loss_train, updates=upds, givens=givens) test = theano.function([], accuracy, givens={X_batch_one:X_test_fc6, X_batch_two:X_test_fc7, y_batch:y_test}) num_epochs = 1000 for epoch in range(num_epochs): print "epoch %s" % epoch
def build_model(var_x, input_size_x, var_y, input_size_y, layer_sizes, weight_init=lasagne.init.GlorotUniform()): """ Creates at bi-directional model, containing two channels from var_x to the reconstruction of var_y and vice versa, the returned value contains also a the composite loss term. The loss term is composed of: 1. The reconstruction loss between X and X' and Y and Y' (X' and Y' are the output of each channel) 2. The reconstruction loss of the OUTPUT_LAYER from both channels 3. The covariance regularization which aims to decorralted each output internally 4. The gamma regularization, equals to the the sum of the squared norm of 1/gamma (the batch normalization parameter) 5. Weight decay :param var_x: theano variable for the input x view :param input_size_x: size of x dimensionality :param var_y: theano variable for the input y view :param input_size_y: size of y dimensionality :param layer_sizes: array containing the sizes of hidden layers :param weight_init: initialization function for the weights :return: """ layer_types = Params.LAYER_TYPES # Create x to y network model_x, hidden_x, weights_x, biases_x, prediction_y, hooks_x, dropouts_x = build_single_channel( var_x, input_size_x, input_size_y, layer_sizes, layer_types, weight_init, lasagne.init.Constant(0.), 'x') weights_y = [transpose_recursive(w) for w in reversed(weights_x)] bias_y = lasagne.init.Constant(0.) model_y, hidden_y, weights_y, biases_y, prediction_x, hooks_y, dropouts_y = build_single_channel( var_y, input_size_y, input_size_x, list(reversed(layer_sizes)), list(reversed(layer_types)), weights_y, bias_y, 'y', dropouts_x) reversed_hidden_y = list(reversed(hidden_y)) hooks = {} if "BatchNormalizationLayer:movingavg" in hooks_x: # Merge the two dictionaries hooks = hooks_x hooks["BatchNormalizationLayer:movingavg"].extend( hooks_y["BatchNormalizationLayer:movingavg"]) # hooks["WhiteningLayer:movingavg"].extend(hooks_y["WhiteningLayer:movingavg"]) loss_x = Params.LOSS_X * lasagne.objectives.squared_error( var_x, prediction_x).sum(axis=1).mean() loss_y = Params.LOSS_Y * lasagne.objectives.squared_error( var_y, prediction_y).sum(axis=1).mean() hooks_temp = {} layer_x = lasagne.layers.get_output(hidden_x[Params.OUTPUT_LAYER], moving_avg_hooks=hooks_temp) layer_y = lasagne.layers.get_output(reversed_hidden_y[Params.OUTPUT_LAYER], moving_avg_hooks=hooks_temp) loss_l2 = Params.L2_LOSS * lasagne.objectives.squared_error( layer_x, layer_y).sum(axis=1).mean() loss_weight_decay = 0 cov_x = T.dot(layer_x.T, layer_x) / T.cast(layer_x.shape[0], dtype=T.config.floatX) cov_y = T.dot(layer_y.T, layer_y) / T.cast(layer_x.shape[0], dtype=T.config.floatX) loss_withen_x = Params.WITHEN_REG_X * (T.sqrt(T.sum(T.sum(cov_x**2))) - T.sqrt(T.sum(T.diag(cov_x)**2))) loss_withen_y = Params.WITHEN_REG_Y * (T.sqrt(T.sum(T.sum(cov_y**2))) - T.sqrt(T.sum(T.diag(cov_y)**2))) loss_weight_decay += lasagne.regularization.regularize_layer_params( model_x, penalty=l2) * Params.WEIGHT_DECAY loss_weight_decay += lasagne.regularization.regularize_layer_params( model_y, penalty=l2) * Params.WEIGHT_DECAY gamma_x = lasagne.layers.get_all_params(model_x, gamma=True) gamma_y = lasagne.layers.get_all_params(model_y, gamma=True) loss_gamma = T.constant(0) loss_gamma += sum(l2(1 / gamma) for gamma in gamma_x) * Params.GAMMA_COEF loss_gamma += sum(l2(1 / gamma) for gamma in gamma_y) * Params.GAMMA_COEF loss = loss_x + loss_y + loss_l2 + loss_weight_decay + loss_withen_x + loss_withen_y + loss_gamma output = { 'loss_x': loss_x, 'loss_y': loss_y, 'loss_l2': loss_l2, 'loss_weight_decay': loss_weight_decay, 'loss_gamma': loss_gamma, 'loss_withen_x': loss_withen_x, 'loss_withen_y': loss_withen_y, 'mean_x': T.mean(T.mean(layer_x, axis=0)), 'mean_y': T.mean(T.mean(layer_y, axis=0)), 'var_x': T.mean(T.var(layer_x, axis=0)), 'var_y': T.mean(T.var(layer_y, axis=0)), 'var_mean_x': T.var(T.mean(layer_x, axis=0)), 'var_mean_y': T.var(T.mean(layer_y, axis=0)) } return model_x, model_y, hidden_x, reversed_hidden_y, loss, output, hooks
def synthesize_image(input_layer, output_layer, inputshape, which_class, gradient_steps, gradient_stepsize, LAM, chopNonlin=True, I0=None): """ does gradient ascend in image space to maximize a certain class score, hence producing an image that maximizes a class :param input_layer: :param output_layer: :param inputshape: :param gradient_steps: :param gradient_stepsize: :param chopNonlin: maximize the class score before or after the nonlinearity: =True-> maximize the unnormalized score :param I0: optinally, put in an image from which we start the optimization. Could be a natural image in whihc we want to enhance the features of the clas If None, random initialization will be made :return: """ assert inputshape[0] == 1 input_var = input_layer.input_var if chopNonlin: before_non = _get_output_before_nonlinearity(output_layer, deterministic=True) classNeuron = before_non[0, which_class] else: classNeuron = get_output( output_layer, deterministic=True )[0, which_class] # the zero is needed to get a scalar gradient (otherwise we would a single number per image)! we want a single image anyways regularized_score = classNeuron - LAM * l2( input_var ) # mind the SIGN: we MAXIMIZE class_prob, hence l2 must be subtracted theGrad = T.grad(regularized_score, input_var) gradient_fn = theano.function([input_var], theGrad) score_fn = theano.function([input_var], regularized_score) # starting point of gradient ascend: if I0 is None: # I = np.zeros(inputshape,dtype='float32') I = np.random.normal(0, 1, inputshape).astype('float32') else: I = np.copy( I0) # otherwise we would modifiy the original image as a sideffect assert I.shape == inputshape # #(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((( # # # loss = - regularized_score # we want to maximize the score hence the minus sign # params = [theano.shared(I, name='theImage')] # params = [theano.shared(input_var)] # # updates = lasagne.updates.adam(loss, params) # **optimizer_params # # train_fn = theano.function([input_var, target_var], [loss, train_acc], updates=updates) # # (((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((( I_progress = [] score_progress = [] bar = progressbar.ProgressBar() for i in bar(range(gradient_steps)): if i % 10 == 0: I_progress.append(I.copy()) gr = gradient_fn(I) I += gr * gradient_stepsize the_score = score_fn(I) score_progress.append(the_score) # print("%d\t%.3f" % (i, the_score)) plt.figure() plt.plot(score_progress) return I_progress, score_progress
for i in xrange(0): l_hiddens.append(DenseLayer(dropout(l_hiddens[-1]), num_units=100, nonlinearity=rectify)) l_out = DenseLayer(dropout(l_hiddens[-1]), num_units=y.shape[1], nonlinearity=softmax, W=Orthogonal()) def reset(): if any(np.isnan(scale.get_value()) for scale in scales): for scale in scales: scale.set_value(1.) for l in l_hiddens: l.b.set_value(Constant()(l.b.get_value().shape)) l.W.set_value(Orthogonal()(l.W.get_value().shape)) l_out.b.set_value(Constant()(l_out.b.get_value().shape)) l_out.W.set_value(Orthogonal()(l_out.W.get_value().shape)) for p in (p for u in (updates_ada,updates_other,updates_scal) for p in u if p not in get_all_params(l_out)): p.set_value(Constant()(p.get_value().shape)) chunky_l2 = apply_penalty(get_all_params(l_out,regularizable=True),l2)-l2(l_hiddens[0].W)+l2(l_hiddens[0].W/T.reshape(vscale,(206279,1))) chunky_l1 = apply_penalty(get_all_params(l_out,regularizable=True),l1)-l1(l_hiddens[0].W)+l1(l_hiddens[0].W/T.reshape(vscale,(206279,1))) simple_l2 = apply_penalty(get_all_params(l_out,regularizable=True),l2) #l_out2 = DenseLayer(dropout(l_hiddens2[-1]), num_units=y.shape[1]) #l_out = lasagne.layers.NonlinearityLayer(lasagne.layers.ElemwiseSumLayer((l_out1,l_out2),.5), softmax) #categorical_crossentropy(get_output(l_out)[train_indice]) target=T.fmatrix(name="target") #f=theano.function([l_in.input_var],get_output(l_out),allow_input_downcast=True) #f(X[0,:].toarray()) loss=categorical_crossentropy(get_output(l_out),target).mean() # train_loss_smoo=categorical_crossentropy(get_output(l_out,deterministic=True)[train_indices,],target[train_indices,]).mean() # valid_loss=categorical_crossentropy(get_output(l_out)[valid_indices,],target[valid_indices,]).mean() # valid_loss_smoo=categorical_crossentropy(get_output(l_out,deterministic=True)[valid_indices,],target[valid_indices,]).mean()