def test_batch_normalization_test(): for axes in ('per-activation', 'spatial', (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale', 'bias', 'mean', 'var')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # forward pass out = bn.batch_normalization_test(x, scale, bias, mean, var, axes, eps) # reference forward pass if axes == 'per-activation': axes2 = (0,) elif axes == 'spatial': axes2 = (0,) + tuple(range(2, ndim)) else: axes2 = axes scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes2) for t in (scale, bias, mean, var)) out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, mean, var, dy], [out, out2] + grads + grads2) # check if the abstract Ops have been replaced assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad)) for n in f.maker.fgraph.toposort()]) # run for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes2 else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX) Var = numpy.random.rand(*param_shape).astype(theano.config.floatX) outputs = f(X, Scale, Bias, Mean, Var, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[1]) # out # compare gradients utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
def batch_norm(input_, gamma, beta, running_mean, running_var, is_training, axes='per-activation'): if is_training: # returns: # batch-normalized output # batch mean # batch variance # running mean (for later use as population mean estimate) # running var (for later use as population var estimate) out, _, _, running_mean, running_var = batch_normalization_train( input_, gamma, beta, running_mean=running_mean, running_var=running_var, axes=axes, running_average_factor=0.9, ) else: out = batch_normalization_test( input_, gamma, beta, running_mean, running_var, axes=axes, ) return out, running_mean, running_var
def batchNorm(x, train, gamma, beta, RM, RV, ax): values_train, _, _, newRM, newRV = batch_normalization_train( x, gamma, beta, axes=ax, running_mean=RM, running_var=RV) values = ifelse(T.neq(train, 1), batch_normalization_test(x, gamma, beta, RM, RV, axes=ax), values_train) return values, newRM, newRV
def forward(self, X, is_traning): activation = X.dot(self.W) if is_traning: out, batch_mean, batch_invstd, new_running_mean, new_running_var = batch_normalization_train( activation, self.gamma, self.beta, running_mean=self.running_mean, running_var=self.running_var) self.running_update = [ (self.running_mean, new_running_mean), (self.running_var, new_running_var), ] # how it updates exactly # batch_var = 1 / (batch_invstd * batch_invstd) # self.running_update = [ # (self.running_mean, 0.9*self.running_mean + 0.1*batch_mean), # (self.running_var, 0.9*self.running_var + 0.1*batch_var), # ] else: out = batch_normalization_test(activation, self.gamma, self.beta, self.running_mean, self.running_var) return self.f(out)
def forward(self, X, is_training): activation = X.dot(self.W) if is_training: out, b_mean, b_invstd, new_mean, new_var = batch_normalization_train( activation, self.gamma, self.beta, running_mean =self.running_mean, running_var= self.running_var ) # write the updates rules of mean and var in the layer to access them outside self.running_update = [ (self.running_mean, new_mean), (self.running_var, new_var) ] else : out = batch_normalization_test( activation, self.gamma, self.beta, self.running_mean, self.running_var ) if self.af == None: return out else: return self.af(out)
def forward(self, Z, is_training): a = Z.dot(self.W) if is_training: out, batch_mean, batch_invstd, new_rn_mean, new_rn_var = batch_normalization_train( a, self.gamma, self.beta, running_mean=self.rn_mean, running_var=self.rn_var) self.running_update = [(self.rn_mean, new_rn_mean), (self.rn_var, new_rn_var)] else: out = batch_normalization_test(a, self.gamma, self.beta, self.rn_mean, self.rn_var) return self.f(out)
def test_batch_normalization_broadcastable(): # check if the broadcastable pattern is preserved by the optimizations x, dy, scale, bias, mean, var = (T.scalar(n).dimshuffle(['x'] * 5) for n in ('x', 'dy', 'scale', 'bias', 'mean', 'var')) # forward pass out_train, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'spatial') out_test = bn.batch_normalization_test(x, scale, bias, mean, var, 'spatial') # backward pass grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy}) grads_test = T.grad(None, wrt=[x, scale, bias], known_grads={out_test: dy}) # compile f = theano.function([x, scale, bias, mean, var, dy], [out_train, x_mean, x_invstd, out_test] + grads_train + grads_test) assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad)) for n in f.maker.fgraph.toposort()])
def forward(self, X, is_training): activation = X.dot(self.W) if is_training: # returns: # batch-normalized output # batch mean # batch variance # running mean (for later use as population mean estimate) # running var (for later use as population var estimate) out, batch_mean, batch_invstd, new_running_mean, new_running_var = batch_normalization_train( activation, self.gamma, self.beta, running_mean=self.running_mean, running_var=self.running_var, ) self.running_update = [ (self.running_mean, new_running_mean), (self.running_var, new_running_var), ] # if you don't trust the built-in bn function # batch_var = 1 / (batch_invstd * batch_invstd) # self.running_update = [ # (self.running_mean, 0.9*self.running_mean + 0.1*batch_mean), # (self.running_var, 0.9*self.running_var + 0.1*batch_var), # ] else: out = batch_normalization_test( activation, self.gamma, self.beta, self.running_mean, self.running_var ) return self.f(out)
def batch_norm( input_, gamma, beta, running_mean, running_var, is_training, axes='per-activation'): if is_training: # returns: # batch-normalized output # batch mean # batch variance # running mean (for later use as population mean estimate) # running var (for later use as population var estimate) out, _, _, new_running_mean, new_running_var = batch_normalization_train( input_, gamma, beta, running_mean=running_mean, running_var=running_var, axes=axes, running_average_factor=0.9, ) else: new_running_mean = None new_running_var = None # just to ensure we don't try to use them out = batch_normalization_test( input_, gamma, beta, running_mean, running_var, axes=axes, ) return out, new_running_mean, new_running_var
def forward(self, prev_layer, train): self.drop = self.rng.binomial(size=prev_layer.shape, p=1 - self.dropout_rate) prev_layer = prev_layer * self.drop self.Z = T.dot(prev_layer, self.weights) if self.batch_norm == True: if train == True: self.Z, _, _, self.n_running_mean, self.n_running_variance = batch_normalization_train( self.Z, self.gamma, self.beta, running_mean=self.running_mean, running_var=self.running_variance) self.n_norm_params = [ self.n_running_mean, self.n_running_variance ] else: self.Z = batch_normalization_test(self.Z, self.gamma, self.beta, self.running_mean, self.running_variance) else: self.Z += self.biases self.n_norm_params = [] if self.activation == 'relu': self.A = T.nnet.nnet.relu(self.Z) elif self.activation == 'sigmoid': self.A = T.nnet.nnet.sigmoid(self.Z) elif self.activation == 'tanh': self.A = 2 * T.nnet.nnet.sigmoid(self.Z) - 1 elif self.activation == 'leaky_relu': self.A = T.nnet.nnet.relu(self.Z, alpha=0.1) elif self.activation == 'softmax': self.A = T.nnet.nnet.softmax(self.Z) else: raise ValueError('Activation Error') return self.A
def test_batch_normalization_train_broadcast(): for axes in ('per-activation', 'spatial', (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): x = vartype('x') ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # convert axes to explicit list if axes == 'per-activation': axes2 = (0,) elif axes == 'spatial': axes2 = (0,) + tuple(range(2, ndim)) else: axes2 = axes # compute axes for parameter tensors non_bc_axes = tuple(i for i in range(ndim) if i not in axes2) params_dimshuffle = ['x'] * ndim for i, axis in enumerate(non_bc_axes): params_dimshuffle[axis] = i # construct non-broadcasted parameter variables param_type = T.TensorType(x.dtype, (False,) * len(non_bc_axes)) scale, bias, running_mean, running_var = (param_type(n) for n in ('scale', 'bias', 'running_mean', 'running_var')) # broadcast parameter variables scale_bc = scale.dimshuffle(params_dimshuffle) bias_bc = bias.dimshuffle(params_dimshuffle) running_mean_bc = running_mean.dimshuffle(params_dimshuffle) running_var_bc = running_var.dimshuffle(params_dimshuffle) # batch_normalization_train with original, non-broadcasted variables train_non_bc = \ bn.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var) # batch_normalization_train with broadcasted variables train_bc = \ bn.batch_normalization_train( x, scale_bc, bias_bc, axes, eps, running_average_factor, running_mean_bc, running_var_bc) train_bc = tuple([train_bc[0]] + # out [r.dimshuffle(non_bc_axes) for r in train_bc[1:]]) # batch_normalization_test with original, non-broadcasted variables test_non_bc = \ bn.batch_normalization_test( x, scale, bias, running_mean, running_var, axes, eps) # batch_normalization_test with broadcasted variables test_bc = \ bn.batch_normalization_test( x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps) # subtract the results of the non-broadcasted and broadcasted calls results_non_bc = train_non_bc + (test_non_bc,) results_bc = train_bc + (test_bc,) results = [abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)] # compile to compute all differences f = theano.function([x, scale, bias, running_mean, running_var], T.sum(sum(results))) # the paired ops are exactly the same, so the optimizer should have # collapsed the sum of differences to a constant zero nodes = f.maker.fgraph.toposort() if theano.config.mode != "FAST_COMPILE": assert len(nodes) == 1 assert isinstance(nodes[0].op, theano.compile.DeepCopyOp) inputs = [numpy.asarray(numpy.random.rand(*((4,) * n)), x.dtype) for n in [x.ndim, scale.ndim, bias.ndim, running_mean.ndim, running_var.ndim]] assert 0.0 == f(*inputs)
def __init__( self, input, nkerns, input_shape, id, filter_shape=(3, 3), poolsize=(2, 2), pooltype='max', batch_norm=False, border_mode='valid', stride=(1, 1), rng=None, borrow=True, activation='relu', input_params=None, verbose=2, ): super(conv_pool_layer_2d, self).__init__(id=id, type='conv_pool', verbose=verbose) if verbose >= 3: print "... Creating conv pool layer" if rng is None: rng = numpy.random # To copy weights previously created or some wierd initializations if input_params is not None: init_w = input_params[0] init_b = input_params[1] if batch_norm is True: init_gamma = input_params[2] init_beta = input_params[3] init_mean = input_params[4] init_var = input_params[5] mini_batch_size = input_shape[0] channels = input_shape[1] width = input_shape[3] height = input_shape[2] # srng = RandomStreams(rng.randint(1,2147462579)) # Initialize the parameters of this layer. w_shp = (nkerns, channels, filter_shape[0], filter_shape[1]) if input_params is None: # fan_in = filter_shape[0]*filter_shape[1] # fan_out = filter_shape[0]*filter_shape[1] / numpy.prod(poolsize) # w_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.w = theano.shared( value= # numpy.asarray(rng.uniform(low=-w_bound, high=w_bound, size =w_shp), numpy.asarray(0.01 * rng.standard_normal(size=w_shp), dtype=theano.config.floatX), borrow=borrow, name='filterbank') self.b = theano.shared(value=numpy.zeros( (nkerns, ), dtype=theano.config.floatX), name='bias', borrow=borrow) if batch_norm is True: self.gamma = theano.shared(value=numpy.ones( (nkerns, ), dtype=theano.config.floatX), name='gamma', borrow=borrow) self.beta = theano.shared(value=numpy.zeros( (nkerns, ), dtype=theano.config.floatX), name='beta', borrow=borrow) self.running_mean = theano.shared(value=numpy.zeros( (nkerns, ), dtype=theano.config.floatX), name='population_mean', borrow=borrow) self.running_var = theano.shared(value=numpy.ones( (nkerns, ), dtype=theano.config.floatX), name='population_var', borrow=borrow) else: self.w = init_w self.b = init_b if batch_norm is True: self.gamma = init_gamma self.beta = init_beta self.running_mean = init_mean self.running_var = init_var # Perform the convolution part convolver = convolver_2d(input=input, filters=self.w, subsample=stride, filter_shape=w_shp, image_shape=input_shape, border_mode=border_mode, verbose=verbose) conv_out = convolver.out conv_out_shp = (mini_batch_size, nkerns, convolver.out_shp[0], convolver.out_shp[1]) self.conv_out = conv_out if not poolsize == (1, 1): pooler = pooler_2d(input=conv_out, img_shp=conv_out_shp, mode=pooltype, ds=poolsize, verbose=verbose) pool_out = pooler.out pool_out_shp = pooler.out_shp else: pool_out = conv_out pool_out_shp = conv_out_shp """ Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """ if batch_norm is True: batch_norm_out,_,_,mean,var = batch_normalization_train( inputs = pool_out + \ self.b.dimshuffle('x', 0, 'x', 'x'), gamma = self.gamma, beta = self.beta, axes ='spatial', running_mean = self.running_mean, running_var = self.running_var ) mean = theano.tensor.unbroadcast(mean, 0) var = theano.tensor.unbroadcast(var, 0) self.updates[self.running_mean] = mean self.updates[self.running_var] = var + 0.001 batch_norm_inference = batch_normalization_test ( inputs = pool_out + \ self.b.dimshuffle('x', 0, 'x', 'x'), gamma = self.gamma, beta = self.beta, axes = 'spatial', mean = self.running_mean, var = self.running_var ) else: batch_norm_out = pool_out + self.b.dimshuffle('x', 0, 'x', 'x') batch_norm_inference = batch_norm_out batch_norm_out_shp = pool_out_shp self.output, self.output_shape = _activate( x=batch_norm_out, activation=activation, input_size=batch_norm_out_shp, verbose=verbose, dimension=2) self.inference, _ = _activate(x=batch_norm_inference, activation=activation, input_size=batch_norm_out_shp, verbose=verbose, dimension=2) # store parameters of this layer and do some book keeping. self.params = [self.w, self.b] self.active_params = [self.w, self.b] if batch_norm is True: self.params.append(self.gamma) self.params.append(self.beta) self.active_params.append(self.gamma) self.active_params.append(self.beta) self.params.append(self.running_mean) # inactive params self.params.append(self.running_var) # inactive params self.L1 = abs(self.w).sum() # if batch_norm is True : self.L1 = self.L1 # + abs(self.gamma).sum() self.L2 = (self.w**2).sum() # if batch_norm is True: self.L2 = self.L2 # + (self.gamma**2).sum() # Just doing this for print_layer method to use. self.nkerns = nkerns self.filter_shape = filter_shape self.poolsize = poolsize self.stride = stride self.input_shape = input_shape self.num_neurons = nkerns self.activation = activation self.batch_norm = batch_norm
def __init__( self, input, nkerns, input_shape, id, output_shape, filter_shape=(3, 3), poolsize=(1, 1), pooltype='max', batch_norm=False, border_mode='valid', stride=(1, 1), rng=None, borrow=True, activation='relu', input_params=None, verbose=2, ): super(deconv_layer_2d, self).__init__(id=id, type='deconv', verbose=verbose) if verbose >= 3: print "... Creating deconv layer" if rng is None: rng = numpy.random create_w = False create_b = False create_bn = False # To copy weights previously created or some wierd initializations if not input_params is None: if input_params[0] is None: create_w = True if input_params[1] is None: create_b = True if batch_norm is True: if input_params[2] is None: create_bn = True else: create_w = True create_b = True create_bn = True mini_batch_size = input_shape[0] channels = input_shape[1] width = input_shape[3] height = input_shape[2] # srng = RandomStreams(rng.randint(1,2147462579)) # Initialize the parameters of this layer. w_shp = (nkerns, output_shape[2], filter_shape[0], filter_shape[1]) o_shp = (input_shape[0], output_shape[2], output_shape[0], output_shape[1]) if create_w is True: self.w = theano.shared(value=numpy.asarray( 0.01 * rng.standard_normal(size=w_shp), dtype=theano.config.floatX), borrow=borrow, name='filterbank') else: self.w = input_params[0] if create_b is True: self.b = theano.shared(value=numpy.zeros( (output_shape[2], ), dtype=theano.config.floatX), name='bias', borrow=borrow) else: self.b = input_params[1] if batch_norm is True: if create_bn is True: self.gamma = theano.shared(value=numpy.ones( (output_shape[2], ), dtype=theano.config.floatX), name='gamma', borrow=borrow) self.beta = theano.shared(value=numpy.zeros( (output_shape[2], ), dtype=theano.config.floatX), name='beta', borrow=borrow) self.running_mean = theano.shared(value=numpy.zeros( (output_shape[2], ), dtype=theano.config.floatX), name='population_mean', borrow=borrow) self.running_var = theano.shared(value=numpy.ones( (output_shape[2], ), dtype=theano.config.floatX), name='population_var', borrow=borrow) else: self.gamma = input_params[2] self.beta = input_params[3] self.running_mean = input_params[4] self.running_var = input_params[5] # Perform the convolution part convolver = deconvolver_2d(input=input, filters=self.w, output_shape=o_shp, subsample=stride, filter_shape=w_shp, image_shape=input_shape, border_mode=border_mode, verbose=verbose) conv_out = convolver.out conv_out_shp = o_shp self.conv_out = conv_out if not poolsize == (1, 1): raise Exception( " Unpool operation not yet supported be deconv layer") """ #pragma: no cover pooler = pooler_2d( input = conv_out, img_shp = conv_out_shp, mode = pooltype, ds = poolsize, verbose = verbose ) pool_out = pooler.out pool_out_shp = pooler.out_shp """ else: unpool_out = conv_out unpool_out_shp = conv_out_shp """ Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """ if batch_norm is True: batch_norm_out,_,_,mean,var = batch_normalization_train( inputs = unpool_out + \ self.b.dimshuffle('x', 0, 'x', 'x'), gamma = self.gamma, beta = self.beta, axes ='spatial', running_mean = self.running_mean, running_var = self.running_var ) mean = theano.tensor.unbroadcast(mean, 0) var = theano.tensor.unbroadcast(var, 0) var = var + 0.000001 self.updates[self.running_mean] = mean self.updates[self.running_var] = var batch_norm_inference = batch_normalization_test ( inputs = unpool_out + \ self.b.dimshuffle('x', 0, 'x', 'x'), gamma = self.gamma, beta = self.beta, axes = 'spatial', mean = self.running_mean, var = self.running_var ) else: batch_norm_out = unpool_out + self.b.dimshuffle('x', 0, 'x', 'x') batch_norm_inference = batch_norm_out batch_norm_out_shp = unpool_out_shp if type(activation) is tuple: if activation[0] == 'maxout': raise Exception( 'Deconvolution layer does not support maxout activation') self.output, self.output_shape = _activate( x=batch_norm_out, activation=activation, input_size=batch_norm_out_shp, verbose=verbose, dimension=2) self.inference, _ = _activate(x=batch_norm_inference, activation=activation, input_size=batch_norm_out_shp, verbose=verbose, dimension=2) # store parameters of this layer and do some book keeping. self.params = [self.w, self.b] self.active_params = [self.w, self.b] if batch_norm is True: self.params.append(self.gamma) self.params.append(self.beta) self.active_params.append(self.gamma) self.active_params.append(self.beta) self.params.append(self.running_mean) # inactive params self.params.append(self.running_var) # inactive params self.L1 = abs(self.w).sum() # if batch_norm is True : self.L1 = self.L1 # + abs(self.gamma).sum() self.L2 = (self.w**2).sum() # if batch_norm is True: self.L2 = self.L2 # + (self.gamma**2).sum() # Just doing this for print_layer method to use. self.nkerns = nkerns self.filter_shape = filter_shape self.poolsize = poolsize self.stride = stride self.input_shape = input_shape self.num_neurons = nkerns self.activation = activation self.batch_norm = batch_norm
def __init__(self, input, num_neurons, input_shape, id, rng=None, input_params=None, borrow=True, activation='relu', batch_norm=True, verbose=2): super(dot_product_layer, self).__init__(id=id, type='dot_product', verbose=verbose) if verbose >= 3: print "... Creating dot product layer" if rng is None: rng = numpy.random create = False if input_params is None: create = True elif input_params[0] is None: create = True if create is True: w_values = numpy.asarray( 0.01 * rng.standard_normal(size=(input_shape[1], num_neurons)), dtype=theano.config.floatX) if activation == 'sigmoid': w_values *= 4 self.w = theano.shared(value=w_values, name='weights') else: self.w = input_params[0] create = False if input_params is None: create = True elif input_params[1] is None: create = True if create is True: b_values = numpy.zeros((num_neurons, ), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, name='bias') else: self.b = input_params[1] if batch_norm is True: create = False if input_params is None: create = True elif input_params[2] is None: create = True if create is True: gamma_values = numpy.ones((1, num_neurons), dtype=theano.config.floatX) self.gamma = theano.shared(value=gamma_values, name='gamma') beta_values = numpy.zeros((1, num_neurons), dtype=theano.config.floatX) self.beta = theano.shared(value=beta_values, name='beta') self.running_mean = theano.shared(value=numpy.zeros( (1, num_neurons), dtype=theano.config.floatX), name='population_mean', borrow=borrow) self.running_var = theano.shared(value=numpy.ones( (1, num_neurons), dtype=theano.config.floatX), name='population_var', borrow=borrow) else: self.gamma = input_params[2] self.beta = input_params[3] self.running_mean = input_params[4] self.running_var = input_params[5] linear_fit = T.dot(input, self.w) + self.b if batch_norm is True: batch_norm_out, _, _, mean, var = batch_normalization_train( inputs=linear_fit, gamma=self.gamma, beta=self.beta, running_mean=self.running_mean, running_var=self.running_var) mean = theano.tensor.unbroadcast(mean, 0) var = theano.tensor.unbroadcast(var, 0) self.updates[self.running_mean] = mean self.updates[self.running_var] = var + 0.001 batch_norm_inference = batch_normalization_test( inputs=linear_fit, gamma=self.gamma, beta=self.beta, mean=self.running_mean, var=self.running_var) else: batch_norm_out = linear_fit batch_norm_inference = batch_norm_out batch_norm_shp = (input_shape[0], num_neurons) self.output, self.output_shape = _activate(x=batch_norm_out, activation=activation, input_size=batch_norm_shp, verbose=verbose, dimension=1) self.inference, _ = _activate(x=batch_norm_out, activation=activation, input_size=batch_norm_shp, verbose=verbose, dimension=1) # parameters of the model if batch_norm is True: self.params = [ self.w, self.b, self.gamma, self.beta, self.running_mean, self.running_var ] self.active_params = [self.w, self.b, self.gamma, self.beta] else: self.params = [self.w, self.b] self.active_params = [self.w, self.b] self.L1 = abs(self.w).sum() # if batch_norm is True: self.L1 = self.L1 + abs(self.gamma).sum() self.L2 = (self.w**2).sum() # if batch_norm is True: self.L2 = self.L2 + (self.gamma**2).sum() """ Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """ if verbose >= 3: print "... Dot Product layer is created with output shape " + str( self.output_shape) self.num_neurons = num_neurons self.activation = activation self.batch_norm = batch_norm