def call_on_params(self, x, params): k = self.hidden_depth + 2 ws = params[0:k] bs = params[k:2 * k] if self.use_bn: bn = params[2 * k:] gammas = bn[:k - 1] betas = bn[k - 1:] h = T.dot(x, ws[0]) + bs[0] if self.hidden_activation: h = self.hidden_activation(h) if self.use_bn: h, _m, _s = batch_normalization_train(h, gamma=gammas[0], beta=betas[0]) for j in range(self.hidden_depth): h = T.dot(h, ws[j + 1]) + bs[j + 1] if self.hidden_activation: h = self.hidden_activation(h) if self.use_bn: h, _m, _s = batch_normalization_train(h, gamma=gammas[1 + j], beta=betas[1 + j]) y = T.dot(h, ws[-1]) + bs[-1] if self.output_activation: y = self.output_activation(y) return y
def forward(self, X, is_training): activation = X.dot(self.W) if is_training: # returns: # batch-normalized output # batch mean # batch variance # running mean (for later use as population mean estimate) # running var (for later use as population var estimate) out, batch_mean, batch_invstd, new_running_mean, new_running_var = batch_normalization_train( activation, self.gamma, self.beta, running_mean=self.running_mean, running_var=self.running_var, ) self.running_update = [ (self.running_mean, new_running_mean), (self.running_var, new_running_var), ] # if you don't trust the built-in bn function # batch_var = 1 / (batch_invstd * batch_invstd) # self.running_update = [ # (self.running_mean, 0.9*self.running_mean + 0.1*batch_mean), # (self.running_var, 0.9*self.running_var + 0.1*batch_var), # ] else: out = batch_normalization_test(activation, self.gamma, self.beta, self.running_mean, self.running_var) return self.f(out)
def batch_norm(input_, gamma, beta, running_mean, running_var, is_training, axes='per-activation'): if is_training: # returns: # batch-normalized output # batch mean # batch variance # running mean (for later use as population mean estimate) # running var (for later use as population var estimate) out, _, _, running_mean, running_var = batch_normalization_train( input_, gamma, beta, running_mean=running_mean, running_var=running_var, axes=axes, running_average_factor=0.9, ) else: out = batch_normalization_test( input_, gamma, beta, running_mean, running_var, axes=axes, ) return out, running_mean, running_var
def test_batch_normalization_train_without_running_averages(): # compile and run batch_normalization_train without running averages utt.seed_rng() x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4( 'bias'), T.tensor4('dy') data_shape = (5, 10, 30, 25) param_shape = (1, 10, 30, 25) # forward pass out, x_mean, x_invstd = bn.batch_normalization_train( x, scale, bias, 'per-activation') # backward pass grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # compile f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads) # check if the abstract Ops have been replaced assert not any([ isinstance(n.op, (bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad)) for n in f.maker.fgraph.toposort() ]) # run X = 4 + 3 * np.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * np.random.randn(*data_shape).astype(theano.config.floatX) Scale = np.random.randn(*param_shape).astype(theano.config.floatX) Bias = np.random.randn(*param_shape).astype(theano.config.floatX) f(X, Scale, Bias, Dy)
def test_batch_normalization_broadcastable(): # check if the broadcastable pattern is preserved by the optimizations x, dy, scale, bias, mean, var = (T.scalar(n).dimshuffle(["x"] * 5) for n in ("x", "dy", "scale", "bias", "mean", "var")) # forward pass out_train, x_mean, x_invstd = bn.batch_normalization_train( x, scale, bias, "spatial") out_test = bn.batch_normalization_test(x, scale, bias, mean, var, "spatial") # backward pass grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy}) grads_test = T.grad(None, wrt=[x, scale, bias], known_grads={out_test: dy}) # compile f = theano.function( [x, scale, bias, mean, var, dy], [out_train, x_mean, x_invstd, out_test] + grads_train + grads_test, ) assert not any([ isinstance( n.op, ( bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad, ), ) for n in f.maker.fgraph.toposort() ])
def batchNorm(x, train, gamma, beta, RM, RV, ax): values_train, _, _, newRM, newRV = batch_normalization_train( x, gamma, beta, axes=ax, running_mean=RM, running_var=RV) values = ifelse(T.neq(train, 1), batch_normalization_test(x, gamma, beta, RM, RV, axes=ax), values_train) return values, newRM, newRV
def input_layer(self, input_data): if self.perform_normalization == "all"\ or self.perform_normalization == "only input": gamma = theano.shared(1.) bias = theano.shared(0.) running_mean = theano.shared(0.) running_var = theano.shared(0.) normalized_input_data, _, _,\ new_running_mean, new_running_var = \ batch_normalization_train(input_data, gamma, bias, axes=(0, 1), running_mean=running_mean, running_var=running_var) output = \ normalized_input_data.reshape(self.convolution_input_shape) self.updates.append((running_mean, new_running_mean)) self.updates.append((running_var, new_running_var)) else: output = input_data.reshape(self.convolution_input_shape) return output
def forward(self, X, is_traning): activation = X.dot(self.W) if is_traning: out, batch_mean, batch_invstd, new_running_mean, new_running_var = batch_normalization_train( activation, self.gamma, self.beta, running_mean=self.running_mean, running_var=self.running_var) self.running_update = [ (self.running_mean, new_running_mean), (self.running_var, new_running_var), ] # how it updates exactly # batch_var = 1 / (batch_invstd * batch_invstd) # self.running_update = [ # (self.running_mean, 0.9*self.running_mean + 0.1*batch_mean), # (self.running_var, 0.9*self.running_var + 0.1*batch_var), # ] else: out = batch_normalization_test(activation, self.gamma, self.beta, self.running_mean, self.running_var) return self.f(out)
def test_batch_normalization_train_without_running_averages(): # compile and run batch_normalization_train without running averages utt.seed_rng() x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy') data_shape = (5, 10, 30, 25) param_shape = (1, 10, 30, 25) # forward pass out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation') # backward pass grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # compile f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads) # check if the abstract Ops have been replaced assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad)) for n in f.maker.fgraph.toposort()]) # run X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) f(X, Scale, Bias, Dy)
def forward(self, X, is_training): activation = X.dot(self.W) if is_training: out, b_mean, b_invstd, new_mean, new_var = batch_normalization_train( activation, self.gamma, self.beta, running_mean =self.running_mean, running_var= self.running_var ) # write the updates rules of mean and var in the layer to access them outside self.running_update = [ (self.running_mean, new_mean), (self.running_var, new_var) ] else : out = batch_normalization_test( activation, self.gamma, self.beta, self.running_mean, self.running_var ) if self.af == None: return out else: return self.af(out)
def convolution_layer(self, input_data): """ Weight and bias for the layer """ filter_weights = \ theano.shared( np.asarray( np.random.normal(0, 1, size=self.filter_shape), dtype=theano.config.floatX), name="Filter weights", borrow=True) bias_convolution = \ theano.shared(np.zeros((self.number_of_filters,), dtype=theano.config.floatX), borrow=True) """ Convolution """ convolution = causal_conv1d(input=input_data, filters=filter_weights, filter_shape=self.filter_shape, input_shape=self.convolution_input_shape) convolution_output = \ convolution + bias_convolution.dimshuffle("x", 0, "x") if self.perform_normalization == "all": gamma = theano.shared(1.) bias = theano.shared(0.) running_mean = theano.shared(0.) running_var = theano.shared(0.) normalized_output, _, _,\ new_running_mean, new_running_var = \ batch_normalization_train(convolution_output, gamma, bias, axes=(0, 1, 2), running_mean=running_mean, running_var=running_var) self.updates.append((running_mean, new_running_mean)) self.updates.append((running_var, new_running_var)) activation_output = \ self.activation(normalized_output) else: activation_output = \ self.activation( convolution_output) """ Add parameters to be updated """ self.parameters.append(filter_weights) self.parameters.append(bias_convolution) return activation_output
def call(self, x): out, mean, std, newmean, newvar = BN.batch_normalization_train(inputs=x, gamma=self.gamma, beta=self.beta, axes='per-activation', running_mean=self.mean, running_var=self.var) updates = [(self.mean, T.cast(newmean, 'float32')), (self.var, T.cast(newvar, 'float32'))] return out, updates
def fully_connected_layer(self, input_data): """ Weight and bias for the layer """ W_fully_connected = \ theano.shared( np.asarray( np.random.normal(0, 1, size=self.fully_connected_layer_shape), dtype=theano.config.floatX), name="W fully connected", borrow=True) bias_fully_connected = \ theano.shared(np.zeros((self.fully_connected_layer_shape[1],), dtype=theano.config.floatX), name="bias fully connected", borrow=True) dot_output = \ T.dot(input_data, W_fully_connected) + bias_fully_connected if self.perform_normalization == "all": gamma = theano.shared(1.) bias = theano.shared(0.) running_mean = theano.shared(0.) running_var = theano.shared(0.) normalized_output, _, _,\ new_running_mean, new_running_var = \ batch_normalization_train(dot_output, gamma, bias, axes=(0, 1), running_mean=running_mean, running_var=running_var) self.updates.append((running_mean, new_running_mean)) self.updates.append((running_var, new_running_var)) output_fully_connected = self.activation( normalized_output) else: output_fully_connected = self.activation(dot_output) """ Add parameters to be updated """ self.parameters.append(W_fully_connected) self.parameters.append(bias_fully_connected) return output_fully_connected
def output_layer(self, input_data): """ Weight and bias for the layer """ W_output_layer = \ theano.shared( np.asarray( np.random.normal(0, 1, size=self.output_layer_shape), dtype=theano.config.floatX), name="W output", borrow=True) bias_output_layer = \ theano.shared(np.zeros((self.output_layer_shape[1],), dtype=theano.config.floatX), name="bias output", borrow=True) dot_output = T.dot(input_data, W_output_layer) + bias_output_layer if self.perform_normalization: gamma = theano.shared(1.) bias = theano.shared(0.) running_mean = theano.shared(0.) running_var = theano.shared(0.) normalized_output, _, _,\ new_running_mean, new_running_var = \ batch_normalization_train(dot_output, gamma, bias, axes=(0, 1), running_mean=running_mean, running_var=running_var) self.updates.append((running_mean, new_running_mean)) self.updates.append((running_var, new_running_var)) output = self.classification_method(normalized_output) else: output = self.classification_method(dot_output) """ Add parameters to be updated """ self.parameters.append(W_output_layer) self.parameters.append(bias_output_layer) return output
def forward(self, Z, is_training): a = Z.dot(self.W) if is_training: out, batch_mean, batch_invstd, new_rn_mean, new_rn_var = batch_normalization_train( a, self.gamma, self.beta, running_mean=self.rn_mean, running_var=self.rn_var) self.running_update = [(self.rn_mean, new_rn_mean), (self.rn_var, new_rn_var)] else: out = batch_normalization_test(a, self.gamma, self.beta, self.rn_mean, self.rn_var) return self.f(out)
def test_batch_normalization_broadcastable(): # check if the broadcastable pattern is preserved by the optimizations x, dy, scale, bias, mean, var = (T.scalar(n).dimshuffle(['x'] * 5) for n in ('x', 'dy', 'scale', 'bias', 'mean', 'var')) # forward pass out_train, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'spatial') out_test = bn.batch_normalization_test(x, scale, bias, mean, var, 'spatial') # backward pass grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy}) grads_test = T.grad(None, wrt=[x, scale, bias], known_grads={out_test: dy}) # compile f = theano.function([x, scale, bias, mean, var, dy], [out_train, x_mean, x_invstd, out_test] + grads_train + grads_test) assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad)) for n in f.maker.fgraph.toposort()])
def forward(self, X, is_training, decay=0.9): Z = X.dot(self.W) if is_training: Z, batch_mean, batch_invstd, new_rn_mean, new_rn_var = batch_normalization_train( Z, self.gamma, self.betta, running_mean=self.rn_mean, running_var=self.rn_var) self.rn_update = [(self.rn_mean, new_rn_mean), (self.rn_var, new_rn_var)] else: Z = batch_normalization_test(Z, self.gamma, self.betta, self.rn_mean, self.rn_var) return self.f(Z)
def forward(self, X, is_training): activation = X.dot(self.W) if is_training: # returns: # batch-normalized output # batch mean # batch variance # running mean (for later use as population mean estimate) # running var (for later use as population var estimate) out, batch_mean, batch_invstd, new_running_mean, new_running_var = batch_normalization_train( activation, self.gamma, self.beta, running_mean=self.running_mean, running_var=self.running_var, ) self.running_update = [ (self.running_mean, new_running_mean), (self.running_var, new_running_var), ] # if you don't trust the built-in bn function # batch_var = 1 / (batch_invstd * batch_invstd) # self.running_update = [ # (self.running_mean, 0.9*self.running_mean + 0.1*batch_mean), # (self.running_var, 0.9*self.running_var + 0.1*batch_var), # ] else: out = batch_normalization_test( activation, self.gamma, self.beta, self.running_mean, self.running_var ) return self.f(out)
def batch_norm( input_, gamma, beta, running_mean, running_var, is_training, axes='per-activation'): if is_training: # returns: # batch-normalized output # batch mean # batch variance # running mean (for later use as population mean estimate) # running var (for later use as population var estimate) out, _, _, new_running_mean, new_running_var = batch_normalization_train( input_, gamma, beta, running_mean=running_mean, running_var=running_var, axes=axes, running_average_factor=0.9, ) else: new_running_mean = None new_running_var = None # just to ensure we don't try to use them out = batch_normalization_test( input_, gamma, beta, running_mean, running_var, axes=axes, ) return out, new_running_mean, new_running_var
def get_output(self, input, **kwargs): # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(list(range(input.ndim - len(self.axes)))) pattern = [ 'x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim) ] # apply dimshuffle pattern to all parameters beta = self.beta.dimshuffle(pattern) gamma = self.gamma.dimshuffle(pattern) mean = self.mean.dimshuffle(pattern) var = self.var.dimshuffle(pattern) if not self.deterministic: normalized, _, _, mean_, var_ = bn.batch_normalization_train( input, gamma, beta, self.axes_org, self.epsilon, self.alpha, mean, var) # Update running mean and variance # Tricks adopted from Lasagne implementation # http://lasagne.readthedocs.io/en/latest/modules/layers/normalization.html running_mean = theano.clone(self.mean, share_inputs=False) running_var = theano.clone(self.var, share_inputs=False) running_mean.default_update = mean_.dimshuffle(self.non_bc_axes) running_var.default_update = var_.dimshuffle(self.non_bc_axes) self.mean += 0 * running_mean self.var += 0 * running_var else: normalized = bn.batch_normalization_test(input, gamma, beta, mean, var, self.axes_org, self.epsilon) # normalized, _, _, _, _ = bn.batch_normalization_train( # input, gamma, beta, self.axes_org, self.epsilon, 0, mean, var) # normalized = (input - mean) * (gamma / T.sqrt(var)) + beta return self.activation(normalized)
def forward(self, prev_layer, train): self.drop = self.rng.binomial(size=prev_layer.shape, p=1 - self.dropout_rate) prev_layer = prev_layer * self.drop self.Z = T.dot(prev_layer, self.weights) if self.batch_norm == True: if train == True: self.Z, _, _, self.n_running_mean, self.n_running_variance = batch_normalization_train( self.Z, self.gamma, self.beta, running_mean=self.running_mean, running_var=self.running_variance) self.n_norm_params = [ self.n_running_mean, self.n_running_variance ] else: self.Z = batch_normalization_test(self.Z, self.gamma, self.beta, self.running_mean, self.running_variance) else: self.Z += self.biases self.n_norm_params = [] if self.activation == 'relu': self.A = T.nnet.nnet.relu(self.Z) elif self.activation == 'sigmoid': self.A = T.nnet.nnet.sigmoid(self.Z) elif self.activation == 'tanh': self.A = 2 * T.nnet.nnet.sigmoid(self.Z) - 1 elif self.activation == 'leaky_relu': self.A = T.nnet.nnet.relu(self.Z, alpha=0.1) elif self.activation == 'softmax': self.A = T.nnet.nnet.softmax(self.Z) else: raise ValueError('Activation Error') return self.A
def test_batch_normalization_train(): utt.seed_rng() for axes in ('per-activation', 'spatial', (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): x, scale, bias, running_mean, running_var = (vartype(n) for n in ('x', 'scale', 'bias', 'running_mean', 'running_var')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # forward pass out, x_mean, x_invstd, out_running_mean, out_running_var = \ bn.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var) # reference forward pass if axes == 'per-activation': axes2 = (0,) elif axes == 'spatial': axes2 = (0,) + tuple(range(2, ndim)) else: axes2 = axes x_mean2 = x.mean(axis=axes2, keepdims=True) x_var2 = x.var(axis=axes2, keepdims=True) x_invstd2 = T.inv(T.sqrt(x_var2 + eps)) scale2 = T.addbroadcast(scale, *axes2) bias2 = T.addbroadcast(bias, *axes2) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX) out_running_mean2 = running_mean * (1 - running_average_factor) + \ x_mean2 * running_average_factor out_running_var2 = running_var * (1 - running_average_factor) + \ (m / (m - 1)) * x_var2 * running_average_factor # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, running_mean, running_var, dy], [out, x_mean, x_invstd, out_running_mean, out_running_var, out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] + grads + grads2) # check if the abstract Ops have been replaced assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad)) for n in f.maker.fgraph.toposort()]) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes2 else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX) Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX) outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 5]) # out utt.assert_allclose(outputs[1], outputs[1 + 5]) # mean utt.assert_allclose(outputs[2], outputs[2 + 5]) # invstd utt.assert_allclose(outputs[3], outputs[3 + 5]) # running_mean utt.assert_allclose(numpy.nan_to_num(outputs[4]), numpy.nan_to_num(outputs[4 + 5])) # running_var # compare gradients utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[12], outputs[12 + 3]) # dbias
def test_batch_normalization_train(): utt.seed_rng() for axes in ('per-activation', 'spatial', (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): x, scale, bias, running_mean, running_var = (vartype(n) for n in ('x', 'scale', 'bias', 'running_mean', 'running_var')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # forward pass out, x_mean, x_invstd, out_running_mean, out_running_var = \ bn.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var) # reference forward pass if axes == 'per-activation': axes2 = (0,) elif axes == 'spatial': axes2 = (0,) + tuple(range(2, ndim)) else: axes2 = axes x_mean2 = x.mean(axis=axes2, keepdims=True) x_var2 = x.var(axis=axes2, keepdims=True) x_invstd2 = T.inv(T.sqrt(x_var2 + eps)) scale2 = T.addbroadcast(scale, *axes2) bias2 = T.addbroadcast(bias, *axes2) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX) out_running_mean2 = running_mean * (1 - running_average_factor) + \ x_mean2 * running_average_factor out_running_var2 = running_var * (1 - running_average_factor) + \ (m / (m - 1)) * x_var2 * running_average_factor # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, running_mean, running_var, dy], [out, x_mean, x_invstd, out_running_mean, out_running_var, out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] + grads + grads2) # check if the abstract Ops have been replaced assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad)) for n in f.maker.fgraph.toposort()]) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes2 else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX) Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX) outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 5]) # out utt.assert_allclose(outputs[1], outputs[1 + 5]) # mean utt.assert_allclose(outputs[2], outputs[2 + 5]) # invstd utt.assert_allclose(outputs[3], outputs[3 + 5]) # running_mean utt.assert_allclose(numpy.nan_to_num(outputs[4]), numpy.nan_to_num(outputs[4 + 5])) # running_var # compare gradients utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[12], outputs[12 + 3]) # dbias
def test_batch_normalization_train_broadcast(): for axes in ("per-activation", "spatial", (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): x = vartype("x") ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # convert axes to explicit list if axes == "per-activation": axes2 = (0, ) elif axes == "spatial": axes2 = (0, ) + tuple(range(2, ndim)) else: axes2 = axes # compute axes for parameter tensors non_bc_axes = tuple(i for i in range(ndim) if i not in axes2) params_dimshuffle = ["x"] * ndim for i, axis in enumerate(non_bc_axes): params_dimshuffle[axis] = i # construct non-broadcasted parameter variables param_type = T.TensorType(x.dtype, (False, ) * len(non_bc_axes)) scale, bias, running_mean, running_var = (param_type(n) for n in ("scale", "bias", "running_mean", "running_var")) # broadcast parameter variables scale_bc = scale.dimshuffle(params_dimshuffle) bias_bc = bias.dimshuffle(params_dimshuffle) running_mean_bc = running_mean.dimshuffle(params_dimshuffle) running_var_bc = running_var.dimshuffle(params_dimshuffle) # batch_normalization_train with original, non-broadcasted variables train_non_bc = bn.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var, ) # batch_normalization_train with broadcasted variables train_bc = bn.batch_normalization_train( x, scale_bc, bias_bc, axes, eps, running_average_factor, running_mean_bc, running_var_bc, ) train_bc = tuple([train_bc[0]] + [r.dimshuffle(non_bc_axes) for r in train_bc[1:]] # out ) # batch_normalization_test with original, non-broadcasted variables test_non_bc = bn.batch_normalization_test(x, scale, bias, running_mean, running_var, axes, eps) # batch_normalization_test with broadcasted variables test_bc = bn.batch_normalization_test(x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps) # subtract the results of the non-broadcasted and broadcasted calls results_non_bc = train_non_bc + (test_non_bc, ) results_bc = train_bc + (test_bc, ) results = [ abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc) ] # compile to compute all differences f = theano.function([x, scale, bias, running_mean, running_var], T.sum(sum(results))) # the paired ops are exactly the same, so the optimizer should have # collapsed the sum of differences to a constant zero nodes = f.maker.fgraph.toposort() if theano.config.mode != "FAST_COMPILE": assert len(nodes) == 1 assert isinstance(nodes[0].op, theano.compile.DeepCopyOp) inputs = [ np.asarray(np.random.rand(*((4, ) * n)), x.dtype) for n in [ x.ndim, scale.ndim, bias.ndim, running_mean.ndim, running_var.ndim, ] ] assert 0.0 == f(*inputs)
def test_batch_normalization_train(): utt.seed_rng() for axes in ("per-activation", "spatial", (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor3, T.vector): x, scale, bias, running_mean, running_var = (vartype(n) for n in ( "x", "scale", "bias", "running_mean", "running_var")) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # forward pass ( out, x_mean, x_invstd, out_running_mean, out_running_var, ) = bn.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var, ) # reference forward pass if axes == "per-activation": axes2 = (0, ) elif axes == "spatial": axes2 = (0, ) + tuple(range(2, ndim)) else: axes2 = axes x_mean2 = x.mean(axis=axes2, keepdims=True) x_var2 = x.var(axis=axes2, keepdims=True) x_invstd2 = T.inv(T.sqrt(x_var2 + eps)) scale2 = T.addbroadcast(scale, *axes2) bias2 = T.addbroadcast(bias, *axes2) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 m = T.cast( T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX) out_running_mean2 = (running_mean * (1 - running_average_factor) + x_mean2 * running_average_factor) out_running_var2 = (running_var * (1 - running_average_factor) + (m / (m - 1)) * x_var2 * running_average_factor) # backward pass dy = vartype("dy") grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # second-order backward pass dx = vartype("dinputs") dscale = vartype("dscale") dbias = vartype("dbias") grad_grads = T.grad( None, wrt=[x, dy, scale], known_grads=OrderedDict({ grads[0]: dx, grads[1]: dscale, grads[2]: dbias }), consider_constant=[ x, dy, scale, bias, x_mean, x_invstd, running_mean, running_var, ], return_disconnected="zero", ) # reference second-order backward pass grad_grads2 = T.grad( None, wrt=[x, dy, scale], known_grads=OrderedDict({ grads2[0]: dx, grads2[1]: dscale, grads2[2]: dbias }), consider_constant=[ x, dy, scale, bias, x_mean2, x_var2, running_mean, running_var, ], return_disconnected="zero", ) # compile f = theano.function( [ x, scale, bias, running_mean, running_var, dy, dx, dscale, dbias ], [ out, x_mean, x_invstd, out_running_mean, out_running_var, out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2, ] + grads + grads2 + grad_grads + grad_grads2, ) # check if the abstract Ops have been replaced assert not any([ isinstance( n.op, ( bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad, ), ) for n in f.maker.fgraph.toposort() ]) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes2 else s for d, s in enumerate(data_shape)) X = 4 + 3 * np.random.randn(*data_shape).astype( theano.config.floatX) Dy = -1 + 2 * np.random.randn(*data_shape).astype( theano.config.floatX) Scale = np.random.randn(*param_shape).astype( theano.config.floatX) Bias = np.random.randn(*param_shape).astype( theano.config.floatX) Running_mean = np.random.randn(*param_shape).astype( theano.config.floatX) Running_var = np.random.randn(*param_shape).astype( theano.config.floatX) Dx = 4 + 3 * np.random.randn(*data_shape).astype( theano.config.floatX) Dscale = -1 + 2 * np.random.randn(*param_shape).astype( theano.config.floatX) Dbias = np.random.randn(*param_shape).astype( theano.config.floatX) outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy, Dx, Dscale, Dbias) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 5]) # out utt.assert_allclose(outputs[1], outputs[1 + 5]) # mean utt.assert_allclose(outputs[2], outputs[2 + 5]) # invstd utt.assert_allclose(outputs[3], outputs[3 + 5]) # running_mean utt.assert_allclose(np.nan_to_num(outputs[4]), np.nan_to_num(outputs[4 + 5])) # running_var # compare gradients utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[12], outputs[12 + 3]) # dbias # compare second-order gradients utt.assert_allclose(outputs[16], outputs[16 + 3], atol=1e-4) # ddx utt.assert_allclose(outputs[17], outputs[17 + 3]) # ddy utt.assert_allclose(outputs[18], outputs[18 + 3], rtol=3e-4, atol=1e-4) # ddscale
def __init__( self, input, nkerns, input_shape, id, output_shape, filter_shape=(3, 3), poolsize=(1, 1), pooltype='max', batch_norm=False, border_mode='valid', stride=(1, 1), rng=None, borrow=True, activation='relu', input_params=None, verbose=2, ): super(deconv_layer_2d, self).__init__(id=id, type='deconv', verbose=verbose) if verbose >= 3: print "... Creating deconv layer" if rng is None: rng = numpy.random create_w = False create_b = False create_bn = False # To copy weights previously created or some wierd initializations if not input_params is None: if input_params[0] is None: create_w = True if input_params[1] is None: create_b = True if batch_norm is True: if input_params[2] is None: create_bn = True else: create_w = True create_b = True create_bn = True mini_batch_size = input_shape[0] channels = input_shape[1] width = input_shape[3] height = input_shape[2] # srng = RandomStreams(rng.randint(1,2147462579)) # Initialize the parameters of this layer. w_shp = (nkerns, output_shape[2], filter_shape[0], filter_shape[1]) o_shp = (input_shape[0], output_shape[2], output_shape[0], output_shape[1]) if create_w is True: self.w = theano.shared(value=numpy.asarray( 0.01 * rng.standard_normal(size=w_shp), dtype=theano.config.floatX), borrow=borrow, name='filterbank') else: self.w = input_params[0] if create_b is True: self.b = theano.shared(value=numpy.zeros( (output_shape[2], ), dtype=theano.config.floatX), name='bias', borrow=borrow) else: self.b = input_params[1] if batch_norm is True: if create_bn is True: self.gamma = theano.shared(value=numpy.ones( (output_shape[2], ), dtype=theano.config.floatX), name='gamma', borrow=borrow) self.beta = theano.shared(value=numpy.zeros( (output_shape[2], ), dtype=theano.config.floatX), name='beta', borrow=borrow) self.running_mean = theano.shared(value=numpy.zeros( (output_shape[2], ), dtype=theano.config.floatX), name='population_mean', borrow=borrow) self.running_var = theano.shared(value=numpy.ones( (output_shape[2], ), dtype=theano.config.floatX), name='population_var', borrow=borrow) else: self.gamma = input_params[2] self.beta = input_params[3] self.running_mean = input_params[4] self.running_var = input_params[5] # Perform the convolution part convolver = deconvolver_2d(input=input, filters=self.w, output_shape=o_shp, subsample=stride, filter_shape=w_shp, image_shape=input_shape, border_mode=border_mode, verbose=verbose) conv_out = convolver.out conv_out_shp = o_shp self.conv_out = conv_out if not poolsize == (1, 1): raise Exception( " Unpool operation not yet supported be deconv layer") """ #pragma: no cover pooler = pooler_2d( input = conv_out, img_shp = conv_out_shp, mode = pooltype, ds = poolsize, verbose = verbose ) pool_out = pooler.out pool_out_shp = pooler.out_shp """ else: unpool_out = conv_out unpool_out_shp = conv_out_shp """ Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """ if batch_norm is True: batch_norm_out,_,_,mean,var = batch_normalization_train( inputs = unpool_out + \ self.b.dimshuffle('x', 0, 'x', 'x'), gamma = self.gamma, beta = self.beta, axes ='spatial', running_mean = self.running_mean, running_var = self.running_var ) mean = theano.tensor.unbroadcast(mean, 0) var = theano.tensor.unbroadcast(var, 0) var = var + 0.000001 self.updates[self.running_mean] = mean self.updates[self.running_var] = var batch_norm_inference = batch_normalization_test ( inputs = unpool_out + \ self.b.dimshuffle('x', 0, 'x', 'x'), gamma = self.gamma, beta = self.beta, axes = 'spatial', mean = self.running_mean, var = self.running_var ) else: batch_norm_out = unpool_out + self.b.dimshuffle('x', 0, 'x', 'x') batch_norm_inference = batch_norm_out batch_norm_out_shp = unpool_out_shp if type(activation) is tuple: if activation[0] == 'maxout': raise Exception( 'Deconvolution layer does not support maxout activation') self.output, self.output_shape = _activate( x=batch_norm_out, activation=activation, input_size=batch_norm_out_shp, verbose=verbose, dimension=2) self.inference, _ = _activate(x=batch_norm_inference, activation=activation, input_size=batch_norm_out_shp, verbose=verbose, dimension=2) # store parameters of this layer and do some book keeping. self.params = [self.w, self.b] self.active_params = [self.w, self.b] if batch_norm is True: self.params.append(self.gamma) self.params.append(self.beta) self.active_params.append(self.gamma) self.active_params.append(self.beta) self.params.append(self.running_mean) # inactive params self.params.append(self.running_var) # inactive params self.L1 = abs(self.w).sum() # if batch_norm is True : self.L1 = self.L1 # + abs(self.gamma).sum() self.L2 = (self.w**2).sum() # if batch_norm is True: self.L2 = self.L2 # + (self.gamma**2).sum() # Just doing this for print_layer method to use. self.nkerns = nkerns self.filter_shape = filter_shape self.poolsize = poolsize self.stride = stride self.input_shape = input_shape self.num_neurons = nkerns self.activation = activation self.batch_norm = batch_norm
def __init__( self, input, nkerns, input_shape, id, filter_shape=(3, 3), poolsize=(2, 2), pooltype='max', batch_norm=False, border_mode='valid', stride=(1, 1), rng=None, borrow=True, activation='relu', input_params=None, verbose=2, ): super(conv_pool_layer_2d, self).__init__(id=id, type='conv_pool', verbose=verbose) if verbose >= 3: print "... Creating conv pool layer" if rng is None: rng = numpy.random # To copy weights previously created or some wierd initializations if input_params is not None: init_w = input_params[0] init_b = input_params[1] if batch_norm is True: init_gamma = input_params[2] init_beta = input_params[3] init_mean = input_params[4] init_var = input_params[5] mini_batch_size = input_shape[0] channels = input_shape[1] width = input_shape[3] height = input_shape[2] # srng = RandomStreams(rng.randint(1,2147462579)) # Initialize the parameters of this layer. w_shp = (nkerns, channels, filter_shape[0], filter_shape[1]) if input_params is None: # fan_in = filter_shape[0]*filter_shape[1] # fan_out = filter_shape[0]*filter_shape[1] / numpy.prod(poolsize) # w_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.w = theano.shared( value= # numpy.asarray(rng.uniform(low=-w_bound, high=w_bound, size =w_shp), numpy.asarray(0.01 * rng.standard_normal(size=w_shp), dtype=theano.config.floatX), borrow=borrow, name='filterbank') self.b = theano.shared(value=numpy.zeros( (nkerns, ), dtype=theano.config.floatX), name='bias', borrow=borrow) if batch_norm is True: self.gamma = theano.shared(value=numpy.ones( (nkerns, ), dtype=theano.config.floatX), name='gamma', borrow=borrow) self.beta = theano.shared(value=numpy.zeros( (nkerns, ), dtype=theano.config.floatX), name='beta', borrow=borrow) self.running_mean = theano.shared(value=numpy.zeros( (nkerns, ), dtype=theano.config.floatX), name='population_mean', borrow=borrow) self.running_var = theano.shared(value=numpy.ones( (nkerns, ), dtype=theano.config.floatX), name='population_var', borrow=borrow) else: self.w = init_w self.b = init_b if batch_norm is True: self.gamma = init_gamma self.beta = init_beta self.running_mean = init_mean self.running_var = init_var # Perform the convolution part convolver = convolver_2d(input=input, filters=self.w, subsample=stride, filter_shape=w_shp, image_shape=input_shape, border_mode=border_mode, verbose=verbose) conv_out = convolver.out conv_out_shp = (mini_batch_size, nkerns, convolver.out_shp[0], convolver.out_shp[1]) self.conv_out = conv_out if not poolsize == (1, 1): pooler = pooler_2d(input=conv_out, img_shp=conv_out_shp, mode=pooltype, ds=poolsize, verbose=verbose) pool_out = pooler.out pool_out_shp = pooler.out_shp else: pool_out = conv_out pool_out_shp = conv_out_shp """ Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """ if batch_norm is True: batch_norm_out,_,_,mean,var = batch_normalization_train( inputs = pool_out + \ self.b.dimshuffle('x', 0, 'x', 'x'), gamma = self.gamma, beta = self.beta, axes ='spatial', running_mean = self.running_mean, running_var = self.running_var ) mean = theano.tensor.unbroadcast(mean, 0) var = theano.tensor.unbroadcast(var, 0) self.updates[self.running_mean] = mean self.updates[self.running_var] = var + 0.001 batch_norm_inference = batch_normalization_test ( inputs = pool_out + \ self.b.dimshuffle('x', 0, 'x', 'x'), gamma = self.gamma, beta = self.beta, axes = 'spatial', mean = self.running_mean, var = self.running_var ) else: batch_norm_out = pool_out + self.b.dimshuffle('x', 0, 'x', 'x') batch_norm_inference = batch_norm_out batch_norm_out_shp = pool_out_shp self.output, self.output_shape = _activate( x=batch_norm_out, activation=activation, input_size=batch_norm_out_shp, verbose=verbose, dimension=2) self.inference, _ = _activate(x=batch_norm_inference, activation=activation, input_size=batch_norm_out_shp, verbose=verbose, dimension=2) # store parameters of this layer and do some book keeping. self.params = [self.w, self.b] self.active_params = [self.w, self.b] if batch_norm is True: self.params.append(self.gamma) self.params.append(self.beta) self.active_params.append(self.gamma) self.active_params.append(self.beta) self.params.append(self.running_mean) # inactive params self.params.append(self.running_var) # inactive params self.L1 = abs(self.w).sum() # if batch_norm is True : self.L1 = self.L1 # + abs(self.gamma).sum() self.L2 = (self.w**2).sum() # if batch_norm is True: self.L2 = self.L2 # + (self.gamma**2).sum() # Just doing this for print_layer method to use. self.nkerns = nkerns self.filter_shape = filter_shape self.poolsize = poolsize self.stride = stride self.input_shape = input_shape self.num_neurons = nkerns self.activation = activation self.batch_norm = batch_norm
def test_batch_normalization_train_broadcast(): for axes in ('per-activation', 'spatial', (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): x = vartype('x') ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # convert axes to explicit list if axes == 'per-activation': axes2 = (0,) elif axes == 'spatial': axes2 = (0,) + tuple(range(2, ndim)) else: axes2 = axes # compute axes for parameter tensors non_bc_axes = tuple(i for i in range(ndim) if i not in axes2) params_dimshuffle = ['x'] * ndim for i, axis in enumerate(non_bc_axes): params_dimshuffle[axis] = i # construct non-broadcasted parameter variables param_type = T.TensorType(x.dtype, (False,) * len(non_bc_axes)) scale, bias, running_mean, running_var = (param_type(n) for n in ('scale', 'bias', 'running_mean', 'running_var')) # broadcast parameter variables scale_bc = scale.dimshuffle(params_dimshuffle) bias_bc = bias.dimshuffle(params_dimshuffle) running_mean_bc = running_mean.dimshuffle(params_dimshuffle) running_var_bc = running_var.dimshuffle(params_dimshuffle) # batch_normalization_train with original, non-broadcasted variables train_non_bc = \ bn.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var) # batch_normalization_train with broadcasted variables train_bc = \ bn.batch_normalization_train( x, scale_bc, bias_bc, axes, eps, running_average_factor, running_mean_bc, running_var_bc) train_bc = tuple([train_bc[0]] + # out [r.dimshuffle(non_bc_axes) for r in train_bc[1:]]) # batch_normalization_test with original, non-broadcasted variables test_non_bc = \ bn.batch_normalization_test( x, scale, bias, running_mean, running_var, axes, eps) # batch_normalization_test with broadcasted variables test_bc = \ bn.batch_normalization_test( x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps) # subtract the results of the non-broadcasted and broadcasted calls results_non_bc = train_non_bc + (test_non_bc,) results_bc = train_bc + (test_bc,) results = [abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)] # compile to compute all differences f = theano.function([x, scale, bias, running_mean, running_var], T.sum(sum(results))) # the paired ops are exactly the same, so the optimizer should have # collapsed the sum of differences to a constant zero nodes = f.maker.fgraph.toposort() if theano.config.mode != "FAST_COMPILE": assert len(nodes) == 1 assert isinstance(nodes[0].op, theano.compile.DeepCopyOp) inputs = [numpy.asarray(numpy.random.rand(*((4,) * n)), x.dtype) for n in [x.ndim, scale.ndim, bias.ndim, running_mean.ndim, running_var.ndim]] assert 0.0 == f(*inputs)
def __init__( self, input, input_shape, id, rng=None, borrow=True, input_params=None, verbose=2, ): super(batch_norm_layer_2d, self).__init__(id=id, type='batch_norm', verbose=verbose) if verbose >= 3: print "... Creating batch norm layer" if rng is None: rng = numpy.random # To copy weights previously created or some wierd initializations if input_params is not None: init_gamma = input_params[0] init_beta = input_params[1] init_mean = input_params[2] init_var = input_params[3] channels = input_shape[1] if input_params is None: self.gamma = theano.shared(value=numpy.ones( (channels, ), dtype=theano.config.floatX), name='gamma', borrow=borrow) self.beta = theano.shared(value=numpy.zeros( (channels, ), dtype=theano.config.floatX), name='beta', borrow=borrow) self.running_mean = theano.shared(value=numpy.zeros( (channels, ), dtype=theano.config.floatX), name='population_mean', borrow=borrow) self.running_var = theano.shared(value=numpy.ones( (channels, ), dtype=theano.config.floatX), name='population_var', borrow=borrow) else: self.gamma = init_gamma self.beta = init_beta self.running_mean = init_mean self.running_var = init_var """ Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """ self.output, _, _, mean, var = batch_normalization_train( inputs=input, gamma=self.gamma, beta=self.beta, axes='spatial', running_mean=self.running_mean, running_var=self.running_var) mean = theano.tensor.unbroadcast(mean, 0) var = theano.tensor.unbroadcast(var, 0) self.updates[self.running_mean] = mean self.updates[self.running_var] = var + 0.001 self.inference = batch_normalization_test(inputs=input, gamma=self.gamma, beta=self.beta, axes='spatial', mean=self.running_mean, var=self.running_var) # store parameters of this layer and do some book keeping. self.parmas = [ self.gamma, self.beta, self.running_mean, self.running_var ] self.active_params = [self.gamma, self.beta] self.input_shape = input_shape self.output_shape = input_shape
def __init__(self, input, num_neurons, input_shape, id, rng=None, input_params=None, borrow=True, activation='relu', batch_norm=True, verbose=2): super(dot_product_layer, self).__init__(id=id, type='dot_product', verbose=verbose) if verbose >= 3: print "... Creating dot product layer" if rng is None: rng = numpy.random create = False if input_params is None: create = True elif input_params[0] is None: create = True if create is True: w_values = numpy.asarray( 0.01 * rng.standard_normal(size=(input_shape[1], num_neurons)), dtype=theano.config.floatX) if activation == 'sigmoid': w_values *= 4 self.w = theano.shared(value=w_values, name='weights') else: self.w = input_params[0] create = False if input_params is None: create = True elif input_params[1] is None: create = True if create is True: b_values = numpy.zeros((num_neurons, ), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, name='bias') else: self.b = input_params[1] if batch_norm is True: create = False if input_params is None: create = True elif input_params[2] is None: create = True if create is True: gamma_values = numpy.ones((1, num_neurons), dtype=theano.config.floatX) self.gamma = theano.shared(value=gamma_values, name='gamma') beta_values = numpy.zeros((1, num_neurons), dtype=theano.config.floatX) self.beta = theano.shared(value=beta_values, name='beta') self.running_mean = theano.shared(value=numpy.zeros( (1, num_neurons), dtype=theano.config.floatX), name='population_mean', borrow=borrow) self.running_var = theano.shared(value=numpy.ones( (1, num_neurons), dtype=theano.config.floatX), name='population_var', borrow=borrow) else: self.gamma = input_params[2] self.beta = input_params[3] self.running_mean = input_params[4] self.running_var = input_params[5] linear_fit = T.dot(input, self.w) + self.b if batch_norm is True: batch_norm_out, _, _, mean, var = batch_normalization_train( inputs=linear_fit, gamma=self.gamma, beta=self.beta, running_mean=self.running_mean, running_var=self.running_var) mean = theano.tensor.unbroadcast(mean, 0) var = theano.tensor.unbroadcast(var, 0) self.updates[self.running_mean] = mean self.updates[self.running_var] = var + 0.001 batch_norm_inference = batch_normalization_test( inputs=linear_fit, gamma=self.gamma, beta=self.beta, mean=self.running_mean, var=self.running_var) else: batch_norm_out = linear_fit batch_norm_inference = batch_norm_out batch_norm_shp = (input_shape[0], num_neurons) self.output, self.output_shape = _activate(x=batch_norm_out, activation=activation, input_size=batch_norm_shp, verbose=verbose, dimension=1) self.inference, _ = _activate(x=batch_norm_out, activation=activation, input_size=batch_norm_shp, verbose=verbose, dimension=1) # parameters of the model if batch_norm is True: self.params = [ self.w, self.b, self.gamma, self.beta, self.running_mean, self.running_var ] self.active_params = [self.w, self.b, self.gamma, self.beta] else: self.params = [self.w, self.b] self.active_params = [self.w, self.b] self.L1 = abs(self.w).sum() # if batch_norm is True: self.L1 = self.L1 + abs(self.gamma).sum() self.L2 = (self.w**2).sum() # if batch_norm is True: self.L2 = self.L2 + (self.gamma**2).sum() """ Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """ if verbose >= 3: print "... Dot Product layer is created with output shape " + str( self.output_shape) self.num_neurons = num_neurons self.activation = activation self.batch_norm = batch_norm