def test_batch_normalization(): def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B np.random.seed(1234) X = 1 + np.random.random([10, 20]).astype("float32") B = 1 + np.random.random([20]).astype("float32") G = 1 + np.random.random([20]).astype("float32") M = 1 + np.random.random([20]).astype("float32") V = 1 + np.random.random([20]).astype("float32") x = tt.matrix("x") b = tt.vector("b") g = tt.vector("g") m = tt.vector("m") v = tt.vector("v") bn_ref_op = bn_ref(x, g, b, m, v) f_ref = theano.function([x, g, b, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ["low_mem", "high_mem"]: bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode) f = theano.function([x, g, b, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) def bn_f(inputs, gamma, beta, mean, std): return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode) utt.verify_grad(bn_f, [X, G, B, M, V]) bn_ref_op = bn_ref( x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True) ) f_ref = theano.function([x, b, g], [bn_ref_op]) res_ref = f_ref(X, G, B) for mode in ["low_mem", "high_mem"]: bn_op = bn.batch_normalization( x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode, ) f = theano.function([x, b, g], [bn_op]) res = f(X, G, B) utt.assert_allclose(res_ref, res) def bn_f(inputs, gamma, beta, mean, std): return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode) utt.verify_grad( bn_f, [X, G, B, X.mean(axis=0)[np.newaxis], X.std(axis=0)[np.newaxis]] )
def conv_bn(inputs, gamma, beta, mean, std): return bn.batch_normalization(inputs, gamma.dimshuffle('x', 0, 'x', 'x'), beta.dimshuffle('x', 0, 'x', 'x'), mean.dimshuffle('x', 0, 'x', 'x'), std.dimshuffle('x', 0, 'x', 'x'), mode=mode)
def apply(self, input_, application_call, i=None): if self._training_mode: mean, stdev = self._compute_training_statistics(input_) else: mean, stdev = self._prepare_population_statistics(i) # Useful for filtration of calls that were already made in # training mode when doing graph transformations. # Very important to cast to bool, as self._training_mode is # normally a list (to support nested context managers), which would # otherwise get passed by reference and be remotely mutated. application_call.metadata['training_mode'] = bool(self._training_mode) # Useful for retrieving a list of updates for population # statistics. Ditch the broadcastable first axis, though, to # make it the same dimensions as the population mean and stdev # shared variables. application_call.metadata['offset'] = mean[0] application_call.metadata['divisor'] = stdev[0] # Give these quantities roles in the graph. _add_role_and_annotate(mean, BATCH_NORM_OFFSET, [self, application_call]) _add_role_and_annotate(stdev, BATCH_NORM_DIVISOR, [self, application_call]) scale = _add_batch_axis(self.scale) shift = _add_batch_axis(self.shift) # Heavy lifting is done by the Theano utility function. normalized = bn.batch_normalization( input_, scale, shift, mean, stdev, mode=('low_mem' if self.conserve_memory else 'high_mem')) return normalized
def bn_f(inputs, gamma, beta, mean, std): return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
def apply(self, input_, application_call): if self._training_mode: mean, stdev = self._compute_training_statistics(input_) else: mean, stdev = self._prepare_population_statistics() # Useful for filtration of calls that were already made in # training mode when doing graph transformations. # Very important to cast to bool, as self._training_mode is # normally a list (to support nested context managers), which would # otherwise get passed by reference and be remotely mutated. application_call.metadata['training_mode'] = bool(self._training_mode) # Useful for retrieving a list of updates for population # statistics. Ditch the broadcastable first axis, though, to # make it the same dimensions as the population mean and stdev # shared variables. application_call.metadata['offset'] = mean[0] application_call.metadata['divisor'] = stdev[0] # Give these quantities roles in the graph. _add_role_and_annotate(mean, BATCH_NORM_OFFSET, [self, application_call]) _add_role_and_annotate(stdev, BATCH_NORM_DIVISOR, [self, application_call]) scale = _add_batch_axis(self.scale) shift = _add_batch_axis(self.shift) # Heavy lifting is done by the Theano utility function. normalized = bn.batch_normalization(input_, scale, shift, mean, stdev, mode=('low_mem' if self.conserve_memory else 'high_mem')) return normalized
def test_bn(): def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B numpy.random.seed(1234) X = 1 + numpy.random.random([10, 20]).astype("float32") B = 1 + numpy.random.random([20]).astype("float32") G = 1 + numpy.random.random([20]).astype("float32") M = 1 + numpy.random.random([20]).astype("float32") V = 1 + numpy.random.random([20]).astype("float32") x = theano.tensor.matrix("x") b = theano.tensor.vector("b") g = theano.tensor.vector("g") m = theano.tensor.vector("m") v = theano.tensor.vector("v") bn_ref_op = bn_ref(x, g, b, m, v) f_ref = theano.function([x, b, g, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ["low_mem", "high_mem"]: bn_op = batch_normalization(x, g, b, m, v, mode=mode) f = theano.function([x, b, g, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) def bn(inputs, gamma, beta, mean, std): return batch_normalization(inputs, gamma, beta, mean, std, mode=mode) utt.verify_grad(bn, [X, G, B, M, V]) bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True)) f_ref = theano.function([x, b, g], [bn_ref_op]) res_ref = f_ref(X, G, B) for mode in ["low_mem", "high_mem"]: bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode) f = theano.function([x, b, g], [bn_op]) res = f(X, G, B) utt.assert_allclose(res_ref, res) def bn(inputs, gamma, beta, mean, std): return batch_normalization(inputs, gamma, beta, mean, std, mode=mode) utt.verify_grad(batch_normalization, [X, G, B, X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
def test_batch_normalization(): def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B numpy.random.seed(1234) X = 1 + numpy.random.random([10, 20]).astype('float32') B = 1 + numpy.random.random([20]).astype('float32') G = 1 + numpy.random.random([20]).astype('float32') M = 1 + numpy.random.random([20]).astype('float32') V = 1 + numpy.random.random([20]).astype('float32') x = theano.tensor.matrix('x') b = theano.tensor.vector('b') g = theano.tensor.vector('g') m = theano.tensor.vector('m') v = theano.tensor.vector('v') bn_ref_op = bn_ref(x, g, b, m, v) f_ref = theano.function([x, b, g, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ['low_mem', 'high_mem']: bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode) f = theano.function([x, b, g, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) def bn_f(inputs, gamma, beta, mean, std): return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode) utt.verify_grad(bn_f, [X, G, B, M, V]) bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True)) f_ref = theano.function([x, b, g], [bn_ref_op]) res_ref = f_ref(X, G, B) for mode in ['low_mem', 'high_mem']: bn_op = bn.batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode) f = theano.function([x, b, g], [bn_op]) res = f(X, G, B) utt.assert_allclose(res_ref, res) def bn_f(inputs, gamma, beta, mean, std): return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode) utt.verify_grad(bn_f, [X, G, B, X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
def conv_bn(inputs, gamma, beta, mean, std): return batch_normalization( inputs, gamma.dimshuffle("x", 0, "x", "x"), beta.dimshuffle("x", 0, "x", "x"), mean.dimshuffle("x", 0, "x", "x"), std.dimshuffle("x", 0, "x", "x"), mode=mode, )
def _inference(self, input_): output = bn.batch_normalization(input_, self.gamma.dimshuffle(*self.pattern), self.beta.dimshuffle(*self.pattern), self.pop_means.dimshuffle(*self.pattern), tensor.sqrt(self.pop_vars.dimshuffle(*self.pattern) + self.epsilon), mode='low_mem') return output
def _inference(self, input_): output = bn.batch_normalization( input_, self.gamma.dimshuffle(*self.pattern), self.beta.dimshuffle(*self.pattern), self.pop_means.dimshuffle(*self.pattern), tensor.sqrt( self.pop_vars.dimshuffle(*self.pattern) + self.epsilon), mode='low_mem') return output
def get_reconstructed_input(self, hidden): lin_output = T.dot(hidden, self.W_prime) + self.b_prime bn_output = batch_normalization( inputs=lin_output, gamma=self.gamma_o, beta=self.beta_o, mean=lin_output.mean((0,), keepdims=True), std=lin_output.std((0,), keepdims=True), mode="low_mem", ) return self.actv_fcn(bn_output)
def get_hidden_values(self, input): lin_output = T.dot(input, self.W) + self.b bn_output = batch_normalization( inputs=lin_output, gamma=self.gamma_h, beta=self.beta_h, mean=lin_output.mean((0,), keepdims=True), std=lin_output.std((0,), keepdims=True), mode="low_mem", ) return self.actv_fcn(bn_output)
def _training(self, input_): self.batch_means = input_.mean(axis=self.axes, keepdims=False, dtype=floatX) self.batch_vars = input_.var(axis=self.axes, keepdims=False) output = bn.batch_normalization(input_, self.gamma.dimshuffle(*self.pattern), self.beta.dimshuffle(*self.pattern), self.batch_means.dimshuffle(*self.pattern), tensor.sqrt(self.batch_vars.dimshuffle(*self.pattern) + self.epsilon), mode='low_mem') return output
def test_bn_feature_maps(): def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B np.random.seed(1234) X = 1 + np.random.random([2, 3, 4, 4]).astype("float32") B = 1 + np.random.random([3]).astype("float32") G = 1 + np.random.random([3]).astype("float32") M = 1 + np.random.random([3]).astype("float32") V = 1 + np.random.random([3]).astype("float32") x = theano.tensor.tensor4("x") b = theano.tensor.vector("b") g = theano.tensor.vector("g") m = theano.tensor.vector("m") v = theano.tensor.vector("v") bn_ref_op = bn_ref( x, g.dimshuffle("x", 0, "x", "x"), b.dimshuffle("x", 0, "x", "x"), m.dimshuffle("x", 0, "x", "x"), v.dimshuffle("x", 0, "x", "x"), ) f_ref = theano.function([x, b, g, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ["low_mem", "high_mem"]: bn_op = bn.batch_normalization( x, g.dimshuffle("x", 0, "x", "x"), b.dimshuffle("x", 0, "x", "x"), m.dimshuffle("x", 0, "x", "x"), v.dimshuffle("x", 0, "x", "x"), mode=mode, ) f = theano.function([x, b, g, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) def conv_bn(inputs, gamma, beta, mean, std): return bn.batch_normalization( inputs, gamma.dimshuffle("x", 0, "x", "x"), beta.dimshuffle("x", 0, "x", "x"), mean.dimshuffle("x", 0, "x", "x"), std.dimshuffle("x", 0, "x", "x"), mode=mode, ) utt.verify_grad(conv_bn, [X, G, B, M, V])
def test_bn_feature_maps(): def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B numpy.random.seed(1234) X = 1 + numpy.random.random([10, 20, 4, 4]).astype("float32") B = 1 + numpy.random.random([20]).astype("float32") G = 1 + numpy.random.random([20]).astype("float32") M = 1 + numpy.random.random([20]).astype("float32") V = 1 + numpy.random.random([20]).astype("float32") x = theano.tensor.tensor4("x") b = theano.tensor.vector("b") g = theano.tensor.vector("g") m = theano.tensor.vector("m") v = theano.tensor.vector("v") bn_ref_op = bn_ref( x, g.dimshuffle("x", 0, "x", "x"), b.dimshuffle("x", 0, "x", "x"), m.dimshuffle("x", 0, "x", "x"), v.dimshuffle("x", 0, "x", "x"), ) f_ref = theano.function([x, b, g, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ["low_mem", "high_mem"]: bn_op = batch_normalization( x, g.dimshuffle("x", 0, "x", "x"), b.dimshuffle("x", 0, "x", "x"), m.dimshuffle("x", 0, "x", "x"), v.dimshuffle("x", 0, "x", "x"), mode=mode, ) f = theano.function([x, b, g, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) def conv_bn(inputs, gamma, beta, mean, std): return batch_normalization( inputs, gamma.dimshuffle("x", 0, "x", "x"), beta.dimshuffle("x", 0, "x", "x"), mean.dimshuffle("x", 0, "x", "x"), std.dimshuffle("x", 0, "x", "x"), mode=mode, ) utt.verify_grad(conv_bn, [X, G, B, M, V])
def __init__(self, input1, n_in, n_out, W_values=None, b_values=None, activation=T.tanh, batch_norm=True): self.input1 = input1 self.W = theano.shared(value=W_values, name='W', borrow=True) self.b = theano.shared(value=b_values, name='b', borrow=True) lin_output = T.dot(input1, self.W) + self.b # self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma') # self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta') # bn_output = batch_normalization(inputs = lin_output, # gamma = self.gamma, beta = self.beta, mean = lin_output.mean((0,), keepdims=True), # std = lin_output.std((0,), keepdims = True), # mode='high_mem') # self.output1 = ( # bn_output if activation is None # else activation(bn_output) # ) if batch_norm: self.gamma = theano.shared(value=numpy.ones( (n_out, ), dtype=theano.config.floatX), name='gamma', borrow=True) self.beta = theano.shared(value=numpy.zeros( (n_out, ), dtype=theano.config.floatX), name='beta', borrow=True) # bn_output = batch_normalization(inputs = self.linear, # gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True), # std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem') # xmean = lin_output.mean(0, keepdims=True) # xstd = T.sqrt(lin_output.std(0, keepdims=True)**2+1e-6) bn_output = batch_normalization( inputs=lin_output, gamma=self.gamma, beta=self.beta, mean=lin_output.mean(0, keepdims=True), std=T.sqrt(lin_output.std(0, keepdims=True)**2 + 1e-6), mode='high_mem') self.output1 = T.clip(bn_output, 0, 40) self.params = [self.W, self.b, self.gamma, self.beta] else: self.output1 = (lin_output if activation is None else activation(lin_output)) self.params = [self.W, self.b]
def bn_layer(x, a, b, normParam, params, phase): ''' Apply BN. # phase = 0 : BN eval with m1v1, BN ups weighter average # phase = 1 : BN eval with m2v2, no BN ups ''' minAlpha = params.movingAvMin iterStep = params.movingAvStep # compute mean & variance if params.model == 'convnet': mean1 = T.mean(x, axis=(0, 2, 3)) var1 = T.var(x, axis=(0, 2, 3)) else: mean1 = T.mean(x, axis=0) var1 = T.var(x, axis=0) # moving average as a proxi for validation model alpha = (1. - phase) * T.maximum(minAlpha, 1. / normParam['iter']) mean2 = (1. - alpha) * normParam['mean'] + alpha * mean1 var2 = (1. - alpha) * normParam['var'] + alpha * var1 mean = (1. - phase) * mean2 + phase * mean1 var = (1. - phase) * var1 + phase * var1 std = T.sqrt(var + eps) # apply transformation: if params.model == 'convnet': x = bn.batch_normalization(x, a.dimshuffle('x', 0, 'x', 'x'), b.dimshuffle('x', 0, 'x', 'x'), mean.dimshuffle('x', 0, 'x', 'x'), std.dimshuffle('x', 0, 'x', 'x'), mode='high_mem') else: x = bn.batch_normalization(x, a, b, mean, std) updateBN = [mean2, var2, mean1, var1, normParam['iter'] + iterStep] return x, updateBN
def _training(self, input_): self.batch_means = input_.mean(axis=self.axes, keepdims=False, dtype=floatX) self.batch_vars = input_.var(axis=self.axes, keepdims=False) output = bn.batch_normalization( input_, self.gamma.dimshuffle(*self.pattern), self.beta.dimshuffle(*self.pattern), self.batch_means.dimshuffle(*self.pattern), tensor.sqrt( self.batch_vars.dimshuffle(*self.pattern) + self.epsilon), mode='low_mem') return output
def get_cnn2_log_prob(self, X, Z, w1, w2, w3, b3, w4, b4, gamma1, beta1, gamma2, beta2, gamma3, beta3, pool_horiz, n_conv, dropout, deterministic): l1 = T.nnet.relu( T.nnet.conv2d(X, w1, border_mode='valid', subsample=(1, 1))) bn1 = batch_normalization(inputs = l1, gamma = gamma1, beta = beta1, mean = l1.mean((0,), keepdims=True), \ std = T.ones_like(l1.var((0,), keepdims = True)), mode='high_mem') l2 = max_pool_2d(bn1, ds=(1, pool_horiz), st=(1, 1), ignore_border=True) l3 = T.nnet.relu( T.nnet.conv2d(l2, w2, border_mode='valid', subsample=(1, 1))) bn2 = batch_normalization(inputs = l3, gamma = gamma2, beta = beta2, mean = l3.mean((0,), keepdims=True), \ std = T.ones_like(l3.var((0,), keepdims = True)), mode='high_mem') l4 = max_pool_2d(bn2, ds=(1, pool_horiz), st=(1, 1), ignore_border=True) l5 = l4.reshape((X.shape[0], n_conv)) l5 = self.add_dropout(l5, dropout, deterministic) l6 = T.nnet.relu(T.dot(l5, w3) + b3) l6 = self.add_dropout(l6, dropout, deterministic) l7 = T.concatenate([l6, Z], axis=1) l8 = T.dot(l7, w4) + b4 bn3 = batch_normalization(inputs = l8, gamma = gamma3, beta = beta3, mean = l8.mean((0,), keepdims=True), \ std = T.ones_like(l8.var((0,), keepdims = True)), mode='high_mem') #self.helper_fn = theano.function(inputs=[X, Z], outputs=[bn3], allow_input_downcast=True) log_prob = T.nnet.logsoftmax(bn3) return log_prob
def get_output(self, input, **kwargs): input_mean = input.mean(self.axes) # input_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) input_std = T.sqrt(input.var(self.axes) + self.epsilon) # Decide whether to use the stored averages or mini-batch statistics use_averages = self.deterministic if use_averages: mean = self.mean std = self.std else: mean = input_mean std = input_std # Decide whether to update the stored averages update_averages = self.update_averages and not use_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_std.default_update = ((1 - self.alpha) * running_std + self.alpha * input_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean std += 0 * running_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(list(range(input.ndim - len(self.axes)))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) std = std.dimshuffle(pattern) # normalize # normalized = (input - mean) * (gamma * std) + beta normalized = batch_normalization( input, gamma, beta, mean, std, mode='low_mem') return self.activation(normalized)
def bn_layer(x, a, b, normParam, params, phase): ''' Apply BN. # phase = 0 : BN eval with m1v1, BN ups weighter average # phase = 1 : BN eval with m2v2, no BN ups ''' minAlpha = params.movingAvMin iterStep = params.movingAvStep # compute mean & variance if params.model == 'convnet': mean1 = T.mean(x, axis = (0, 2, 3)) var1 = T.var(x, axis = (0, 2, 3)) else: mean1 = T.mean(x, axis = 0) var1 = T.var(x, axis = 0) # moving average as a proxi for validation model alpha = (1.-phase)*T.maximum(minAlpha, 1./normParam['iter']) mean2 = (1.-alpha)*normParam['mean'] + alpha*mean1 var2 = (1.-alpha)*normParam['var'] + alpha*var1 mean = (1.-phase)*mean2 + phase*mean1 var = (1.-phase)*var1 + phase*var1 std = T.sqrt(var+eps) # apply transformation: if params.model == 'convnet': x = bn.batch_normalization(x, a.dimshuffle('x', 0, 'x', 'x'), b.dimshuffle('x', 0, 'x', 'x'), mean.dimshuffle('x', 0, 'x', 'x'), std.dimshuffle('x', 0, 'x', 'x'), mode='high_mem') else: x = bn.batch_normalization(x, a, b, mean, std) updateBN = [mean2, var2, mean1, var1, normParam['iter']+iterStep] return x, updateBN
def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh, bn=False): self.input = input if W is None: W_values = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out)), dtype=theano.config.floatX) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) if b is None: b_values = numpy.zeros((n_out, ), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.W, self.b = W, b lin_output = T.dot(input, self.W) + self.b if bn: self.gamma = theano.shared(value=numpy.ones( (n_out, ), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value=numpy.zeros( (n_out, ), dtype=theano.config.floatX), name='beta') mean = lin_output.mean(0, keepdims=True) std = T.sqrt(lin_output.std(0, keepdims=True)**2 + 0.01) output = batch_normalization(inputs=lin_output, gamma=self.gamma, beta=self.beta, mean=mean, std=std) else: output = lin_output self.output = (output if activation is None else activation(output)) # parameters of the model self.params = [self.W, self.b, self.gamma, self.beta ] if bn else [self.W, self.b]
def test_bn_feature_maps(): def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B numpy.random.seed(1234) X = 1 + numpy.random.random([2, 3, 4, 4]).astype('float32') B = 1 + numpy.random.random([3]).astype('float32') G = 1 + numpy.random.random([3]).astype('float32') M = 1 + numpy.random.random([3]).astype('float32') V = 1 + numpy.random.random([3]).astype('float32') x = theano.tensor.tensor4('x') b = theano.tensor.vector('b') g = theano.tensor.vector('g') m = theano.tensor.vector('m') v = theano.tensor.vector('v') bn_ref_op = bn_ref(x, g.dimshuffle('x', 0, 'x', 'x'), b.dimshuffle('x', 0, 'x', 'x'), m.dimshuffle('x', 0, 'x', 'x'), v.dimshuffle('x', 0, 'x', 'x')) f_ref = theano.function([x, b, g, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ['low_mem', 'high_mem']: bn_op = bn.batch_normalization(x, g.dimshuffle('x', 0, 'x', 'x'), b.dimshuffle('x', 0, 'x', 'x'), m.dimshuffle('x', 0, 'x', 'x'), v.dimshuffle('x', 0, 'x', 'x'), mode=mode) f = theano.function([x, b, g, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) def conv_bn(inputs, gamma, beta, mean, std): return bn.batch_normalization(inputs, gamma.dimshuffle('x', 0, 'x', 'x'), beta.dimshuffle('x', 0, 'x', 'x'), mean.dimshuffle('x', 0, 'x', 'x'), std.dimshuffle('x', 0, 'x', 'x'), mode=mode) utt.verify_grad(conv_bn, [X, G, B, M, V])
def __init__(self, x, n_in, n_out, dropout_on, layer=0, act=T.nnet.sigmoid, w = None, b = None, dropout_rate=0.3): if w==None: w = theano.shared( value=w_init(n_in, n_out), name='w'+str(layer), borrow=True ) if b==None: b = theano.shared( value=b_init(n_out), name='b'+str(layer), borrow=True ) self.w = w self.b = b self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta') rng = np.random.RandomState(42) srng = RandomStreams(rng.randint(10**9)) mask = srng.binomial(n=1, p=1-dropout_rate, size=x.shape) cast_mark = T.cast(mask, theano.config.floatX) drop_input = T.switch(dropout_on, x*cast_mark,x*(1-dropout_rate)) lin_output = T.dot(drop_input, self.w) + self.b bn_output = batch_normalization(inputs = lin_output, gamma = self.gamma, beta = self.beta, mean = lin_output.mean((0,), keepdims=True), std = lin_output.std((0,), keepdims = True), mode='low_mem') self.output = ( bn_output if act is None else act(bn_output) ) self.params = [self.w, self.b]
def __init__(self, rng, input, filter_shape, image_shape, use_bn = 1): assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) W_bound = numpy.sqrt(2. /(fan_in + fan_out)) W_value = rng.normal(loc = 0., scale = W_bound, size = filter_shape) self.W = theano.shared(W_value, name = 'W', borrow = True) conv_out = conv2d(input = self.input, filters = self.W) # pooled_out = pool.pool_2d(input = conv_out, # ds=poolsize, ignore_border=True) b_bound = numpy.sqrt(2. /fan_out) b_value = rng.normal(loc = 0, scale = b_bound, size=(filter_shape[0],)) self.b = theano.shared(b_value, name = 'b', borrow = True) self.linear = conv_out + self.b.dimshuffle('x', 0, 'x','x') if use_bn == 1: self.gamma = theano.shared(value = numpy.ones((filter_shape[0],), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX), name='beta') self.linear_shuffle = self.linear.dimshuffle(0, 2, 3, 1) self.linear_res = self.linear_shuffle.reshape( (self.linear.shape[0]*self.linear.shape[2]*self.linear.shape[3], self.linear.shape[1])) bn_output = batch_normalization(inputs = self.linear_shuffle, gamma = self.gamma, beta = self.beta, mean = self.linear_res.mean((0,), keepdims=True), std = T.std(self.linear_res, axis=0), mode='high_mem') self.output = T.nnet.relu( bn_output.dimshuffle(0, 3, 1, 2) ) self.params = [self.W, self.b, self.gamma, self.beta] else: self.output = T.nnet.relu(self.linear) self.params = [self.W, self.b]
def test_BNComposite(): try: orig = theano.config.compute_test_value theano.config.compute_test_value = "raise" def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B np.random.seed(1234) X = 1 + np.random.random([10, 20]).astype("float32") B = 1 + np.random.random([20]).astype("float32") G = 1 + np.random.random([20]).astype("float32") M = 1 + np.random.random([20]).astype("float32") V = 1 + np.random.random([20]).astype("float32") x = theano.tensor.matrix("x") b = theano.tensor.vector("b") g = theano.tensor.vector("g") m = theano.tensor.vector("m") v = theano.tensor.vector("v") x.tag.test_value = np.random.rand(2, 2).astype(theano.config.floatX) b.tag.test_value = np.random.rand(2).astype(theano.config.floatX) g.tag.test_value = np.random.rand(2).astype(theano.config.floatX) m.tag.test_value = np.random.rand(2).astype(theano.config.floatX) v.tag.test_value = np.random.rand(2).astype(theano.config.floatX) bn_ref_op = bn_ref(x, g, b, m, v) f_ref = theano.function([x, b, g, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ["low_mem", "high_mem"]: bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode) f = theano.function([x, b, g, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) finally: theano.config.compute_test_value = orig
def test_BNComposite(): try: orig = theano.config.compute_test_value theano.config.compute_test_value = 'raise' def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B np.random.seed(1234) X = 1 + np.random.random([10, 20]).astype('float32') B = 1 + np.random.random([20]).astype('float32') G = 1 + np.random.random([20]).astype('float32') M = 1 + np.random.random([20]).astype('float32') V = 1 + np.random.random([20]).astype('float32') x = theano.tensor.matrix('x') b = theano.tensor.vector('b') g = theano.tensor.vector('g') m = theano.tensor.vector('m') v = theano.tensor.vector('v') x.tag.test_value = np.random.rand(2, 2).astype(theano.config.floatX) b.tag.test_value = np.random.rand(2).astype(theano.config.floatX) g.tag.test_value = np.random.rand(2).astype(theano.config.floatX) m.tag.test_value = np.random.rand(2).astype(theano.config.floatX) v.tag.test_value = np.random.rand(2).astype(theano.config.floatX) bn_ref_op = bn_ref(x, g, b, m, v) f_ref = theano.function([x, b, g, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ['low_mem', 'high_mem']: bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode) f = theano.function([x, b, g, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) finally: theano.config.compute_test_value = orig
def test_BNComposite(): try: orig = theano.config.compute_test_value theano.config.compute_test_value = 'raise' def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B numpy.random.seed(1234) X = 1 + numpy.random.random([10, 20]).astype('float32') B = 1 + numpy.random.random([20]).astype('float32') G = 1 + numpy.random.random([20]).astype('float32') M = 1 + numpy.random.random([20]).astype('float32') V = 1 + numpy.random.random([20]).astype('float32') x = theano.tensor.matrix('x') b = theano.tensor.vector('b') g = theano.tensor.vector('g') m = theano.tensor.vector('m') v = theano.tensor.vector('v') x.tag.test_value = numpy.random.rand(2, 2).astype(theano.config.floatX) b.tag.test_value = numpy.random.rand(2).astype(theano.config.floatX) g.tag.test_value = numpy.random.rand(2).astype(theano.config.floatX) m.tag.test_value = numpy.random.rand(2).astype(theano.config.floatX) v.tag.test_value = numpy.random.rand(2).astype(theano.config.floatX) bn_ref_op = bn_ref(x, g, b, m, v) f_ref = theano.function([x, b, g, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ['low_mem', 'high_mem']: bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode) f = theano.function([x, b, g, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) finally: theano.config.compute_test_value = orig
def __init__(self, rng, is_train, input, n_in, n_out, dropout_rate=0.5, W=None, b=None, activation=ReLu): self.input = input p = dropout_rate W = numpy.asarray(numpy.random.normal(loc=0.0, scale=0.05, size=(n_in, n_out)), dtype=theano.config.floatX) self.W = theano.shared(W, borrow=True) b = numpy.zeros((n_out, ), dtype=theano.config.floatX) self.b = theano.shared(value=b, borrow=True) linearOutput = T.dot(self.input, self.W) + self.b train_output = drop(input=np.cast[theano.config.floatX](1. / p) * linearOutput, p=dropout_rate, rng=rng) tempOutPut = T.switch(T.neq(is_train, 0), train_output, linearOutput) bnOutput = bn.batch_normalization(inputs=tempOutPut, gamma=1., beta=0, mean=T.mean(tempOutPut), std=T.std(tempOutPut)) self.output = activation(bnOutput) self.params = [self.W, self.b]
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=70, filter_size=[3, 1], maxSentLen=70, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(all_sentences_l[2], dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(all_sentences_r[2], dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(all_labels[2], dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered" "words keep random initialization" rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' 'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in' 'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) ' embed_input_l = init_embeddings[sents_ids_l.flatten( )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) '''create parameters for attentive convolution function ''' gate_filter_shape = (emb_size, 1, emb_size, 1) conv_W_pre, conv_b_pre = create_conv_para(rng, filter_shape=gate_filter_shape) conv_W_gate, conv_b_gate = create_conv_para(rng, filter_shape=gate_filter_shape) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W2_context, conv_b2_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [ conv_W, conv_b, conv_W_context, conv_W_pre, conv_b_pre, conv_W_gate, conv_b_gate, conv_W2, conv_b2, conv_W2_context ] "A gated convolution layer to form more expressive word representations in each sentence" "input tensor3 (batch_size, emb_size, sen_length), output tensor3 (batch_size, emb_size, sen_length)" conv_layer_gate_l = Conv_with_Mask_with_Gate( rng, input_tensor3=embed_input_l, mask_matrix=sents_mask_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=conv_W_pre, b=conv_b_pre, W_gate=conv_W_gate, b_gate=conv_b_gate) conv_layer_gate_r = Conv_with_Mask_with_Gate( rng, input_tensor3=embed_input_r, mask_matrix=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=conv_W_pre, b=conv_b_pre, W_gate=conv_W_gate, b_gate=conv_b_gate) ''' attentive convolution function, two sizes of filter_width 3&1 are used. Multi-channel ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=conv_layer_gate_l.output_tensor3, input_tensor3_r=conv_layer_gate_r.output_tensor3, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=conv_layer_gate_l.output_tensor3, input_tensor3_r=conv_layer_gate_r.output_tensor3, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_W2, b=conv_b2, W_context=conv_W2_context, b_context=conv_b2_context) attentive_sent_embeddings_l2 = attentive_conv_layer2.attentive_maxpool_vec_l attentive_sent_embeddings_r2 = attentive_conv_layer2.attentive_maxpool_vec_r "Batch normalization for the four output sentence representation vectors" gamma = theano.shared(np.asarray(rng.uniform( low=-1.0 / math.sqrt(hidden_size[0]), high=1.0 / math.sqrt(hidden_size[0]), size=(hidden_size[0])), dtype=theano.config.floatX), borrow=True) beta = theano.shared(np.zeros((hidden_size[0]), dtype=theano.config.floatX), borrow=True) bn_params = [gamma, beta] bn_attentive_sent_embeddings_l = batch_normalization( inputs=attentive_sent_embeddings_l, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_l.mean((0, ), keepdims=True), std=attentive_sent_embeddings_l.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_r = batch_normalization( inputs=attentive_sent_embeddings_r, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_r.mean((0, ), keepdims=True), std=attentive_sent_embeddings_r.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_l2 = batch_normalization( inputs=attentive_sent_embeddings_l2, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_l2.mean((0, ), keepdims=True), std=attentive_sent_embeddings_l2.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_r2 = batch_normalization( inputs=attentive_sent_embeddings_r2, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_r2.mean((0, ), keepdims=True), std=attentive_sent_embeddings_r2.std((0, ), keepdims=True), mode='low_mem') "Before logistic regression layer, we insert a hidden layer. Now form input to HL classifier" HL_layer_1_input = T.concatenate([ bn_attentive_sent_embeddings_l, bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l + bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l * bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l2, bn_attentive_sent_embeddings_r2, bn_attentive_sent_embeddings_l2 + bn_attentive_sent_embeddings_r2, bn_attentive_sent_embeddings_l2 * bn_attentive_sent_embeddings_r2 ], axis=1) HL_layer_1_input_size = 8 * hidden_size[0] "Create hidden layer parameters" HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, HL_layer_1_input_size, hidden_size[1]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] "Hidden Layer and batch norm to its output again" HL_layer_1 = HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[1], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) gamma_HL = theano.shared(np.asarray(rng.uniform( low=-1.0 / math.sqrt(hidden_size[1]), high=1.0 / math.sqrt(hidden_size[1]), size=(hidden_size[1])), dtype=theano.config.floatX), borrow=True) beta_HL = theano.shared(np.zeros((hidden_size[1]), dtype=theano.config.floatX), borrow=True) bn_params_HL = [gamma_HL, beta_HL] bn_HL_output = batch_normalization(inputs=HL_layer_1.output, gamma=gamma_HL, beta=beta_HL, mean=HL_layer_1.output.mean( (0, ), keepdims=True), std=HL_layer_1.output.std( (0, ), keepdims=True), mode='low_mem') "Form input to LR classifier" LR_input = T.concatenate([HL_layer_1_input, bn_HL_output], axis=1) LR_input_size = HL_layer_1_input_size + hidden_size[1] U_a = create_ensemble_para(rng, 3, LR_input_size) # (input_size, 3) LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] "Logistic Regression layer" layer_LR = LogisticRegression( rng, input=normalize_matrix_col_wise(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [ init_embeddings ] + NN_para + LR_para + bn_params + HL_layer_1_params + bn_params_HL cost = loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) if (epoch == 1 and iter % 1000 == 0) or (epoch >= 2 and iter % 5 == 0): print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() dev_error_sum = 0.0 for dev_batch_id in dev_batch_start: # for each test batch dev_error_i = dev_model( dev_sents_l[dev_batch_id:dev_batch_id + batch_size], dev_masks_l[dev_batch_id:dev_batch_id + batch_size], dev_sents_r[dev_batch_id:dev_batch_id + batch_size], dev_masks_r[dev_batch_id:dev_batch_id + batch_size], dev_labels_store[dev_batch_id:dev_batch_id + batch_size]) dev_error_sum += dev_error_i dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start)) if dev_acc > max_acc_dev: max_acc_dev = dev_acc print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev ''' best dev model, test ''' error_sum = 0.0 for test_batch_id in test_batch_start: # for each test batch error_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i test_acc = 1.0 - error_sum / (len(test_batch_start)) if test_acc > max_acc_test: max_acc_test = test_acc print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test else: print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def __init__(self, input, rng, n_in, n_out, stochastic=False, binary=True): # initialize with 0 the weights W as a matrix of shape (n_in, n_out) self.high = 2.#numpy.float32(numpy.sqrt(6. / (float(n_in) + float(n_out) ))) self.W0 = numpy.float32(self.high/2.) #self.W = theano.shared(value=numpy.zeros((n_in, n_out), # dtype=theano.config.floatX), # name='W', borrow=True) #self.high = numpy.float32(2.) #self.W0 = numpy.float32(self.high/2.) W_values = numpy.asarray( rng.uniform( low= -1.,#numpy.sqrt(6. / (n_in + n_out)), high= 1.,#numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) #srng = RandomStreams(seed=420) srng = theano.sandbox.rng_mrg.MRG_RandomStreams(420) self.W = theano.shared(value=W_values, name='W', borrow=True) # initialize the baises b as a vector of n_out 0s self.b = theano.shared(value=numpy.zeros((n_out,), dtype=theano.config.floatX), name='b', borrow=True) self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta') def hard_sigma(w): p=T.clip((w+1)/2,0,1) return p if binary: Wb = hard_sigma(self.W/self.W0) if stochastic: #Wb = T.cast(numpy.random.binomial(n=1, p=T.ge(Wb), size=(n_in, n_out)), theano.config.floatX) Wb = srng.binomial(n=1, p=Wb, size=(n_in, n_out) ) else: Wb = T.round(Wb) # Leave below alone Wb = T.switch(Wb,self.W0, -self.W0) #Wb = T.cast(T.switch(Wb,self.W0, -self.W0), dtype=theano.config.floatX) #Wb = theano.shared(Wb.eval(), name='Wb', borrow=True) self.Wb = Wb else: self.Wb = self.W # parameters of the model self.linear = T.dot(input, self.Wb) + self.b bn_output = batch_normalization(inputs = self.linear, gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True), std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem') self.linear_output = bn_output self.y_pred = T.argmax(bn_output, axis=1) if binary: self.params = [self.W, self.Wb, self.gamma, self.beta, self.b] elif binary==False: self.params = [self.Wb, self.gamma, self.beta, self.b] self.len_params = len(self.params) self.n_in=n_in # keep track of model input self.input = input
def __init__(self, input, rng, n_in, n_out, stochastic=False, binary=True): """ Initialize the parameters of the logistic regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) # self.W = theano.shared( # value=numpy.zeros( # (n_in, n_out), # dtype=theano.config.floatX # ), # name='W', # borrow=True # ) W_values = numpy.asarray( rng.uniform( low= -1.,#numpy.sqrt(6. / (n_in + n_out)), high= 1.,#numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) self.W = theano.shared(value=W_values, name='W', borrow=True) # initialize the biases b as a vector of n_out 0s self.b = theano.shared( value=numpy.zeros( (n_out,), dtype=theano.config.floatX ), name='b', borrow=True ) self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta') self.high = 2. #numpy.float32(numpy.sqrt(6. / (n_in + n_out))) self.W0 = numpy.float32(self.high/2) #srng = RandomStreams(seed=420) srng = theano.sandbox.rng_mrg.MRG_RandomStreams(420) def hard_sigma(w): return T.clip((w+1.)/2,0,1) if binary: if stochastic: Wb = hard_sigma(self.W/self.W0) # using numpy was insanely slow and it caused issues with having to evaluate the function #Wb = T.cast(numpy.random.binomial(n=1, p=Wb, size=(n_in, n_out)), theano.config.floatX) Wb = srng.binomial(n=1, p=Wb, size=(n_in, n_out) ) # This works much better else: # T.ge is greater than or equal to #Wb = T.ge(Wb, 0) Wb = T.ge(self.W, 0) #Wb = T.round(Wb) Wb = T.switch(Wb, self.W0, -self.W0) self.Wb = Wb # The code below was way slower #Wb = T.cast(T.switch(Wb,self.W0, -self.W0), dtype=theano.config.floatX) #Wb = theano.shared(Wb.eval(), name='Wb', borrow=True) else: self.Wb = self.W # symbolic expression for computing the matrix of class-membership # probabilities # Where: # W is a matrix where column-k represent the separation hyperplane for # class-k # x is a matrix where row-j represents input training sample-j # b is a vector where element-k represent the free parameter of # hyperplane-k self.linear=T.dot(input, self.Wb) + self.b bn_output = batch_normalization(inputs = self.linear, gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True), std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem') self.p_y_given_x = T.nnet.softmax(bn_output) # symbolic description of how to compute prediction as class whose # probability is maximal self.y_pred = T.argmax(self.p_y_given_x, axis=1) # parameters of the model if binary: self.params = [self.W, self.Wb, self.gamma, self.beta, self.b] elif not binary: self.params = [self.Wb, self.gamma, self.beta, self.b] self.len_params = len(self.params) self.n_in=n_in # keep track of model input self.input = input
def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh): """ Typical hidden layer of a MLP: units are fully-connected and have sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) and the bias vector b is of shape (n_out,). NOTE : The nonlinearity used here is tanh Hidden unit activation is given by: tanh(dot(input,W) + b) :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dmatrix :param input: a symbolic tensor of shape (n_examples, n_in) :type n_in: int :param n_in: dimensionality of input :type n_out: int :param n_out: number of hidden units :type activation: theano.Op or function :param activation: Non linearity to be applied in the hidden layer """ self.input = input # `W` is initialized with `W_values` which is uniformely sampled # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) # for tanh activation function # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU # Note : optimal initialization of weights is dependent on the # activation function used (among other things). # For example, results presented in [Xavier10] suggest that you # should use 4 times larger initial weights for sigmoid # compared to tanh # We have no info for other function, so we use the same as # tanh. if W is None: W_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta') self.W = W self.b = b lin_output = T.dot(input, self.W) + self.b bn_output = batch_normalization(inputs = lin_output, gamma = self.gamma, beta = self.beta, mean = lin_output.mean((0,), keepdims=True), std = lin_output.std((0,), keepdims = True), mode='low_mem') self.output = (T.clip(bn_output,0,20) if activation is 'relu' else activation(bn_output)) # self.output = ( # lin_output if activation is None # else activation(lin_output) # ) # parameters of the model self.params = [self.W, self.b, self.gamma, self.beta]
def __init__(self, input, n_in, n_out): """ Initialize the parameters of the logistic regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) self.W = theano.shared( value=numpy.zeros( (n_in, n_out), dtype=theano.config.floatX ), name='W', borrow=True ) # initialize the biases b as a vector of n_out 0s self.b = theano.shared( value=numpy.zeros( (n_out,), dtype=theano.config.floatX ), name='b', borrow=True ) self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta') self.high = numpy.float32(numpy.sqrt(6. / (n_in + n_out))) self.W0 = numpy.float32(self.high/2) Wb = T.switch(T.ge(self.W.get_value(),0),self.W0,-self.W0).eval() Wb = theano.shared(Wb, name='Wb', borrow=True) self.Wb = Wb # symbolic expression for computing the matrix of class-membership # probabilities # Where: # W is a matrix where column-k represent the separation hyperplane for # class-k # x is a matrix where row-j represents input training sample-j # b is a vector where element-k represent the free parameter of # hyperplane-k self.linear = T.dot(input, self.W) + self.b bn_output = batch_normalization(inputs = self.linear, gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True), std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem') self.linear.std((0,)) self.p_y_given_x = T.nnet.softmax(bn_output) # bn_output = lin_output bn_output = batch_normalization(inputs = self.p_y_given_x, gamma = self.gamma, beta = self.beta, mean = self.p_y_given_x.mean((0,), keepdims=True), std = T.ones_like(self.p_y_given_x.var((0,), keepdims = True)), mode='high_mem') self.p_y_given_x.std((0,)) # symbolic description of how to compute prediction as class whose # probability is maximal self.y_pred = T.argmax(self.p_y_given_x, axis=1) # parameters of the model self.params = [self.Wb, self.b, self.gamma, self.beta] # keep track of model input self.n_in = n_in self.input = input
def __init__(self, input, n_in, n_out, stochastic=False, binary=True): """ Initialize the parameters of the Support Vector Machine layer :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie :type binary: boolean :param binary: indicates whether to implement Binary Connect binarization for weights :type stochastic: boolean :param stochastic: indicate whether to implement a stochatic or deterministic Binary Connect layer """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) self.high = numpy.float32(numpy.sqrt(6. / (n_in + n_out))) self.W = theano.shared(value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX), name='W', borrow=True) # initialize the baises b as a vector of n_out 0s self.b = theano.shared(value=numpy.zeros((n_out,), dtype=theano.config.floatX), name='b', borrow=True) self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta') self.W0 = numpy.float32(self.high/2) # binarize weights either deterministically or stochastically, if indicated def hard_sigma(w): p=T.clip((w+1)/2,0,1) return p if stochastic: p = hard_sigma(self.W/self.W0) p_mask = T.cast(numpy.random.binomial(n=1, p=p.eval(), size=(n_in, n_out)), theano.config.floatX) Wb = T.switch(p_mask,self.W0,-self.W0).eval() else: Wb = T.switch(T.ge(self.W.get_value(),0),self.W0,-self.W0).eval() if binary: Wb = theano.shared(Wb, name='Wb', borrow=True) self.Wb = Wb else: self.Wb=self.W # parameters of the model self.linear = T.dot(input, self.Wb) + self.b bn_output = batch_normalization(inputs = self.linear, gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True), std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem') self.linear_output = bn_output self.y_pred = T.argmax(bn_output, axis=1) if binary: self.params = [self.W, self.Wb, self.gamma, self.beta, self.b] elif binary==False: self.params = [self.Wb, self.gamma, self.beta, self.b] self.len_params = len(self.params) self.n_in=n_in # keep track of model input self.input = input
def __init__(self, rng, input, n_in, n_out, stochastic=False, binary=True, W=None, b=None, activation=T.nnet.relu): """ Typical hidden layer of a MLP: units are fully-connected and have sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) and the bias vector b is of shape (n_out,). NOTE : The nonlinearity used here is ReLU :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dmatrix :param input: a symbolic tensor of shape (n_examples, n_in) :type n_in: int :param n_in: dimensionality of input :type n_out: int :param n_out: number of hidden units :type binary: boolean :param binary: indicates whether to implement Binary Connect binarization for weights :type stochastic: boolean :param stochastic: indicate whether to implement a stochatic or deterministic Binary Connect layer :type activation: theano.Op or function :param activation: Non linearity to be applied in the hidden layer """ self.input = input # `W` is initialized with `W_values` which is uniformely sampled # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) # for tanh activation function # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU # Note : optimal initialization of weights is dependent on the # activation function used (among other things). # For example, results presented in [Xavier10] suggest that you # should use 4 times larger initial weights for sigmoid # compared to tanh # We have no info for other function, so we use the same as # tanh. if W is None: W_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.high = numpy.float32(numpy.sqrt(6. / (n_in + n_out))) self.W0 = numpy.float32(self.high/2) # binarize weights either deterministically or stochastically, if indicated def hard_sigma(w): p=T.clip((w+1)/2,0,1) return p if stochastic: p = hard_sigma(W/self.W0) p_mask = T.cast(numpy.random.binomial(n=1, p=p.eval(), size=(n_in, n_out)), theano.config.floatX) Wb = T.switch(p_mask,self.W0,-self.W0).eval() else: Wb = T.switch(T.ge(W.get_value(),0),self.W0,-self.W0).eval() if binary: Wb = theano.shared(Wb, name='Wb', borrow=True) self.Wb = Wb else: self.Wb=W self.W = W self.b = b self.n_in=n_in self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta') lin_output = T.dot(input, self.Wb) + self.b # batch normalization at output bn_output = batch_normalization(inputs = lin_output, gamma = self.gamma, beta = self.beta, mean = lin_output.mean((0,), keepdims=True), std = lin_output.std((0,), keepdims = True), mode='low_mem') self.output = ( bn_output if activation is None else activation(bn_output) ) # parameters of the model if binary: self.params = [self.W, self.Wb, self.gamma, self.beta, self.b] elif binary==False: self.params = [self.Wb, self.gamma, self.beta, self.b]
def __init__(self, input, n_in, n_out, stochastic=False, binary=True): """ Initialize the parameters of the logistic regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie :type binary: boolean :param binary: indicates whether to implement Binary Connect binarization for weights :type stochastic: boolean :param stochastic: indicate whether to implement a stochatic or deterministic Binary Connect layer """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) self.W = theano.shared( value=numpy.zeros( (n_in, n_out), dtype=theano.config.floatX ), name='W', borrow=True ) # initialize the biases b as a vector of n_out 0s self.b = theano.shared( value=numpy.zeros( (n_out,), dtype=theano.config.floatX ), name='b', borrow=True ) self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta') self.high = numpy.float32(numpy.sqrt(6. / (n_in + n_out))) self.W0 = numpy.float32(self.high/2) # binarize weights either deterministically or stochastically, if indicated def hard_sigma(w): p=T.clip((w+1)/2,0,1) return p if stochastic: p = hard_sigma(self.W/self.W0) p_mask = T.cast(numpy.random.binomial(n=1, p=p.eval(), size=(n_in, n_out)), theano.config.floatX) Wb = T.switch(p_mask,self.W0,-self.W0).eval() else: Wb = T.switch(T.ge(self.W.get_value(),0),self.W0,-self.W0).eval() if binary: Wb = theano.shared(Wb, name='Wb', borrow=True) self.Wb = Wb else: self.Wb = self.W # symbolic expression for computing the matrix of class-membership # probabilities # Where: # W is a matrix where column-k represent the separation hyperplane for # class-k # x is a matrix where row-j represents input training sample-j # b is a vector where element-k represent the free parameter of # hyperplane-k self.linear=T.dot(input, self.Wb) + self.b # batch normalize at the output bn_output = batch_normalization(inputs = self.linear, gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True), std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem') self.p_y_given_x = T.nnet.softmax(bn_output) # symbolic description of how to compute prediction as class whose # probability is maximal self.y_pred = T.argmax(self.p_y_given_x, axis=1) # parameters of the model if binary: self.params = [self.W, self.Wb, self.gamma, self.beta, self.b] elif not binary: self.params = [self.Wb, self.gamma, self.beta, self.b] self.len_params = len(self.params) self.n_in=n_in # keep track of model input self.input = input
def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2), pool_ignore_border=True, stochastic=False, binary=True): """ Allocate a LeNetConvPoolLayer with shared variable internal parameters. :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height, filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows, #cols) :type binary: boolean :param binary: indicates whether to implement Binary Connect binarization for weights :type stochastic: boolean :param stochastic: indicate whether to implement a stochatic or deterministic Binary Connect layer """ assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) // numpy.prod(poolsize)) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared( numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) self.high = numpy.float32(numpy.sqrt(6. / (fan_in + fan_out))) self.W0 = numpy.float32(self.high/2) # binarize weights either deterministically or stochastically, if indicated def hard_sigma(w): p=T.clip((w+1)/2,0,1) return p if stochastic: p = hard_sigma(self.W/self.W0) p_mask = T.cast(numpy.random.binomial(n=1, p=p.eval(), size=filter_shape), theano.config.floatX) Wb = T.switch(p_mask,self.W0,-self.W0).eval() else: Wb = T.switch(T.ge(self.W.get_value(),0),self.W0,-self.W0).eval() if binary: Wb = theano.shared(Wb, name='Wb', borrow=True) self.Wb = Wb else: self.Wb=self.W self.gamma = theano.shared(value = numpy.ones((image_shape[0], filter_shape[0], (image_shape[3]-2)/poolsize[0], (image_shape[3]-2)/poolsize[0]), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value = numpy.zeros((image_shape[0], filter_shape[0], (image_shape[3]-2)/poolsize[0], (image_shape[3]-2)/poolsize[0]), dtype=theano.config.floatX), name='beta') # convolve input feature maps with filters conv_out = conv2d( input=input, filters=self.Wb, filter_shape=filter_shape, image_shape=image_shape ) # downsample each feature map individually, using maxpooling pooled_out = downsample.max_pool_2d( input=conv_out, ds=poolsize, ignore_border=pool_ignore_border ) # implement batch normalization at output bn_output = batch_normalization(inputs = pooled_out, gamma = self.gamma, beta = self.beta, mean = pooled_out.mean((0,2,3), keepdims=True), std = pooled_out.var((0,2,3), keepdims = True), mode='high_mem') # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.output = T.nnet.relu(bn_output + self.b.dimshuffle('x', 0, 'x', 'x')) # store parameters of this layer if binary: self.params = [self.W, self.Wb, self.gamma, self.beta, self.b] elif not binary: self.params = [self.Wb, self.gamma, self.beta, self.b] self.len_params = len(self.params) # keep track of model input self.input = input
def __init__( self, input, image_shape, filter_shape, convstride, padsize, group, poolsize, poolstride, bias_init, lrn=False, lib_conv='cudnn', poolpadsize=(0, 0), caffe_style=False, Bn=False, ): ''' lib_conv can be cudnn (recommended)or cudaconvnet ''' self.filter_size = filter_shape self.convstride = convstride self.padsize = padsize self.poolsize = poolsize self.poolstride = poolstride self.channel = image_shape[0] self.lrn = lrn self.lib_conv = lib_conv # assert input.shape==image_shape assert group in [1, 2] self.filter_shape = np.asarray(filter_shape) self.image_shape = np.asarray(image_shape) if self.lrn: self.lrn_func = CrossChannelNormalization(alpha=0.0005, k=1) # self.lrn_func = CrossChannelNormalization(alpha=0.0005) if group == 1: self.W = Weight(self.filter_shape) self.b = Weight(self.filter_shape[3], bias_init, std=0) else: self.filter_shape[0] = self.filter_shape[0] / 2 self.filter_shape[3] = self.filter_shape[3] / 2 self.image_shape[0] = self.image_shape[0] / 2 self.image_shape[3] = self.image_shape[3] / 2 self.W0 = Weight(self.filter_shape) self.W1 = Weight(self.filter_shape) self.b0 = Weight(self.filter_shape[3], bias_init, std=0) self.b1 = Weight(self.filter_shape[3], bias_init, std=0) if lib_conv == 'cudaconvnet': self.conv_op = FilterActs(pad=self.padsize, stride=self.convstride, partial_sum=1) # Conv if group == 1: contiguous_input = gpu_contiguous(input) contiguous_filters = gpu_contiguous(self.W.val) conv_out = self.conv_op(contiguous_input, contiguous_filters) conv_out = conv_out + self.b.val.dimshuffle(0, 'x', 'x', 'x') else: contiguous_input0 = gpu_contiguous(input[:self.channel / 2, :, :, :]) contiguous_filters0 = gpu_contiguous(self.W0.val) conv_out0 = self.conv_op(contiguous_input0, contiguous_filters0) conv_out0 = conv_out0 + \ self.b0.val.dimshuffle(0, 'x', 'x', 'x') contiguous_input1 = gpu_contiguous(input[self.channel / 2:, :, :, :]) contiguous_filters1 = gpu_contiguous(self.W1.val) conv_out1 = self.conv_op(contiguous_input1, contiguous_filters1) conv_out1 = conv_out1 + \ self.b1.val.dimshuffle(0, 'x', 'x', 'x') conv_out = T.concatenate([conv_out0, conv_out1], axis=0) # ReLu self.output = T.maximum(conv_out, 0) # Pooling if self.poolsize != 1: self.pool_op = MaxPool(ds=poolsize, stride=poolstride) self.output = self.pool_op(self.output) elif lib_conv == 'cudnn': input_shuffled = input.dimshuffle(3, 0, 1, 2) # c01b to bc01 # in01out to outin01 if group == 1: W_shuffled = self.W.val.dimshuffle(3, 0, 1, 2) # c01b to bc01 conv_out = dnn.dnn_conv( img=input_shuffled, kerns=W_shuffled, subsample=(convstride, convstride), border_mode=padsize, ) conv_out = conv_out + self.b.val.dimshuffle('x', 0, 'x', 'x') else: W0_shuffled = \ self.W0.val.dimshuffle(3, 0, 1, 2) # c01b to bc01 conv_out0 = \ dnn.dnn_conv(img=input_shuffled[:, :self.channel / 2, :, :], kerns=W0_shuffled, subsample=(convstride, convstride), border_mode=padsize, ) conv_out0 = conv_out0 + \ self.b0.val.dimshuffle('x', 0, 'x', 'x') W1_shuffled = \ self.W1.val.dimshuffle(3, 0, 1, 2) # c01b to bc01 conv_out1 = \ dnn.dnn_conv(img=input_shuffled[:, self.channel / 2:, :, :], kerns=W1_shuffled, subsample=(convstride, convstride), border_mode=padsize, ) conv_out1 = conv_out1 + \ self.b1.val.dimshuffle('x', 0, 'x', 'x') conv_out = T.concatenate([conv_out0, conv_out1], axis=1) self.conv_out = conv_out if Bn: #Warning this just used for testing phase!!!! self.mean = theano.shared( value=np.zeros((1, filter_shape[3], 1, 1), dtype=theano.config.floatX), broadcastable=[True, False, True, True], name='mean', borrow=True) self.var = theano.shared( value=np.ones((1, filter_shape[3], 1, 1), dtype=theano.config.floatX), broadcastable=[True, False, True, True], name='var', borrow=True) self.gamma = theano.shared(value=np.ones( (filter_shape[3], ), dtype=theano.config.floatX), name='gamma', borrow=True) self.beta = theano.shared(value=np.zeros( (filter_shape[3], ), dtype=theano.config.floatX), name='beta', borrow=True) conv_out = batch_normalization(inputs=conv_out, gamma=self.gamma, beta=self.beta, mean=self.mean, std=T.sqrt(self.var), mode='high_mem') # ReLu self.Bn = conv_out self.output = T.maximum(conv_out, 0) # # Pooling if caffe_style: self.output = self.output[:, :, ::-1, ::-1] if self.poolsize != 1: self.output = dnn.dnn_pool(self.output, ws=(poolsize, poolsize), stride=(poolstride, poolstride), pad=poolpadsize) if caffe_style: self.output = self.output[:, :, ::-1, ::-1] self.output = self.output.dimshuffle(1, 2, 3, 0) # bc01 to c01b else: NotImplementedError("lib_conv can only be cudaconvnet or cudnn") if group == 1: if Bn: #self.params = [self.W.val, self.b.val,self.beta,self.gamma,self.mean,self.var] self.params = [self.W.val, self.b.val] self.weight_type = ['W', 'b'] #self.weight_type = ['W', 'b','b','b','b','b'] pass else: self.params = [self.W.val, self.b.val] self.weight_type = ['W', 'b'] else: self.params = [self.W0.val, self.b0.val, self.W1.val, self.b1.val] self.weight_type = ['W', 'b', 'W', 'b'] print "conv ({}) layer with shape_in: {}".format( lib_conv, str(image_shape))
def __init__(self, rng, input, filter_shape, image_shape): """ :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height, filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) """ assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX), borrow=True) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0], ), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) # convolve input feature maps with filters conv_out = conv.conv2d(input=input, filters=self.W, filter_shape=filter_shape, image_shape=image_shape) # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height #alpha_value = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) + 0.25 #self.alpha = theano.shared(value=alpha_value, borrow=True) linearOutput = conv_out + self.b.dimshuffle('x', 0, 'x', 'x') bnOutput = bn.batch_normalization(inputs=linearOutput, gamma=1., beta=0, mean=T.mean(linearOutput), std=T.std(linearOutput)) self.output = ReLu(bnOutput) # store parameters of this layer self.params = [self.W, self.b]
def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh, reluSlope=0.0): """ Typical hidden layer of a MLP: units are fully-connected and have sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) and the bias vector b is of shape (n_out,). NOTE : The nonlinearity used here is tanh Hidden unit activation is given by: tanh(dot(input,W) + b) :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dmatrix :param input: a symbolic tensor of shape (n_examples, n_in) :type n_in: int :param n_in: dimensionality of input :type n_out: int :param n_out: number of hidden units :type activation: theano.Op or function :param activation: Non linearity to be applied in the hidden layer """ self.input = input # end-snippet-1 # `W` is initialized with `W_values` which is uniformely sampled # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) # for tanh activation function # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU # Note : optimal initialization of weights is dependent on the # activation function used (among other things). # For example, results presented in [Xavier10] suggest that you # should use 4 times larger initial weights for sigmoid # compared to tanh # We have no info for other function, so we use the same as # tanh. if W is None: W_values = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out)), dtype=theano.config.floatX) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) if b is None: b_values = numpy.zeros((n_out, ), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.W = W self.b = b self.gamma = theano.shared(value=numpy.ones( (n_out, ), dtype=theano.config.floatX), name='gamma') self.beta = theano.shared(value=numpy.zeros( (n_out, ), dtype=theano.config.floatX), name='beta') lin_output = T.dot(input, self.W) + self.b ''' self.output = ( lin_output if activation is None else activation(lin_output) ) ''' # parameters of the model self.params = [self.W, self.b] self.lin_output = lin_output bn_output = batch_normalization( inputs=self.lin_output, gamma=self.gamma, beta=self.beta, mean=self.lin_output.mean((0, ), keepdims=True), std=T.ones_like(self.lin_output.var((0, ), keepdims=True)), mode='high_mem') if activation is None: self.output = lin_output #self.output = bn_output elif activation is T.nnet.relu: self.output = T.nnet.relu(lin_output, reluSlope) #self.output = T.nnet.relu(bn_output, reluSlope) else: self.output = activation(lin_output) #self.output = activation(bn_output) self.bn_output = bn_output