def test_batch_norm_exceptions(self): with self.assertRaises(ValueError): # Axis does not exist layers.Input(10) > layers.BatchNorm(axes=2) with self.assertRaises(ValueError): connection = layers.Relu() > layers.BatchNorm() connection.initialize()
def test_batch_norm_storage(self): x_train, x_test, y_train, y_test = simple_classification() batch_norm = layers.BatchNorm() gdnet = algorithms.MinibatchGradientDescent( [ layers.Input(10), layers.Relu(5), batch_norm, layers.Sigmoid(1), ], batch_size=10, ) gdnet.train(x_train, y_train) error_before_save = gdnet.prediction_error(x_test, y_test) mean_before_save = batch_norm.running_mean.get_value() inv_std_before_save = batch_norm.running_inv_std.get_value() with tempfile.NamedTemporaryFile() as temp: storage.save(gdnet, temp.name) storage.load(gdnet, temp.name) error_after_load = gdnet.prediction_error(x_test, y_test) mean_after_load = batch_norm.running_mean.get_value() inv_std_after_load = batch_norm.running_inv_std.get_value() self.assertAlmostEqual(error_before_save, error_after_load) np.testing.assert_array_almost_equal(mean_before_save, mean_after_load) np.testing.assert_array_almost_equal(inv_std_before_save, inv_std_after_load)
def test_batch_norm_storage(self): x_train, x_test, y_train, y_test = simple_classification() batch_norm = layers.BatchNorm() gdnet = algorithms.GradientDescent( [ layers.Input(10), layers.Relu(5), batch_norm, layers.Sigmoid(1), ], batch_size=10, verbose=True, # keep it as `True` ) gdnet.train(x_train, y_train, epochs=5) error_before_save = gdnet.prediction_error(x_test, y_test) mean_before_save = self.eval(batch_norm.running_mean) variance_before_save = self.eval(batch_norm.running_inv_std) with tempfile.NamedTemporaryFile() as temp: storage.save(gdnet, temp.name) storage.load(gdnet, temp.name) error_after_load = gdnet.prediction_error(x_test, y_test) mean_after_load = self.eval(batch_norm.running_mean) variance_after_load = self.eval(batch_norm.running_inv_std) self.assertAlmostEqual(error_before_save, error_after_load) np.testing.assert_array_almost_equal(mean_before_save, mean_after_load) np.testing.assert_array_almost_equal(variance_before_save, variance_after_load)
def test_batch_norm_in_non_training_state(self): network = layers.join( layers.Input(10), layers.BatchNorm(), ) input_value = tf.Variable( asfloat(np.random.random((30, 10))), name='input_value', dtype=tf.float32, ) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), 0) output_value = network.output(input_value) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), 0) network.output(input_value, training=True) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), 2) # Without training your running mean and std suppose to be # equal to 0 and 1 respectavely. output_value = self.eval(output_value) np.testing.assert_array_almost_equal(self.eval(input_value), output_value, decimal=4)
def test_conv_output_shape_when_input_unknown(self): block = layers.join( layers.Convolution((3, 3, 32)), layers.Relu(), layers.BatchNorm(), ) self.assertShapesEqual(block.input_shape, None) self.assertShapesEqual(block.output_shape, (None, None, None, 32))
def test_batchnorm_unsuitable_axes_values(self): network = layers.join( layers.Input((10, 3)), layers.BatchNorm(axes=(0, 2, 3)), ) message = ("Batch normalization cannot be applied over one of " "the axis, because input has only 3 dimensions") with self.assertRaisesRegexp(LayerConnectionError, message): network.create_variables()
def test_batchnorm_wrong_axes_values(self): network = layers.join( layers.Relu(), layers.BatchNorm(), ) message = ("Cannot initialize variables for the batch normalization " "layer, because input shape is undefined") with self.assertRaisesRegexp(WeightInitializationError, message): network.create_variables()
def test_repeat_network(self): block = layers.join( layers.Convolution((3, 3, 32)), layers.Relu(), layers.BatchNorm(), ) network = layers.repeat(block, n=5) self.assertEqual(len(network), 15) self.assertShapesEqual(network.output_shape, (None, None, None, 32))
def test_simple_batch_norm(self): connection = layers.Input(10) > layers.BatchNorm() input_value = theano.shared(value=np.random.random((30, 10))) output_value = connection.output(input_value).eval() self.assertTrue(stats.mstats.normaltest(output_value)) self.assertAlmostEqual(output_value.mean(), 0, places=3) self.assertAlmostEqual(output_value.std(), 1, places=3)
def test_batch_norm_as_shared_variable(self): gamma = theano.shared(value=asfloat(np.ones(2))) beta = theano.shared(value=asfloat(2 * np.ones(2))) batch_norm = layers.BatchNorm(gamma=gamma, beta=beta) layers.Input(10) > batch_norm self.assertIs(gamma, batch_norm.gamma) self.assertIs(beta, batch_norm.beta)
def test_batchnorm_unknown_dimension(self): network = layers.join( layers.Input((10, 10, None)), layers.BatchNorm(), ) message = ("Cannot create variables for batch normalization, because " "input has unknown dimension #3 \(0-based indices\). " "Input shape: \(\?, 10, 10, \?\)") with self.assertRaisesRegexp(WeightInitializationError, message): network.create_variables()
def test_batch_norm_between_layers(self): connection = layers.join( layers.Input(10), layers.Relu(40), layers.BatchNorm(), layers.Relu(1), ) input_value = np.random.random((30, 10)) outpu_value = connection.output(input_value).eval() self.assertEqual(outpu_value.shape, (30, 1))
def test_batch_norm_gamma_beta_params(self): default_beta = -3.14 default_gamma = 4.3 connection = layers.join( layers.Input(10), layers.BatchNorm(gamma=default_gamma, beta=default_beta)) input_value = theano.shared(value=np.random.random((30, 10))) output_value = connection.output(input_value).eval() self.assertAlmostEqual(output_value.mean(), default_beta, places=3) self.assertAlmostEqual(output_value.std(), default_gamma, places=3)
def test_simple_batch_norm(self): connection = layers.Input(10) > layers.BatchNorm() input_value = tf.Variable( asfloat(np.random.random((30, 10))), name='input_value', dtype=tf.float32, ) output_value = self.eval(connection.output(input_value)) self.assertTrue(stats.mstats.normaltest(output_value)) self.assertAlmostEqual(output_value.mean(), 0, places=3) self.assertAlmostEqual(output_value.std(), 1, places=3)
def test_batch_norm_between_layers(self): connection = surgery.sew_together([ layers.Input(10), layers.Relu(40), layers.BatchNorm(), layers.Relu(1), ]) connection.initialize() input_value = np.random.random((30, 10)) outpu_value = connection.output(input_value).eval() self.assertEqual(outpu_value.shape, (30, 1))
def ResidualUnit(n_in_filters, n_out_filters, stride, has_branch=False): main_branch = layers.join( layers.Convolution((n_in_filters, 1, 1), stride=stride, bias=None), layers.BatchNorm(), layers.Relu(), layers.Convolution((n_in_filters, 3, 3), padding=1, bias=None), layers.BatchNorm(), layers.Relu(), layers.Convolution((n_out_filters, 1, 1), bias=None), layers.BatchNorm(), ) residual_branch = [] if has_branch: residual_branch = layers.join( layers.Convolution((n_out_filters, 1, 1), stride=stride, bias=None), layers.BatchNorm(), ) return layers.join( [main_branch, residual_branch], layers.Elementwise() > layers.Relu(), )
def test_batch_norm_between_layers(self): network = layers.join( layers.Input(10), layers.Relu(40), layers.BatchNorm(), layers.Relu(1), ) input_value = tf.Variable( asfloat(np.random.random((30, 10))), name='input_value', dtype=tf.float32, ) outpu_value = self.eval(network.output(input_value, training=True)) self.assertEqual(outpu_value.shape, (30, 1))
def test_batch_norm_gamma_beta_params(self): default_beta = -3.14 default_gamma = 4.3 connection = layers.join( layers.Input(10), layers.BatchNorm(gamma=default_gamma, beta=default_beta)) input_value = tf.Variable( asfloat(np.random.random((30, 10))), name='input_value', dtype=tf.float32, ) output_value = self.eval(connection.output(input_value)) self.assertAlmostEqual(output_value.mean(), default_beta, places=3) self.assertAlmostEqual(output_value.std(), default_gamma, places=3)
def test_batch_norm_in_non_training_state(self): batch_norm = layers.BatchNorm() layers.Input(10) > batch_norm input_value = theano.shared(value=np.random.random((30, 10))) self.assertEqual(len(batch_norm.updates), 0) batch_norm.output(input_value) self.assertEqual(len(batch_norm.updates), 2) with batch_norm.disable_training_state(): # Without training your running mean and std suppose to be # equal to 0 and 1 respectavely. output_value = batch_norm.output(input_value).eval() np.testing.assert_array_almost_equal(input_value.get_value(), output_value)
def test_batch_norm_as_shared_variable(self): gamma = tf.Variable( asfloat(np.ones(2)), name='gamma', dtype=tf.float32, ) beta = tf.Variable( asfloat(2 * np.ones(2)), name='beta', dtype=tf.float32, ) batch_norm = layers.BatchNorm(gamma=gamma, beta=beta) layers.Input(10) > batch_norm self.assertIs(gamma, batch_norm.gamma) self.assertIs(beta, batch_norm.beta)
def test_batch_norm_as_shared_variable(self): gamma = tf.Variable( asfloat(np.ones((1, 2))), name='gamma', dtype=tf.float32, ) beta = tf.Variable( asfloat(2 * np.ones((1, 2))), name='beta', dtype=tf.float32, ) batch_norm = layers.BatchNorm(gamma=gamma, beta=beta) network = layers.join(layers.Input(2), batch_norm) network.outputs self.assertIs(gamma, batch_norm.gamma) self.assertIs(beta, batch_norm.beta)
def test_batch_norm_in_non_training_state(self): batch_norm = layers.BatchNorm() layers.Input(10) > batch_norm input_value = tf.Variable( asfloat(np.random.random((30, 10))), name='input_value', dtype=tf.float32, ) self.assertEqual(len(batch_norm.updates), 0) batch_norm.output(input_value) self.assertEqual(len(batch_norm.updates), 2) with batch_norm.disable_training_state(): # Without training your running mean and std suppose to be # equal to 0 and 1 respectavely. output_value = self.eval(batch_norm.output(input_value)) np.testing.assert_array_almost_equal(self.eval(input_value), output_value, decimal=4)
def test_storage_save_dict(self): network = layers.join( layers.parallel([ layers.Input(2, name='input-1'), layers.PRelu(1, name='prelu') ], [ layers.Input(1, name='input-2'), layers.Sigmoid(4, name='sigmoid'), layers.BatchNorm(name='batch-norm'), ]), layers.Concatenate(name='concatenate'), layers.Softmax(3, name='softmax'), ) dict_network = storage.save_dict(network) expected_keys = ('metadata', 'layers', 'graph') self.assertItemsEqual(expected_keys, dict_network.keys()) expected_metadata_keys = ('created', 'language', 'library', 'version') actual_metadata_keys = dict_network['metadata'].keys() self.assertItemsEqual(expected_metadata_keys, actual_metadata_keys) self.assertEqual(len(dict_network['layers']), 7) expected_layers = [{ 'class_name': 'Input', 'configs': { 'name': 'input-1', 'shape': (2, ) }, 'name': 'input-1', }, { 'class_name': 'PRelu', 'configs': { 'alpha_axes': (-1, ), 'name': 'prelu', 'n_units': 1 }, 'name': 'prelu', }, { 'class_name': 'Input', 'configs': { 'name': 'input-2', 'shape': (1, ) }, 'name': 'input-2', }, { 'class_name': 'Sigmoid', 'configs': { 'name': 'sigmoid', 'n_units': 4 }, 'name': 'sigmoid', }, { 'class_name': 'BatchNorm', 'configs': { 'alpha': 0.1, 'axes': (0, ), 'epsilon': 1e-05, 'name': 'batch-norm' }, 'name': 'batch-norm', }, { 'class_name': 'Concatenate', 'configs': { 'axis': -1, 'name': 'concatenate' }, 'name': 'concatenate', }, { 'class_name': 'Softmax', 'configs': { 'name': 'softmax', 'n_units': 3 }, 'name': 'softmax', }] actual_layers = [] for i, layer in enumerate(dict_network['layers']): self.assertIn('parameters', layer, msg="Layer #" + str(i)) layer = copy.deepcopy(layer) del layer['parameters'] actual_layers.append(layer) self.assertEqual(actual_layers, expected_layers)
mean = x_train.mean(axis=(0, 2, 3)).reshape((1, -1, 1, 1)) std = x_train.std(axis=(0, 2, 3)).reshape((1, -1, 1, 1)) x_train -= mean x_train /= std x_test -= mean x_test /= std target_scaler = OneHotEncoder() y_train = target_scaler.fit_transform(y_train.reshape((-1, 1))).todense() y_test = target_scaler.transform(y_test.reshape((-1, 1))).todense() network = algorithms.Adadelta( [ layers.Input((3, 32, 32)), layers.Convolution((64, 3, 3)) > layers.BatchNorm() > layers.PRelu(), layers.Convolution((64, 3, 3)) > layers.BatchNorm() > layers.PRelu(), layers.MaxPooling((2, 2)), layers.Convolution((128, 3, 3)) > layers.BatchNorm() > layers.PRelu(), layers.Convolution((128, 3, 3)) > layers.BatchNorm() > layers.PRelu(), layers.MaxPooling((2, 2)), layers.Reshape(), layers.Linear(1024) > layers.BatchNorm() > layers.PRelu(), layers.Linear(1024) > layers.BatchNorm() > layers.PRelu(), layers.Softmax(10), ], error='categorical_crossentropy', step=0.25, shuffle_data=True, batch_size=128, verbose=True,
mean = x_train.mean(axis=(0, 2, 3)) std = x_train.std(axis=(0, 2, 3)) x_train -= mean x_train /= std x_test -= mean x_test /= std return x_train, x_test, y_train, y_test network = algorithms.Adadelta( [ layers.Input((1, 28, 28)), layers.Convolution((32, 3, 3)) > layers.BatchNorm() > layers.Relu(), layers.Convolution((48, 3, 3)) > layers.BatchNorm() > layers.Relu(), layers.MaxPooling((2, 2)), layers.Convolution((64, 3, 3)) > layers.BatchNorm() > layers.Relu(), layers.MaxPooling((2, 2)), layers.Reshape(), layers.Linear(1024) > layers.BatchNorm() > layers.Relu(), layers.Softmax(10), ], # Using categorical cross-entropy as a loss function error='categorical_crossentropy', # Min-batch size batch_size=128,
residual_branch = layers.join( layers.Convolution((n_out_filters, 1, 1), stride=stride, bias=None), layers.BatchNorm(), ) return layers.join( [main_branch, residual_branch], layers.Elementwise() > layers.Relu(), ) resnet50 = layers.join( layers.Input((3, 224, 224)), layers.Convolution((64, 7, 7), stride=2, padding=3), layers.BatchNorm(), layers.Relu(), layers.MaxPooling((3, 3), stride=(2, 2), ignore_border=False), ResidualUnit(64, 256, stride=1, has_branch=True), ResidualUnit(64, 256, stride=1), ResidualUnit(64, 256, stride=1), ResidualUnit(128, 512, stride=2, has_branch=True), ResidualUnit(128, 512, stride=1), ResidualUnit(128, 512, stride=1), ResidualUnit(128, 512, stride=1), ResidualUnit(256, 1024, stride=2, has_branch=True), ResidualUnit(256, 1024, stride=1), ResidualUnit(256, 1024, stride=1), ResidualUnit(256, 1024, stride=1), ResidualUnit(256, 1024, stride=1), ResidualUnit(256, 1024, stride=1),
def resnet50(input_shape=(224, 224, 3), include_global_pool=True, in_out_ratio=32): """ ResNet50 network architecture with random parameters. Parameters can be loaded using ``neupy.storage`` module. ResNet50 has roughly 25.5 million parameters. Parameters ---------- input_shape : tuple Network's input shape. Defaults to ``(224, 224, 3)``. include_global_pool : bool Specifies if returned output should include global pooling layer. Defaults to ``True``. in_out_ratio : {4, 8, 16, 32} Every layer that applies strides reduces height and width per every image. There are 5 of these layers in Resnet and at the end each dimensions gets reduced by ``32``. For example, 224x224 image will be reduced to 7x7 image patches. This parameter specifies what level of reduction we want to obtain after we've propagated network through all the convolution layers. Notes ----- Because of the global pooling layer, ResNet50 can be applied to the images with variable sizes. The only limitation is that image size should be bigger than 32x32, otherwise network won't be able to apply all transformations to the image. Examples -------- ResNet-50 for ImageNet classification >>> from neupy import architectures, algorithms >>> >>> resnet = architectures.resnet50() >>> resnet (?, 224, 224, 3) -> [... 187 layers ...] -> (?, 1000) >>> >>> optimizer = algorithms.Momentum(resnet50) ResNet-50 for custom classification task >>> from neupy import architectures >>> resnet = architectures.resnet50(include_global_pool=False) >>> resnet (?, 224, 224, 3) -> [... 185 layers ...] -> (?, 7, 7, 2048) >>> >>> from neupy.layers import * >>> resnet = resnet >> GlobalPooling('avg') >> Softmax(21) (?, 224, 224, 3) -> [... 187 layers ...] -> (?, 21) ResNet-50 for image segmentation >>> from neupy import architectures >>> resnet = architectures.resnet50( ... include_global_pool=False, ... in_out_ratio=8, ... ) >>> resnet (?, 224, 224, 3) -> [... 185 layers ...] -> (?, 28, 28, 2048) See Also -------- :architecture:`vgg16` : VGG16 network :architecture:`squeezenet` : SqueezeNet network :architecture:`resnet50` : ResNet-50 network References ---------- Deep Residual Learning for Image Recognition. https://arxiv.org/abs/1512.03385 """ in_out_configs = { 4: {'strides': [1, 1, 1], 'rates': [2, 4, 8]}, 8: {'strides': [2, 1, 1], 'rates': [1, 2, 4]}, 16: {'strides': [2, 2, 1], 'rates': [1, 1, 2]}, 32: {'strides': [2, 2, 2], 'rates': [1, 1, 1]}, } if in_out_ratio not in in_out_configs: raise ValueError( "Expected one of the folowing in_out_ratio values: {}, got " "{} instead.".format(in_out_configs.keys(), in_out_ratio)) strides = in_out_configs[in_out_ratio]['strides'] rates = in_out_configs[in_out_ratio]['rates'] resnet = layers.join( layers.Input(input_shape), # Convolutional layer reduces image's height and width by a factor # of 2 (because of the stride) # from (3, 224, 224) to (64, 112, 112) layers.Convolution( (7, 7, 64), stride=2, bias=None, padding='same', name='conv1' ), layers.BatchNorm(name='bn_conv1'), layers.Relu(), # Stride equal two 2 reduces image size by a factor of two # from (64, 112, 112) to (64, 56, 56) layers.MaxPooling((3, 3), stride=2, padding="same"), # The branch option applies extra convolution x+ batch # normalization transformations to the residual ResidualUnit(64, name='2a', has_branch=True), ResidualUnit(64, name='2b'), ResidualUnit(64, name='2c'), # When stride=2 reduces width and hight by factor of 2 ResidualUnit(128, stride=strides[0], name='3a', has_branch=True), ResidualUnit(128, rate=rates[0], name='3b'), ResidualUnit(128, rate=rates[0], name='3c'), ResidualUnit(128, rate=rates[0], name='3d'), # When stride=2 reduces width and hight by factor of 2 ResidualUnit(256, rate=rates[0], name='4a', stride=strides[1], has_branch=True), ResidualUnit(256, rate=rates[1], name='4b'), ResidualUnit(256, rate=rates[1], name='4c'), ResidualUnit(256, rate=rates[1], name='4d'), ResidualUnit(256, rate=rates[1], name='4e'), ResidualUnit(256, rate=rates[1], name='4f'), # When stride=2 reduces width and hight by factor of 2 ResidualUnit(512, rate=rates[1], name='5a', stride=strides[2], has_branch=True), ResidualUnit(512, rate=rates[2], name='5b'), ResidualUnit(512, rate=rates[2], name='5c'), ) if include_global_pool: resnet = layers.join( resnet, # Since the final residual unit has 2048 output filters, global # pooling will replace every output image with single average # value. Despite input image size, output from this layer always # will be a vector with 2048 values. layers.GlobalPooling('avg'), layers.Softmax(1000, name='fc1000'), ) return resnet
def ResidualUnit(n_input_filters, stride=1, rate=1, has_branch=False, name=None): def bn_name(index): return 'bn' + name + '_branch' + index def conv_name(index): return 'res' + name + '_branch' + index n_output_filters = 4 * n_input_filters main_branch = layers.join( # The main purpose of this 1x1 convolution layer is to # reduce number of filters. For instance, for the tensor with # 256 filters it can be reduced to 64. This trick allows to # reduce computation by factor of 4. layers.Convolution( size=(1, 1, n_input_filters), stride=stride, bias=None, name=conv_name('2a'), ), layers.BatchNorm(name=bn_name('2a')), layers.Relu(), # This convolution layer applies 3x3 filter in order to # extract features. layers.Convolution( (3, 3, n_input_filters), padding='same', dilation=rate, bias=None, name=conv_name('2b'), ), layers.BatchNorm(name=bn_name('2b')), layers.Relu(), # Last layer reverses operations of the first layer. In this # case we increase number of filters. For instance, from previously # obtained 64 filters we can increase it back to the 256 filters layers.Convolution( (1, 1, n_output_filters), bias=None, name=conv_name('2c') ), layers.BatchNorm(name=bn_name('2c')), ) if has_branch: residual_branch = layers.join( layers.Convolution( (1, 1, n_output_filters), stride=stride, bias=None, name=conv_name('1'), ), layers.BatchNorm(name=bn_name('1')), ) else: # Empty list defines residual connection, meaning that # output from this branch would be equal to its input residual_branch = layers.Identity('residual-' + name) return layers.join( # For the output from two branches we just combine results # with simple elementwise sum operation. The main purpose of # the residual connection is to build shortcuts for the # gradient during backpropagation. (main_branch | residual_branch), layers.Elementwise(), layers.Relu(), )
def ConvReluBN(*conv_args, **conv_kwargs): return layers.join( layers.Convolution(*conv_args, **conv_kwargs), layers.Relu(), layers.BatchNorm(epsilon=0.001), )
def test_batchnorm_wrong_axes(self): message = "Specified axes have to contain only unique values" with self.assertRaisesRegexp(ValueError, message): layers.BatchNorm(axes=(0, 1, 1))