def recurrent_hypernetwork(T, batch_size):
    X = layers.variable('data')
    label = layers.variable('softmax_label')
    loss = 0
    parameters = ({
        'weight': None,
        'bias': None
    }, {
        'weight': None,
        'bias': None
    }, {
        'weight': None,
        'bias': None
    })
    KERNEL_SHAPES = ((3, 3, 3 * 16), ) + ((3, 3, 16 * 16), ) * 2
    for time in range(T):
        network = _extract_representations(X, parameters, batch_size)
        prediction = layers.pooling(X=network,
                                    mode='average',
                                    global_pool=True,
                                    kernel_shape=(1, 1),
                                    stride=(1, 1),
                                    pad=(0, 0))
        prediction = layers.flatten(prediction)
        prediction = layers.fully_connected(X=prediction, n_hidden_units=10)
        loss += layers.softmax_loss(prediction=prediction, label=label)
        for index, weight in enumerate(
                _generate_parameters(network, KERNEL_SHAPES)):
            parameters[index]['weight'] = weight
    return loss
示例#2
0
def _rnn_attention_module(network, settings):
    global _n_rnn_attention_module
    prefix = 'rnn_attention_module%d' % _n_rnn_attention_module

    n_filters = settings['convolution_settings']['n_filters']
    memory_settings = {'n_filters': n_filters}
    X_weight = layers.variable('%s_X_weight' % prefix,
                               shape=(4 * n_filters, n_filters, 1, 1))
    h_weight = layers.variable('%s_h_weight' % prefix,
                               shape=(4 * n_filters, n_filters, 1, 1))
    rnn_bias = layers.variable('%s_rnn_bias' % prefix,
                               shape=(1, 4 * n_filters, 1, 1))
    rnn_parameters = (X_weight, h_weight, rnn_bias)
    memory = (0, 0)

    kwargs = {
        key: value
        for key, value in settings['convolution_settings'].items()
    }
    if settings['weight_sharing']:
        kwargs['weight'] = layers.variable('%s_weight' % prefix)
        kwargs['bias'] = layers.variable('%s_bias' % prefix)
    for index in range(settings['n_layers']):
        memory = _write(network, memory_settings, rnn_parameters,
                        memory)  # dynamic period of memory writing
        network = _read(memory_settings, memory)
        network = _normalized_convolution(X=network, **kwargs)
        network = _normalized_convolution(X=network, **kwargs)

    _n_rnn_attention_module += 1
    return network
def _lstm_attention_module(network, settings):
    prefix = 'lstm_attention_module'

    n_filters = settings['convolution_settings']['n_filters']
    memory_settings = {'n_filters': n_filters}
    X_weight = layers.variable('%s_X_weight' % prefix,
                               shape=(4 * n_filters, n_filters, 1, 1))
    h_weight = layers.variable('%s_h_weight' % prefix,
                               shape=(4 * n_filters, n_filters, 1, 1))
    lstm_bias = layers.variable('%s_lstm_bias' % prefix,
                                shape=(1, 4 * n_filters, 1, 1))
    lstm_parameters = (X_weight, h_weight, lstm_bias)
    memory = 0, 0

    kwargs = {
        key: value
        for key, value in settings['convolution_settings'].items()
    }
    if settings['weight_sharing']:
        kwargs['weight'] = layers.variable('%s_weight' % prefix)
        kwargs['bias'] = layers.variable('%s_bias' % prefix)

    for index in range(settings['n_layers']):
        memory = _write(network, memory_settings, lstm_parameters, memory)
        network = _read(memory_settings, memory)
        network = _normalized_convolution(X=network, **kwargs)

    return network
示例#4
0
def drelu(X, shape):
  _, input_shape, _ = X.infer_shape(**shape)
  input_shape = input_shape[0]
  if len(input_shape) is 2: bound_shape = input_shape[1:]
  elif len(input_shape) is 4: bound_shape = (1, input_shape[1], 1, 1)
  global _n_drelus
  lower = variable('drelu%d_lower_bound' % _n_drelus, shape=bound_shape)
  upper = variable('drelu%d_upper_bound' % _n_drelus, shape=bound_shape)
  _n_drelus += 1
  return broadcast_minimum(upper, broadcast_maximum(lower, X))
示例#5
0
def _break_graph(operations, name, data_shape):
    network = layers.variable('data')
    for operation in operations:
        if network.name == name:
            replaced = network
            shape = output_shape(replaced, data=data_shape)
            network = operation(layers.variable('%s_data' % name, shape=shape))
        else:
            network = operation(network)
    return replaced, network
示例#6
0
def elman(X, D, cache):
  time = cache.setdefault('time', -1)
  cache['time'] += 1

  WX = cache.setdefault('WX', layers.variable('X_weight'))
  WH = cache.setdefault('WH', layers.variable('H_weight'))
  bias = cache.setdefault('bias', layers.variable('elman_bias', shape=(1, D)))

  network = _rnn_linearity(X, D, WX) + (_rnn_linearity(cache['h'], D, WH) if 'h' in cache else 0)
  network = layers.broadcast_plus(network, bias)
  cache['h'] = layers.tanh(network)

  return cache
示例#7
0
def dual_activation_network(n_layers):
  shared_weight = layers.variable('shared_weight')
  shared_bias = layers.variable('shared_bias')
  network = layers.variable('data')
  network = _normalized_convolution(network, (3, 3), 16, (1, 1), (1, 1))
  for i in range(n_layers):
    private = _normalized_convolution(network, (3, 3), 16, (1, 1), (1, 1))
    shared = _normalized_convolution(network, (3, 3), 16, (1, 1), (1, 1), weight=shared_weight, bias=shared_bias)
    network =  private + shared
  network = layers.pooling(X=network, mode='average', global_pool=True, kernel_shape=(1, 1), stride=(1, 1), pad=(1, 1))
  network = layers.flatten(network)
  network = layers.fully_connected(X=network, n_hidden_units=10, name='linear_transition')
  network = layers.softmax_loss(prediction=network, normalization='batch', id='softmax')
  return network
示例#8
0
def attended_memory_network(settings):
    network = layers.variable('data')
    for module_settings in settings:
        if module_settings['operator'] is 'attended_memory_module':
            network = _attended_memory_module(network,
                                              module_settings['settings'])
        else:
            args = module_settings.get('args', tuple())
            kwargs = {
                key: value
                for key, value in module_settings.get('kwargs', {}).items()
            }
            if args: args = (network, ) + args
            else: kwargs['X'] = network
            network = getattr(layers, module_settings['operator'])(*args,
                                                                   **kwargs)
    network = layers.pooling(X=network,
                             mode='average',
                             global_pool=True,
                             kernel_shape=(1, 1),
                             stride=(1, 1),
                             pad=(1, 1))
    network = layers.flatten(network)
    network = layers.fully_connected(X=network, n_hidden_units=10)
    network = layers.softmax_loss(prediction=network,
                                  normalization='batch',
                                  id='softmax')
    return network
示例#9
0
def build_network(n_layers):
    network = layers.variable('data')
    network = _convolution(X=network, n_filters=16)

    convolution_settings = {'n_filters': None}
    settings = {
        'convolution_settings': convolution_settings,
        'n_layers': args.n_layers,
        'weight_sharing': False
    }

    for n_filters in (16, 32):
        convolution_settings['n_filters'] = n_filters
        network = _rnn_attention_module(network, settings)
        network = _transit(network, n_filters * 2)

    convolution_settings['n_filters'] = 64
    network = _rnn_attention_module(network, settings)

    network = layers.pooling(X=network,
                             mode='average',
                             kernel_shape=(8, 8),
                             stride=(1, 1),
                             pad=(0, 0))
    network = layers.flatten(network)
    network = layers.batch_normalization(network, fix_gamma=False)
    network = layers.fully_connected(X=network, n_hidden_units=10)
    network = layers.softmax_loss(prediction=network,
                                  normalization='batch',
                                  id='softmax')

    return network
示例#10
0
def nin(settings):
    network = layers.variable('data')
    network = _activated_convolution(X=network,
                                     kernel_shape=(3, 3),
                                     n_filters=192,
                                     stride=(1, 1),
                                     pad=(1, 1))
    network = _activated_convolution(X=network,
                                     kernel_shape=(1, 1),
                                     n_filters=160,
                                     stride=(1, 1),
                                     pad=(0, 0))
    network = _activated_convolution(X=network,
                                     kernel_shape=(1, 1),
                                     n_filters=96,
                                     stride=(1, 1),
                                     pad=(0, 0))
    network = _transit(network, settings['transition_mode'])
    network = _activated_convolution(X=network,
                                     kernel_shape=(3, 3),
                                     n_filters=192,
                                     stride=(1, 1),
                                     pad=(1, 1))
    network = _activated_convolution(X=network,
                                     kernel_shape=(1, 1),
                                     n_filters=192,
                                     stride=(1, 1),
                                     pad=(0, 0))
    network = _activated_convolution(X=network,
                                     kernel_shape=(1, 1),
                                     n_filters=192,
                                     stride=(1, 1),
                                     pad=(0, 0))
    network = _transit(network, settings['transition_mode'])
    network = _activated_convolution(X=network,
                                     kernel_shape=(3, 3),
                                     n_filters=192,
                                     stride=(1, 1),
                                     pad=(1, 1))
    network = _activated_convolution(X=network,
                                     kernel_shape=(1, 1),
                                     n_filters=192,
                                     stride=(1, 1),
                                     pad=(0, 0))
    network = _activated_convolution(X=network,
                                     kernel_shape=(1, 1),
                                     n_filters=10,
                                     stride=(1, 1),
                                     pad=(0, 0))
    network = layers.pooling(X=network,
                             mode='average',
                             global_pool=True,
                             kernel_shape=(1, 1),
                             stride=(1, 1),
                             pad=(0, 0))
    network = layers.flatten(network)
    network = layers.softmax_loss(prediction=network,
                                  normalization='batch',
                                  id='softmax')
    return network
示例#11
0
def build_network(args):
    network = layers.variable('data')
    network = _convolution(X=network, n_filters=16)

    for n_filters in (16, 32):
        network = _module(network, n_filters, args.n_layers)
        network = _transit(network, n_filters * 2)


# network = _module(network, 64, args.n_layers)
    _, rnn_cache = _traced_module(network, args.rnn, 64, args.n_layers)

    # network = layers.batch_normalization(network, fix_gamma=False)
    network = layers.batch_normalization(rnn_cache['h'],
                                         fix_gamma=False,
                                         id='BN')
    network = layers.ReLU(network)

    network = layers.pooling(X=rnn_cache['h'],
                             mode='average',
                             kernel_shape=(8, 8),
                             stride=(1, 1),
                             pad=(0, 0))
    network = layers.flatten(network)
    network = layers.fully_connected(X=network, n_hidden_units=10, id='linear')
    network = layers.softmax_loss(prediction=network,
                                  normalization='batch',
                                  id='softmax')

    return network
示例#12
0
def _attended_memory_module(network, settings):
    kwargs = {
        key: value
        for key, value in settings['convolution_settings'].items()
    }
    global _n_attended_memory_module
    prefix = 'attended_memory_module%d' % _n_attended_memory_module
    if settings['weight_sharing']:
        kwargs['weight'] = layers.variable('%s_weight' % prefix)
        kwargs['bias'] = layers.variable('%s_bias' % prefix)
    memory = [network]
    for index in range(settings['n_layers']):
        kwargs['X'] = _read(memory, settings)
        network = _normalized_convolution(**kwargs)
        memory.append(network)
    _n_attended_memory_module += 1
    return network
示例#13
0
def elman(X, n_filters, cache):
    time = cache.setdefault('time', -1)
    cache['time'] += 1

    WX = cache.setdefault('WX', layers.variable('X_weight'))
    WH = cache.setdefault('WH', layers.variable('H_weight'))
    bias = cache.setdefault(
        'bias', layers.variable('elman_bias', shape=(1, n_filters, 1, 1)))

    network = _rnn_convolution(X, n_filters, WX) + \
      (_rnn_convolution(cache['h'], n_filters, WH) if 'h' in cache else 0)

    network = layers.broadcast_plus(network, bias)
    # network = layers.batch_normalization(network, fix_gamma=False, id='ElmanBN%d' % time)

    cache['h'] = layers.tanh(network)

    return cache
示例#14
0
def _constant_attention_module(network, settings):
  global _n_constant_attention_module
  prefix = 'constant_attention_module%d' % _n_constant_attention_module

  memory = 0

  kwargs = {key : value for key, value in settings['convolution_settings'].items()}
  if settings['weight_sharing']:
    kwargs['weight'] = layers.variable('%s_weight' % prefix)
    kwargs['bias'] = layers.variable('%s_bias' % prefix)
  for index in range(settings['n_layers']):
    memory = _write(network, memory) # dynamic period of memory writing
    network = _read(memory)
    network = _normalized_convolution(X=network, **kwargs)
    network = _normalized_convolution(X=network, **kwargs)

  _n_constant_attention_module += 1
  return network
示例#15
0
def build_resnet(args):
  network = layers.variable('data')
  network = _convolution(X=network, n_filters=16)

  for n_filters in (16, 32):
    network = _module(network, n_filters, args.n_layers)
    network = _transit(network, n_filters * 2)
  
  return _traced_module(network, 64, args.n_layers)
示例#16
0
def naive_network(n_layers, weight_sharing):
    network = layers.variable('data')
    network = _normalized_convolution(X=network,
                                      n_filters=8,
                                      kernel_shape=(5, 5),
                                      stride=(1, 1),
                                      pad=(2, 2))
    network = layers.pooling(X=network,
                             mode='maximum',
                             kernel_shape=(2, 2),
                             stride=(2, 2),
                             pad=(0, 0))
    if weight_sharing:
        shared_weight = layers.variable('shared_weight')
        shared_bias = layers.variable('shared_bias')
    for index in range(n_layers):
        if weight_sharing:
            network = _normalized_convolution(X=network,
                                              n_filters=8,
                                              kernel_shape=(3, 3),
                                              stride=(1, 1),
                                              pad=(1, 1),
                                              weight=shared_weight,
                                              bias=shared_bias)
        else:
            network = _normalized_convolution(X=network,
                                              n_filters=16,
                                              kernel_shape=(3, 3),
                                              stride=(1, 1),
                                              pad=(1, 1))
    network = layers.pooling(X=network,
                             mode='average',
                             global_pool=True,
                             kernel_shape=(1, 1),
                             stride=(1, 1),
                             pad=(1, 1))
    network = layers.flatten(network)
    network = layers.fully_connected(X=network, n_hidden_units=10)
    network = layers.softmax_loss(prediction=network,
                                  normalization='batch',
                                  id='softmax')
    return network
示例#17
0
def lstm(X, n_filters, cache):
    WX = cache.setdefault('WX', layers.variable('X_weight'))
    WH = cache.setdefault('WH', layers.variable('H_weight'))
    bias = cache.setdefault(
        'bias', layers.variable('lstm_bias', shape=(1, n_filters * 4, 1, 1)))

    network = _rnn_convolution(X, n_filters * 4, WH) + \
      (_rnn_convolution(cache['h'], n_filters * 4, WH) if 'h' in cache else 0)
    network = layers.broadcast_plus(network, bias)

    group = layers.slice(X=network, axis=1, n_outputs=4)
    i = layers.sigmoid(group[0])
    f = layers.sigmoid(group[1])
    o = layers.sigmoid(group[2])
    g = layers.tanh(group[3])

    cache['c'] = f * cache.get('c', 0) + i * g
    cache['h'] = o * layers.tanh(cache['c'])

    return cache
示例#18
0
def lstm(X, D, cache):
  time = cache.setdefault('time', -1)
  cache['time'] += 1

  WX = cache.setdefault('WX', layers.variable('X_weight'))
  WH = cache.setdefault('WH', layers.variable('H_weight'))
  bias = cache.setdefault('bias', layers.variable('lstm_bias', shape=(1, D * 4)))

  network = _rnn_linearity(X, D * 4, WX) + (_rnn_linearity(cache['h'], D * 4, WH) if 'h' in cache else 0)
  network = layers.broadcast_plus(network, bias)

  group = layers.slice(X=network, axis=1, n_outputs=4)
  i = layers.sigmoid(group[0])
  f = layers.sigmoid(group[1])
  o = layers.sigmoid(group[2])
  g = layers.tanh(group[3])

  cache['c'] = f * cache.get('c', 0) + i * g
  cache['h'] = o * layers.tanh(cache['c'])
 
  return cache
示例#19
0
def build_rnn(args):
  rnn_cache = {}

  for i in range(args.n_layers):
    X = layers.variable('data%d' % i)
    rnn_cache = globals()[args.rnn](X, args.n_hidden_units, rnn_cache)

  network = layers.fully_connected(X=rnn_cache['h'], n_hidden_units=10, id='linear')
  loss = layers.linear_regression_loss(network, id='criterion')
# network = layers.softmax_loss(prediction=network, normalization='batch', id='criterion')

  return network, loss
def _lstm_attention_module(network, settings):
  global n_modules
  prefix = 'lstm_attention_module%d' % n_modules
  n_modules += 1

  n_filters = settings['convolution_settings']['n_filters']
  X_weight = layers.variable('%s_X_weight' % prefix, shape=(4 * n_filters, n_filters, 1, 1))
  h_weight = layers.variable('%s_h_weight' % prefix, shape=(4 * n_filters, n_filters, 1, 1))
  lstm_bias = layers.variable('%s_lstm_bias' % prefix, shape=(1, 4 * n_filters, 1, 1))
  lstm_parameters = (X_weight, h_weight, lstm_bias)
  memory = 0, 0

  kwargs = {key : value for key, value in settings['convolution_settings'].items()}

  if settings['weight_sharing']:
    # TODO
    kwargs['weight'] = layers.variable('%s_weight' % prefix)
    shared_gamma = layers.variable('shared_gamma')
    shared_beta = layers.variable('shared_beta')

  for index in range(settings['n_layers']):
    from_identity = network

    memory = _write(network, n_filters * 4, lstm_parameters, memory)
    from_lstm = _read(n_filters, memory)

    network = _normalized_convolution(network, **kwargs)
    network = _normalized_convolution(network, **kwargs)

    network += from_identity + from_lstm
#   network += from_lstm

  return network
示例#21
0
def dropping_out_mlp(settings):
    network = layers.variable('data')
    network = layers.flatten(network)
    layer_settings = settings['layer_settings']
    for index, layer_setting in enumerate(layer_settings):
        n_hidden_units = layer_setting['n_hidden_units']
        p = layer_setting['p']
        network = _fully_connected(network, n_hidden_units, p)
    network = layers.fully_connected(X=network,
                                     n_hidden_units=settings['n_classes'])
    network = layers.softmax_loss(prediction=network,
                                  normalization='batch',
                                  id='softmax')
    return network
示例#22
0
def unattentioned_network(times, function=average, n_classes=10):
  # TODO simplify network structure
  network = layers.variable('data')
  cache = []
  for time in range(times):
    network = _normalized_convolution(network, (3, 3), 16, (1, 1), (1, 1))
    cache.append(network)
  network = layers.batch_normalization(function(cache))
  network = _normalized_convolution(network, (3, 3), 16, (2, 2), (1, 1))
  network = _normalized_convolution(network, (3, 3), 16, (2, 2), (1, 1))
  network = layers.pooling(X=network, mode='average', kernel_shape=(8, 8), stride=(1, 1), pad=(0, 0))
  network = layers.fully_connected(X=network, n_hidden_units=n_classes)
  network = layers.softmax_loss(network, normalization='batch')
  return network
示例#23
0
def dense_network(settings, n_classes=10):
    network = layers.variable('data')
    network = _normalized_convolution(network, (3, 3), 16, (1, 1), (1, 1))
    for module_settings in settings:
        network = _dense_module(network, module_settings)
    network = layers.pooling(X=network,
                             mode='average',
                             kernel_shape=(1, 1),
                             stride=(1, 1),
                             pad=(0, 0),
                             global_pool=True)
    network = layers.flatten(network)
    network = layers.fully_connected(X=network, n_hidden_units=n_classes)
    network = layers.softmax_loss(network, normalization='batch')
    return network
def element_wise_stochastic_pooling_mlp(settings):
    network = layers.variable('data')
    network = layers.flatten(network)
    layer_settings = settings['layer_settings']
    for index, layer_setting in enumerate(layer_settings):
        n_hidden_units = layer_setting['n_hidden_units']
        mode = layer_setting['pooling_mode']
        p = layer_setting['p']  # the probability of using long path
        network = _fully_connected(network, settings['batch_size'],
                                   n_hidden_units, mode, p)
    network = layers.fully_connected(X=network,
                                     n_hidden_units=settings['n_classes'])
    network = layers.softmax_loss(prediction=network,
                                  normalization='batch',
                                  id='softmax')
    return network
示例#25
0
def residual_network(procedures):
    network = layers.variable('data')
    for index, procedure in enumerate(procedures):
        transit, recur = procedure
        network = transit(network, index)
        network = recur(network, index)
    network = layers.pooling(X=network,
                             mode='average',
                             global_pool=True,
                             kernel_shape=(1, 1),
                             stride=(1, 1),
                             pad=(1, 1))
    network = layers.flatten(network)
    network = layers.fully_connected(X=network,
                                     n_hidden_units=10,
                                     name='linear_transition')
    network = layers.softmax_loss(prediction=network,
                                  normalization='batch',
                                  id='softmax')
    return network
示例#26
0
from mx_initializer import PReLUInitializer
from mx_solver import MXSolver

from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('--gpu_index', type=int, default=0)
parser.add_argument('--n_residual_layers', type=int, required=True)
parser.add_argument('--postfix', type=str, default='')
args = parser.parse_args()

# TODO calculate receptive field

_convolution = lambda X: layers.convolution(
    X=X, n_filters=16, kernel_shape=(5, 5), stride=(1, 1), pad=(2, 2))

network = layers.variable('data')
for index in range(3):
    network = _convolution(network)
    network = layers.ReLU(network)
    network = layers.pooling(X=network,
                             mode='maximum',
                             kernel_shape=(2, 2),
                             stride=(2, 2),
                             pad=(0, 0))

shared_weight = layers.variable('shared_weight')
shared_gamma = layers.variable('shared_gamma')
shared_beta = layers.variable('shared_beta')

_convolution = lambda X: layers.convolution(
    X=X,
    for index in range(settings['n_layers']):
        memory = _write(network, memory_settings, lstm_parameters, memory)
        network = _read(memory_settings, memory)
        network = _normalized_convolution(X=network, **kwargs)

    return network


from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('--gpu_index', type=int, required=True)
parser.add_argument('--n_layers', type=int, required=True)
parser.add_argument('--postfix', type=str, required=True)
args = parser.parse_args()

network = layers.variable('data')
for index in range(3):
    network = _normalized_convolution(X=network,
                                      n_filters=16,
                                      kernel_shape=(5, 5),
                                      stride=(1, 1),
                                      pad=(2, 2))
    network = layers.pooling(X=network,
                             mode='maximum',
                             kernel_shape=(2, 2),
                             stride=(2, 2),
                             pad=(0, 0))

convolution_settings = {
    'n_filters': 16,
    'kernel_shape': (3, 3),
示例#28
0
_n_drelus = 0
def drelu(X, shape):
  _, input_shape, _ = X.infer_shape(**shape)
  input_shape = input_shape[0]
  if len(input_shape) is 2: bound_shape = input_shape[1:]
  elif len(input_shape) is 4: bound_shape = (1, input_shape[1], 1, 1)
  global _n_drelus
  lower = variable('drelu%d_lower_bound' % _n_drelus, shape=bound_shape)
  upper = variable('drelu%d_upper_bound' % _n_drelus, shape=bound_shape)
  _n_drelus += 1
  return broadcast_minimum(upper, broadcast_maximum(lower, X))

class DReLUInitializer(PReLUInitializer):
  def __init__(self, lower, upper):
    super(DReLUInitializer, self).__init__()
    self._lower, self._upper = lower, upper
  def __call__(self, identifier, array):
    if 'lower' in identifier: array[:] = self._lower
    elif 'upper' in identifier: array[:] = self._upper
    else: super(DReLUInitializer, self).__call__(identifier, array)

if __name__ is '__main__':
# X_SHAPE = (10000, 3072)
  X_SHAPE = (10000, 3, 32, 32)
  X = variable('data')
  network = drelu(X, {'data' : X_SHAPE})
  args = network.list_arguments()
  arg_shapes, output_shapes, _ = network.infer_shape(data=X_SHAPE)
  print dict(zip(args, arg_shapes))
  print output_shapes
parser = ArgumentParser()
parser.add_argument('--gpu_index', type=int)
parser.add_argument('--n_plain_layers', type=int)
parser.add_argument('--postfix', type=str)
configs = parser.parse_args()


def _normalized_convolution(**args):
    network = layers.convolution(**args)
    network = layers.batch_normalization(network)
    network = layers.ReLU(network)
    return network


network = layers.variable('data')
network = _normalized_convolution(X=network,
                                  n_filters=16,
                                  kernel_shape=(5, 5),
                                  stride=(1, 1),
                                  pad=(2, 2))
network = layers.pooling(X=network,
                         mode='maximum',
                         kernel_shape=(2, 2),
                         stride=(2, 2),
                         pad=(0, 0))

weight = layers.variable('shared_convolution_weight')
for index in range(configs.n_plain_layers):
    network = _normalized_convolution(X=network,
                                      n_filters=16,