def recurrent_hypernetwork(T, batch_size): X = layers.variable('data') label = layers.variable('softmax_label') loss = 0 parameters = ({ 'weight': None, 'bias': None }, { 'weight': None, 'bias': None }, { 'weight': None, 'bias': None }) KERNEL_SHAPES = ((3, 3, 3 * 16), ) + ((3, 3, 16 * 16), ) * 2 for time in range(T): network = _extract_representations(X, parameters, batch_size) prediction = layers.pooling(X=network, mode='average', global_pool=True, kernel_shape=(1, 1), stride=(1, 1), pad=(0, 0)) prediction = layers.flatten(prediction) prediction = layers.fully_connected(X=prediction, n_hidden_units=10) loss += layers.softmax_loss(prediction=prediction, label=label) for index, weight in enumerate( _generate_parameters(network, KERNEL_SHAPES)): parameters[index]['weight'] = weight return loss
def _rnn_attention_module(network, settings): global _n_rnn_attention_module prefix = 'rnn_attention_module%d' % _n_rnn_attention_module n_filters = settings['convolution_settings']['n_filters'] memory_settings = {'n_filters': n_filters} X_weight = layers.variable('%s_X_weight' % prefix, shape=(4 * n_filters, n_filters, 1, 1)) h_weight = layers.variable('%s_h_weight' % prefix, shape=(4 * n_filters, n_filters, 1, 1)) rnn_bias = layers.variable('%s_rnn_bias' % prefix, shape=(1, 4 * n_filters, 1, 1)) rnn_parameters = (X_weight, h_weight, rnn_bias) memory = (0, 0) kwargs = { key: value for key, value in settings['convolution_settings'].items() } if settings['weight_sharing']: kwargs['weight'] = layers.variable('%s_weight' % prefix) kwargs['bias'] = layers.variable('%s_bias' % prefix) for index in range(settings['n_layers']): memory = _write(network, memory_settings, rnn_parameters, memory) # dynamic period of memory writing network = _read(memory_settings, memory) network = _normalized_convolution(X=network, **kwargs) network = _normalized_convolution(X=network, **kwargs) _n_rnn_attention_module += 1 return network
def _lstm_attention_module(network, settings): prefix = 'lstm_attention_module' n_filters = settings['convolution_settings']['n_filters'] memory_settings = {'n_filters': n_filters} X_weight = layers.variable('%s_X_weight' % prefix, shape=(4 * n_filters, n_filters, 1, 1)) h_weight = layers.variable('%s_h_weight' % prefix, shape=(4 * n_filters, n_filters, 1, 1)) lstm_bias = layers.variable('%s_lstm_bias' % prefix, shape=(1, 4 * n_filters, 1, 1)) lstm_parameters = (X_weight, h_weight, lstm_bias) memory = 0, 0 kwargs = { key: value for key, value in settings['convolution_settings'].items() } if settings['weight_sharing']: kwargs['weight'] = layers.variable('%s_weight' % prefix) kwargs['bias'] = layers.variable('%s_bias' % prefix) for index in range(settings['n_layers']): memory = _write(network, memory_settings, lstm_parameters, memory) network = _read(memory_settings, memory) network = _normalized_convolution(X=network, **kwargs) return network
def drelu(X, shape): _, input_shape, _ = X.infer_shape(**shape) input_shape = input_shape[0] if len(input_shape) is 2: bound_shape = input_shape[1:] elif len(input_shape) is 4: bound_shape = (1, input_shape[1], 1, 1) global _n_drelus lower = variable('drelu%d_lower_bound' % _n_drelus, shape=bound_shape) upper = variable('drelu%d_upper_bound' % _n_drelus, shape=bound_shape) _n_drelus += 1 return broadcast_minimum(upper, broadcast_maximum(lower, X))
def _break_graph(operations, name, data_shape): network = layers.variable('data') for operation in operations: if network.name == name: replaced = network shape = output_shape(replaced, data=data_shape) network = operation(layers.variable('%s_data' % name, shape=shape)) else: network = operation(network) return replaced, network
def elman(X, D, cache): time = cache.setdefault('time', -1) cache['time'] += 1 WX = cache.setdefault('WX', layers.variable('X_weight')) WH = cache.setdefault('WH', layers.variable('H_weight')) bias = cache.setdefault('bias', layers.variable('elman_bias', shape=(1, D))) network = _rnn_linearity(X, D, WX) + (_rnn_linearity(cache['h'], D, WH) if 'h' in cache else 0) network = layers.broadcast_plus(network, bias) cache['h'] = layers.tanh(network) return cache
def dual_activation_network(n_layers): shared_weight = layers.variable('shared_weight') shared_bias = layers.variable('shared_bias') network = layers.variable('data') network = _normalized_convolution(network, (3, 3), 16, (1, 1), (1, 1)) for i in range(n_layers): private = _normalized_convolution(network, (3, 3), 16, (1, 1), (1, 1)) shared = _normalized_convolution(network, (3, 3), 16, (1, 1), (1, 1), weight=shared_weight, bias=shared_bias) network = private + shared network = layers.pooling(X=network, mode='average', global_pool=True, kernel_shape=(1, 1), stride=(1, 1), pad=(1, 1)) network = layers.flatten(network) network = layers.fully_connected(X=network, n_hidden_units=10, name='linear_transition') network = layers.softmax_loss(prediction=network, normalization='batch', id='softmax') return network
def attended_memory_network(settings): network = layers.variable('data') for module_settings in settings: if module_settings['operator'] is 'attended_memory_module': network = _attended_memory_module(network, module_settings['settings']) else: args = module_settings.get('args', tuple()) kwargs = { key: value for key, value in module_settings.get('kwargs', {}).items() } if args: args = (network, ) + args else: kwargs['X'] = network network = getattr(layers, module_settings['operator'])(*args, **kwargs) network = layers.pooling(X=network, mode='average', global_pool=True, kernel_shape=(1, 1), stride=(1, 1), pad=(1, 1)) network = layers.flatten(network) network = layers.fully_connected(X=network, n_hidden_units=10) network = layers.softmax_loss(prediction=network, normalization='batch', id='softmax') return network
def build_network(n_layers): network = layers.variable('data') network = _convolution(X=network, n_filters=16) convolution_settings = {'n_filters': None} settings = { 'convolution_settings': convolution_settings, 'n_layers': args.n_layers, 'weight_sharing': False } for n_filters in (16, 32): convolution_settings['n_filters'] = n_filters network = _rnn_attention_module(network, settings) network = _transit(network, n_filters * 2) convolution_settings['n_filters'] = 64 network = _rnn_attention_module(network, settings) network = layers.pooling(X=network, mode='average', kernel_shape=(8, 8), stride=(1, 1), pad=(0, 0)) network = layers.flatten(network) network = layers.batch_normalization(network, fix_gamma=False) network = layers.fully_connected(X=network, n_hidden_units=10) network = layers.softmax_loss(prediction=network, normalization='batch', id='softmax') return network
def nin(settings): network = layers.variable('data') network = _activated_convolution(X=network, kernel_shape=(3, 3), n_filters=192, stride=(1, 1), pad=(1, 1)) network = _activated_convolution(X=network, kernel_shape=(1, 1), n_filters=160, stride=(1, 1), pad=(0, 0)) network = _activated_convolution(X=network, kernel_shape=(1, 1), n_filters=96, stride=(1, 1), pad=(0, 0)) network = _transit(network, settings['transition_mode']) network = _activated_convolution(X=network, kernel_shape=(3, 3), n_filters=192, stride=(1, 1), pad=(1, 1)) network = _activated_convolution(X=network, kernel_shape=(1, 1), n_filters=192, stride=(1, 1), pad=(0, 0)) network = _activated_convolution(X=network, kernel_shape=(1, 1), n_filters=192, stride=(1, 1), pad=(0, 0)) network = _transit(network, settings['transition_mode']) network = _activated_convolution(X=network, kernel_shape=(3, 3), n_filters=192, stride=(1, 1), pad=(1, 1)) network = _activated_convolution(X=network, kernel_shape=(1, 1), n_filters=192, stride=(1, 1), pad=(0, 0)) network = _activated_convolution(X=network, kernel_shape=(1, 1), n_filters=10, stride=(1, 1), pad=(0, 0)) network = layers.pooling(X=network, mode='average', global_pool=True, kernel_shape=(1, 1), stride=(1, 1), pad=(0, 0)) network = layers.flatten(network) network = layers.softmax_loss(prediction=network, normalization='batch', id='softmax') return network
def build_network(args): network = layers.variable('data') network = _convolution(X=network, n_filters=16) for n_filters in (16, 32): network = _module(network, n_filters, args.n_layers) network = _transit(network, n_filters * 2) # network = _module(network, 64, args.n_layers) _, rnn_cache = _traced_module(network, args.rnn, 64, args.n_layers) # network = layers.batch_normalization(network, fix_gamma=False) network = layers.batch_normalization(rnn_cache['h'], fix_gamma=False, id='BN') network = layers.ReLU(network) network = layers.pooling(X=rnn_cache['h'], mode='average', kernel_shape=(8, 8), stride=(1, 1), pad=(0, 0)) network = layers.flatten(network) network = layers.fully_connected(X=network, n_hidden_units=10, id='linear') network = layers.softmax_loss(prediction=network, normalization='batch', id='softmax') return network
def _attended_memory_module(network, settings): kwargs = { key: value for key, value in settings['convolution_settings'].items() } global _n_attended_memory_module prefix = 'attended_memory_module%d' % _n_attended_memory_module if settings['weight_sharing']: kwargs['weight'] = layers.variable('%s_weight' % prefix) kwargs['bias'] = layers.variable('%s_bias' % prefix) memory = [network] for index in range(settings['n_layers']): kwargs['X'] = _read(memory, settings) network = _normalized_convolution(**kwargs) memory.append(network) _n_attended_memory_module += 1 return network
def elman(X, n_filters, cache): time = cache.setdefault('time', -1) cache['time'] += 1 WX = cache.setdefault('WX', layers.variable('X_weight')) WH = cache.setdefault('WH', layers.variable('H_weight')) bias = cache.setdefault( 'bias', layers.variable('elman_bias', shape=(1, n_filters, 1, 1))) network = _rnn_convolution(X, n_filters, WX) + \ (_rnn_convolution(cache['h'], n_filters, WH) if 'h' in cache else 0) network = layers.broadcast_plus(network, bias) # network = layers.batch_normalization(network, fix_gamma=False, id='ElmanBN%d' % time) cache['h'] = layers.tanh(network) return cache
def _constant_attention_module(network, settings): global _n_constant_attention_module prefix = 'constant_attention_module%d' % _n_constant_attention_module memory = 0 kwargs = {key : value for key, value in settings['convolution_settings'].items()} if settings['weight_sharing']: kwargs['weight'] = layers.variable('%s_weight' % prefix) kwargs['bias'] = layers.variable('%s_bias' % prefix) for index in range(settings['n_layers']): memory = _write(network, memory) # dynamic period of memory writing network = _read(memory) network = _normalized_convolution(X=network, **kwargs) network = _normalized_convolution(X=network, **kwargs) _n_constant_attention_module += 1 return network
def build_resnet(args): network = layers.variable('data') network = _convolution(X=network, n_filters=16) for n_filters in (16, 32): network = _module(network, n_filters, args.n_layers) network = _transit(network, n_filters * 2) return _traced_module(network, 64, args.n_layers)
def naive_network(n_layers, weight_sharing): network = layers.variable('data') network = _normalized_convolution(X=network, n_filters=8, kernel_shape=(5, 5), stride=(1, 1), pad=(2, 2)) network = layers.pooling(X=network, mode='maximum', kernel_shape=(2, 2), stride=(2, 2), pad=(0, 0)) if weight_sharing: shared_weight = layers.variable('shared_weight') shared_bias = layers.variable('shared_bias') for index in range(n_layers): if weight_sharing: network = _normalized_convolution(X=network, n_filters=8, kernel_shape=(3, 3), stride=(1, 1), pad=(1, 1), weight=shared_weight, bias=shared_bias) else: network = _normalized_convolution(X=network, n_filters=16, kernel_shape=(3, 3), stride=(1, 1), pad=(1, 1)) network = layers.pooling(X=network, mode='average', global_pool=True, kernel_shape=(1, 1), stride=(1, 1), pad=(1, 1)) network = layers.flatten(network) network = layers.fully_connected(X=network, n_hidden_units=10) network = layers.softmax_loss(prediction=network, normalization='batch', id='softmax') return network
def lstm(X, n_filters, cache): WX = cache.setdefault('WX', layers.variable('X_weight')) WH = cache.setdefault('WH', layers.variable('H_weight')) bias = cache.setdefault( 'bias', layers.variable('lstm_bias', shape=(1, n_filters * 4, 1, 1))) network = _rnn_convolution(X, n_filters * 4, WH) + \ (_rnn_convolution(cache['h'], n_filters * 4, WH) if 'h' in cache else 0) network = layers.broadcast_plus(network, bias) group = layers.slice(X=network, axis=1, n_outputs=4) i = layers.sigmoid(group[0]) f = layers.sigmoid(group[1]) o = layers.sigmoid(group[2]) g = layers.tanh(group[3]) cache['c'] = f * cache.get('c', 0) + i * g cache['h'] = o * layers.tanh(cache['c']) return cache
def lstm(X, D, cache): time = cache.setdefault('time', -1) cache['time'] += 1 WX = cache.setdefault('WX', layers.variable('X_weight')) WH = cache.setdefault('WH', layers.variable('H_weight')) bias = cache.setdefault('bias', layers.variable('lstm_bias', shape=(1, D * 4))) network = _rnn_linearity(X, D * 4, WX) + (_rnn_linearity(cache['h'], D * 4, WH) if 'h' in cache else 0) network = layers.broadcast_plus(network, bias) group = layers.slice(X=network, axis=1, n_outputs=4) i = layers.sigmoid(group[0]) f = layers.sigmoid(group[1]) o = layers.sigmoid(group[2]) g = layers.tanh(group[3]) cache['c'] = f * cache.get('c', 0) + i * g cache['h'] = o * layers.tanh(cache['c']) return cache
def build_rnn(args): rnn_cache = {} for i in range(args.n_layers): X = layers.variable('data%d' % i) rnn_cache = globals()[args.rnn](X, args.n_hidden_units, rnn_cache) network = layers.fully_connected(X=rnn_cache['h'], n_hidden_units=10, id='linear') loss = layers.linear_regression_loss(network, id='criterion') # network = layers.softmax_loss(prediction=network, normalization='batch', id='criterion') return network, loss
def _lstm_attention_module(network, settings): global n_modules prefix = 'lstm_attention_module%d' % n_modules n_modules += 1 n_filters = settings['convolution_settings']['n_filters'] X_weight = layers.variable('%s_X_weight' % prefix, shape=(4 * n_filters, n_filters, 1, 1)) h_weight = layers.variable('%s_h_weight' % prefix, shape=(4 * n_filters, n_filters, 1, 1)) lstm_bias = layers.variable('%s_lstm_bias' % prefix, shape=(1, 4 * n_filters, 1, 1)) lstm_parameters = (X_weight, h_weight, lstm_bias) memory = 0, 0 kwargs = {key : value for key, value in settings['convolution_settings'].items()} if settings['weight_sharing']: # TODO kwargs['weight'] = layers.variable('%s_weight' % prefix) shared_gamma = layers.variable('shared_gamma') shared_beta = layers.variable('shared_beta') for index in range(settings['n_layers']): from_identity = network memory = _write(network, n_filters * 4, lstm_parameters, memory) from_lstm = _read(n_filters, memory) network = _normalized_convolution(network, **kwargs) network = _normalized_convolution(network, **kwargs) network += from_identity + from_lstm # network += from_lstm return network
def dropping_out_mlp(settings): network = layers.variable('data') network = layers.flatten(network) layer_settings = settings['layer_settings'] for index, layer_setting in enumerate(layer_settings): n_hidden_units = layer_setting['n_hidden_units'] p = layer_setting['p'] network = _fully_connected(network, n_hidden_units, p) network = layers.fully_connected(X=network, n_hidden_units=settings['n_classes']) network = layers.softmax_loss(prediction=network, normalization='batch', id='softmax') return network
def unattentioned_network(times, function=average, n_classes=10): # TODO simplify network structure network = layers.variable('data') cache = [] for time in range(times): network = _normalized_convolution(network, (3, 3), 16, (1, 1), (1, 1)) cache.append(network) network = layers.batch_normalization(function(cache)) network = _normalized_convolution(network, (3, 3), 16, (2, 2), (1, 1)) network = _normalized_convolution(network, (3, 3), 16, (2, 2), (1, 1)) network = layers.pooling(X=network, mode='average', kernel_shape=(8, 8), stride=(1, 1), pad=(0, 0)) network = layers.fully_connected(X=network, n_hidden_units=n_classes) network = layers.softmax_loss(network, normalization='batch') return network
def dense_network(settings, n_classes=10): network = layers.variable('data') network = _normalized_convolution(network, (3, 3), 16, (1, 1), (1, 1)) for module_settings in settings: network = _dense_module(network, module_settings) network = layers.pooling(X=network, mode='average', kernel_shape=(1, 1), stride=(1, 1), pad=(0, 0), global_pool=True) network = layers.flatten(network) network = layers.fully_connected(X=network, n_hidden_units=n_classes) network = layers.softmax_loss(network, normalization='batch') return network
def element_wise_stochastic_pooling_mlp(settings): network = layers.variable('data') network = layers.flatten(network) layer_settings = settings['layer_settings'] for index, layer_setting in enumerate(layer_settings): n_hidden_units = layer_setting['n_hidden_units'] mode = layer_setting['pooling_mode'] p = layer_setting['p'] # the probability of using long path network = _fully_connected(network, settings['batch_size'], n_hidden_units, mode, p) network = layers.fully_connected(X=network, n_hidden_units=settings['n_classes']) network = layers.softmax_loss(prediction=network, normalization='batch', id='softmax') return network
def residual_network(procedures): network = layers.variable('data') for index, procedure in enumerate(procedures): transit, recur = procedure network = transit(network, index) network = recur(network, index) network = layers.pooling(X=network, mode='average', global_pool=True, kernel_shape=(1, 1), stride=(1, 1), pad=(1, 1)) network = layers.flatten(network) network = layers.fully_connected(X=network, n_hidden_units=10, name='linear_transition') network = layers.softmax_loss(prediction=network, normalization='batch', id='softmax') return network
from mx_initializer import PReLUInitializer from mx_solver import MXSolver from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument('--gpu_index', type=int, default=0) parser.add_argument('--n_residual_layers', type=int, required=True) parser.add_argument('--postfix', type=str, default='') args = parser.parse_args() # TODO calculate receptive field _convolution = lambda X: layers.convolution( X=X, n_filters=16, kernel_shape=(5, 5), stride=(1, 1), pad=(2, 2)) network = layers.variable('data') for index in range(3): network = _convolution(network) network = layers.ReLU(network) network = layers.pooling(X=network, mode='maximum', kernel_shape=(2, 2), stride=(2, 2), pad=(0, 0)) shared_weight = layers.variable('shared_weight') shared_gamma = layers.variable('shared_gamma') shared_beta = layers.variable('shared_beta') _convolution = lambda X: layers.convolution( X=X,
for index in range(settings['n_layers']): memory = _write(network, memory_settings, lstm_parameters, memory) network = _read(memory_settings, memory) network = _normalized_convolution(X=network, **kwargs) return network from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument('--gpu_index', type=int, required=True) parser.add_argument('--n_layers', type=int, required=True) parser.add_argument('--postfix', type=str, required=True) args = parser.parse_args() network = layers.variable('data') for index in range(3): network = _normalized_convolution(X=network, n_filters=16, kernel_shape=(5, 5), stride=(1, 1), pad=(2, 2)) network = layers.pooling(X=network, mode='maximum', kernel_shape=(2, 2), stride=(2, 2), pad=(0, 0)) convolution_settings = { 'n_filters': 16, 'kernel_shape': (3, 3),
_n_drelus = 0 def drelu(X, shape): _, input_shape, _ = X.infer_shape(**shape) input_shape = input_shape[0] if len(input_shape) is 2: bound_shape = input_shape[1:] elif len(input_shape) is 4: bound_shape = (1, input_shape[1], 1, 1) global _n_drelus lower = variable('drelu%d_lower_bound' % _n_drelus, shape=bound_shape) upper = variable('drelu%d_upper_bound' % _n_drelus, shape=bound_shape) _n_drelus += 1 return broadcast_minimum(upper, broadcast_maximum(lower, X)) class DReLUInitializer(PReLUInitializer): def __init__(self, lower, upper): super(DReLUInitializer, self).__init__() self._lower, self._upper = lower, upper def __call__(self, identifier, array): if 'lower' in identifier: array[:] = self._lower elif 'upper' in identifier: array[:] = self._upper else: super(DReLUInitializer, self).__call__(identifier, array) if __name__ is '__main__': # X_SHAPE = (10000, 3072) X_SHAPE = (10000, 3, 32, 32) X = variable('data') network = drelu(X, {'data' : X_SHAPE}) args = network.list_arguments() arg_shapes, output_shapes, _ = network.infer_shape(data=X_SHAPE) print dict(zip(args, arg_shapes)) print output_shapes
parser = ArgumentParser() parser.add_argument('--gpu_index', type=int) parser.add_argument('--n_plain_layers', type=int) parser.add_argument('--postfix', type=str) configs = parser.parse_args() def _normalized_convolution(**args): network = layers.convolution(**args) network = layers.batch_normalization(network) network = layers.ReLU(network) return network network = layers.variable('data') network = _normalized_convolution(X=network, n_filters=16, kernel_shape=(5, 5), stride=(1, 1), pad=(2, 2)) network = layers.pooling(X=network, mode='maximum', kernel_shape=(2, 2), stride=(2, 2), pad=(0, 0)) weight = layers.variable('shared_convolution_weight') for index in range(configs.n_plain_layers): network = _normalized_convolution(X=network, n_filters=16,