def mpusim_fully_connected(inputs, units, activation=None, use_bias=True, kernel_initializer=None, bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, activations_datatype_size_byte=1, weights_datatype_size_byte=1, results_datatype_size_byte=4, systolic_array_height=256, systolic_array_width=256, activation_fifo_depth=8, accumulator_array_height=4096, log_file_output_dir='.', model_name='unnamed'): """ A wrapper around `mpusim_fc`. One difference to maintain backward-compatibility: Default weight initializer is variance_scaling_initializer(2.0). Variable Names: * ``W``: weights of shape [in_dim, out_dim] * ``b``: bias """ if kernel_initializer is None: if get_tf_version_tuple() <= (1, 12): kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0) # deprecated else: kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal') inputs = batch_flatten(inputs) with rename_get_variable({'kernel': 'W', 'bias': 'b'}): layer = mpusim_fc(units=units, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer, activations_datatype_size_byte=activations_datatype_size_byte, weights_datatype_size_byte=weights_datatype_size_byte, results_datatype_size_byte=results_datatype_size_byte, systolic_array_height=systolic_array_height, systolic_array_width=systolic_array_width, activation_fifo_depth=activation_fifo_depth, accumulator_array_height=accumulator_array_height, log_file_output_dir=log_file_output_dir, model_name=model_name, _reuse=tf.get_variable_scope().reuse) ret = layer.apply(inputs, scope=tf.get_variable_scope()) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=layer.kernel) if use_bias: ret.variables.b = layer.bias return ret
def FullyConnectedWithTrackedMults(x, out_dim, network_complexity=None, W_init=None, b_init=None, nl=tf.identity, use_bias=True): """ Fully-Connected layer, takes a N>1D tensor and returns a 2D tensor. It is an equivalent of `tf.layers.dense` except for naming conventions. Args: x (tf.Tensor): a tensor to be flattened except for the first dimension. out_dim (int): output dimension W_init: initializer for W. Defaults to `variance_scaling_initializer`. b_init: initializer for b. Defaults to zero. nl: a nonlinearity function use_bias (bool): whether to use bias. Returns: tf.Tensor: a NC tensor named ``output`` with attribute `variables`. Variable Names: * ``W``: weights of shape [in_dim, out_dim] * ``b``: bias """ x = symbf.batch_flatten(x) if W_init is None: W_init = tf.contrib.layers.variance_scaling_initializer() if b_init is None: b_init = tf.constant_initializer() # if get_current_tower_context().is_main_training_tower: network_complexity['weights'] += out_dim * x.get_shape().as_list()[1] network_complexity['mults'] += out_dim * x.get_shape().as_list()[1] if use_bias: network_complexity['weights'] += out_dim W = tf.get_variable('W', (x.get_shape().as_list()[1], out_dim), initializer=W_init) if use_bias: b = tf.get_variable('b', out_dim, initializer=W_init) product = tf.matmul(x, W) ret = nl(tf.nn.bias_add(product, b) if use_bias else product, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b = b return ret
def Conv3D( inputs, filters, kernel_size, strides=(1, 1, 1), padding='same', data_format='channels_last', dilation_rate=(1, 1, 1), activation=None, use_bias=True, kernel_initializer=tf.contrib.layers.variance_scaling_initializer(2.0), bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, split=1): """ A wrapper around `tf.layers.Conv3D`. Some differences to maintain backward-compatibility: 1. Default kernel initializer is variance_scaling_initializer(2.0). 2. Default padding is 'same'. 3. Support 'split' argument to do group conv. Variable Names: * ``W``: weights * ``b``: bias """ if split == 1: with rename_get_variable({'kernel': 'W', 'bias': 'b'}): layer = tf.layers.Conv3D( filters, kernel_size, strides=strides, padding=padding, data_format=data_format, dilation_rate=dilation_rate, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer, _reuse=tf.get_variable_scope().reuse) ret = layer.apply(inputs, scope=tf.get_variable_scope()) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=layer.kernel) if use_bias: ret.variables.b = layer.bias else: # group conv implementation pass return ret
def GrConv2D(x, out_channel, kernel_shape, padding='SAME', stride=1, dilation_rate=1, W_init=None, b_init=None, nl=tf.identity, split=1, use_bias=True, data_format='channels_last'): if data_format == 'NHWC' or data_format == 'channels_last': data_format = 'channels_last' elif data_format == 'NCHW' or data_format == 'channels_first': data_format = 'channels_first' else: print "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa unknown data format" in_shape = x.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[GrConv2D] Input cannot have unknown channel!" assert in_channel % split == 0 assert out_channel % split == 0 kernel_shape = shape2d(kernel_shape) padding = padding.upper() filter_shape = kernel_shape + [in_channel / split, out_channel] stride = shape2d(stride) if W_init is None: W_init = tf.contrib.layers.variance_scaling_initializer() if b_init is None: b_init = tf.constant_initializer() with rename_get_variable({'kernel': 'W', 'bias': 'b'}): layer = tf.layers.Conv2D(filters=out_channel, kernel_size=kernel_shape, strides=stride, padding=padding, data_format=data_format, dilation_rate=dilation_rate, activation=lambda x: nl(x, name='output'), use_bias=use_bias, kernel_initializer=W_init, bias_initializer=b_init, trainable=True) ret = layer.apply(x, scope=tf.get_variable_scope()) ret.variables = VariableHolder(W=layer.kernel) if use_bias: ret.variables.b = layer.bias return ret
def Conv3DTranspose( inputs, filters, kernel_size, strides=(1, 1, 1), padding='same', data_format='channels_last', activation=None, use_bias=False, kernel_initializer=tf.contrib.layers. variance_scaling_initializer( 2.0 ), #tf.contrib.layers.xavier_initializer(), #tf.initializers.variance_scaling(distribution='uniform'), bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None): """ A wrapper around `tf.layers.Conv2DTranspose`. Some differences to maintain backward-compatibility: 1. Default kernel initializer is variance_scaling_initializer(2.0),. 2. Default padding is 'same' Variable Names: * ``W``: weights * ``b``: bias """ with rename_get_variable({'kernel': 'W', 'bias': 'b'}): layer = tf.layers.Conv3DTranspose( filters, kernel_size, strides=strides, padding=padding, data_format=data_format, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer) ret = layer.apply(inputs, scope=tf.get_variable_scope()) ret.variables = VariableHolder(W=layer.kernel) if use_bias: ret.variables.b = layer.bias return tf.identity(ret, name='output')
def RescaleActivationLayer(inputs, decay=0.9, bit_a=8): in_shape = inputs.get_shape().as_list() moving_max = tf.get_variable('activation_max/EMA', [in_shape[-1]], initializer=tf.constant_initializer(), trainable=False) moving_min = tf.get_variable('activation_min/EMA', [in_shape[-1]], initializer=tf.constant_initializer(), trainable=False) named_inputs = tf.identity(inputs, name='rescaling_input_activation') # xn = (named_inputs - moving_min) / tf.pow(tf.constant(2.0), log2(moving_max) - tf.constant(float(bit_a))) xn = (named_inputs - (moving_min + moving_max) / 2.0) / (moving_max - moving_min) named_xn = tf.identity(xn, name='rescaled_activation') named_xn = tf.Print(named_xn, [named_xn]) ctx = get_current_tower_context() if ctx.is_main_training_tower: ret = update_ema(xn, moving_max, moving_min, decay) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder(mean=moving_max, variance=moving_min) return ret
def Conv2DWithTrackedMults(x, out_channel, kernel_shape, network_complexity=None, padding='SAME', stride=1, W_init=None, b_init=None, nl=tf.identity, split=1, use_bias=True, data_format='NHWC'): """ 2D convolution on 4D inputs. Args: x (tf.Tensor): a 4D tensor. Must have known number of channels, but can have other unknown dimensions. out_channel (int): number of output channel. kernel_shape: (h, w) tuple or a int. stride: (h, w) tuple or a int. padding (str): 'valid' or 'same'. Case insensitive. split (int): Split channels as used in Alexnet. Defaults to 1 (no split). W_init: initializer for W. Defaults to `variance_scaling_initializer`. b_init: initializer for b. Defaults to zero. nl: a nonlinearity function. use_bias (bool): whether to use bias. Returns: tf.Tensor named ``output`` with attribute `variables`. Variable Names: * ``W``: weights * ``b``: bias """ in_shape = x.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[Conv2D] Input cannot have unknown channel!" assert in_channel % split == 0 assert out_channel % split == 0 kernel_shape = shape2d(kernel_shape) padding = padding.upper() filter_shape = kernel_shape + [in_channel / split, out_channel] stride = shape4d(stride, data_format=data_format) if W_init is None: W_init = tf.contrib.layers.variance_scaling_initializer() if b_init is None: b_init = tf.constant_initializer() W = tf.get_variable('W', filter_shape, initializer=W_init) network_complexity['weights'] += filter_shape[0] * filter_shape[ 1] * filter_shape[2] * filter_shape[3] if use_bias: b = tf.get_variable('b', [out_channel], initializer=b_init) network_complexity['weights'] += out_channel assert split == 1 xsh = x.get_shape().as_list() network_complexity['mults'] += xsh[1] * xsh[2] * filter_shape[ 0] * filter_shape[1] * filter_shape[2] * filter_shape[3] / (stride[2] * stride[2]) conv = tf.nn.conv2d(x, W, stride, padding, data_format=data_format) ret = nl( tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b = b return ret
def Conv3D( inputs, filters, kernel_size, strides=(1, 1, 1), padding='same', data_format='channels_last', dilation_rate=(1, 1, 1), activation=None, use_bias=True, kernel_initializer=tf.contrib.layers.variance_scaling_initializer(2.0), bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, split=1): """ A wrapper around `tf.layers.Conv2D`. Some differences to maintain backward-compatibility: 1. Default kernel initializer is variance_scaling_initializer(2.0). 2. Default padding is 'same'. 3. Support 'split' argument to do group conv. Variable Names: * ``W``: weights * ``b``: bias """ if split == 1: with rename_get_variable({'kernel': 'W', 'bias': 'b'}): layer = tf.layers.Conv3D(filters, kernel_size, strides=strides, padding=padding, data_format=data_format, dilation_rate=dilation_rate, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer) ret = layer.apply(inputs, scope=tf.get_variable_scope()) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=layer.kernel) if use_bias: ret.variables.b = layer.bias else: # group conv implementation data_format = get_data_format(data_format, tfmode=False) in_shape = inputs.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[Conv3D] Input cannot have unknown channel!" assert in_channel % split == 0 assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \ "Not supported by group conv now!" out_channel = filters assert out_channel % split == 0 assert dilation_rate == (1, 1) or get_tf_version_number( ) >= 1.5, 'TF>=1.5 required for group dilated conv' kernel_shape = shape2d(kernel_size) filter_shape = kernel_shape + [in_channel / split, out_channel] stride = shape4d(strides, data_format=data_format) kwargs = dict(data_format=data_format) if get_tf_version_number() >= 1.5: kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format) W = tf.get_variable('W', filter_shape, initializer=kernel_initializer) if use_bias: b = tf.get_variable('b', [out_channel], initializer=bias_initializer) inputs = tf.split(inputs, split, channel_axis) kernels = tf.split(W, split, 3) outputs = [ tf.nn.conv2d(i, k, stride, padding.upper(), **kwargs) for i, k in zip(inputs, kernels) ] conv = tf.concat(outputs, channel_axis) if activation is None: activation = tf.identity ret = activation(tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b = b return ret
def mod_conv2d(x, y, fmaps, kernel, demodulate=True, gain=1, use_wscale=True, lrmul=1, fused_modconv=True, eps=1e-8, padding='SAME', name="mod_conv2d"): shape = x.get_shape().as_list() # [n, h, w, c] cin = shape[-1] with tf.variable_scope(name, reuse=tf.AUTO_REUSE): # Get weight w = get_weight([kernel, kernel, cin, fmaps], gain=gain, use_wscale=use_wscale, lrmul=lrmul, name='W') ww = w[tf.newaxis] # introduce minibatch dimension # Modulate s = get_bias( cin, base_std=0, use_wscale=use_wscale, lrmul=lrmul, name='bs') + 1 vh = VariableHolder(W=w, bs=s) s = tf.tile(s[tf.newaxis], [tf.shape(x)[0], 1]) # introduce minibatch dimension if y is not None: y_style, w_style = dense(y, cin, gain=gain, use_wscale=use_wscale, lrmul=lrmul) s = s + y_style vh.Ws = w_style ww = ww * tf.cast(s[:, tf.newaxis, tf.newaxis, :, tf.newaxis], w.dtype) # scale input feature maps # Demodulate if demodulate: d = tf.rsqrt( tf.reduce_sum(tf.square(ww), axis=[1, 2, 3], keepdims=True) + eps) # scaling factor ww = ww * d # Reshape/scale input if fused_modconv: x = tf.reshape(tf.transpose(x, [0, 3, 1, 2]), [1, -1, shape[1], shape[2]]) # [1, n*cin, h, w] w = tf.reshape(tf.transpose(ww, [1, 2, 3, 0, 4]), [kernel, kernel, cin, -1]) # [k, k, cin, n*cout] x = tf.nn.conv2d(x, tf.cast(w, x.dtype), data_format='NCHW', strides=[1, 1, 1, 1], padding=padding) out_shape = x.get_shape().as_list() x = tf.transpose( tf.reshape(x, [-1, fmaps, out_shape[2], out_shape[3]]), [0, 2, 3, 1]) else: x = x * tf.cast(s[:, tf.newaxis, tf.newaxis, :], x.dtype) x = tf.nn.conv2d(x, tf.cast(w, x.dtype), data_format='NHWC', strides=[1, 1, 1, 1], padding=padding) if demodulate: x = x * tf.cast(tf.reshape(d, [-1, 1, 1, fmaps]), x.dtype) ret = tf.identity(x) ret.variables = vh return ret
def BatchNorm_SplitGPU(x, use_local_stat=None, decay=0.9, epsilon=1e-5, use_scale=True, use_bias=True, gamma_init=tf.constant_initializer(1.0), data_format='NHWC', internal_update=False, split_num = 1): """ """ print split_num if data_format == 'channels_last': data_format = 'NHWC' assert data_format == 'NHWC' shape = x.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'NHWC' if data_format == 'NCHW': n_out = shape[1] else: n_out = shape[-1] # channel assert n_out is not None, "Input to BatchNorm cannot have unknown channels!" beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, use_scale, use_bias, gamma_init) ctx = get_current_tower_context() if use_local_stat is None: use_local_stat = ctx.is_training use_local_stat = bool(use_local_stat) if use_local_stat: if ndims == 2: x = tf.reshape(x, [-1, 1, 1, n_out]) # fused_bn only takes 4D input # fused_bn has error using NCHW? (see #190) inputs = tf.concat(tf.split(x, split_num, 0), -1) # N/S_n x H x W x C*S_n beta_, gamma_ = None, None beta_ = tf.reshape([beta]*split_num, [-1]) gamma_ = tf.reshape([gamma]*split_num, [-1]) xn, batch_mean, batch_var = tf.nn.fused_batch_norm(inputs, gamma_, beta_,epsilon=epsilon,is_training=True, data_format=data_format) xn = tf.concat(tf.split(xn, split_num, 3), 0) """ """ # inputs = tf.concat(tf.split(x, split_num, 0), -1) # N/split_num x H x W x C*split_num # axis = [0, 1, 2] # batch_mean, batch_var = tf.nn.moments(inputs, axis) # C*split_num # beta_, gamma_ = None, None # beta_ = tf.reshape([beta]*split_num, [-1]) # gamma_ = tf.reshape([gamma]*split_num, [-1]) # xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta_, gamma_, epsilon) # xn = tf.concat(tf.split(xn, split_num, 3), 0) if ndims == 2: xn = tf.squeeze(xn, [1, 2]) else: if ctx.is_training: assert get_tf_version_number() < 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. xn, batch_mean, batch_var = tf.nn.fused_batch_norm( x, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=False) else: if ndims == 4 and data_format == 'NCHW': [g, b, mm, mv] = [reshape_for_bn(_, ndims, n_out, data_format) for _ in [gamma, beta, moving_mean, moving_var]] xn = tf.nn.batch_normalization(x, mm, mv, b, g, epsilon) batch_mean = tf.concat([moving_mean] * split_num, 0) batch_var = tf.concat([moving_var] * split_num, 0) else: # avoid the reshape if possible (when channel is the last dimension) xn = tf.nn.batch_normalization( x, moving_mean, moving_var, beta, gamma, epsilon) batch_mean = tf.concat([moving_mean] * split_num, 0) batch_var = tf.concat([moving_var] * split_num, 0) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA if ctx.is_main_training_tower: add_model_variable(moving_mean) add_model_variable(moving_var) if ctx.is_main_training_tower and use_local_stat: # print (xn) ret = update_bn_ema(xn, batch_mean[:n_out], batch_var[:n_out], moving_mean, moving_var, decay, internal_update) else: ret = tf.identity(xn, name='output') ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var) if use_scale: vh.gamma = gamma if use_bias: vh.beta = beta assert batch_mean is not None, 'batch_mean outputs is None' return ret
def Conv(inputs, filters, kernel_size, strides=(1, 1), padding='same', data_format='channels_last', dilation_rate=(1, 1), activation=None, use_bias=True, kernel_initializer=None, bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, split=1, norm=False): """ Similar to `tf.layers.Conv2D`, but with some differences: 1. Default kernel initializer is variance_scaling_initializer(2.0). 2. Default padding is 'same'. 3. Support 'split' argument to do group convolution. Variable Names: * ``W``: weights * ``b``: bias """ if kernel_initializer is None: if get_tf_version_tuple() <= (1, 12): kernel_initializer = tf.contrib.layers.variance_scaling_initializer( 2.0) # deprecated else: kernel_initializer = tf.keras.initializers.VarianceScaling( 2.0, distribution='untruncated_normal') dilation_rate = shape2d(dilation_rate) if True: # group conv implementation data_format = get_data_format(data_format, keras_mode=False) in_shape = inputs.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[Conv2D] Input cannot have unknown channel!" assert in_channel % split == 0 assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \ "Not supported by group conv or dilated conv!" out_channel = filters assert out_channel % split == 0 assert dilation_rate == [1, 1] or get_tf_version_tuple() >= ( 1, 5), 'TF>=1.5 required for dilated conv.' kernel_shape = shape2d(kernel_size) filter_shape = kernel_shape + [in_channel // split, out_channel] stride = shape4d(strides, data_format=data_format) kwargs = {"data_format": data_format} if get_tf_version_tuple() >= (1, 5): kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format) # matching input dtype (ex. tf.float16) since the default dtype of variable if tf.float32 inputs_dtype = inputs.dtype W = tf.get_variable('parseweigth', filter_shape, dtype=inputs_dtype, initializer=kernel_initializer) if norm: use_bias = False W = tf.reshape(W, kernel_shape + [4, in_channel // 4, out_channel]) W = tf.nn.softmax(W, 2) W = tf.reshape(W, filter_shape) #dynamics = tf.reduce_mean(inputs, 0) #dynamics = tf.transpose(dynamics, [1,2,0]) #dynamics = tf.image.resize_images(dynamics, kernel_shape) #dynamics = tf.expand_dims(dynamics, -1) #W = W + 0.001 * dynamics #tf.random_normal(shape = tf.shape(W), mean = 0.0, stddev = 0.012, dtype = tf.float32) #W = W *tf.random_uniform(shape=W.get_shape().as_list(), minval=0., maxval=2.) if use_bias: b = tf.get_variable('parsebias', [out_channel], dtype=inputs_dtype, initializer=bias_initializer) if split == 1: conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs) else: try: conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs) except ValueError: log_once( "CUDNN group convolution support is only available with " "https://github.com/tensorflow/tensorflow/pull/25818 . " "Will fall back to a loop-based slow implementation instead!", 'warn') ret = tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv if activation is not None: ret = activation(ret) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b = b return ret
def mpusim_depthwise_convolution2d(inputs, kernel_size, strides=(1, 1), padding='valid', depth_multiplier=1, data_format='channels_last', activation=None, use_bias=True, depthwise_initializer='glorot_uniform', bias_initializer='zeros', depthwise_regularizer=None, bias_regularizer=None, depthwise_constraint=None, bias_constraint=None, activations_datatype_size_byte=1, weights_datatype_size_byte=1, results_datatype_size_byte=4, systolic_array_height=256, systolic_array_width=256, activation_fifo_depth=8, accumulator_array_height=4096, log_file_output_dir='.', model_name='unnamed'): #depthwise_initializer = initializers.get(depthwise_initializer) #depthwise_regularizer = regularizers.get(depthwise_regularizer) #depthwise_constraint = constraints.get(depthwise_constraint) #bias_initializer = initializers.get(bias_initializer) data_format = get_data_format(data_format, keras_mode=False) input_shape = inputs.get_shape().as_list() strides = shape4d(strides, data_format=data_format) if len(input_shape) < 4: raise ValueError( 'Inputs to `mpusim_depthwise_conv2d` should have rank 4. ' 'Received input shape:', str(input_shape)) if data_format == 'NCHW': raise ValueError('mpusim_depthwise_convolution2d ' 'only supports NHWC data format') else: channel_axis = 3 if input_shape[channel_axis] is None: raise ValueError('The channel dimension of the inputs to ' '`mpusim_depthwise_convolution2d` ' 'should be defined. Found `None`.') input_dim = int(input_shape[channel_axis]) depthwise_kernel_shape = (kernel_size[0], kernel_size[1], input_dim, depth_multiplier) depthwise_kernel = tf.get_variable('W', shape=depthwise_kernel_shape, initializer=depthwise_initializer, regularizer=depthwise_regularizer, constraint=depthwise_constraint) if use_bias: biases = tf.get_variable('b', shape=(input_dim * depth_multiplier, ), initializer=bias_initializer, regularizer=bias_regularizer, constraint=bias_constraint) result = mpusim_depthwise_conv2d( inputs, depthwise_kernel, strides=strides, padding=padding, data_format=data_format, activations_datatype_size_byte=activations_datatype_size_byte, weights_datatype_size_byte=weights_datatype_size_byte, results_datatype_size_byte=results_datatype_size_byte, systolic_array_height=systolic_array_height, systolic_array_width=systolic_array_width, activation_fifo_depth=activation_fifo_depth, accumulator_array_height=accumulator_array_height, log_file_output_dir=log_file_output_dir, model_name=model_name) if use_bias: result = tf.nn.bias_add(result, bias, data_format=data_format) if activation is not None: result = activation(result) result = tf.identity(result, name='output') result.variables = VariableHolder(W=depthwise_kernel) if use_bias: result.variables.b = biases return result
def MaskedConv2D( inputs, filters, kernel_size, strides=(1, 1), padding='same', data_format='channels_last', dilation_rate=(1, 1), activation=None, use_bias=True, kernel_initializer=None, bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, split=1, masking=False): """ A wrapper around `tf.layers.Conv2D`. Some differences to maintain backward-compatibility: 1. Default kernel initializer is variance_scaling_initializer(2.0). 2. Default padding is 'same'. 3. Support 'split' argument to do group conv. Variable Names: * ``W``: weights * ``b``: bias """ if kernel_initializer is None: if get_tf_version_tuple() <= (1, 12): kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0) else: kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal') dilation_rate = shape2d(dilation_rate) if (masking == False) and (split == 1) and (dilation_rate == [1, 1]): # tf.layers.Conv2D has bugs with dilations (https://github.com/tensorflow/tensorflow/issues/26797) with rename_get_variable({'kernel': 'W', 'bias': 'b'}): layer = tf.layers.Conv2D( filters, kernel_size, strides=strides, padding=padding, data_format=data_format, dilation_rate=dilation_rate, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer, _reuse=tf.get_variable_scope().reuse) ret = layer.apply(inputs, scope=tf.get_variable_scope()) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=layer.kernel) if use_bias: ret.variables.b = layer.bias else: if masking == True: assert split == 1, "Pruining group conv is not supported yet" # group conv implementation data_format = get_data_format(data_format, keras_mode=False) in_shape = inputs.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[Conv2D] Input cannot have unknown channel!" assert in_channel % split == 0 assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \ "Not supported by group conv or dilated conv!" out_channel = filters assert out_channel % split == 0 assert dilation_rate == [1, 1] or get_tf_version_tuple() >= (1, 5), 'TF>=1.5 required for dilated conv.' kernel_shape = shape2d(kernel_size) filter_shape = kernel_shape + [in_channel / split, out_channel] stride = shape4d(strides, data_format=data_format) kwargs = dict(data_format=data_format) if get_tf_version_tuple() >= (1, 5): kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format) W = tf.get_variable( 'W', filter_shape, initializer=kernel_initializer) if use_bias: b = tf.get_variable('b', [out_channel], initializer=bias_initializer) if split == 1: if masking: W = pruning.apply_mask(W) conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs) else: conv = None if get_tf_version_tuple() >= (1, 13): try: conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs) except ValueError: log_once("CUDNN group convolution support is only available with " "https://github.com/tensorflow/tensorflow/pull/25818 . " "Will fall back to a loop-based slow implementation instead!", 'warn') if conv is None: inputs = tf.split(inputs, split, channel_axis) kernels = tf.split(W, split, 3) outputs = [tf.nn.conv2d(i, k, stride, padding.upper(), **kwargs) for i, k in zip(inputs, kernels)] conv = tf.concat(outputs, channel_axis) ret = tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv if activation is not None: ret = activation(ret) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b = b return ret
def mpusim_conv2d( inputs, filters, kernel_size, strides=(1, 1), padding='same', data_format='channels_last', dilation_rate=(1, 1), activation=None, use_bias=True, kernel_initializer=None, bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, split=1, activations_datatype_size_byte=1, weights_datatype_size_byte=1, results_datatype_size_byte=4, systolic_array_height=256, systolic_array_width=256, activation_fifo_depth=8, accumulator_array_height=4096, log_file_output_dir='.', model_name='unnamed'): """ Similar to `tf.layers.Conv2D`, but with some differences: 1. Default kernel initializer is variance_scaling_initializer(2.0). 2. Default padding is 'same'. 3. Support 'split' argument to do group convolution. Variable Names: * ``W``: weights * ``b``: bias """ if kernel_initializer is None: if get_tf_version_tuple() <= (1, 12): kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0) else: kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal') dilation_rate = shape2d(dilation_rate) # group conv implementation data_format = get_data_format(data_format, keras_mode=False) in_shape = inputs.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[mpusim_conv2d] Input cannot have unknown channel!" assert in_channel % split == 0 assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \ "Not supported by group conv or dilated conv!" out_channel = filters assert out_channel % split == 0 assert dilation_rate == [1, 1] or get_tf_version_tuple() >= (1, 5), 'TF>=1.5 required for dilated conv.' kernel_shape = shape2d(kernel_size) filter_shape = kernel_shape + [in_channel / split, out_channel] stride = shape4d(strides, data_format=data_format) kwargs = dict(data_format=data_format) if get_tf_version_tuple() >= (1, 5): kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format) W = tf.get_variable( 'W', filter_shape, initializer=kernel_initializer) if use_bias: b = tf.get_variable('b', [out_channel], initializer=bias_initializer) if split == 1: conv = mpu_sim_conv2d_lib.mpu_sim_conv2d(inputs, W, activations_datatype_size_byte, weights_datatype_size_byte, results_datatype_size_byte, systolic_array_height, systolic_array_width, activation_fifo_depth, accumulator_array_height, log_file_output_dir, model_name, stride, padding.upper(), **kwargs) else: inputs = tf.split(inputs, split, channel_axis) kernels = tf.split(W, split, 3) outputs = [mpu_sim_conv2d_lib.mpu_sim_conv2d(input_block, kernel_block, activations_datatype_size_byte, weights_datatype_size_byte, results_datatype_size_byte, systolic_array_height, systolic_array_width, activation_fifo_depth, accumulator_array_height, log_file_output_dir, model_name, stride, padding.upper(), **kwargs) for input_block, kernel_block in zip(inputs, kernels)] conv = tf.concat(outputs, channel_axis) ret = tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv if activation is not None: ret = activation(ret) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b=b return ret
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, internal_update=False): """ Mostly equivalent to `tf.layers.batch_normalization`, but different in the following: 1. Accepts `data_format` when `axis` is None. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from `TowerContext`. 4. Support the `internal_update` option. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: 1. About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed independently. This is consistent with most frameworks. 2. Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ # parse shapes shape = inputs.get_shape().as_list() ndims = len(shape) assert axis is not None # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_number() if not training and ctx.is_training: assert TF_version >= 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn( "[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable({ 'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA' }): if TF_version >= 1.5: layer = tf.layers.BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, virtual_batch_size=virtual_batch_size, fused=True, _reuse=tf.get_variable_scope().reuse) else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" layer = tf.layers.BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, fused=True, _reuse=tf.get_variable_scope().reuse) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta return ret
def AtrousConv2D(x, out_channel, kernel_shape, padding='SAME', rate=1, W_init=None, b_init=None, nl=tf.identity, use_bias=False, data_format='NHWC'): """ 2D AtrousConvolution on 4D inputs. Args: x (tf.Tensor): a 4D tensor. Must have known number of channels, but can have other unknown dimensions. out_channel (int): number of output channel. kernel_shape: (h, w) tuple or a int. stride: (h, w) tuple or a int. rate: A positive int32, In the literature, the same parameter is sometimes called input stride or dilation. padding (str): 'valid' or 'same'. Case insensitive. W_init: initializer for W. Defaults to `variance_scaling_initializer`. b_init: initializer for b. Defaults to zero. nl: a nonlinearity function. use_bias (bool): whether to use bias. Returns: tf.Tensor named ``output`` with attribute `variables`. Variable Names: * ``W``: weights * ``b``: bias """ in_shape = x.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[AtrousConv2D] Input cannot have unknown channel!" kernel_shape = shape2d(kernel_shape) padding = padding.upper() filter_shape = kernel_shape + [in_channel, out_channel] if W_init is None: W_init = tf.contrib.layers.variance_scaling_initializer() if b_init is None: b_init = tf.constant_initializer() W = tf.get_variable('W', filter_shape, initializer=W_init) if use_bias: b = tf.get_variable('b', [out_channel], initializer=b_init) conv = tf.nn.atrous_conv2d(x, W, rate, padding) ret = nl( tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b = b return ret