def convolution(inputs, num_outputs, kernel_size, stride=1, padding='SAME', data_format=None, rate=1, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, use_spectral_norm=False, is_training=False, scope=None, conv_dims=None): """Adds an N-D convolution followed by an optional batch_norm layer. It is required that 1 <= N <= 3. `convolution` creates a variable called `weights`, representing the convolutional kernel, that is convolved (actually cross-correlated) with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Performs atrous convolution with input stride/dilation rate equal to `rate` if a value > 1 for any dimension of `rate` is specified. In this case `stride` values != 1 are not supported. Args: inputs: A Tensor of rank N+2 of shape `[batch_size] + input_spatial_shape + [in_channels]` if data_format does not start with "NC" (default), or `[batch_size, in_channels] + input_spatial_shape` if data_format starts with "NC". num_outputs: Integer, the number of output filters. kernel_size: A sequence of N positive integers specifying the spatial dimensions of the filters. Can be a single integer to specify the same value for all spatial dimensions. stride: A sequence of N positive integers specifying the stride at which to compute output. Can be a single integer to specify the same value for all spatial dimensions. Specifying any `stride` value != 1 is incompatible with specifying any `rate` value != 1. padding: One of `"VALID"` or `"SAME"`. data_format: A string or None. Specifies whether the channel dimension of the `input` and output is the last dimension (default, or if `data_format` does not start with "NC"), or the second dimension (if `data_format` starts with "NC"). For N=1, the valid values are "NWC" (default) and "NCW". For N=2, the valid values are "NHWC" (default) and "NCHW". For N=3, the valid values are "NDHWC" (default) and "NCDHW". rate: A sequence of N positive integers specifying the dilation rate to use for atrous convolution. Can be a single integer to specify the same value for all spatial dimensions. Specifying any `rate` value != 1 is incompatible with specifying any `stride` value != 1. activation_fn: Activation function. The default value is a ReLU function. Explicitly set it to None to skip it and maintain a linear activation. normalizer_fn: Normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: Normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: Collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. conv_dims: Optional convolution dimensionality, when set it would use the corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When leaved to None it would select the convolution dimensionality based on the input rank (i.e. Conv ND, with N = input_rank - 2). Returns: A tensor representing the output of the operation. Raises: ValueError: If `data_format` is invalid. ValueError: Both 'rate' and `stride` are not uniformly 1. """ if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW']: raise ValueError('Invalid data_format: %r' % (data_format,)) layer_variable_getter = _build_variable_getter({'bias': 'biases', 'kernel': 'weights'}) with variable_scope.variable_scope(scope, 'Conv', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) input_rank = inputs.get_shape().ndims if conv_dims is not None and conv_dims + 2 != input_rank: raise ValueError('Convolution expects input with rank %d, got %d' % (conv_dims + 2, input_rank)) if input_rank == 3: layer_class = convolutional_layers.Convolution1D elif input_rank == 4: layer_class = MyConv2D elif input_rank == 5: layer_class = convolutional_layers.Convolution3D else: raise ValueError('Convolution not supported for input with rank', input_rank) df = ('channels_first' if data_format and data_format.startswith('NC') else 'channels_last') layer = layer_class( filters=num_outputs, kernel_size=kernel_size, strides=stride, padding=padding, data_format=df, dilation_rate=rate, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, use_spectral_norm=use_spectral_norm, is_training=is_training, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs) # Add variables to collections. _add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.use_bias: _add_variable_to_collections(layer.bias, variables_collections, 'biases') if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def fully_connected(inputs, num_outputs, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, quantizer=None, weight_quantizer=None): """ """ if not isinstance(num_outputs, six.integer_types): raise ValueError('num_outputs should be int or long, got %s.' % (num_outputs, )) layer_variable_getter = layers._build_variable_getter({ 'bias': 'biases', 'kernel': 'weights' }) with variable_scope.variable_scope( scope, 'fully_connected', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) layer = QDense(units=num_outputs, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse, quantizer=quantizer, weight_quantizer=weight_quantizer) outputs = layer.apply(inputs) # Add variables to collections. layers._add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.bias is not None: layers._add_variable_to_collections(layer.bias, variables_collections, 'biases') # Apply normalizer function / layer. if normalizer_fn is not None: if not normalizer_params: normalizer_params = {} outputs = normalizer_fn(outputs, **normalizer_params) if quantizer is not None: outputs = quantizer.quantize(outputs) if activation_fn is not None: outputs = activation_fn(outputs) if quantizer is not None: outputs = quantizer.quantize(outputs) return slim_utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def conv2d(inputs, num_outputs, kernel_size, stride=1, padding='SAME', data_format=None, rate=1, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, quantizer=None, weight_quantizer=None): """ function call from slim library. """ if data_format not in [ None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW' ]: raise ValueError('Invalid data_format: %r' % (data_format, )) layer_variable_getter = layers._build_variable_getter({ 'bias': 'biases', 'kernel': 'weights' }) with variable_scope.variable_scope( scope, 'Conv', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) input_rank = inputs.get_shape().ndims if input_rank == 4: layer_class = QConv2D #convolutional.Conv2D else: raise ValueError('Convolution not supported for input with rank', input_rank) df = ('channels_first' if data_format and data_format.startswith('NC') else 'channels_last') layer = layer_class(filters=num_outputs, kernel_size=kernel_size, strides=stride, padding=padding, data_format=df, dilation_rate=rate, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse, quantizer=quantizer, weight_quantizer=weight_quantizer) outputs = layer.apply(inputs) # Add variables to collections. layers._add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.use_bias: layers._add_variable_to_collections(layer.bias, variables_collections, 'biases') if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) if quantizer is not None: # quantize after normalization outputs = quantizer.quantize(outputs) if activation_fn is not None: outputs = activation_fn(outputs) if quantizer is not None: # quantize after activation outputs = quantizer.quantize(outputs) return slim_utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def dau_conv1d( inputs, filters, dau_units, max_kernel_size, stride=1, mu_learning_rate_factor=500, data_format=None, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=init_ops.random_normal_initializer( stddev=0.1), #init_ops.glorot_uniform_initializer(), weights_regularizer=None, weights_constraint=None, mu1_initializer=None, mu1_regularizer=None, mu1_constraint=None, sigma_initializer=None, sigma_regularizer=None, sigma_constraint=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, dau_unit_border_bound=0.01, dau_aggregation_forbid_positive_dim1=False, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): if data_format not in [None, 'NCHW']: raise ValueError('Invalid data_format: %r' % (data_format, )) layer_variable_getter = layers_contrib._build_variable_getter({ 'bias': 'biases', 'weight': 'weights', 'mu1': 'mu1', 'sigma': 'sigma' }) with variable_scope.variable_scope( scope, 'DAUConv', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) input_rank = inputs.get_shape().ndims if input_rank != 4: raise ValueError( 'DAU convolution not supported for input with rank', input_rank) df = ('channels_first' if data_format and data_format.startswith('NC') else 'channels_last') layer = DAUConv1d(filters, dau_units, max_kernel_size, strides=stride, data_format=df, activation=None, use_bias=not normalizer_fn and biases_initializer, mu_learning_rate_factor=mu_learning_rate_factor, weight_initializer=weights_initializer, mu1_initializer=mu1_initializer, sigma_initializer=sigma_initializer, bias_initializer=biases_initializer, weight_regularizer=weights_regularizer, mu1_regularizer=mu1_regularizer, sigma_regularizer=sigma_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, dau_unit_border_bound=dau_unit_border_bound, dau_aggregation_forbid_positive_dim1= dau_aggregation_forbid_positive_dim1, trainable=trainable, unit_testing=False, name=sc.name, _scope=sc, _reuse=reuse) dau_weights = weights_constraint( layer.add_dau_weights_var( inputs.shape)) if weights_constraint is not None else None dau_mu1 = mu1_constraint(layer.add_dau_mu1_var( inputs.shape)) if mu1_constraint is not None else None dau_sigma = sigma_constraint(layer.add_dau_sigma_var( inputs.shape)) if sigma_constraint is not None else None layer.set_dau_variables_manually(dau_weights, dau_mu1, None, dau_sigma) outputs = layer.apply(inputs) # Add variables to collections. layers_contrib._add_variable_to_collections(layer.dau_weights, variables_collections, 'weights') layers_contrib._add_variable_to_collections(layer.dau_mu1, variables_collections, 'mu1') layers_contrib._add_variable_to_collections(layer.dau_sigma, variables_collections, 'sigma') if layer.use_bias: layers_contrib._add_variable_to_collections( layer.bias, variables_collections, 'biases') if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return utils_contrib.collect_named_outputs(outputs_collections, sc.name, outputs)
def cross_replica_batch_normalization(inputs, *args, **kwargs): fused = kwargs.get('fused') if fused is None: fused = True # inputs = ops.convert_to_tensor(inputs) rank = inputs.get_shape().ndims if kwargs.get('data_format', DATA_FORMAT_NHWC) not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') layer_variable_getter = _build_variable_getter() with variable_scope.variable_scope( kwargs.get('scope'), 'BatchNorm', [inputs], reuse=kwargs.get('reuse'), custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) # Check that we can use the core layer class. assert all([ kwargs.get('batch_weights') is None, kwargs.get('updates_collections', ops.GraphKeys.UPDATE_OPS) is ops.GraphKeys.UPDATE_OPS, not kwargs.get('zero_debias_moving_mean', False) ]), 'This function cannot be used.' # Construct and apply the layer axis = 1 if kwargs.get('data_format', DATA_FORMAT_NHWC) == DATA_FORMAT_NCHW else -1 if not kwargs.get('param_initializers', None): param_initializers = {} beta_initializer = param_initializers.get('beta', init_ops.zeros_initializer()) gamma_initializer = param_initializers.get('gamma', init_ops.ones_initializer()) moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) if not kwargs.get('param_regularizers', None): param_regularizers = {} beta_regularizer = param_regularizers.get('beta') gamma_regularizer = param_regularizers.get('gamma') layer = CrossReplicaBatchNormalization( axis=axis, momentum=kwargs.get('decay', 0.999), epsilon=kwargs.get('epsilon', 0.001), center=kwargs.get('center', True), scale=kwargs.get('scale', False), beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, beta_regularizer=beta_regularizer, gamma_regularizer=gamma_regularizer, trainable=kwargs.get('trainable', True), renorm=kwargs.get('renorm', False), renorm_clipping=kwargs.get('renorm_clipping'), renorm_momentum=kwargs.get('renorm_decay', 0.99), adjustment=kwargs.get('adjustment'), name=sc.name, _scope=sc, _reuse=kwargs.get('reuse'), fused=fused) outputs = layer.apply(inputs, training=kwargs.get('is_training', True)) # Add variables to collections. _add_variable_to_collections(layer.moving_mean, kwargs.get('variables_collections'), 'moving_mean') _add_variable_to_collections(layer.moving_variance, kwargs.get('variables_collections'), 'moving_variance') if layer.beta is not None: _add_variable_to_collections(layer.beta, kwargs.get('variables_collections'), 'beta') if layer.gamma is not None: _add_variable_to_collections(layer.gamma, kwargs.get('variables_collections'), 'gamma') if kwargs.get('activation_fn') is not None: outputs = kwargs.get('activation_fn')(outputs) return utils.collect_named_outputs(kwargs.get('outputs_collections'), sc.name, outputs)
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, param_initializers=None, param_regularizers=None, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, batch_weights=None, fused=False, data_format=DATA_FORMAT_NHWC, zero_debias_moving_mean=False, scope=None, renorm=False, renorm_clipping=None, renorm_decay=0.99, quantizer=None, use_quantized_weights=True): """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167. "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" Sergey Ioffe, Christian Szegedy Can be used as a normalizer function for conv2d and fully_connected. Note: when training, the moving_mean and moving_variance need to be updated. By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they need to be added as a dependency to the `train_op`. For example: ```python update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss) ``` One can set updates_collections=None to force the updates in place, but that can have a speed penalty, especially in distributed settings. Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. The normalization is over all but the last dimension if `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. decay: Decay for the moving average. Reasonable values for `decay` are close to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower `decay` value (recommend trying `decay`=0.9) if model experiences reasonably good training performance but poor validation and/or test performance. Try zero_debias_moving_mean=True for improved stability. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. param_regularizers: Optional regularizer for beta and gamma. updates_collections: Collections to collect the update ops for computation. The updates_ops need to be executed with the train_op. If None, a control dependency would be added to make sure the updates are computed in place. is_training: Whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into `moving_mean` and `moving_variance` using an exponential moving average with the given `decay`. When it is not in training mode then it would use the values of the `moving_mean` and the `moving_variance`. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). batch_weights: An optional tensor of shape `[batch_size]`, containing a frequency weight for each batch item. If present, then the batch normalization uses weighted mean and variance. (This can be used to correct for bias in training example selection.) fused: if `True`, use a faster, fused implementation based on nn.fused_batch_norm. If `None`, use the fused implementation if possible. data_format: A string. `NHWC` (default) and `NCHW` are supported. zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new pair of variables 'moving_mean/biased' and 'moving_mean/local_step'. scope: Optional scope for `variable_scope`. renorm: Whether to use Batch Renormalization (https://arxiv.org/abs/1702.03275). This adds extra variables during training. The inference is the same for either value of this parameter. renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to scalar `Tensors` used to clip the renorm correction. The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, with `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin, dmax are set to inf, 0, inf, respectively. renorm_decay: Momentum used to update the moving means and standard deviations with renorm. Unlike `momentum`, this affects training and should be neither too small (which would add noise) nor too large (which would give stale estimates). Note that `decay` is still applied to get the means and variances for inference. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If `batch_weights` is not None and `fused` is True. ValueError: If `data_format` is neither `NHWC` nor `NCHW`. ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. """ if fused: raise ValueError( 'Quantization is not supported for fused batch norm. ') if batch_weights is not None: raise ValueError('Weighted mean and variance is not currently ' 'supported for fused batch norm.') if param_regularizers is not None: raise ValueError('Regularizers are not currently ' 'supported for fused batch norm.') if renorm: raise ValueError('Renorm is not supported for fused batch norm.') # Only use _fused_batch_norm (1) if fused is set True or if it is # possible to use (currently it doesn't support batch weights, # renorm, and the case when rank is neither 2 nor 4), # and (2) if used with zero_debias_moving_mean, or an input shape of rank 2, # or non-default updates_collections (not implemented in # normalization_layers.BatchNormalization yet); otherwise use the fused # implementation in normalization_layers.BatchNormalization. inputs = ops.convert_to_tensor(inputs) rank = inputs.get_shape().ndims feature_supported = batch_weights is None and not renorm and rank in [2, 4] possible_to_fuse = fused is None and feature_supported if (fused or possible_to_fuse) and (zero_debias_moving_mean or rank == 2 or updates_collections is not ops.GraphKeys.UPDATE_OPS): return _fused_batch_norm( inputs, decay=decay, center=center, scale=scale, epsilon=epsilon, activation_fn=activation_fn, param_initializers=param_initializers, updates_collections=updates_collections, is_training=is_training, reuse=reuse, variables_collections=variables_collections, outputs_collections=outputs_collections, trainable=trainable, data_format=data_format, zero_debias_moving_mean=zero_debias_moving_mean, scope=scope) if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') layer_variable_getter = slim_layers._build_variable_getter() with variable_scope.variable_scope( scope, 'BatchNorm', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) # Determine whether we can use the core layer class. if (batch_weights is None and updates_collections is ops.GraphKeys.UPDATE_OPS and not zero_debias_moving_mean): # Use the core layer class. axis = 1 if data_format == DATA_FORMAT_NCHW else -1 if not param_initializers: param_initializers = {} beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) if not param_regularizers: param_regularizers = {} beta_regularizer = param_regularizers.get('beta') gamma_regularizer = param_regularizers.get('gamma') #This call is mainly used by the slim models layer = QBatchNormalization( #normalization_layers.BatchNormalization( axis=axis, momentum=decay, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, beta_regularizer=beta_regularizer, gamma_regularizer=gamma_regularizer, renorm=renorm, renorm_clipping=renorm_clipping, renorm_momentum=renorm_decay, #fused=fused, trainable=trainable, name=sc.name, quantizer=quantizer, use_quantized_weights=use_quantized_weights, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs, training=is_training) # Add variables to collections. slim_layers._add_variable_to_collections(layer.moving_mean, variables_collections, 'moving_mean') slim_layers._add_variable_to_collections(layer.moving_variance, variables_collections, 'moving_variance') if layer.beta is not None: slim_layers._add_variable_to_collections( layer.beta, variables_collections, 'beta') if layer.gamma is not None: slim_layers._add_variable_to_collections( layer.gamma, variables_collections, 'gamma') if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs) raise ValueError('Only core layer supported for quantized batch norm.')
def batch_norm_backbone(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, param_initializers=None, param_regularizers=None, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, batch_weights=None, fused=None, data_format=DATA_FORMAT_NHWC, zero_debias_moving_mean=False, scope=None, renorm=False, renorm_clipping=None, renorm_decay=0.99, adjustment=None, tower_config=None): """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167. "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" Sergey Ioffe, Christian Szegedy Can be used as a normalizer function for conv2d and fully_connected. The normalization is over all but the last dimension if `data_format` is `NHWC` and all but the second dimension if `data_format` is `NCHW`. In case of a 2D tensor this corresponds to the batch dimension, while in case of a 4D tensor this corresponds to the batch and space dimensions. Note: when training, the moving_mean and moving_variance need to be updated. By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they need to be added as a dependency to the `train_op`. For example: ```python update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss) ``` One can set updates_collections=None to force the updates in place, but that can have a speed penalty, especially in distributed settings. Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. The normalization is over all but the last dimension if `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. decay: Decay for the moving average. Reasonable values for `decay` are close to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower `decay` value (recommend trying `decay`=0.9) if model experiences reasonably good training performance but poor validation and/or test performance. Try zero_debias_moving_mean=True for improved stability. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. param_regularizers: Optional regularizer for beta and gamma. updates_collections: Collections to collect the update ops for computation. The updates_ops need to be executed with the train_op. If None, a control dependency would be added to make sure the updates are computed in place. is_training: Whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into `moving_mean` and `moving_variance` using an exponential moving average with the given `decay`. When it is not in training mode then it would use the values of the `moving_mean` and the `moving_variance`. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). batch_weights: An optional tensor of shape `[batch_size]`, containing a frequency weight for each batch item. If present, then the batch normalization uses weighted mean and variance. (This can be used to correct for bias in training example selection.) fused: if `None` or `True`, use a faster, fused implementation if possible. If `False`, use the system recommended implementation. data_format: A string. `NHWC` (default) and `NCHW` are supported. zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new pair of variables 'moving_mean/biased' and 'moving_mean/local_step'. scope: Optional scope for `variable_scope`. renorm: Whether to use Batch Renormalization (https://arxiv.org/abs/1702.03275). This adds extra variables during training. The inference is the same for either value of this parameter. renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to scalar `Tensors` used to clip the renorm correction. The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, with `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin, dmax are set to inf, 0, inf, respectively. renorm_decay: Momentum used to update the moving means and standard deviations with renorm. Unlike `momentum`, this affects training and should be neither too small (which would add noise) nor too large (which would give stale estimates). Note that `decay` is still applied to get the means and variances for inference. adjustment: A function taking the `Tensor` containing the (dynamic) shape of the input tensor and returning a pair (scale, bias) to apply to the normalized values (before gamma and beta), only during training. For example, `adjustment = lambda shape: ( tf.random_uniform(shape[-1:], 0.93, 1.07), tf.random_uniform(shape[-1:], -0.1, 0.1))` will scale the normalized value by up to 7% up or down, then shift the result by up to 0.1 (with independent scaling and bias for each feature but shared across all examples), and finally apply gamma and/or beta. If `None`, no adjustment is applied. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If `data_format` is neither `NHWC` nor `NCHW`. ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. """ # if fused is None: # fused = True # Only use _fused_batch_norm if all of the following three # conditions are true: # (1) fused is set True; # (2) it is possible to use (currently it doesn't support batch weights, # renorm, and the case when rank is neither 2 nor 4); # (3) it is used with zero_debias_moving_mean, or an input shape of rank 2, # or non-default updates_collections (not implemented in # normalization_layers.BatchNormalization yet); otherwise use the fused # implementation in normalization_layers.BatchNormalization. # inputs = ops.convert_to_tensor(inputs) # rank = inputs.get_shape().ndims # possible_to_fuse = ( # batch_weights is None and not renorm and rank in [2, 4] and # adjustment is None) # if fused and possible_to_fuse and ( # zero_debias_moving_mean or rank == 2 or # updates_collections is not ops.GraphKeys.UPDATE_OPS): # return _fused_batch_norm( # inputs, # decay=decay, # center=center, # scale=scale, # epsilon=epsilon, # activation_fn=activation_fn, # param_initializers=param_initializers, # param_regularizers=param_regularizers, # updates_collections=updates_collections, # is_training=is_training, # reuse=reuse, # variables_collections=variables_collections, # outputs_collections=outputs_collections, # trainable=trainable, # data_format=data_format, # zero_debias_moving_mean=zero_debias_moving_mean, # scope=scope) if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') layer_variable_getter = _build_variable_getter() with variable_scope.variable_scope( scope, 'BatchNorm', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) # # Determine whether we can use the core layer class. # if (batch_weights is None and # updates_collections is ops.GraphKeys.UPDATE_OPS and # not zero_debias_moving_mean): # print("F**K !!!!") # # Use the core layer class. # axis = 1 if data_format == DATA_FORMAT_NCHW else -1 # if not param_initializers: # param_initializers = {} # beta_initializer = param_initializers.get('beta', # init_ops.zeros_initializer()) # gamma_initializer = param_initializers.get('gamma', # init_ops.ones_initializer()) # moving_mean_initializer = param_initializers.get( # 'moving_mean', init_ops.zeros_initializer()) # moving_variance_initializer = param_initializers.get( # 'moving_variance', init_ops.ones_initializer()) # if not param_regularizers: # param_regularizers = {} # beta_regularizer = param_regularizers.get('beta') # gamma_regularizer = param_regularizers.get('gamma') # layer = normalization_layers.BatchNormalization( # axis=axis, # momentum=decay, # epsilon=epsilon, # center=center, # scale=scale, # beta_initializer=beta_initializer, # gamma_initializer=gamma_initializer, # moving_mean_initializer=moving_mean_initializer, # moving_variance_initializer=moving_variance_initializer, # beta_regularizer=beta_regularizer, # gamma_regularizer=gamma_regularizer, # trainable=trainable, # renorm=renorm, # renorm_clipping=renorm_clipping, # renorm_momentum=renorm_decay, # adjustment=adjustment, # name=sc.name, # _scope=sc, # _reuse=reuse, # fused=fused) # outputs = layer.apply(inputs, training=is_training) # # # Add variables to collections. # _add_variable_to_collections(layer.moving_mean, variables_collections, # 'moving_mean') # _add_variable_to_collections(layer.moving_variance, variables_collections, # 'moving_variance') # if layer.beta is not None: # _add_variable_to_collections(layer.beta, variables_collections, 'beta') # if layer.gamma is not None: # _add_variable_to_collections(layer.gamma, variables_collections, # 'gamma') # # if activation_fn is not None: # outputs = activation_fn(outputs) # return utils.collect_named_outputs(outputs_collections, sc.name, outputs) # Not supported by layer class: batch_weights argument, # and custom updates_collections. In that case, use the legacy BN # implementation. # Custom updates collections are not supported because the update logic # is different in this case, in particular w.r.t. "forced updates" and # update op reuse. if renorm: raise ValueError('renorm is not supported with batch_weights, ' 'updates_collections or zero_debias_moving_mean') inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype if batch_weights is not None: batch_weights = ops.convert_to_tensor(batch_weights) inputs_shape[0:1].assert_is_compatible_with(batch_weights.get_shape()) # Reshape batch weight values so they broadcast across inputs. nshape = [-1] + [1 for _ in range(inputs_rank - 1)] batch_weights = array_ops.reshape(batch_weights, nshape) if data_format == DATA_FORMAT_NCHW: moments_axes = [0] + list(range(2, inputs_rank)) params_shape = inputs_shape[1:2] # For NCHW format, rather than relying on implicit broadcasting, we # explicitly reshape the params to params_shape_broadcast when computing # the moments and the batch normalization. params_shape_broadcast = list( [1, inputs_shape[1].value] + [1 for _ in range(2, inputs_rank)]) else: moments_axes = list(range(inputs_rank - 1)) params_shape = inputs_shape[-1:] params_shape_broadcast = None if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined channels dimension %s.' % (inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if not param_initializers: param_initializers = {} if center: beta_collections = utils.get_variable_collections(variables_collections, 'beta') beta_initializer = param_initializers.get('beta', init_ops.zeros_initializer()) beta = variables.model_variable( 'beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get('gamma', init_ops.ones_initializer()) gamma = variables.model_variable( 'gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropriate collections. We disable variable partitioning while creating # them, because assign_moving_average is not yet supported for partitioned # variables (this needs to be handled carefully, as it may break # the checkpoint backward compatibility). with variable_scope.variable_scope( variable_scope.get_variable_scope()) as local_scope: local_scope.set_partitioner(None) moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=moving_mean_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=moving_variance_initializer, trainable=False, collections=moving_variance_collections) # If `is_training` doesn't have a constant value, because it is a `Tensor`, # a `Variable` or `Placeholder` then is_training_value will be None and # `needs_moments` will be true. is_training_value = utils.constant_value(is_training) need_moments = is_training_value is None or is_training_value if need_moments: # Calculate the moments based on the individual batch. if batch_weights is None: if data_format == DATA_FORMAT_NCHW: mean, variance = moments(inputs, moments_axes, tower_config=tower_config, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: mean, variance = moments(inputs, moments_axes, tower_config=tower_config) else: if data_format == DATA_FORMAT_NCHW: mean, variance = weighted_moments( inputs, moments_axes, batch_weights, tower_config, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: mean, variance = weighted_moments(inputs, moments_axes, batch_weights, tower_config=tower_config) moving_vars_fn = lambda: (moving_mean, moving_variance) if updates_collections is None: def _force_updates(): """Internal function forces updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) with ops.control_dependencies( [update_moving_mean, update_moving_variance]): return array_ops.identity(mean), array_ops.identity(variance) mean, variance = utils.smart_cond(is_training, _force_updates, moving_vars_fn) else: def _delay_updates(): """Internal function that delay updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) return update_moving_mean, update_moving_variance update_mean, update_variance = utils.smart_cond( is_training, _delay_updates, moving_vars_fn) ops.add_to_collections(updates_collections, update_mean) ops.add_to_collections(updates_collections, update_variance) # Use computed moments during training and moving_vars otherwise. vars_fn = lambda: (mean, variance) mean, variance = utils.smart_cond(is_training, vars_fn, moving_vars_fn) else: mean, variance = moving_mean, moving_variance if data_format == DATA_FORMAT_NCHW: mean = array_ops.reshape(mean, params_shape_broadcast) variance = array_ops.reshape(variance, params_shape_broadcast) if beta is not None: beta = array_ops.reshape(beta, params_shape_broadcast) if gamma is not None: gamma = array_ops.reshape(gamma, params_shape_broadcast) # Compute batch_normalization. outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) outputs.set_shape(inputs_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def fully_connected(inputs, num_outputs, activation_fn=tf.nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=tf.zeros_initializer(), biases_regularizer=None, do_spec_norm=False, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds support for spectral normalization following https://arxiv.org/abs/1802.05957. For non-spectral normed fc layer, See tensorflow.contrib.layers.python.layers.fully_connected for doc. """ # ***Added section*** layer_class = layers.core_layers.Dense if do_spec_norm: layer_class = SpectralNormedDense # ***Added section ends*** if not isinstance(num_outputs, layers.six.integer_types): raise ValueError('num_outputs should be int or long, got %s.' % (num_outputs, )) layer_variable_getter = layers._build_variable_getter({ 'bias': 'biases', 'kernel': 'weights' }) with tf.variable_scope(scope, 'fully_connected', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = tf.convert_to_tensor(inputs) layer = layer_class(units=num_outputs, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs) # Add variables to collections. layers._add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.bias is not None: layers._add_variable_to_collections(layer.bias, variables_collections, 'biases') # Apply normalizer function / layer. if normalizer_fn is not None: if not normalizer_params: normalizer_params = {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return layer_utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def convolution(inputs, num_outputs, kernel_size, stride=1, padding='SAME', data_format=None, rate=1, activation_fn=tf.nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=tf.zeros_initializer(), biases_regularizer=None, do_spec_norm=False, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds support for spectral normalization following https://arxiv.org/abs/1802.05957. For non-spectral normed convolution, See tensorflow.contrib.layers.python.layers.convolution for doc. """ if data_format not in [ None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW' ]: raise ValueError('Invalid data_format: %r' % (data_format, )) layer_variable_getter = layers._build_variable_getter({ 'bias': 'biases', 'kernel': 'weights' }) with tf.variable_scope(scope, 'Conv', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = tf.convert_to_tensor(inputs) input_rank = inputs.get_shape().ndims # ***Modified section*** if input_rank == 3: layer_class = convolutional_layers.Convolution1D if do_spec_norm: raise NotImplementedError( 'only supports 2d conv for spectral norm.') elif input_rank == 4: layer_class = convolutional_layers.Convolution2D if do_spec_norm: layer_class = SpecNormConv2d elif input_rank == 5: layer_class = convolutional_layers.Convolution3D if do_spec_norm: raise NotImplementedError( 'only supports 2d conv for spectral norm.') else: raise ValueError('Convolution not supported for input with rank', input_rank) # ***Modified section ends*** df = ('channels_first' if data_format and data_format.startswith('NC') else 'channels_last') layer = layer_class(filters=num_outputs, kernel_size=kernel_size, strides=stride, padding=padding, data_format=df, dilation_rate=rate, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs) # Add variables to collections. layers._add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.use_bias: layers._add_variable_to_collections(layer.bias, variables_collections, 'biases') if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return layer_utils.collect_named_outputs(outputs_collections, sc.name, outputs)