def _apply_noisy_update(self, mom, grad): # Compute and apply the gradient update following # preconditioned Langevin dynamics stddev = array_ops.where( array_ops.squeeze(self._counter > self._burnin), math_ops.cast(math_ops.rsqrt(self._learning_rate), grad.dtype), array_ops.zeros([], grad.dtype)) preconditioner = math_ops.rsqrt( mom + math_ops.cast(self._diagonal_bias, grad.dtype)) return ( 0.5 * preconditioner * grad * math_ops.cast(self._num_pseudo_batches, grad.dtype) + random_ops.random_normal(array_ops.shape(grad), 1.0, dtype=grad.dtype) * stddev * math_ops.sqrt(preconditioner))
def _opsBatchNorm(self, x, m, v, beta, gamma, epsilon, scale_after_normalization): y = (x - m) * math_ops.rsqrt(v + epsilon) if scale_after_normalization: y = gamma * y y += beta return y
def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim): """Compute the approximate sliced Wasserstein distance. Args: a: (matrix) Distribution "a" of samples (row, col). b: (matrix) Distribution "b" of samples (row, col). random_sampling_count: (int) Number of random projections to average. random_projection_dim: (int) Dimension of the random projection space. Returns: Float containing the approximate distance between "a" and "b". """ s = array_ops.shape(a) means = [] for _ in range(random_sampling_count): # Random projection matrix. proj = random_ops.random_normal( [array_ops.shape(a)[1], random_projection_dim]) proj *= math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(proj), 0, keepdims=True)) # Project both distributions and sort them. proj_a = math_ops.matmul(a, proj) proj_b = math_ops.matmul(b, proj) proj_a = _sort_rows(proj_a, s[0]) proj_b = _sort_rows(proj_b, s[0]) # Pairwise Wasserstein distance. wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b)) means.append(wdist) return math_ops.reduce_mean(means)
def _batch_norm(self, x, mean, var, offset, scale, epsilon): # We compute the batch norm manually in this function because # nn_impl.batch_normalization does not support float16 yet. # TODO(reedwm): Add float16 support to nn_impl.batch_normalization. inv = math_ops.rsqrt(var + epsilon) * scale y = math_ops.cast(x, scale.dtype) * inv + (offset - mean * inv) return math_ops.cast(y, x.dtype)
def l2_normalize(x, dim, epsilon=1e-12, name=None): """Normalizes along dimension `dim` using an L2 norm. For a 1-D tensor with `dim = 0`, computes output = x / sqrt(max(sum(x**2), epsilon)) For `x` with more dimensions, independently normalizes each 1-D slice along dimension `dim`. Args: x: A `Tensor`. dim: Dimension along which to normalize. epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the divisor if `norm < sqrt(epsilon)`. name: A name for this operation (optional). Returns: A `Tensor` with the same shape as `x`. """ with ops.op_scope([x], name, "l2_normalize") as name: x = ops.convert_to_tensor(x, name="x") square_sum = math_ops.reduce_sum(math_ops.square(x), [dim], keep_dims=True) x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon)) return math_ops.mul(x, x_inv_norm, name=name)
def clip_by_norm(t, clip_norm, name=None): """Clips tensor values to a maximum L2-norm. Given a tensor `t`, and a maximum clip value `clip_norm`, this operation normalizes `t` so that its L2-norm is less than or equal to `clip_norm'. Specifically, if the L2-norm is already less than or equal to `clip_norm`, then `t` is not modified. If the L2-norm is greater than `clip_norm`, then this operation returns a tensor of the same type and shape as `t` with its values set to: `t * clip_norm / l2norm(t)` In this case, the L2-norm of the output tensor is `clip_norm`. This operation is typically used to clip gradients before applying them with an optimizer. Args: t: A `Tensor`. clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value. name: A name for the operation (optional). Returns: A clipped `Tensor`. """ with ops.op_scope([t, clip_norm], name, "clip_by_norm") as name: t = ops.convert_to_tensor(t, name="t") # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm l2norm_inv = math_ops.rsqrt( math_ops.reduce_sum(t * t, math_ops.range(array_ops.rank(t)))) tclip = array_ops.identity(t * clip_norm * math_ops.minimum( l2norm_inv, constant_op.constant(1.0 / clip_norm)), name=name) return tclip
def _FoldFusedBatchNorms(graph): """Finds fused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise convolution. Args: graph: Graph to walk and modify. Raises: ValueError: When batch norm folding fails. """ for match in _FindFusedBatchNorms(graph): scope, sep, _ = match.layer_op.name.rpartition('/') # Make sure new ops are added to `graph` and put on the same device as # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope # named `scope`. Otherwise, TF creates a unique scope whose name starts with # `scope`. with graph.as_default(), graph.name_scope(scope + sep), ops.device( match.bn_op.device): with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep): # new weights = old weights * gamma / sqrt(variance + epsilon) # new biases = -mean * gamma / sqrt(variance + epsilon) + beta multiplier_tensor = match.gamma_tensor * math_ops.rsqrt( match.variance_tensor + match.bn_op.get_attr('epsilon')) bias_tensor = math_ops.subtract( match.beta_tensor, match.mean_tensor * multiplier_tensor, name='bias') # The shape of depthwise weights is different, so we need to reshape the # multiplier_tensor to ensure that the scaled_weight_tensor has the # expected shape. if match.layer_op.type == 'DepthwiseConv2dNative': new_shape = [ match.weight_tensor.get_shape().as_list()[2], match.weight_tensor.get_shape().as_list()[3] ] multiplier_tensor = array_ops.reshape( multiplier_tensor, new_shape, name='scale_reshape') # TODO(suharshs): This naming of the following ops needs to carefully # follow the naming expected by quantize.py. Generalize the quantize code # to not require these delicate naming conventions. scaled_weight_tensor = math_ops.multiply( match.weight_tensor, multiplier_tensor, name='mul_fold') new_layer_tensor = _CloneWithNewOperands( match.layer_op, match.input_tensor, scaled_weight_tensor) bias_add_tensor = math_ops.add( new_layer_tensor, bias_tensor, name='add_fold') nodes_modified_count = graph_editor.reroute_ts(bias_add_tensor, match.output_tensor) if nodes_modified_count != 1: raise ValueError( 'Unexpected inputs to op: %s' % match.output_tensor.name)
def dct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disable=redefined-builtin """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`. Currently only Type II is supported. Implemented using a length `2N` padded @{tf.spectral.rfft}, as described here: https://dsp.stackexchange.com/a/10606 @compatibility(scipy) Equivalent to scipy.fftpack.dct for the Type-II DCT. https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html @end_compatibility Args: input: A `[..., samples]` `float32` `Tensor` containing the signals to take the DCT of. type: The DCT type to perform. Must be 2. n: For future expansion. The length of the transform. Must be `None`. axis: For future expansion. The axis to compute the DCT along. Must be `-1`. norm: The normalization to apply. `None` for no normalization or `'ortho'` for orthonormal normalization. name: An optional name for the operation. Returns: A `[..., samples]` `float32` `Tensor` containing the DCT of `input`. Raises: ValueError: If `type` is not `2`, `n` is not `None, `axis` is not `-1`, or `norm` is not `None` or `'ortho'`. [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform """ _validate_dct_arguments(type, n, axis, norm) with _ops.name_scope(name, "dct", [input]): # We use the RFFT to compute the DCT and TensorFlow only supports float32 # for FFTs at the moment. input = _ops.convert_to_tensor(input, dtype=_dtypes.float32) axis_dim = input.shape[-1].value or _array_ops.shape(input)[-1] axis_dim_float = _math_ops.to_float(axis_dim) scale = 2.0 * _math_ops.exp(_math_ops.complex( 0.0, -_math.pi * _math_ops.range(axis_dim_float) / (2.0 * axis_dim_float))) # TODO(rjryan): Benchmark performance and memory usage of the various # approaches to computing a DCT via the RFFT. dct2 = _math_ops.real( rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale) if norm == "ortho": n1 = 0.5 * _math_ops.rsqrt(axis_dim_float) n2 = n1 * _math_ops.sqrt(2.0) # Use tf.pad to make a vector of [n1, n2, n2, n2, ...]. weights = _array_ops.pad( _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]], constant_values=n2) dct2 *= weights return dct2
def _FusedBatchNormGrad(op, *grad): """Return the gradients for the 3 inputs of BatchNorm. Args: op: The BatchNormOp for which we need to compute gradients. *grad: An argument list for tensors of gradients wrt the outputs with grad[0] as grad_y. Returns: grad_x: gradient for x, which is scale * rsqrt(variance + epsilon) * [grad_y - mean(grad_y) - (x - mean(x)) * mean(grad_y * (x - mean(x))) / (variance + epsilon)] in training mode; grad_y * scale * rsqrt(pop_variance + epsilon) in freeze mode. grad_scale: gradient for scale, which is sum(grad_y * (x - mean(x)) * rsqrt(variance + epsilon)) in training mode; sum(grad_y * (x - pop_mean) * rsqrt(pop_variance + epsilon)) in freeze mode. grad_offset: gradient for offset, which is sum(grad_y) in training mode; sum(grad_y) in freeze mode. """ x = op.inputs[0] grad_y = grad[0] scale = op.inputs[1] epsilon = op.get_attr("epsilon") data_format = op.get_attr("data_format") is_training = op.get_attr("is_training") if is_training: return gen_nn_ops.fused_batch_norm_grad( grad_y, x, scale, op.outputs[3], op.outputs[4], epsilon=epsilon, data_format=data_format, is_training=is_training) else: pop_mean = op.inputs[3] pop_var = op.inputs[4] if data_format == b"NHWC": reduce_axis = [0, 1, 2] else: reduce_axis = [0, 2, 3] shape = [1, array_ops.size(pop_mean), 1, 1] pop_mean = array_ops.reshape(pop_mean, shape) pop_var = array_ops.reshape(pop_var, shape) scale = array_ops.reshape(scale, shape) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) var_rsqrt = math_ops.rsqrt(pop_var + epsilon) grad_scale = math_ops.reduce_sum( grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis) grad_x = grad_y * scale * var_rsqrt return grad_x, grad_scale, grad_offset, None, None
def batch_normalization(x, mean, variance, offset, scale, variance_epsilon, name=None): r"""Batch normalization. As described in http://arxiv.org/abs/1502.03167. Normalizes a tensor by `mean` and `variance`, and applies (optionally) a `scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\): \\(\frac{\gamma(x-\mu)}{\sigma}+\beta\\) `mean`, `variance`, `offset` and `scale` are all expected to be of one of two shapes: * In all generality, they can have the same number of dimensions as the input `x`, with identical sizes as `x` for the dimensions that are not normalized over (the 'depth' dimension(s)), and dimension 1 for the others which are being normalized over. `mean` and `variance` in this case would typically be the outputs of `tf.nn.moments(..., keep_dims=True)` during training, or running averages thereof during inference. * In the common case where the 'depth' dimension is the last dimension in the input tensor `x`, they may be one dimensional tensors of the same size as the 'depth' dimension. This is the case for example for the common `[batch, depth]` layout of fully-connected layers, and `[batch, height, width, depth]` for convolutions. `mean` and `variance` in this case would typically be the outputs of `tf.nn.moments(..., keep_dims=False)` during training, or running averages thereof during inference. Args: x: Input `Tensor` of arbitrary dimensionality. mean: A mean `Tensor`. variance: A variance `Tensor`. offset: An offset `Tensor`, often denoted \\(\beta\\) in equations, or None. If present, will be added to the normalized tensor. scale: A scale `Tensor`, often denoted \\(\gamma\\) in equations, or `None`. If present, the scale is applied to the normalized tensor. variance_epsilon: A small float number to avoid dividing by 0. name: A name for this operation (optional). Returns: the normalized, scaled, offset tensor. """ with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]): inv = math_ops.rsqrt(variance + variance_epsilon) if scale is not None: inv *= scale return x * inv + (offset - mean * inv if offset is not None else -mean * inv)
def _BatchNormGrad(grad_y, x, scale, epsilon, data_format): """Returns the gradients for the 3 inputs of BatchNorm. Args: grad_y: A `Tensor` of 4 dimensions for gradient for y. x: A `Tensor` of 4 dimensions for x. scale: A `Tensor` of 1 dimension for scaling. epsilon: A small float number added to the variance of x. data_format: The data format for input. Either b"NHWC" or b"NCHW". Returns: A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient for x, grad_scale the gradient for scale, and grad_offset the gradient for offset. """ if data_format == b"NHWC": keep_dims = False reduce_axis = [0, 1, 2] else: keep_dims = True reduce_axis = [0, 2, 3] shape = [1, array_ops.size(scale), 1, 1] scale = array_ops.reshape(scale, shape) mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims) mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims) var_x = math_ops.reduce_mean( math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)), reduce_axis, keep_dims=keep_dims) grad_y_offset = grad_y - mean_grad_y x_offset = x - mean_x mean = math_ops.reduce_mean( grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims) grad_x = scale * math_ops.rsqrt(var_x + epsilon) * ( grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset) grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum( grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims) if data_format == b"NCHW": grad_scale = array_ops.squeeze(grad_scale) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) return grad_x, grad_scale, grad_offset
def _bahdanau_score(processed_query, keys, normalize): """Implements Bahdanau-style (additive) scoring function. This attention has two forms. The first is Bhandanau attention, as described in: Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio. "Neural Machine Translation by Jointly Learning to Align and Translate." ICLR 2015. https://arxiv.org/abs/1409.0473 The second is the normalized form. This form is inspired by the weight normalization article: Tim Salimans, Diederik P. Kingma. "Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks." https://arxiv.org/abs/1602.07868 To enable the second form, set `normalize=True`. Args: processed_query: Tensor, shape `[batch_size, num_units]` to compare to keys. keys: Processed memory, shape `[batch_size, max_time, num_units]`. normalize: Whether to normalize the score function. Returns: A `[batch_size, max_time]` tensor of unnormalized score values. """ dtype = processed_query.dtype # Get the number of hidden units from the trailing dimension of keys num_units = keys.shape[2].value or array_ops.shape(keys)[2] # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) v = variable_scope.get_variable( "attention_v", [num_units], dtype=dtype) if normalize: # Scalar used in weight normalization g = variable_scope.get_variable( "attention_g", dtype=dtype, initializer=math.sqrt((1. / num_units))) # Bias added prior to the nonlinearity b = variable_scope.get_variable( "attention_b", [num_units], dtype=dtype, initializer=init_ops.zeros_initializer()) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) return math_ops.reduce_sum( normed_v * math_ops.tanh(keys + processed_query + b), [2]) else: return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2])
def clip_by_norm(t, clip_norm, axes=None, name=None): """Clips tensor values to a maximum L2-norm. Given a tensor `t`, and a maximum clip value `clip_norm`, this operation normalizes `t` so that its L2-norm is less than or equal to `clip_norm`, along the dimensions given in `axes`. Specifically, in the default case where all dimensions are used for calculation, if the L2-norm of `t` is already less than or equal to `clip_norm`, then `t` is not modified. If the L2-norm is greater than `clip_norm`, then this operation returns a tensor of the same type and shape as `t` with its values set to: `t * clip_norm / l2norm(t)` In this case, the L2-norm of the output tensor is `clip_norm`. As another example, if `t` is a matrix and `axes == [1]`, then each row of the output will have L2-norm equal to `clip_norm`. If `axes == [0]` instead, each column of the output will be clipped. This operation is typically used to clip gradients before applying them with an optimizer. Args: t: A `Tensor`. clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value. axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions to use for computing the L2-norm. If `None` (the default), uses all dimensions. name: A name for the operation (optional). Returns: A clipped `Tensor`. """ with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name: t = ops.convert_to_tensor(t, name="t") # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm l2norm_inv = math_ops.rsqrt( math_ops.reduce_sum(t * t, axes, keep_dims=True)) intermediate = t * clip_norm # Assert that the shape is compatible with the initial shape, # to prevent unintentional broadcasting. _ = t.shape.merge_with(intermediate.shape) tclip = array_ops.identity(intermediate * math_ops.minimum( l2norm_inv, constant_op.constant(1.0, dtype=t.dtype) / clip_norm), name=name) return tclip
def batch_normalization(x, mean, variance, offset, scale, variance_epsilon, data_format, name=None): """Data Format aware version of tf.nn.batch_normalization.""" with ops.name_scope(name, 'batchnorm', [x, mean, variance, scale, offset]): inv = math_ops.rsqrt(variance + variance_epsilon) if scale is not None: inv *= scale a = math_ops.cast(inv, x.dtype) b = math_ops.cast(offset - mean * inv if offset is not None else -mean * inv, x.dtype) # Return a * x + b with customized data_format. # Currently TF doesn't have bias_scale, and tensorRT has bug in converting tf.nn.bias_add # So we reimplemted them to allow make the model work with tensorRT. # See https://github.com/tensorlayer/openpose-plus/issues/75 for more details. df = {'channels_first': 'NCHW', 'channels_last': 'NHWC'} return _bias_add(_bias_scale(x, a, df[data_format]), b, df[data_format])
def _sample_n(self, n, seed=None): # The sampling method comes from the fact that if: # X ~ Normal(0, 1) # Z ~ Chi2(df) # Y = X / sqrt(Z / df) # then: # Y ~ StudentT(df). shape = array_ops.concat([[n], self.batch_shape_tensor()], 0) normal_sample = random_ops.random_normal(shape, dtype=self.dtype, seed=seed) df = self.df * array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype) gamma_sample = random_ops.random_gamma( [n], 0.5 * df, beta=0.5, dtype=self.dtype, seed=distribution_util.gen_new_seed(seed, salt="student_t")) samples = normal_sample * math_ops.rsqrt(gamma_sample / df) return samples * self.scale + self.loc # Abs(scale) not wanted.
def per_image_whitening(image): """Linearly scales `image` to have zero mean and unit norm. This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average of all values in image, and `adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))`. `stddev` is the standard deviation of all values in `image`. It is capped away from zero to protect against division by 0 when handling uniform images. Note that this implementation is limited: * It only whitens based on the statistics of an individual image. * It does not take into account the covariance structure. Args: image: 3-D tensor of shape `[height, width, channels]`. Returns: The whitened image with same shape as `image`. Raises: ValueError: if the shape of 'image' is incompatible with this function. """ image = ops.convert_to_tensor(image, name='image') _Check3DImage(image, require_static=False) num_pixels = math_ops.reduce_prod(array_ops.shape(image)) image = math_ops.cast(image, dtype=dtypes.float32) image_mean = math_ops.reduce_mean(image) variance = (math_ops.reduce_mean(math_ops.square(image)) - math_ops.square(image_mean)) variance = gen_nn_ops.relu(variance) stddev = math_ops.sqrt(variance) # Apply a minimum normalization that protects us against uniform images. min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32)) pixel_value_scale = math_ops.maximum(stddev, min_stddev) pixel_value_offset = image_mean image = math_ops.sub(image, pixel_value_offset) image = math_ops.div(image, pixel_value_scale) return image
def __call__(self, query, previous_alignments): """Score the query based on the keys and values. Args: query: Tensor of dtype matching `self.values` and shape `[batch_size, query_depth]`. previous_alignments: Tensor of dtype matching `self.values` and shape `[batch_size, alignments_size]` (`alignments_size` is memory's `max_time`). Returns: alignments: Tensor of dtype matching `self.values` and shape `[batch_size, alignments_size]` (`alignments_size` is memory's `max_time`). """ with variable_scope.variable_scope(None, "bahdanau_attention", [query]): processed_query = self.query_layer( query) if self.query_layer else query # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) keys = self._keys dtype = query.dtype v = variable_scope.get_variable("attention_v", [self._num_units], dtype=dtype) if self._normalize: # Scalar used in weight normalization g = variable_scope.get_variable("attention_g", dtype=dtype, initializer=math.sqrt( (1. / self._num_units))) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) score = math_ops.reduce_sum( normed_v * math_ops.tanh(keys + processed_query + b), [2]) else: score = math_ops.reduce_sum( v * math_ops.tanh(keys + processed_query), [2]) alignments = self._probability_fn(score, previous_alignments) return alignments, self.mask_func(score)
def _apply_dense(self, grad, var): # Calculates the preconditioner statistics for each tensor. partitioned_grads = TensorPartitioner.partition_tensor( grad, self._partition_info) shape = var.get_shape() fallback_to_diagonal = self._fallback_to_diagonal_for_shape(shape) precond_statistics_update = [] if not fallback_to_diagonal: precond_statistics_update = self._updated_statistics( var, partitioned_grads) accumulator = self.get_slot(var, "accumulator") accumulator_updated = state_ops.assign_add(accumulator, grad * grad) accumulator_inv_sqrt = math_ops.rsqrt(accumulator_updated + 1e-30) if self._momentum > 0.0: scaled_g = (1.0 - self._momentum_tensor) * (grad * accumulator_inv_sqrt) gbar = self.get_slot(var, "momentum") gbar_updated = state_ops.assign_add( gbar, gbar * (self._momentum_tensor - 1.0) + scaled_g) else: gbar_updated = (grad * accumulator_inv_sqrt) if not fallback_to_diagonal: # Update the preconditioner statistics followed by computing the # preconditioned gradient. with ops.control_dependencies(precond_statistics_update): s = tf.cast(self._run_nondiagonal_update, tf.float32) preconditioned_grad = self._preconditioned_update( var, partitioned_grads, gbar_updated) # slowly adapt from diagonal to preconditioned gradient. w = self._run_nondiagonal_update_warmup warmup_update = s * self._learning_rate_tensor * ( w * preconditioned_grad + (1.0 - w) * gbar_updated) fallback_update = (1 - s) * (self._learning_rate_tensor * gbar_updated) return state_ops.assign_sub(var, warmup_update + fallback_update) else: return state_ops.assign_sub( var, self._learning_rate_tensor * gbar_updated)
def __call__(self, query, previous_alignments): """Score the query based on the keys and values. Args: query: Tensor of dtype matching `self.values` and shape `[batch_size, query_depth]`. previous_alignments: Tensor of dtype matching `self.values` and shape `[batch_size, alignments_size]` (`alignments_size` is memory's `max_time`). Returns: alignments: Tensor of dtype matching `self.values` and shape `[batch_size, alignments_size]` (`alignments_size` is memory's `max_time`). """ with variable_scope.variable_scope(None, "bahdanau_attention", [query]): processed_query = self.query_layer(query) if self.query_layer else query dtype = processed_query.dtype # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) keys = self._keys v = variable_scope.get_variable( "attention_v", [self._num_units], dtype=dtype) if self._normalize: # Scalar used in weight normalization g = variable_scope.get_variable( "attention_g", dtype=dtype, initializer=math.sqrt((1. / self._num_units))) # Bias added prior to the nonlinearity b = variable_scope.get_variable( "attention_b", [self._num_units], dtype=dtype, initializer=init_ops.zeros_initializer()) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) score = math_ops.reduce_sum( normed_v * math_ops.tanh(keys + processed_query + b), [2]) else: score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2]) alignments = self._probability_fn(score, previous_alignments) return alignments
def __call__(self, query): """Score the query based on the keys and values. Args: query: Tensor of dtype matching `self.values` and shape `[batch_size, query_depth]`. Returns: score: Tensor of dtype matching `self.values` and shape `[batch_size, max_time]` (`max_time` is memory's `max_time`). """ with variable_scope.variable_scope(None, "bahdanau_attention", [query]): processed_query = self.query_layer( query) if self.query_layer else query dtype = processed_query.dtype # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) keys = self._keys v = variable_scope.get_variable("attention_v", [self._num_units], dtype=dtype) if self._normalize: # Scalar used in weight normalization g = variable_scope.get_variable("attention_g", dtype=dtype, initializer=math.sqrt( (1. / self._num_units))) # Bias added prior to the nonlinearity b = variable_scope.get_variable( "attention_b", [self._num_units], dtype=dtype, initializer=init_ops.zeros_initializer()) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) score = math_ops.reduce_sum( normed_v * math_ops.tanh(keys + processed_query + b), [2]) else: score = math_ops.reduce_sum( v * math_ops.tanh(keys + processed_query), [2]) return score
def _sample_n(self, n, seed=None): # The sampling method comes from the fact that if: # X ~ Normal(0, 1) # Z ~ Chi2(df) # Y = X / sqrt(Z / df) # then: # Y ~ StudentT(df). shape = array_ops.concat([[n], self.batch_shape()], 0) normal_sample = random_ops.random_normal(shape, dtype=self.dtype, seed=seed) df = self.df * array_ops.ones(self.batch_shape(), dtype=self.dtype) gamma_sample = random_ops.random_gamma( [n], 0.5 * df, beta=0.5, dtype=self.dtype, seed=distribution_util.gen_new_seed(seed, salt="student_t")) samples = normal_sample * math_ops.rsqrt(gamma_sample / df) return samples * self.scale + self.loc # Abs(scale) not wanted.
def call(self, inputs, **kwargs): inputs = ops.convert_to_tensor(inputs) original_shape = inputs.get_shape() # Reshape the input by the group within the channel dimension. inputs_shape = (self.axes_before_channels + [self.groups, self.channels // self.groups] + self.axes_after_channels) inputs = array_ops.reshape(inputs, inputs_shape) # Calculate the moments. if self.mean_close_to_zero: # One pass algorithm returns better result when mean is close to zero. counts, means_ss, variance_ss, _ = tf.nn.sufficient_statistics( inputs, self.moments_axes, keep_dims=True) mean, variance = tf.nn.normalize_moments(counts, means_ss, variance_ss, shift=None) else: mean, variance = tf.nn.moments(inputs, self.moments_axes, keep_dims=True) # Compute normalization. gain = math_ops.rsqrt(variance + self.epsilon) offset = -mean * gain if self.gamma is not None: gamma = array_ops.reshape(self.gamma, self.params_shape_broadcast) gain *= gamma offset *= gamma if self.beta is not None: beta = array_ops.reshape(self.beta, self.params_shape_broadcast) offset += beta outputs = inputs * gain + offset # Collapse the groups into the channel dimension. outputs = array_ops.reshape(outputs, original_shape) return outputs
def get_folded_weights(self): """Function to get the batchnorm folded weights. This function converts the weights by folding batchnorm parameters into the weight of QDepthwiseConv2d. The high-level equation: W_fold = gamma * W / sqrt(variance + epsilon) bias_fold = gamma * (bias - moving_mean) / sqrt(variance + epsilon) + beta """ depthwise_kernel = self.depthwise_kernel if self.use_bias: bias = self.bias else: bias = 0 # get Batchnorm stats gamma = self.batchnorm.gamma beta = self.batchnorm.beta moving_mean = self.batchnorm.moving_mean moving_variance = self.batchnorm.moving_variance # get the inversion factor so that we replace division by multiplication inv = math_ops.rsqrt(moving_variance + self.batchnorm.epsilon) if gamma is not None: inv *= gamma # fold bias with bn stats folded_bias = inv * (bias - moving_mean) + beta # for DepthwiseConv2D inv needs to be broadcasted to the last 2 dimensions # of the kernels depthwise_weights_shape = [ depthwise_kernel.get_shape().as_list()[2], depthwise_kernel.get_shape().as_list()[3] ] inv = array_ops.reshape(inv, depthwise_weights_shape) # wrap conv kernel with bn parameters folded_depthwise_kernel = inv * depthwise_kernel return [folded_depthwise_kernel, folded_bias]
def __call__(self, query): """Score the query based on the keys and values. Args: query: Tensor of dtype matching `self.values` and shape `[batch_size, query_depth]`. Returns: score: Tensor of dtype matching `self.values` and shape `[batch_size, max_time]` (`max_time` is memory's `max_time`). """ with ops.name_scope(None, "BahndahauAttentionCall", [query]): processed_query = self.query_layer(query) if self.query_layer else query dtype = processed_query.dtype # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) v = variable_scope.get_variable( "attention_v", [self._num_units], dtype=dtype) if self._normalize: # Scalar used in weight normalization g = variable_scope.get_variable( "attention_g", dtype=dtype, initializer=math.sqrt((1. / self._num_units))) # Bias added prior to the nonlinearity b = variable_scope.get_variable( "attention_b", [self._num_units], dtype=dtype, initializer=init_ops.zeros_initializer()) # Scalar bias added to attention scores r = variable_scope.get_variable( "attention_r", dtype=dtype, initializer=self._attention_r_initializer) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) score = math_ops.reduce_sum( normed_v * math_ops.tanh(self.keys + processed_query + b), [2]) + r else: score = math_ops.reduce_sum( v * math_ops.tanh(self.keys + processed_query), [2]) return score
def bn(x, name='batchnorm'): with tf.variable_scope(name): epsilon = 1e-3 size = int(x.shape.as_list()[-1]) beta = tf.get_variable('beta', [size], initializer=tf.zeros_initializer()) scale = tf.get_variable('scale', [size], initializer=tf.ones_initializer()) moving_mean = tf.get_variable('mean', [size], initializer=tf.zeros_initializer(), trainable=False) moving_variance = tf.get_variable('variance', [size], initializer=tf.ones_initializer(), trainable=False) inv = math_ops.rsqrt(moving_variance + epsilon) inv *= scale return x * inv + (beta - moving_mean * inv)
def clip_by_average_norm(t, clip_norm, name=None): """Clips tensor values to a maximum average L2-norm. Given a tensor `t`, and a maximum clip value `clip_norm`, this operation normalizes `t` so that its average L2-norm is less than or equal to `clip_norm`. Specifically, if the average L2-norm is already less than or equal to `clip_norm`, then `t` is not modified. If the average L2-norm is greater than `clip_norm`, then this operation returns a tensor of the same type and shape as `t` with its values set to: `t * clip_norm / l2norm_avg(t)` In this case, the average L2-norm of the output tensor is `clip_norm`. This operation is typically used to clip gradients before applying them with an optimizer. Args: t: A `Tensor`. clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value. name: A name for the operation (optional). Returns: A clipped `Tensor`. """ with ops.name_scope(name, "clip_by_average_norm", [t, clip_norm]) as name: t = ops.convert_to_tensor(t, name="t") # Calculate L2-norm per element, clip elements by ratio of clip_norm to # L2-norm per element n_element = math_ops.cast(array_ops.size(t), dtypes.float32) l2norm_inv = math_ops.rsqrt( math_ops.reduce_sum(t * t, math_ops.range(array_ops.rank(t)))) tclip = array_ops.identity( t * clip_norm * math_ops.minimum(l2norm_inv * n_element, constant_op.constant(1.0) / clip_norm), name=name) return tclip
def __call__(self, query, tiling_factor=1): """Score the query based on the keys and values. Args: query: Tensor of dtype matching `self.values` and shape `[batch_size, query_depth]`. tiling_factor: An integer factor for which to tile the batch dimension. Used with BeamSearchDecoder. Returns: score: Tensor of dtype matching `self.values` and shape `[batch_size, max_time]` (`max_time` is memory's `max_time`). """ with variable_scope.variable_scope(None, "bahdanau_attention", [query]): processed_query = self.query_layer(query) if self.query_layer else query dtype = processed_query.dtype # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) keys = _maybe_tile_batch(self.keys, tiling_factor) v = variable_scope.get_variable( "attention_v", [self._num_units], dtype=dtype) if self._normalize: # Scalar used in weight normalization g = variable_scope.get_variable( "attention_g", dtype=dtype, initializer=math.sqrt((1. / self._num_units))) # Bias added prior to the nonlinearity b = variable_scope.get_variable( "attention_b", [self._num_units], dtype=dtype, initializer=init_ops.zeros_initializer()) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) score = math_ops.reduce_sum( normed_v * math_ops.tanh(keys + processed_query + b), [2]) else: score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2]) return score
def get_context_additive_null(self, query, top_states_4, top_states_transform_4, encoder_raws_matrix): query_transform_2 = tf.add(tf.matmul(query, self.a_w_target), self.a_b) #[batch_size, hidden_size] query_transform_4 = tf.reshape( query_transform_2, [-1, 1, 1, self.model.size]) #[batch_size,1,1,hidden_size] if self.model.attention_scale: # normed_v = g * v / |v| normed_v = self.attention_g * self.a_v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(self.a_v))) else: normed_v = self.a_v attention_null_vector_transform = tf.matmul(self.null_attention_vector, self.a_w_source) attention_null_score = tf.reduce_sum( normed_v * tf.tanh(attention_null_vector_transform + query_transform_2), [1]) #[batch_size] attention_null_score = tf.reshape(attention_null_score, [-1, 1]) #a = softmax( a_v * tanh(...)) s = tf.reduce_sum(normed_v * tf.tanh(top_states_transform_4 + query_transform_4), [2, 3]) #[batch_size, source_length] s = self.mask_score(s, encoder_raws_matrix) s_with_null = tf.concat([attention_null_score, s], 1) a_with_null = tf.nn.softmax( s_with_null) # [batch_size, 1 + source_length] a = tf.slice(a_with_null, [0, 1], [-1, -1]) #[batch_size, source_length] # context = a * h_source context = tf.reduce_sum( tf.reshape(a, [self.model.batch_size, -1, 1, 1]) * top_states_4, [1, 2]) return context, a
def batch_normalization(x, mean, variance, offset, scale, variance_epsilon, name=None): with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]): inv = math_ops.rsqrt(variance + variance_epsilon) if scale is not None: inv *= scale # Note: tensorflow/contrib/quantize/python/fold_batch_norms.py depends on # the precise order of ops that are generated by the expression below. out = x * math_ops.cast(inv, x.dtype) + math_ops.cast( offset - mean * inv if offset is not None else -mean * inv, x.dtype) special = tf.transpose(x, [1, 0, 2]) # special = tf.transpose(inv, [1, 0, 2]) return out, special
def batch_normalization(x, mean, variance, offset, scale, variance_epsilon, data_format, name=None): """Data Format aware version of tf.nn.batch_normalization.""" if data_format == 'channels_last': mean = tf.reshape(mean, [1] * (len(x.shape) - 1) + [-1]) variance = tf.reshape(variance, [1] * (len(x.shape) - 1) + [-1]) offset = tf.reshape(offset, [1] * (len(x.shape) - 1) + [-1]) scale = tf.reshape(scale, [1] * (len(x.shape) - 1) + [-1]) elif data_format == 'channels_first': mean = tf.reshape(mean, [1] + [-1] + [1] * (len(x.shape) - 2)) variance = tf.reshape(variance, [1] + [-1] + [1] * (len(x.shape) - 2)) offset = tf.reshape(offset, [1] + [-1] + [1] * (len(x.shape) - 2)) scale = tf.reshape(scale, [1] + [-1] + [1] * (len(x.shape) - 2)) else: raise ValueError('invalid data_format: %s' % data_format) with ops.name_scope(name, 'batchnorm', [x, mean, variance, scale, offset]): inv = math_ops.rsqrt(variance + variance_epsilon) if scale is not None: inv *= scale a = math_ops.cast(inv, x.dtype) b = math_ops.cast( offset - mean * inv if offset is not None else -mean * inv, x.dtype) # Return a * x + b with customized data_format. # Currently TF doesn't have bias_scale, and tensorRT has bug in converting tf.nn.bias_add # So we reimplemted them to allow make the model work with tensorRT. # See https://github.com/tensorlayer/openpose-plus/issues/75 for more details. df = {'channels_first': 'NCHW', 'channels_last': 'NHWC'} return _bias_add(_bias_scale(x, a, df[data_format]), b, df[data_format])
def bn(x, is_training, name='batchnorm'): with tf.variable_scope(name): decay = 0.99 epsilon = 1e-3 size = x.shape.as_list()[-1] beta = tf.get_variable('beta', [size], initializer=tf.zeros_initializer()) scale = tf.get_variable('scale', [size], initializer=tf.ones_initializer()) moving_mean = tf.get_variable('mean', [size], initializer=tf.zeros_initializer(), trainable=False) moving_variance = tf.get_variable('variance', [size], initializer=tf.ones_initializer(), trainable=False) def train(): mean, variance = tf.nn.moments(x, [0, 1, 2]) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_moving_mean) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_moving_variance) return mean, variance mean, variance = tf.cond( tf.convert_to_tensor(is_training, dtype=tf.bool), lambda: train(), lambda: (moving_mean, moving_variance)) inv = math_ops.rsqrt(variance + epsilon) inv *= scale return x * inv + (beta - mean * inv)
def batch_normalization_my(x, mean, variance, offset, scale, variance_epsilon, name=None): with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]): inv = math_ops.rsqrt(variance + variance_epsilon) scale = tf.cast(scale, tf.float32) scale = tf.expand_dims(scale, 1) scale = tf.tile(scale, [1, x.get_shape()[1], x.get_shape()[2], 1]) offset = tf.cast(offset, tf.float32) offset = tf.expand_dims(offset, 1) offset = tf.tile(offset, [1, x.get_shape()[1], x.get_shape()[2], 1]) if scale is not None: inv *= scale # Note: tensorflow/contrib/quantize/python/fold_batch_norms.py depends on # the precise order of ops that are generated by the expression below. return x * math_ops.cast(inv, x.dtype) + math_ops.cast( offset - mean * inv if offset is not None else -mean * inv, x.dtype)
def _get_folded_kernel_bias(conv_type, kernel, bias, mu, var, gamma, beta, epsilon): """ Get folded kernel and bias folded_kernel = kernel * multiplier = kernel * gamma / sigma_bt folded_bias = beta - (mu - bias) * multiplier = beta - (mu - bias) * gamma / sigma """ sigma = math_ops.rsqrt(var + epsilon) if gamma is not None: multiplier = math_ops.mul(gamma, sigma) else: multiplier = sigma if conv_type == 'DepthwiseConv2D': new_shape = [kernel.shape[2], kernel.shape[3]] depthwise_multiplier = array_ops.reshape(multiplier, new_shape) folded_kernel = math_ops.mul( depthwise_multiplier, kernel, name='depthwise_kernel') else: folded_kernel = math_ops.mul(multiplier, kernel, name='kernel') folded_bias = math_ops.subtract(beta, (mu - bias) * multiplier, name='bias') return folded_kernel, folded_bias
def call(self, inputs): inputs = ops.convert_to_tensor(inputs, dtype=self.dtype) ndim = self._input_rank if self.rectify: inputs = nn.relu(inputs) # Compute normalization pool. if ndim == 2: norm_pool = math_ops.matmul(math_ops.square(inputs), self.gamma) norm_pool = nn.bias_add(norm_pool, self.beta) elif self.data_format == "channels_last" and ndim <= 5: shape = self.gamma.shape.as_list() gamma = array_ops.reshape(self.gamma, (ndim - 2) * [1] + shape) norm_pool = nn.convolution(math_ops.square(inputs), gamma, "VALID") norm_pool = nn.bias_add(norm_pool, self.beta) else: # generic implementation # This puts channels in the last dimension regardless of input. norm_pool = math_ops.tensordot(math_ops.square(inputs), self.gamma, [[self._channel_axis()], [0]]) norm_pool += self.beta if self.data_format == "channels_first": # Return to channels_first format if necessary. axes = range(ndim - 1) axes.insert(1, ndim - 1) norm_pool = array_ops.transpose(norm_pool, axes) if self.inverse: norm_pool = math_ops.sqrt(norm_pool) else: norm_pool = math_ops.rsqrt(norm_pool) outputs = inputs * norm_pool if not context.executing_eagerly(): outputs.set_shape(self.compute_output_shape(inputs.shape)) return outputs
def per_image_standardization(image): """Linearly scales `image` to have zero mean and unit variance. This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average of all values in image, and `adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))`. `stddev` is the standard deviation of all values in `image`. It is capped away from zero to protect against division by 0 when handling uniform images. Args: image: An n-D Tensor where the last 3 dimensions are `[height, width, channels]`. Returns: The standardized image with same shape as `image`. Raises: ValueError: if the shape of 'image' is incompatible with this function. """ with ops.name_scope(None, 'per_image_standardization', [image]) as scope: image = ops.convert_to_tensor(image, name='image') num_pixels = math_ops.reduce_prod(array_ops.shape(image)[1:4]) image = math_ops.cast(image, dtype=dtypes.float32) image_mean = math_ops.reduce_mean(image, axis=[-1, -2, -3], keepdims=True) variance = (math_ops.reduce_mean( math_ops.square(image), axis=[-1, -2, -3], keepdims=True) - math_ops.square(image_mean)) variance = gen_nn_ops.relu(variance) stddev = math_ops.sqrt(variance) # Apply a minimum normalization that protects us against uniform images. min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32)) pixel_value_scale = math_ops.maximum(stddev, min_stddev) pixel_value_offset = image_mean image = math_ops.subtract(image, pixel_value_offset) image = math_ops.div(image, pixel_value_scale, name=scope) return image
def _bahdanau_coverage_mul_score(processed_query, keys, coverage_features, normalize): dtype = processed_query.dtype # Get the number of hidden units from the trailing dimension of keys num_units = keys.shape[2].value or array_ops.shape(keys)[2] # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) v = variable_scope.get_variable( "attention_v", [num_units], dtype=dtype) if normalize: # Scalar used in weight normalization g = variable_scope.get_variable( "attention_g", dtype=dtype, initializer=math.sqrt((1. / num_units))) # Bias added prior to the nonlinearity b = variable_scope.get_variable( "attention_b", [num_units], dtype=dtype, initializer=init_ops.zeros_initializer()) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) return math_ops.reduce_sum( normed_v * math_ops.tanh(keys + processed_query + coverage_features + b), [2]) else: return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query + coverage_features), [2])
def call(self, inputs, training=None): # numpy value, mark the layer is in training training = self.batchnorm._get_training_value(training) # pylint: disable=protected-access # checking if to update batchnorm params if (self.ema_freeze_delay is None) or (self.ema_freeze_delay < 0): # if ema_freeze_delay is None or a negative value, do not freeze bn stats bn_training = tf.cast(training, dtype=bool) else: bn_training = tf.math.logical_and( training, tf.math.less_equal(self._iteration, self.ema_freeze_delay)) depthwise_kernel = self.depthwise_kernel # run depthwise_conv2d to produce output for the following batchnorm conv_outputs = tf.keras.backend.depthwise_conv2d( inputs, depthwise_kernel, strides=self.strides, padding=self.padding, dilation_rate=self.dilation_rate, data_format=self.data_format) if self.use_bias: bias = self.bias conv_outputs = tf.keras.backend.bias_add( conv_outputs, bias, data_format=self.data_format) else: bias = 0 _ = self.batchnorm(conv_outputs, training=bn_training) self._iteration.assign_add( tf_utils.smart_cond(training, lambda: tf.constant(1, tf.int64), lambda: tf.constant(0, tf.int64))) # calcuate mean and variance from current batch bn_shape = conv_outputs.shape ndims = len(bn_shape) reduction_axes = [ i for i in range(ndims) if i not in self.batchnorm.axis ] keep_dims = len(self.batchnorm.axis) > 1 mean, variance = self.batchnorm._moments( # pylint: disable=protected-access math_ops.cast(conv_outputs, self.batchnorm._param_dtype), # pylint: disable=protected-access reduction_axes, keep_dims=keep_dims) gamma = self.batchnorm.gamma beta = self.batchnorm.beta moving_mean = self.batchnorm.moving_mean moving_variance = self.batchnorm.moving_variance if self.folding_mode not in [ "batch_stats_folding", "ema_stats_folding" ]: assert ValueError("mode {} not supported!".format( self.folding_mode)) mv_inv = math_ops.rsqrt(moving_variance + self.batchnorm.epsilon) batch_inv = math_ops.rsqrt(variance + self.batchnorm.epsilon) if gamma is not None: mv_inv *= gamma batch_inv *= gamma folded_bias = tf_utils.smart_cond( bn_training, lambda: batch_inv * (bias - mean) + beta, lambda: mv_inv * (bias - moving_mean) + beta) if self.folding_mode == "batch_stats_folding": # using batch mean and variance in the initial training stage # after sufficient training, switch to moving mean and variance inv = tf_utils.smart_cond(bn_training, lambda: batch_inv, lambda: mv_inv) elif self.folding_mode == "ema_stats_folding": # We always scale the weights with a correction factor to the long term # statistics prior to quantization. This ensures that there is no jitter # in the quantized weights due to batch to batch variation. During the # initial phase of training, we undo the scaling of the weights so that # outputs are identical to regular batch normalization. We also modify # the bias terms correspondingly. After sufficient training, switch from # using batch statistics to long term moving averages for batch # normalization. # use batch stats for calcuating bias before bn freeze, and use moving # stats after bn freeze # moving stats is always used to fold kernel in tflite; before bn freeze # an additional correction factor will be applied to the depthwiseconv2d # output inv = mv_inv # for DepthwiseConv2D inv needs to be broadcasted to the last 2 dimensions # of the kernels depthwise_weights_shape = [ depthwise_kernel.get_shape().as_list()[2], depthwise_kernel.get_shape().as_list()[3] ] inv = array_ops.reshape(inv, depthwise_weights_shape) # wrap conv kernel with bn parameters folded_depthwise_kernel = inv * depthwise_kernel # quantize the folded kernel if self.depthwise_quantizer is not None: q_folded_depthwise_kernel = self.depthwise_quantizer_internal( folded_depthwise_kernel) else: q_folded_depthwise_kernel = folded_depthwise_kernel # If loaded from a ckpt, bias_quantizer is the ckpt value # Else if bias_quantizer not specified, bias # quantizer is None and we need to calculate bias quantizer # type according to accumulator type. User can call # bn_folding_utils.populate_bias_quantizer_for_folded_layers( # model, input_quantizer_list]) to populate such bias quantizer. if self.bias_quantizer is not None: q_folded_bias = self.bias_quantizer_internal(folded_bias) else: q_folded_bias = folded_bias applied_kernel = q_folded_depthwise_kernel applied_bias = q_folded_bias # calculate depthwise_conv2d output using the quantized folded kernel folded_outputs = tf.keras.backend.depthwise_conv2d( inputs, applied_kernel, strides=self.strides, padding=self.padding, dilation_rate=self.dilation_rate, data_format=self.data_format) if training is True and self.folding_mode == "ema_stats_folding": batch_inv = math_ops.rsqrt(variance + self.batchnorm.epsilon) y_corr = tf_utils.smart_cond( bn_training, lambda: (math_ops.sqrt(moving_variance + self.batchnorm.epsilon) * math_ops.rsqrt(variance + self.batchnorm.epsilon)), lambda: tf.constant(1.0, shape=moving_variance.shape)) folded_outputs = math_ops.mul(folded_outputs, y_corr) folded_outputs = tf.keras.backend.bias_add( folded_outputs, applied_bias, data_format=self.data_format) if self.activation is not None: return self.activation(folded_outputs) return folded_outputs
def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay): """Finds fused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise convolution. Args: graph: Graph to walk and modify. is_training: Bool, true if training. freeze_batch_norm_delay: How many steps to wait before freezing moving mean and variance and using them for batch normalization. Raises: ValueError: When batch norm folding fails. """ for match in _FindFusedBatchNorms(graph): scope, sep, _ = match.layer_op.name.rpartition('/') # Make sure new ops are added to `graph` and put on the same device as # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope # named `scope`. Otherwise, TF creates a unique scope whose name starts with # `scope`. with graph.as_default(), graph.name_scope(scope + sep): with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep): # new weights = old weights * gamma / sqrt(variance + epsilon) # new biases = -mean * gamma / sqrt(variance + epsilon) + beta multiplier_tensor = match.gamma_tensor * math_ops.rsqrt( match.variance_tensor + match.bn_op.get_attr('epsilon')) bias_tensor = math_ops.subtract( match.beta_tensor, match.mean_tensor * multiplier_tensor, name='bias') correction_scale, correction_recip, correction_offset = None, None, None if is_training: correction_scale, correction_recip, correction_offset = ( _ComputeBatchNormCorrections( context='', match=match, freeze_batch_norm_delay=freeze_batch_norm_delay, fused_batch_norm=True)) # The shape of depthwise weights is different, so we need to reshape the # multiplier_tensor to ensure that the scaled_weight_tensor has the # expected shape. weights = match.weight_tensor if match.layer_op.type == 'DepthwiseConv2dNative': new_shape = [ match.weight_tensor.get_shape().as_list()[2], match.weight_tensor.get_shape().as_list()[3] ] multiplier_tensor = array_ops.reshape( multiplier_tensor, new_shape, name='scale_reshape') if correction_scale is not None: correction_scale = array_ops.reshape( correction_scale, new_shape, name='correction_reshape') if correction_scale is not None: weights = math_ops.multiply( correction_scale, weights, name='correction_mult') scaled_weight_tensor = math_ops.multiply( weights, multiplier_tensor, name='mul_fold') new_layer_tensor = _CloneWithNewOperands( match.layer_op, match.input_tensor, scaled_weight_tensor) if correction_recip is not None: new_layer_tensor = math_ops.multiply( correction_recip, new_layer_tensor, name='post_conv_mul') new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset), 'correction_add') bias_add_tensor = math_ops.add( new_layer_tensor, bias_tensor, name='add_fold') nodes_modified_count = graph_editor.reroute_ts(bias_add_tensor, match.output_tensor) if nodes_modified_count == 0: raise ValueError('Folding batch norms failed, %s had no outputs.' % match.output_tensor.name)
def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None): """Computes [MFCCs][mfcc] of `log_mel_spectrograms`. Implemented with GPU-compatible ops and supports gradients. [Mel-Frequency Cepstral Coefficient (MFCC)][mfcc] calculation consists of taking the DCT-II of a log-magnitude mel-scale spectrogram. [HTK][htk]'s MFCCs use a particular scaling of the DCT-II which is almost orthogonal normalization. We follow this convention. All `num_mel_bins` MFCCs are returned and it is up to the caller to select a subset of the MFCCs based on their application. For example, it is typical to only use the first few for speech recognition, as this results in an approximately pitch-invariant representation of the signal. For example: ```python sample_rate = 16000.0 # A Tensor of [batch_size, num_samples] mono PCM samples in the range [-1, 1]. pcm = tf.placeholder(tf.float32, [None, None]) # A 1024-point STFT with frames of 64 ms and 75% overlap. stfts = tf.contrib.signal.stft(pcm, frame_length=1024, frame_step=256, fft_length=1024) spectrograms = tf.abs(stfts) # Warp the linear scale spectrograms into the mel-scale. num_spectrogram_bins = stfts.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80 linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot( spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) # Compute a stabilized log to get log-magnitude mel-scale spectrograms. log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6) # Compute MFCCs from log_mel_spectrograms and take the first 13. mfccs = tf.contrib.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[..., :13] ``` Args: log_mel_spectrograms: A `[..., num_mel_bins]` `float32` `Tensor` of log-magnitude mel-scale spectrograms. name: An optional name for the operation. Returns: A `[..., num_mel_bins]` `float32` `Tensor` of the MFCCs of `log_mel_spectrograms`. Raises: ValueError: If `num_mel_bins` is not positive. [mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum [htk]: https://en.wikipedia.org/wiki/HTK_(software) """ with ops.name_scope(name, 'mfccs_from_log_mel_spectrograms', [log_mel_spectrograms]): # Compute the DCT-II of the resulting log-magnitude mel-scale spectrogram. # The DCT used in HTK scales every basis vector by sqrt(2/N), which is the # scaling required for an "orthogonal" DCT-II *except* in the 0th bin, where # the true orthogonal DCT (as implemented by scipy) scales by sqrt(1/N). For # this reason, we don't apply orthogonal normalization and scale the DCT by # `0.5 * sqrt(2/N)` manually. log_mel_spectrograms = ops.convert_to_tensor(log_mel_spectrograms, dtype=dtypes.float32) if (log_mel_spectrograms.shape.ndims and log_mel_spectrograms.shape.dims[-1].value is not None): num_mel_bins = log_mel_spectrograms.shape.dims[-1].value if num_mel_bins == 0: raise ValueError('num_mel_bins must be positive. Got: %s' % log_mel_spectrograms) else: num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1] dct2 = spectral_ops.dct(log_mel_spectrograms) return dct2 * math_ops.rsqrt(math_ops.to_float(num_mel_bins) * 2.0)
def group_norm(inputs, groups=32, channels_axis=-1, reduction_axes=(-3, -2), center=True, scale=True, epsilon=1e-6, activation_fn=None, param_initializers=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, mean_close_to_zero=False): """Functional interface for the group normalization layer. Reference: https://arxiv.org/abs/1803.08494. "Group Normalization", Yuxin Wu, Kaiming He Args: inputs: A Tensor with at least 2 dimensions one which is channels. All shape dimensions must be fully defined. groups: Integer. Divide the channels into this number of groups over which normalization statistics are computed. This number must be commensurate with the number of channels in `inputs`. channels_axis: An integer. Specifies index of channels axis which will be broken into `groups`, each of which whose statistics will be computed across. Must be mutually exclusive with `reduction_axes`. Preferred usage is to specify negative integers to be agnostic as to whether a batch dimension is included. reduction_axes: Tuple of integers. Specifies dimensions over which statistics will be accumulated. Must be mutually exclusive with `channels_axis`. Statistics will not be accumulated across axes not specified in `reduction_axes` nor `channel_axis`. Preferred usage is to specify negative integers to be agnostic to whether a batch dimension is included. Some sample usage cases: NHWC format: channels_axis=-1, reduction_axes=[-3, -2] NCHW format: channels_axis=-3, reduction_axes=[-2, -1] center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). scope: Optional scope for `variable_scope`. mean_close_to_zero: The mean of `input` before ReLU will be close to zero when batch size >= 4k for Resnet-50 on TPU. If `True`, use `nn.sufficient_statistics` and `nn.normalize_moments` to calculate the variance. This is the same behavior as `fused` equals `True` in batch normalization. If `False`, use `nn.moments` to calculate the variance. When `mean` is close to zero, like 1e-4, use `mean` to calculate the variance may have poor result due to repeated roundoff error and denormalization in `mean`. When `mean` is large, like 1e2, sum(`input`^2) is so large that only the high-order digits of the elements are being accumulated. Thus, use sum(`input` - `mean`)^2/n to calculate the variance has better accuracy compared to (sum(`input`^2)/n - `mean`^2) when `mean` is large. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. ValueError: If number of groups is not commensurate with number of channels. ValueError: If reduction_axes or channels_axis are out of bounds. ValueError: If reduction_axes are not mutually exclusive with channels_axis. """ # TODO(shlens): Support partially defined shapes for the inputs. inputs = ops.convert_to_tensor(inputs) original_shape = inputs.shape if inputs.shape.ndims is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) if channels_axis > (inputs.shape.ndims - 1): raise ValueError('Axis is out of bounds.') # Standardize the channels_axis to be positive and identify # of channels. if channels_axis < 0: channels_axis = inputs.shape.ndims + channels_axis channels = inputs.shape[channels_axis].value if channels is None: raise ValueError('Inputs %s has undefined channel dimension: %d.' % ( inputs.name, channels_axis)) # Standardize the reduction_axes to be positive. reduction_axes = list(reduction_axes) for i in range(len(reduction_axes)): if reduction_axes[i] < 0: reduction_axes[i] += inputs.shape.ndims for a in reduction_axes: if a > inputs.shape.ndims: raise ValueError('Axis is out of bounds.') if inputs.shape[a].value is None: raise ValueError('Inputs %s has undefined dimensions %d.' % ( inputs.name, a)) if channels_axis == a: raise ValueError('reduction_axis must be mutually exclusive ' 'with channels_axis') if groups > channels: raise ValueError('Invalid groups %d for %d channels.' % (groups, channels)) if channels % groups != 0: raise ValueError('%d channels is not commensurate with %d groups.' % (channels, groups)) # Determine axes before channels. Some examples of common image formats: # 'NCHW': before = [N], after = [HW] # 'NHWC': before = [NHW], after = [] axes_before_channels = inputs.shape.as_list()[:channels_axis] axes_after_channels = inputs.shape.as_list()[channels_axis+1:] # Manually broadcast the parameters to conform to the number of groups. params_shape_broadcast = ([1] * len(axes_before_channels) + [groups, channels // groups] + [1] * len(axes_after_channels)) # Reshape the input by the group within the channel dimension. inputs_shape = (axes_before_channels + [groups, channels // groups] + axes_after_channels) inputs = array_ops.reshape(inputs, inputs_shape) # Determine the dimensions across which moments are calculated. moments_axes = [channels_axis + 1] for a in reduction_axes: if a > channels_axis: moments_axes.append(a + 1) else: moments_axes.append(a) with variable_scope.variable_scope( scope, 'GroupNorm', [inputs], reuse=reuse) as sc: # Note that the params_shape is the number of channels always. params_shape = [channels] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None dtype = inputs.dtype.base_dtype if param_initializers is None: param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) beta = array_ops.reshape(beta, params_shape_broadcast) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) gamma = array_ops.reshape(gamma, params_shape_broadcast) # Calculate the moments. if mean_close_to_zero: # One pass algorithm returns better result when mean is close to zero. counts, means_ss, variance_ss, _ = nn.sufficient_statistics( inputs, moments_axes, keep_dims=True) mean, variance = nn.normalize_moments( counts, means_ss, variance_ss, shift=None) else: mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) # Compute normalization. # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor # appropriately so that this operation may be faster. gain = math_ops.rsqrt(variance + epsilon) offset = -mean * gain if gamma is not None: gain *= gamma offset *= gamma if beta is not None: offset += beta outputs = inputs * gain + offset # Collapse the groups into the channel dimension. outputs = array_ops.reshape(outputs, original_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def _opsBatchNorm(self, x, m, v, beta, gamma, epsilon, scale_after_normalization, shift_after_normalization): y = (x - m) * math_ops.rsqrt(v + epsilon) if scale_after_normalization: y = gamma * y return y + beta if shift_after_normalization else y
def call(self, inputs, training=None): if training is None: training = K.learning_phase() conv_out = super(_ConvBatchNorm2D, self).call(inputs) # Not all the computations in the batchnorm need to happen, # but this avoids duplicating code (e.g. moving_average). self.batchnorm.call(conv_out) folded_conv_kernel_multiplier = self.batchnorm.gamma * math_ops.rsqrt( self.batchnorm.moving_variance + self.batchnorm.epsilon) folded_conv_kernel = math_ops.mul(folded_conv_kernel_multiplier, self.kernel, name='folded_conv_kernel') folded_conv_bias = math_ops.subtract(self.batchnorm.beta, self.batchnorm.moving_mean * folded_conv_kernel_multiplier, name='folded_conv_bias') if self.is_quantized: def make_quantizer_fn(training): """Return quantizer conditioned on whether training or not.""" def quantizer_fn(): return self.weight_quantizer(folded_conv_kernel, self.optimizer_step, training, min_var=self._weight_min_var, max_var=self._weight_max_var) return quantizer_fn folded_conv_kernel = tf_utils.smart_cond(training, make_quantizer_fn(True), make_quantizer_fn(False)) # Second convolution doesn't need new trainable weights, so we # cannot reuse Conv2D layer. # TODO(alanchiao): # 1. See if we can at least reuse the bias logic. # 2. See if we need to fork between conv2d and conv2d_v2 for # TensorFlow 1.XX and 2.XX. # Taken from keras/layers/convolutional.py:183 if self.padding == 'causal': op_padding = 'valid' else: op_padding = self.padding if not isinstance(op_padding, (list, tuple)): op_padding = op_padding.upper() folded_conv_out = nn_ops.conv2d( inputs, folded_conv_kernel, strides=self.strides, padding=op_padding, data_format=conv_utils.convert_data_format(self.data_format, self.rank + 2), dilations=self.dilation_rate, name='folded_conv_out', ) # Taken from keras/layers/convolutional.py:200 if self.data_format == 'channels_first': if self.rank == 1: # nn.bias_add does not accept a 1D input tensor. bias = array_ops.reshape(folded_conv_bias, (1, self.filters, 1)) folded_conv_out += bias else: outputs = nn.bias_add(folded_conv_out, folded_conv_bias, data_format='NCHW') else: outputs = nn.bias_add(folded_conv_out, folded_conv_bias, data_format='NHWC') if self.is_quantized: self.post_activation.training = training if self.post_activation is not None: return self.post_activation(outputs) return outputs
def _variance_scale_term(self): """Helper to `_covariance` and `_variance` which computes a shared scale.""" return math_ops.rsqrt(1. + self.total_concentration[..., None])
def _variance_scale_term(self): """Helper to `_covariance` and `_variance` which computes a shared scale.""" return math_ops.rsqrt(1. + self.total_concentration[..., array_ops.newaxis])
def dct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disable=redefined-builtin """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`. Types I, II, III and IV are supported. Type I is implemented using a length `2N` padded `tf.signal.rfft`. Type II is implemented using a length `2N` padded `tf.signal.rfft`, as described here: [Type 2 DCT using 2N FFT padded (Makhoul)] (https://dsp.stackexchange.com/a/10606). Type III is a fairly straightforward inverse of Type II (i.e. using a length `2N` padded `tf.signal.irfft`). Type IV is calculated through 2N length DCT2 of padded signal and picking the odd indices. @compatibility(scipy) Equivalent to [scipy.fftpack.dct] (https://docs.scipy.org/doc/scipy-1.4.0/reference/generated/scipy.fftpack.dct.html) for Type-I, Type-II, Type-III and Type-IV DCT. @end_compatibility Args: input: A `[..., samples]` `float32`/`float64` `Tensor` containing the signals to take the DCT of. type: The DCT type to perform. Must be 1, 2, 3 or 4. n: The length of the transform. If length is less than sequence length, only the first n elements of the sequence are considered for the DCT. If n is greater than the sequence length, zeros are padded and then the DCT is computed as usual. axis: For future expansion. The axis to compute the DCT along. Must be `-1`. norm: The normalization to apply. `None` for no normalization or `'ortho'` for orthonormal normalization. name: An optional name for the operation. Returns: A `[..., samples]` `float32`/`float64` `Tensor` containing the DCT of `input`. Raises: ValueError: If `type` is not `1`, `2`, `3` or `4`, `axis` is not `-1`, `n` is not `None` or greater than 0, or `norm` is not `None` or `'ortho'`. ValueError: If `type` is `1` and `norm` is `ortho`. [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform """ _validate_dct_arguments(input, type, n, axis, norm) with _ops.name_scope(name, "dct", [input]): input = _ops.convert_to_tensor(input) zero = _ops.convert_to_tensor(0.0, dtype=input.dtype) seq_len = (tensor_shape.dimension_value(input.shape[-1]) or _array_ops.shape(input)[-1]) if n is not None: if n <= seq_len: input = input[..., 0:n] else: rank = len(input.shape) padding = [[0, 0] for _ in range(rank)] padding[rank - 1][1] = n - seq_len padding = _ops.convert_to_tensor(padding, dtype=_dtypes.int32) input = _array_ops.pad(input, paddings=padding) axis_dim = (tensor_shape.dimension_value(input.shape[-1]) or _array_ops.shape(input)[-1]) axis_dim_float = _math_ops.cast(axis_dim, input.dtype) if type == 1: dct1_input = _array_ops.concat([input, input[..., -2:0:-1]], axis=-1) dct1 = _math_ops.real(fft_ops.rfft(dct1_input)) return dct1 if type == 2: scale = 2.0 * _math_ops.exp( _math_ops.complex( zero, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 / axis_dim_float)) # TODO(rjryan): Benchmark performance and memory usage of the various # approaches to computing a DCT via the RFFT. dct2 = _math_ops.real( fft_ops.rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale) if norm == "ortho": n1 = 0.5 * _math_ops.rsqrt(axis_dim_float) n2 = n1 * _math.sqrt(2.0) # Use tf.pad to make a vector of [n1, n2, n2, n2, ...]. weights = _array_ops.pad(_array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]], constant_values=n2) dct2 *= weights return dct2 elif type == 3: if norm == "ortho": n1 = _math_ops.sqrt(axis_dim_float) n2 = n1 * _math.sqrt(0.5) # Use tf.pad to make a vector of [n1, n2, n2, n2, ...]. weights = _array_ops.pad(_array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]], constant_values=n2) input *= weights else: input *= axis_dim_float scale = 2.0 * _math_ops.exp( _math_ops.complex( zero, _math_ops.range(axis_dim_float) * _math.pi * 0.5 / axis_dim_float)) dct3 = _math_ops.real( fft_ops.irfft(scale * _math_ops.complex(input, zero), fft_length=[2 * axis_dim]))[..., :axis_dim] return dct3 elif type == 4: # DCT-2 of 2N length zero-padded signal, unnormalized. dct2 = dct(input, type=2, n=2 * axis_dim, axis=axis, norm=None) # Get odd indices of DCT-2 of zero padded 2N signal to obtain # DCT-4 of the original N length signal. dct4 = dct2[..., 1::2] if norm == "ortho": dct4 *= _math.sqrt(0.5) * _math_ops.rsqrt(axis_dim_float) return dct4
def _BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training=True): """Returns the gradients for the 3 inputs of BatchNorm. Args: grad_y: A `Tensor` of 4 dimensions for gradient for y. x: A `Tensor` of 4 dimensions for x. scale: A `Tensor` of 1 dimension for scaling. pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when is_training=False. pop_var: A `Tensor` of 1 dimension for the population variance. Only used when is_training=False. epsilon: A small float number added to the variance of x. data_format: The data format for input. Either b"NHWC" or b"NCHW". is_training: A bool value to indicate the operation is for training (default) or inference. Returns: A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient for x, grad_scale the gradient for scale, and grad_offset the gradient for offset. """ x_dtype = x.dtype.base_dtype if x_dtype == dtypes.float16: # float16 math is too imprecise, so we do the batch norm gradient # computations in float32. x = math_ops.cast(x, dtypes.float32) grad_y = math_ops.cast(grad_y, dtypes.float32) if is_training: if data_format == b"NHWC": keepdims = False reduce_axis = [0, 1, 2] else: keepdims = True reduce_axis = [0, 2, 3] shape = [1, array_ops.size(scale), 1, 1] scale = array_ops.reshape(scale, shape) mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims) mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims) var_x = math_ops.reduce_mean( math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)), reduce_axis, keepdims=keepdims) grad_y_offset = grad_y - mean_grad_y x_offset = x - mean_x mean = math_ops.reduce_mean( grad_y * x_offset, axis=reduce_axis, keepdims=keepdims) grad_x = scale * math_ops.rsqrt(var_x + epsilon) * ( grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset) grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum( grad_y * x_offset, axis=reduce_axis, keepdims=keepdims) if data_format == b"NCHW": grad_scale = array_ops.squeeze(grad_scale) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset else: if data_format == b"NHWC": reduce_axis = [0, 1, 2] else: reduce_axis = [0, 2, 3] shape = [1, array_ops.size(pop_mean), 1, 1] pop_mean = array_ops.reshape(pop_mean, shape) pop_var = array_ops.reshape(pop_var, shape) scale = array_ops.reshape(scale, shape) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) var_rsqrt = math_ops.rsqrt(pop_var + epsilon) grad_scale = math_ops.reduce_sum( grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis) grad_x = grad_y * scale * var_rsqrt return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
def BN0(x): mean = math_ops.reduce_mean(x, [0]) var = math_ops.reduce_mean(math_ops.square(x - mean)) # biased var rstd = math_ops.rsqrt(var + 1e-8) return (x - mean) * rstd
def dct(input, type=2, n=None, axis=-1, norm=None, name=None): # pylint: disable=redefined-builtin """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`. Currently only Types II and III are supported. Type II is implemented using a length `2N` padded `tf.spectral.rfft`, as described here: https://dsp.stackexchange.com/a/10606. Type III is a fairly straightforward inverse of Type II (i.e. using a length `2N` padded `tf.spectral.irfft`). @compatibility(scipy) Equivalent to scipy.fftpack.dct for Type-II and Type-III DCT. https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html @end_compatibility Args: input: A `[..., samples]` `float32` `Tensor` containing the signals to take the DCT of. type: The DCT type to perform. Must be 2 or 3. n: For future expansion. The length of the transform. Must be `None`. axis: For future expansion. The axis to compute the DCT along. Must be `-1`. norm: The normalization to apply. `None` for no normalization or `'ortho'` for orthonormal normalization. name: An optional name for the operation. Returns: A `[..., samples]` `float32` `Tensor` containing the DCT of `input`. Raises: ValueError: If `type` is not `2` or `3`, `n` is not `None, `axis` is not `-1`, or `norm` is not `None` or `'ortho'`. [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform """ _validate_dct_arguments(type, n, axis, norm) with _ops.name_scope(name, "dct", [input]): # We use the RFFT to compute the DCT and TensorFlow only supports float32 # for FFTs at the moment. input = _ops.convert_to_tensor(input, dtype=_dtypes.float32) axis_dim = (tensor_shape.dimension_value(input.shape[-1]) or _array_ops.shape(input)[-1]) axis_dim_float = _math_ops.to_float(axis_dim) if type == 2: scale = 2.0 * _math_ops.exp( _math_ops.complex( 0.0, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 / axis_dim_float)) # TODO(rjryan): Benchmark performance and memory usage of the various # approaches to computing a DCT via the RFFT. dct2 = _math_ops.real( rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale) if norm == "ortho": n1 = 0.5 * _math_ops.rsqrt(axis_dim_float) n2 = n1 * _math_ops.sqrt(2.0) # Use tf.pad to make a vector of [n1, n2, n2, n2, ...]. weights = _array_ops.pad(_array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]], constant_values=n2) dct2 *= weights return dct2 elif type == 3: if norm == "ortho": n1 = _math_ops.sqrt(axis_dim_float) n2 = n1 * _math_ops.sqrt(0.5) # Use tf.pad to make a vector of [n1, n2, n2, n2, ...]. weights = _array_ops.pad(_array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]], constant_values=n2) input *= weights else: input *= axis_dim_float scale = 2.0 * _math_ops.exp( _math_ops.complex( 0.0, _math_ops.range(axis_dim_float) * _math.pi * 0.5 / axis_dim_float)) dct3 = _math_ops.real( irfft(scale * _math_ops.complex(input, 0.0), fft_length=[2 * axis_dim]))[..., :axis_dim] return dct3
def b_n(value, mean, variance, beta, gamma, epsilon):#beta=offset gamma=scale inv = math_ops.rsqrt(variance + epsilon)#rsqrt = 1/sqrt(value) inv *= gamma return value * inv + (beta - mean * inv)
def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay, fused_batch_norm): """Computes batch norm correction params. Before batch normalization is frozen: We use batch statistics for batch norm. correction_scale = sigma_b/sigma_mv correction_recip = 1/correction_scale correction_offset = 0 After batch normalization is frozen: correction_scale = sigma_b/sigma_mv correction_recip = 1 correction_offset = gamma*(mu_b/sigma_b-mu_mv/sigma_mv). Batch norm is frozen if global_step > bn_freeze_delay. The corrections ensure that: a) The weights are quantized after scaling by gamma/sigma_mv. This enables smoother training as the scaling on the weights changes slowly, rather than jump across mini-batches b) Changing the values of the corrections allows for one to switch between using batch statistics to using moving mean and average, without requiring changes to batch_norm Args: context: The scope under which we look for batch norm params match: Object containing required batch norm tensors for correction computation. freeze_batch_norm_delay: Delay in steps at which computation switches from regular batch norm to frozen mean and variance. fused_batch_norm: Bool, true if fused batch norm is used. Returns: A tuple of correction_scale, correction_recip, correction_offset """ g = ops.get_default_graph() prefix = '' if not context else context + '/' with g.name_scope(prefix + 'batch_norm_correction'): recip_sigma_mv = math_ops.rsqrt( match.moving_variance_tensor + match.batch_epsilon) recip_sigma = math_ops.rsqrt(match.variance_tensor + match.batch_epsilon) correction_scale = math_ops.divide( recip_sigma_mv, recip_sigma, name='scale_compute') correction_scale = array_ops.identity( correction_scale, name='correction_scale') correction_recip = math_ops.reciprocal( correction_scale, name='reciprocal_compute') correction_offset = math_ops.multiply( match.gamma_tensor, match.mean_tensor * recip_sigma - match.moving_mean_tensor * recip_sigma_mv, name='offset_compute') if freeze_batch_norm_delay is not None: use_mv_avg = math_ops.greater_equal( common.CreateOrGetQuantizationStep(), freeze_batch_norm_delay, name='use_moving_average') else: use_mv_avg = False bn_decay_zero = 0.0 bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers()) bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers()) bn_decay_mean_out = utils.smart_cond( use_mv_avg, lambda: bn_decay_zero, lambda: match.bn_decay_mean_tensor, name='freeze_moving_mean') graph_editor.reroute_ts( [bn_decay_mean_out], [match.bn_decay_mean_tensor], can_modify=bn_decay_mean_consumers) if fused_batch_norm is False: bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers()) bn_decay_var_out = utils.smart_cond( use_mv_avg, lambda: bn_decay_zero, lambda: match.bn_decay_var_tensor, name='freeze_moving_var') graph_editor.reroute_ts( [bn_decay_var_out], [match.bn_decay_var_tensor], can_modify=bn_decay_var_consumers) correction_recip = utils.smart_cond( use_mv_avg, lambda: array_ops.ones(correction_scale.shape), lambda: correction_recip, name='correction_recip') correction_offset = utils.smart_cond( use_mv_avg, lambda: correction_offset, lambda: array_ops.zeros(correction_offset.shape), name='correction_offset') return correction_scale, correction_recip, correction_offset
def batch_norm_slow(tensor, mean, variance, beta, gamma, scale): batch_norm = (tensor - mean) * math_ops.rsqrt(variance + 0.001) if scale: batch_norm *= gamma return batch_norm + beta
def _BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training=True): """Returns the gradients for the 3 inputs of BatchNorm. Args: grad_y: A `Tensor` of 4 dimensions for gradient for y. x: A `Tensor` of 4 dimensions for x. scale: A `Tensor` of 1 dimension for scaling. pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when is_training=False. pop_var: A `Tensor` of 1 dimension for the population variance. Only used when is_training=False. epsilon: A small float number added to the variance of x. data_format: The data format for input. Either b"NHWC" or b"NCHW". is_training: A bool value to indicate the operation is for training (default) or inference. Returns: A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient for x, grad_scale the gradient for scale, and grad_offset the gradient for offset. """ x_dtype = x.dtype.base_dtype if x_dtype == dtypes.float16: # float16 math is too imprecise, so we do the batch norm gradient # computations in float32. x = math_ops.cast(x, dtypes.float32) grad_y = math_ops.cast(grad_y, dtypes.float32) if is_training: if data_format == b"NHWC": keepdims = False reduce_axis = [0, 1, 2] else: keepdims = True reduce_axis = [0, 2, 3] shape = [1, array_ops.size(scale), 1, 1] scale = array_ops.reshape(scale, shape) mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims) mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims) var_x = math_ops.reduce_mean(math_ops.squared_difference( x, array_ops.stop_gradient(mean_x)), reduce_axis, keepdims=keepdims) grad_y_offset = grad_y - mean_grad_y x_offset = x - mean_x mean = math_ops.reduce_mean(grad_y * x_offset, axis=reduce_axis, keepdims=keepdims) grad_x = scale * math_ops.rsqrt(var_x + epsilon) * ( grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset) grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum( grad_y * x_offset, axis=reduce_axis, keepdims=keepdims) if data_format == b"NCHW": grad_scale = array_ops.squeeze(grad_scale) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset else: if data_format == b"NHWC": reduce_axis = [0, 1, 2] else: reduce_axis = [0, 2, 3] shape = [1, array_ops.size(pop_mean), 1, 1] pop_mean = array_ops.reshape(pop_mean, shape) pop_var = array_ops.reshape(pop_var, shape) scale = array_ops.reshape(scale, shape) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) var_rsqrt = math_ops.rsqrt(pop_var + epsilon) grad_scale = math_ops.reduce_sum(grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis) grad_x = grad_y * scale * var_rsqrt return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None): """Computes [MFCCs][mfcc] of `log_mel_spectrograms`. Implemented with GPU-compatible ops and supports gradients. [Mel-Frequency Cepstral Coefficient (MFCC)][mfcc] calculation consists of taking the DCT-II of a log-magnitude mel-scale spectrogram. [HTK][htk]'s MFCCs use a particular scaling of the DCT-II which is almost orthogonal normalization. We follow this convention. All `num_mel_bins` MFCCs are returned and it is up to the caller to select a subset of the MFCCs based on their application. For example, it is typical to only use the first few for speech recognition, as this results in an approximately pitch-invariant representation of the signal. For example: ```python sample_rate = 16000.0 # A Tensor of [batch_size, num_samples] mono PCM samples in the range [-1, 1]. pcm = tf.compat.v1.placeholder(tf.float32, [None, None]) # A 1024-point STFT with frames of 64 ms and 75% overlap. stfts = tf.signal.stft(pcm, frame_length=1024, frame_step=256, fft_length=1024) spectrograms = tf.abs(stfts) # Warp the linear scale spectrograms into the mel-scale. num_spectrogram_bins = stfts.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot( spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) # Compute a stabilized log to get log-magnitude mel-scale spectrograms. log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6) # Compute MFCCs from log_mel_spectrograms and take the first 13. mfccs = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[..., :13] ``` Args: log_mel_spectrograms: A `[..., num_mel_bins]` `float32` `Tensor` of log-magnitude mel-scale spectrograms. name: An optional name for the operation. Returns: A `[..., num_mel_bins]` `float32` `Tensor` of the MFCCs of `log_mel_spectrograms`. Raises: ValueError: If `num_mel_bins` is not positive. [mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum [htk]: https://en.wikipedia.org/wiki/HTK_(software) """ with ops.name_scope(name, 'mfccs_from_log_mel_spectrograms', [log_mel_spectrograms]): # Compute the DCT-II of the resulting log-magnitude mel-scale spectrogram. # The DCT used in HTK scales every basis vector by sqrt(2/N), which is the # scaling required for an "orthogonal" DCT-II *except* in the 0th bin, where # the true orthogonal DCT (as implemented by scipy) scales by sqrt(1/N). For # this reason, we don't apply orthogonal normalization and scale the DCT by # `0.5 * sqrt(2/N)` manually. log_mel_spectrograms = ops.convert_to_tensor(log_mel_spectrograms, dtype=dtypes.float32) if (log_mel_spectrograms.shape.ndims and log_mel_spectrograms.shape.dims[-1].value is not None): num_mel_bins = log_mel_spectrograms.shape.dims[-1].value if num_mel_bins == 0: raise ValueError('num_mel_bins must be positive. Got: %s' % log_mel_spectrograms) else: num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1] dct2 = dct_ops.dct(log_mel_spectrograms, type=2) return dct2 * math_ops.rsqrt( math_ops.cast(num_mel_bins, dtypes.float32) * 2.0)
def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay): """Finds fused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise convolution. Args: graph: Graph to walk and modify. is_training: Bool, true if training. freeze_batch_norm_delay: How many steps to wait before freezing moving mean and variance and using them for batch normalization. Raises: ValueError: When batch norm folding fails. """ for match in _FindFusedBatchNorms(graph): scope, sep, _ = match.layer_op.name.rpartition('/') # Make sure new ops are added to `graph` and put on the same device as # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope # named `scope`. Otherwise, TF creates a unique scope whose name starts with # `scope`. with graph.as_default(), graph.name_scope(scope + sep): with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep): # new weights = old weights * gamma / sqrt(variance + epsilon) # new biases = -mean * gamma / sqrt(variance + epsilon) + beta multiplier_tensor = match.gamma_tensor * math_ops.rsqrt( match.variance_tensor + match.bn_op.get_attr('epsilon')) bias_tensor = math_ops.subtract(match.beta_tensor, match.mean_tensor * multiplier_tensor, name='bias') correction_scale, correction_recip, correction_offset = None, None, None if is_training: correction_scale, correction_recip, correction_offset = ( _ComputeBatchNormCorrections( context='', match=match, freeze_batch_norm_delay=freeze_batch_norm_delay)) # The shape of depthwise weights is different, so we need to reshape the # multiplier_tensor to ensure that the scaled_weight_tensor has the # expected shape. weights = match.weight_tensor if match.layer_op.type == 'DepthwiseConv2dNative': new_shape = [ match.weight_tensor.get_shape().as_list()[2], match.weight_tensor.get_shape().as_list()[3] ] multiplier_tensor = array_ops.reshape(multiplier_tensor, new_shape, name='scale_reshape') if correction_scale is not None: correction_scale = array_ops.reshape( correction_scale, new_shape, name='correction_reshape') if correction_scale is not None: weights = math_ops.multiply(correction_scale, weights, name='correction_mult') scaled_weight_tensor = math_ops.multiply(weights, multiplier_tensor, name='mul_fold') new_layer_tensor = _CloneWithNewOperands(match.layer_op, match.input_tensor, scaled_weight_tensor, match.batch_to_space_op) if correction_recip is not None: new_layer_tensor = math_ops.multiply(correction_recip, new_layer_tensor, name='post_conv_mul') new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset), 'correction_add') bias_add_tensor = math_ops.add(new_layer_tensor, bias_tensor, name='add_fold') nodes_modified_count = common.RerouteTensor( bias_add_tensor, match.output_tensor) if nodes_modified_count == 0: raise ValueError( 'Folding batch norms failed, %s had no outputs.' % match.output_tensor.name)