예제 #1
0
  def _apply_noisy_update(self, mom, grad):
    # Compute and apply the gradient update following
    # preconditioned Langevin dynamics
    stddev = array_ops.where(
        array_ops.squeeze(self._counter > self._burnin),
        math_ops.cast(math_ops.rsqrt(self._learning_rate), grad.dtype),
        array_ops.zeros([], grad.dtype))

    preconditioner = math_ops.rsqrt(
        mom + math_ops.cast(self._diagonal_bias, grad.dtype))
    return (
        0.5 * preconditioner * grad * math_ops.cast(self._num_pseudo_batches,
                                                    grad.dtype) +
        random_ops.random_normal(array_ops.shape(grad), 1.0, dtype=grad.dtype) *
        stddev * math_ops.sqrt(preconditioner))
예제 #2
0
 def _opsBatchNorm(self, x, m, v, beta, gamma, epsilon,
                   scale_after_normalization):
   y = (x - m) * math_ops.rsqrt(v + epsilon)
   if scale_after_normalization:
     y = gamma * y
   y += beta
   return y
def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim):
  """Compute the approximate sliced Wasserstein distance.

  Args:
      a: (matrix) Distribution "a" of samples (row, col).
      b: (matrix) Distribution "b" of samples (row, col).
      random_sampling_count: (int) Number of random projections to average.
      random_projection_dim: (int) Dimension of the random projection space.
  Returns:
      Float containing the approximate distance between "a" and "b".
  """
  s = array_ops.shape(a)
  means = []
  for _ in range(random_sampling_count):
    # Random projection matrix.
    proj = random_ops.random_normal(
        [array_ops.shape(a)[1], random_projection_dim])
    proj *= math_ops.rsqrt(
        math_ops.reduce_sum(math_ops.square(proj), 0, keepdims=True))
    # Project both distributions and sort them.
    proj_a = math_ops.matmul(a, proj)
    proj_b = math_ops.matmul(b, proj)
    proj_a = _sort_rows(proj_a, s[0])
    proj_b = _sort_rows(proj_b, s[0])
    # Pairwise Wasserstein distance.
    wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b))
    means.append(wdist)
  return math_ops.reduce_mean(means)
예제 #4
0
 def _batch_norm(self, x, mean, var, offset, scale, epsilon):
   # We compute the batch norm manually in this function because
   # nn_impl.batch_normalization does not support float16 yet.
   # TODO(reedwm): Add float16 support to nn_impl.batch_normalization.
   inv = math_ops.rsqrt(var + epsilon) * scale
   y = math_ops.cast(x, scale.dtype) * inv + (offset - mean * inv)
   return math_ops.cast(y, x.dtype)
예제 #5
0
파일: nn.py 프로젝트: BersaKAIN/tensorflow
def l2_normalize(x, dim, epsilon=1e-12, name=None):
  """Normalizes along dimension `dim` using an L2 norm.

  For a 1-D tensor with `dim = 0`, computes

      output = x / sqrt(max(sum(x**2), epsilon))

  For `x` with more dimensions, independently normalizes each 1-D slice along
  dimension `dim`.

  Args:
    x: A `Tensor`.
    dim: Dimension along which to normalize.
    epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
      divisor if `norm < sqrt(epsilon)`.
    name: A name for this operation (optional).

  Returns:
    A `Tensor` with the same shape as `x`.
  """
  with ops.op_scope([x], name, "l2_normalize") as name:
    x = ops.convert_to_tensor(x, name="x")
    square_sum = math_ops.reduce_sum(math_ops.square(x), [dim], keep_dims=True)
    x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
    return math_ops.mul(x, x_inv_norm, name=name)
예제 #6
0
def clip_by_norm(t, clip_norm, name=None):
  """Clips tensor values to a maximum L2-norm.

  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
  normalizes `t` so that its L2-norm is less than or equal to `clip_norm'.
  Specifically, if the L2-norm is already less than or equal to `clip_norm`,
  then `t` is not modified. If the L2-norm is greater than `clip_norm`, then
  this operation returns a tensor of the same type and shape as `t` with its
  values set to:

  `t * clip_norm / l2norm(t)`

  In this case, the L2-norm of the output tensor is `clip_norm`.

  This operation is typically used to clip gradients before applying them with
  an optimizer.

  Args:
    t: A `Tensor`.
    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor`.
  """
  with ops.op_scope([t, clip_norm], name, "clip_by_norm") as name:
    t = ops.convert_to_tensor(t, name="t")

    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
    l2norm_inv = math_ops.rsqrt(
        math_ops.reduce_sum(t * t, math_ops.range(array_ops.rank(t))))
    tclip = array_ops.identity(t * clip_norm * math_ops.minimum(
        l2norm_inv, constant_op.constant(1.0 / clip_norm)), name=name)

  return tclip
예제 #7
0
def _FoldFusedBatchNorms(graph):
  """Finds fused batch norm layers and folds them into preceding layers.

  Folding only affects the following layers: Conv2D, fully connected, depthwise
  convolution.

  Args:
    graph: Graph to walk and modify.

  Raises:
    ValueError: When batch norm folding fails.
  """
  for match in _FindFusedBatchNorms(graph):
    scope, sep, _ = match.layer_op.name.rpartition('/')
    # Make sure new ops are added to `graph` and put on the same device as
    # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope
    # named `scope`. Otherwise, TF creates a unique scope whose name starts with
    # `scope`.
    with graph.as_default(), graph.name_scope(scope + sep), ops.device(
        match.bn_op.device):
      with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep):
        # new weights = old weights * gamma / sqrt(variance + epsilon)
        # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
        multiplier_tensor = match.gamma_tensor * math_ops.rsqrt(
            match.variance_tensor + match.bn_op.get_attr('epsilon'))
        bias_tensor = math_ops.subtract(
            match.beta_tensor,
            match.mean_tensor * multiplier_tensor,
            name='bias')

        # The shape of depthwise weights is different, so we need to reshape the
        # multiplier_tensor to ensure that the scaled_weight_tensor has the
        # expected shape.
        if match.layer_op.type == 'DepthwiseConv2dNative':
          new_shape = [
              match.weight_tensor.get_shape().as_list()[2],
              match.weight_tensor.get_shape().as_list()[3]
          ]
          multiplier_tensor = array_ops.reshape(
              multiplier_tensor, new_shape, name='scale_reshape')

      # TODO(suharshs): This naming of the following ops needs to carefully
      # follow the naming expected by quantize.py. Generalize the quantize code
      # to not require these delicate naming conventions.
      scaled_weight_tensor = math_ops.multiply(
          match.weight_tensor, multiplier_tensor, name='mul_fold')

      new_layer_tensor = _CloneWithNewOperands(
          match.layer_op, match.input_tensor, scaled_weight_tensor)

      bias_add_tensor = math_ops.add(
          new_layer_tensor, bias_tensor, name='add_fold')

      nodes_modified_count = graph_editor.reroute_ts(bias_add_tensor,
                                                     match.output_tensor)
      if nodes_modified_count != 1:
        raise ValueError(
            'Unexpected inputs to op: %s' % match.output_tensor.name)
예제 #8
0
def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
  """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.

  Currently only Type II is supported. Implemented using a length `2N` padded
  @{tf.spectral.rfft}, as described here: https://dsp.stackexchange.com/a/10606

  @compatibility(scipy)
  Equivalent to scipy.fftpack.dct for the Type-II DCT.
  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
  @end_compatibility

  Args:
    input: A `[..., samples]` `float32` `Tensor` containing the signals to
      take the DCT of.
    type: The DCT type to perform. Must be 2.
    n: For future expansion. The length of the transform. Must be `None`.
    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
    norm: The normalization to apply. `None` for no normalization or `'ortho'`
      for orthonormal normalization.
    name: An optional name for the operation.

  Returns:
    A `[..., samples]` `float32` `Tensor` containing the DCT of `input`.

  Raises:
    ValueError: If `type` is not `2`, `n` is not `None, `axis` is not `-1`, or
      `norm` is not `None` or `'ortho'`.

  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
  """
  _validate_dct_arguments(type, n, axis, norm)
  with _ops.name_scope(name, "dct", [input]):
    # We use the RFFT to compute the DCT and TensorFlow only supports float32
    # for FFTs at the moment.
    input = _ops.convert_to_tensor(input, dtype=_dtypes.float32)

    axis_dim = input.shape[-1].value or _array_ops.shape(input)[-1]
    axis_dim_float = _math_ops.to_float(axis_dim)
    scale = 2.0 * _math_ops.exp(_math_ops.complex(
        0.0, -_math.pi * _math_ops.range(axis_dim_float) /
        (2.0 * axis_dim_float)))

    # TODO(rjryan): Benchmark performance and memory usage of the various
    # approaches to computing a DCT via the RFFT.
    dct2 = _math_ops.real(
        rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale)

    if norm == "ortho":
      n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
      n2 = n1 * _math_ops.sqrt(2.0)
      # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
      weights = _array_ops.pad(
          _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
          constant_values=n2)
      dct2 *= weights

    return dct2
예제 #9
0
파일: nn_grad.py 프로젝트: Dr4KK/tensorflow
def _FusedBatchNormGrad(op, *grad):
  """Return the gradients for the 3 inputs of BatchNorm.

  Args:
    op: The BatchNormOp for which we need to compute gradients.
    *grad: An argument list for tensors of gradients wrt the outputs
          with grad[0] as grad_y.

  Returns:
    grad_x: gradient for x, which is scale * rsqrt(variance + epsilon) *
            [grad_y - mean(grad_y) - (x - mean(x)) *
            mean(grad_y * (x - mean(x))) / (variance + epsilon)]
            in training mode; grad_y * scale * rsqrt(pop_variance + epsilon)
            in freeze mode.

    grad_scale: gradient for scale, which is sum(grad_y * (x - mean(x)) *
                rsqrt(variance + epsilon)) in training mode;
                sum(grad_y * (x - pop_mean) * rsqrt(pop_variance + epsilon))
                in freeze mode.

    grad_offset: gradient for offset, which is sum(grad_y) in training mode;
                 sum(grad_y) in freeze mode.
  """
  x = op.inputs[0]
  grad_y = grad[0]
  scale = op.inputs[1]
  epsilon = op.get_attr("epsilon")
  data_format = op.get_attr("data_format")
  is_training = op.get_attr("is_training")
  if is_training:
    return gen_nn_ops.fused_batch_norm_grad(
        grad_y,
        x,
        scale,
        op.outputs[3],
        op.outputs[4],
        epsilon=epsilon,
        data_format=data_format,
        is_training=is_training)
  else:
    pop_mean = op.inputs[3]
    pop_var = op.inputs[4]
    if data_format == b"NHWC":
      reduce_axis = [0, 1, 2]
    else:
      reduce_axis = [0, 2, 3]
      shape = [1, array_ops.size(pop_mean), 1, 1]
      pop_mean = array_ops.reshape(pop_mean, shape)
      pop_var = array_ops.reshape(pop_var, shape)
      scale = array_ops.reshape(scale, shape)

    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
    var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
    grad_scale = math_ops.reduce_sum(
        grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis)
    grad_x = grad_y * scale * var_rsqrt
    return grad_x, grad_scale, grad_offset, None, None
예제 #10
0
def batch_normalization(x,
                        mean,
                        variance,
                        offset,
                        scale,
                        variance_epsilon,
                        name=None):
  r"""Batch normalization.

  As described in http://arxiv.org/abs/1502.03167.
  Normalizes a tensor by `mean` and `variance`, and applies (optionally) a
  `scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\):

  \\(\frac{\gamma(x-\mu)}{\sigma}+\beta\\)

  `mean`, `variance`, `offset` and `scale` are all expected to be of one of two
  shapes:

    * In all generality, they can have the same number of dimensions as the
      input `x`, with identical sizes as `x` for the dimensions that are not
      normalized over (the 'depth' dimension(s)), and dimension 1 for the
      others which are being normalized over.
      `mean` and `variance` in this case would typically be the outputs of
      `tf.nn.moments(..., keep_dims=True)` during training, or running averages
      thereof during inference.
    * In the common case where the 'depth' dimension is the last dimension in
      the input tensor `x`, they may be one dimensional tensors of the same
      size as the 'depth' dimension.
      This is the case for example for the common `[batch, depth]` layout of
      fully-connected layers, and `[batch, height, width, depth]` for
      convolutions.
      `mean` and `variance` in this case would typically be the outputs of
      `tf.nn.moments(..., keep_dims=False)` during training, or running averages
      thereof during inference.

  Args:
    x: Input `Tensor` of arbitrary dimensionality.
    mean: A mean `Tensor`.
    variance: A variance `Tensor`.
    offset: An offset `Tensor`, often denoted \\(\beta\\) in equations, or
      None. If present, will be added to the normalized tensor.
    scale: A scale `Tensor`, often denoted \\(\gamma\\) in equations, or
      `None`. If present, the scale is applied to the normalized tensor.
    variance_epsilon: A small float number to avoid dividing by 0.
    name: A name for this operation (optional).

  Returns:
    the normalized, scaled, offset tensor.
  """
  with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]):
    inv = math_ops.rsqrt(variance + variance_epsilon)
    if scale is not None:
      inv *= scale
    return x * inv + (offset - mean * inv
                      if offset is not None else -mean * inv)
예제 #11
0
def _BatchNormGrad(grad_y, x, scale, epsilon, data_format):
  """Returns the gradients for the 3 inputs of BatchNorm.

  Args:
    grad_y: A `Tensor` of 4 dimensions for gradient for y.
    x: A `Tensor` of 4 dimensions for x.
    scale: A `Tensor` of 1 dimension for scaling.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for input. Either b"NHWC" or b"NCHW".

  Returns:
    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
    for x, grad_scale the gradient for scale, and grad_offset the gradient
    for offset.
  """
  if data_format == b"NHWC":
    keep_dims = False
    reduce_axis = [0, 1, 2]
  else:
    keep_dims = True
    reduce_axis = [0, 2, 3]
    shape = [1, array_ops.size(scale), 1, 1]
    scale = array_ops.reshape(scale, shape)
  mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims)
  mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims)
  var_x = math_ops.reduce_mean(
      math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
      reduce_axis,
      keep_dims=keep_dims)
  grad_y_offset = grad_y - mean_grad_y
  x_offset = x - mean_x
  mean = math_ops.reduce_mean(
      grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
  grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
      grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
  grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
      grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
  if data_format == b"NCHW":
    grad_scale = array_ops.squeeze(grad_scale)
  grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
  return grad_x, grad_scale, grad_offset
예제 #12
0
def _bahdanau_score(processed_query, keys, normalize):
  """Implements Bahdanau-style (additive) scoring function.

  This attention has two forms.  The first is Bhandanau attention,
  as described in:

  Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
  "Neural Machine Translation by Jointly Learning to Align and Translate."
  ICLR 2015. https://arxiv.org/abs/1409.0473

  The second is the normalized form.  This form is inspired by the
  weight normalization article:

  Tim Salimans, Diederik P. Kingma.
  "Weight Normalization: A Simple Reparameterization to Accelerate
   Training of Deep Neural Networks."
  https://arxiv.org/abs/1602.07868

  To enable the second form, set `normalize=True`.

  Args:
    processed_query: Tensor, shape `[batch_size, num_units]` to compare to keys.
    keys: Processed memory, shape `[batch_size, max_time, num_units]`.
    normalize: Whether to normalize the score function.

  Returns:
    A `[batch_size, max_time]` tensor of unnormalized score values.
  """
  dtype = processed_query.dtype
  # Get the number of hidden units from the trailing dimension of keys
  num_units = keys.shape[2].value or array_ops.shape(keys)[2]
  # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
  processed_query = array_ops.expand_dims(processed_query, 1)
  v = variable_scope.get_variable(
      "attention_v", [num_units], dtype=dtype)
  if normalize:
    # Scalar used in weight normalization
    g = variable_scope.get_variable(
        "attention_g", dtype=dtype,
        initializer=math.sqrt((1. / num_units)))
    # Bias added prior to the nonlinearity
    b = variable_scope.get_variable(
        "attention_b", [num_units], dtype=dtype,
        initializer=init_ops.zeros_initializer())
    # normed_v = g * v / ||v||
    normed_v = g * v * math_ops.rsqrt(
        math_ops.reduce_sum(math_ops.square(v)))
    return math_ops.reduce_sum(
        normed_v * math_ops.tanh(keys + processed_query + b), [2])
  else:
    return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2])
예제 #13
0
def clip_by_norm(t, clip_norm, axes=None, name=None):
  """Clips tensor values to a maximum L2-norm.

  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
  normalizes `t` so that its L2-norm is less than or equal to `clip_norm`,
  along the dimensions given in `axes`. Specifically, in the default case
  where all dimensions are used for calculation, if the L2-norm of `t` is
  already less than or equal to `clip_norm`, then `t` is not modified. If
  the L2-norm is greater than `clip_norm`, then this operation returns a
  tensor of the same type and shape as `t` with its values set to:

  `t * clip_norm / l2norm(t)`

  In this case, the L2-norm of the output tensor is `clip_norm`.

  As another example, if `t` is a matrix and `axes == [1]`, then each row
  of the output will have L2-norm equal to `clip_norm`. If `axes == [0]`
  instead, each column of the output will be clipped.

  This operation is typically used to clip gradients before applying them with
  an optimizer.

  Args:
    t: A `Tensor`.
    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
    axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions
      to use for computing the L2-norm. If `None` (the default), uses all
      dimensions.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor`.
  """
  with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name:
    t = ops.convert_to_tensor(t, name="t")

    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
    l2norm_inv = math_ops.rsqrt(
        math_ops.reduce_sum(t * t, axes, keep_dims=True))
    intermediate = t * clip_norm
    # Assert that the shape is compatible with the initial shape,
    # to prevent unintentional broadcasting.
    _ = t.shape.merge_with(intermediate.shape)
    tclip = array_ops.identity(intermediate * math_ops.minimum(
        l2norm_inv, constant_op.constant(1.0, dtype=t.dtype) / clip_norm),
                               name=name)

  return tclip
예제 #14
0
def batch_normalization(x, mean, variance, offset, scale, variance_epsilon, data_format, name=None):
    """Data Format aware version of tf.nn.batch_normalization."""
    with ops.name_scope(name, 'batchnorm', [x, mean, variance, scale, offset]):
        inv = math_ops.rsqrt(variance + variance_epsilon)
        if scale is not None:
            inv *= scale

        a = math_ops.cast(inv, x.dtype)
        b = math_ops.cast(offset - mean * inv if offset is not None else -mean * inv, x.dtype)

        # Return a * x + b with customized data_format.
        # Currently TF doesn't have bias_scale, and tensorRT has bug in converting tf.nn.bias_add
        # So we reimplemted them to allow make the model work with tensorRT.
        # See https://github.com/tensorlayer/openpose-plus/issues/75 for more details.
        df = {'channels_first': 'NCHW', 'channels_last': 'NHWC'}
        return _bias_add(_bias_scale(x, a, df[data_format]), b, df[data_format])
예제 #15
0
 def _sample_n(self, n, seed=None):
   # The sampling method comes from the fact that if:
   #   X ~ Normal(0, 1)
   #   Z ~ Chi2(df)
   #   Y = X / sqrt(Z / df)
   # then:
   #   Y ~ StudentT(df).
   shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
   normal_sample = random_ops.random_normal(shape, dtype=self.dtype, seed=seed)
   df = self.df * array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)
   gamma_sample = random_ops.random_gamma(
       [n],
       0.5 * df,
       beta=0.5,
       dtype=self.dtype,
       seed=distribution_util.gen_new_seed(seed, salt="student_t"))
   samples = normal_sample * math_ops.rsqrt(gamma_sample / df)
   return samples * self.scale + self.loc  # Abs(scale) not wanted.
예제 #16
0
def per_image_whitening(image):
  """Linearly scales `image` to have zero mean and unit norm.

  This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average
  of all values in image, and
  `adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))`.

  `stddev` is the standard deviation of all values in `image`. It is capped
  away from zero to protect against division by 0 when handling uniform images.

  Note that this implementation is limited:

  *  It only whitens based on the statistics of an individual image.
  *  It does not take into account the covariance structure.

  Args:
    image: 3-D tensor of shape `[height, width, channels]`.

  Returns:
    The whitened image with same shape as `image`.

  Raises:
    ValueError: if the shape of 'image' is incompatible with this function.
  """
  image = ops.convert_to_tensor(image, name='image')
  _Check3DImage(image, require_static=False)
  num_pixels = math_ops.reduce_prod(array_ops.shape(image))

  image = math_ops.cast(image, dtype=dtypes.float32)
  image_mean = math_ops.reduce_mean(image)

  variance = (math_ops.reduce_mean(math_ops.square(image)) -
              math_ops.square(image_mean))
  variance = gen_nn_ops.relu(variance)
  stddev = math_ops.sqrt(variance)

  # Apply a minimum normalization that protects us against uniform images.
  min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
  pixel_value_scale = math_ops.maximum(stddev, min_stddev)
  pixel_value_offset = image_mean

  image = math_ops.sub(image, pixel_value_offset)
  image = math_ops.div(image, pixel_value_scale)
  return image
예제 #17
0
def _bahdanau_score(processed_query, keys, normalize):
  """Implements Bahdanau-style (additive) scoring function.
  This attention has two forms.  The first is Bhandanau attention,
  as described in:
  Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
  "Neural Machine Translation by Jointly Learning to Align and Translate."
  ICLR 2015. https://arxiv.org/abs/1409.0473
  The second is the normalized form.  This form is inspired by the
  weight normalization article:
  Tim Salimans, Diederik P. Kingma.
  "Weight Normalization: A Simple Reparameterization to Accelerate
   Training of Deep Neural Networks."
  https://arxiv.org/abs/1602.07868
  To enable the second form, set `normalize=True`.
  Args:
    processed_query: Tensor, shape `[batch_size, num_units]` to compare to keys.
    keys: Processed memory, shape `[batch_size, max_time, num_units]`.
    normalize: Whether to normalize the score function.
  Returns:
    A `[batch_size, max_time]` tensor of unnormalized score values.
  """
  dtype = processed_query.dtype
  # Get the number of hidden units from the trailing dimension of keys
  num_units = keys.shape[2].value or array_ops.shape(keys)[2]
  # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
  processed_query = array_ops.expand_dims(processed_query, 1)
  v = variable_scope.get_variable(
      "attention_v", [num_units], dtype=dtype)
  if normalize:
    # Scalar used in weight normalization
    g = variable_scope.get_variable(
        "attention_g", dtype=dtype,
        initializer=math.sqrt((1. / num_units)))
    # Bias added prior to the nonlinearity
    b = variable_scope.get_variable(
        "attention_b", [num_units], dtype=dtype,
        initializer=init_ops.zeros_initializer())
    # normed_v = g * v / ||v||
    normed_v = g * v * math_ops.rsqrt(
        math_ops.reduce_sum(math_ops.square(v)))
    return math_ops.reduce_sum(
        normed_v * math_ops.tanh(keys + processed_query + b), [2])
  else:
    return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2])
    def __call__(self, query, previous_alignments):
        """Score the query based on the keys and values.

    Args:
      query: Tensor of dtype matching `self.values` and shape
        `[batch_size, query_depth]`.
      previous_alignments: Tensor of dtype matching `self.values` and shape
        `[batch_size, alignments_size]`
        (`alignments_size` is memory's `max_time`).

    Returns:
      alignments: Tensor of dtype matching `self.values` and shape
        `[batch_size, alignments_size]` (`alignments_size` is memory's
        `max_time`).
    """
        with variable_scope.variable_scope(None, "bahdanau_attention",
                                           [query]):
            processed_query = self.query_layer(
                query) if self.query_layer else query
            # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
            processed_query = array_ops.expand_dims(processed_query, 1)
            keys = self._keys
            dtype = query.dtype
            v = variable_scope.get_variable("attention_v", [self._num_units],
                                            dtype=dtype)

            if self._normalize:
                # Scalar used in weight normalization
                g = variable_scope.get_variable("attention_g",
                                                dtype=dtype,
                                                initializer=math.sqrt(
                                                    (1. / self._num_units)))

                # normed_v = g * v / ||v||
                normed_v = g * v * math_ops.rsqrt(
                    math_ops.reduce_sum(math_ops.square(v)))
                score = math_ops.reduce_sum(
                    normed_v * math_ops.tanh(keys + processed_query + b), [2])
            else:
                score = math_ops.reduce_sum(
                    v * math_ops.tanh(keys + processed_query), [2])

        alignments = self._probability_fn(score, previous_alignments)
        return alignments, self.mask_func(score)
예제 #19
0
def per_image_whitening(image):
    """Linearly scales `image` to have zero mean and unit norm.

  This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average
  of all values in image, and
  `adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))`.

  `stddev` is the standard deviation of all values in `image`. It is capped
  away from zero to protect against division by 0 when handling uniform images.

  Note that this implementation is limited:

  *  It only whitens based on the statistics of an individual image.
  *  It does not take into account the covariance structure.

  Args:
    image: 3-D tensor of shape `[height, width, channels]`.

  Returns:
    The whitened image with same shape as `image`.

  Raises:
    ValueError: if the shape of 'image' is incompatible with this function.
  """
    image = ops.convert_to_tensor(image, name='image')
    _Check3DImage(image, require_static=False)
    num_pixels = math_ops.reduce_prod(array_ops.shape(image))

    image = math_ops.cast(image, dtype=dtypes.float32)
    image_mean = math_ops.reduce_mean(image)

    variance = (math_ops.reduce_mean(math_ops.square(image)) -
                math_ops.square(image_mean))
    variance = gen_nn_ops.relu(variance)
    stddev = math_ops.sqrt(variance)

    # Apply a minimum normalization that protects us against uniform images.
    min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
    pixel_value_scale = math_ops.maximum(stddev, min_stddev)
    pixel_value_offset = image_mean

    image = math_ops.sub(image, pixel_value_offset)
    image = math_ops.div(image, pixel_value_scale)
    return image
예제 #20
0
    def _apply_dense(self, grad, var):
        # Calculates the preconditioner statistics for each tensor.
        partitioned_grads = TensorPartitioner.partition_tensor(
            grad, self._partition_info)
        shape = var.get_shape()
        fallback_to_diagonal = self._fallback_to_diagonal_for_shape(shape)

        precond_statistics_update = []
        if not fallback_to_diagonal:
            precond_statistics_update = self._updated_statistics(
                var, partitioned_grads)

        accumulator = self.get_slot(var, "accumulator")
        accumulator_updated = state_ops.assign_add(accumulator, grad * grad)
        accumulator_inv_sqrt = math_ops.rsqrt(accumulator_updated + 1e-30)
        if self._momentum > 0.0:
            scaled_g = (1.0 - self._momentum_tensor) * (grad *
                                                        accumulator_inv_sqrt)
            gbar = self.get_slot(var, "momentum")
            gbar_updated = state_ops.assign_add(
                gbar,
                gbar * (self._momentum_tensor - 1.0) + scaled_g)
        else:
            gbar_updated = (grad * accumulator_inv_sqrt)

        if not fallback_to_diagonal:
            # Update the preconditioner statistics followed by computing the
            # preconditioned gradient.
            with ops.control_dependencies(precond_statistics_update):
                s = tf.cast(self._run_nondiagonal_update, tf.float32)
                preconditioned_grad = self._preconditioned_update(
                    var, partitioned_grads, gbar_updated)
                # slowly adapt from diagonal to preconditioned gradient.
                w = self._run_nondiagonal_update_warmup
                warmup_update = s * self._learning_rate_tensor * (
                    w * preconditioned_grad + (1.0 - w) * gbar_updated)
                fallback_update = (1 - s) * (self._learning_rate_tensor *
                                             gbar_updated)
                return state_ops.assign_sub(var,
                                            warmup_update + fallback_update)
        else:
            return state_ops.assign_sub(
                var, self._learning_rate_tensor * gbar_updated)
예제 #21
0
  def __call__(self, query, previous_alignments):
    """Score the query based on the keys and values.

    Args:
      query: Tensor of dtype matching `self.values` and shape
        `[batch_size, query_depth]`.
      previous_alignments: Tensor of dtype matching `self.values` and shape
        `[batch_size, alignments_size]`
        (`alignments_size` is memory's `max_time`).

    Returns:
      alignments: Tensor of dtype matching `self.values` and shape
        `[batch_size, alignments_size]` (`alignments_size` is memory's
        `max_time`).
    """
    with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
      processed_query = self.query_layer(query) if self.query_layer else query
      dtype = processed_query.dtype
      # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
      processed_query = array_ops.expand_dims(processed_query, 1)
      keys = self._keys
      v = variable_scope.get_variable(
          "attention_v", [self._num_units], dtype=dtype)
      if self._normalize:
        # Scalar used in weight normalization
        g = variable_scope.get_variable(
            "attention_g", dtype=dtype,
            initializer=math.sqrt((1. / self._num_units)))
        # Bias added prior to the nonlinearity
        b = variable_scope.get_variable(
            "attention_b", [self._num_units], dtype=dtype,
            initializer=init_ops.zeros_initializer())
        # normed_v = g * v / ||v||
        normed_v = g * v * math_ops.rsqrt(
            math_ops.reduce_sum(math_ops.square(v)))
        score = math_ops.reduce_sum(
            normed_v * math_ops.tanh(keys + processed_query + b), [2])
      else:
        score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query),
                                    [2])

    alignments = self._probability_fn(score, previous_alignments)
    return alignments
예제 #22
0
    def __call__(self, query):
        """Score the query based on the keys and values.

    Args:
      query: Tensor of dtype matching `self.values` and shape
        `[batch_size, query_depth]`.

    Returns:
      score: Tensor of dtype matching `self.values` and shape
        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
    """
        with variable_scope.variable_scope(None, "bahdanau_attention",
                                           [query]):
            processed_query = self.query_layer(
                query) if self.query_layer else query
            dtype = processed_query.dtype
            # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
            processed_query = array_ops.expand_dims(processed_query, 1)
            keys = self._keys
            v = variable_scope.get_variable("attention_v", [self._num_units],
                                            dtype=dtype)
            if self._normalize:
                # Scalar used in weight normalization
                g = variable_scope.get_variable("attention_g",
                                                dtype=dtype,
                                                initializer=math.sqrt(
                                                    (1. / self._num_units)))
                # Bias added prior to the nonlinearity
                b = variable_scope.get_variable(
                    "attention_b", [self._num_units],
                    dtype=dtype,
                    initializer=init_ops.zeros_initializer())
                # normed_v = g * v / ||v||
                normed_v = g * v * math_ops.rsqrt(
                    math_ops.reduce_sum(math_ops.square(v)))
                score = math_ops.reduce_sum(
                    normed_v * math_ops.tanh(keys + processed_query + b), [2])
            else:
                score = math_ops.reduce_sum(
                    v * math_ops.tanh(keys + processed_query), [2])

        return score
예제 #23
0
 def _sample_n(self, n, seed=None):
     # The sampling method comes from the fact that if:
     #   X ~ Normal(0, 1)
     #   Z ~ Chi2(df)
     #   Y = X / sqrt(Z / df)
     # then:
     #   Y ~ StudentT(df).
     shape = array_ops.concat([[n], self.batch_shape()], 0)
     normal_sample = random_ops.random_normal(shape,
                                              dtype=self.dtype,
                                              seed=seed)
     df = self.df * array_ops.ones(self.batch_shape(), dtype=self.dtype)
     gamma_sample = random_ops.random_gamma(
         [n],
         0.5 * df,
         beta=0.5,
         dtype=self.dtype,
         seed=distribution_util.gen_new_seed(seed, salt="student_t"))
     samples = normal_sample * math_ops.rsqrt(gamma_sample / df)
     return samples * self.scale + self.loc  # Abs(scale) not wanted.
예제 #24
0
파일: utils.py 프로젝트: pmh47/o3v
    def call(self, inputs, **kwargs):

        inputs = ops.convert_to_tensor(inputs)
        original_shape = inputs.get_shape()

        # Reshape the input by the group within the channel dimension.
        inputs_shape = (self.axes_before_channels +
                        [self.groups, self.channels // self.groups] +
                        self.axes_after_channels)
        inputs = array_ops.reshape(inputs, inputs_shape)

        # Calculate the moments.
        if self.mean_close_to_zero:
            # One pass algorithm returns better result when mean is close to zero.
            counts, means_ss, variance_ss, _ = tf.nn.sufficient_statistics(
                inputs, self.moments_axes, keep_dims=True)
            mean, variance = tf.nn.normalize_moments(counts,
                                                     means_ss,
                                                     variance_ss,
                                                     shift=None)
        else:
            mean, variance = tf.nn.moments(inputs,
                                           self.moments_axes,
                                           keep_dims=True)

        # Compute normalization.
        gain = math_ops.rsqrt(variance + self.epsilon)
        offset = -mean * gain
        if self.gamma is not None:
            gamma = array_ops.reshape(self.gamma, self.params_shape_broadcast)
            gain *= gamma
            offset *= gamma
        if self.beta is not None:
            beta = array_ops.reshape(self.beta, self.params_shape_broadcast)
            offset += beta
        outputs = inputs * gain + offset

        # Collapse the groups into the channel dimension.
        outputs = array_ops.reshape(outputs, original_shape)

        return outputs
    def get_folded_weights(self):
        """Function to get the batchnorm folded weights.

    This function converts the weights by folding batchnorm parameters into
    the weight of QDepthwiseConv2d. The high-level equation:

    W_fold = gamma * W / sqrt(variance + epsilon)
    bias_fold = gamma * (bias - moving_mean) / sqrt(variance + epsilon) + beta
    """

        depthwise_kernel = self.depthwise_kernel

        if self.use_bias:
            bias = self.bias
        else:
            bias = 0

        # get Batchnorm stats
        gamma = self.batchnorm.gamma
        beta = self.batchnorm.beta
        moving_mean = self.batchnorm.moving_mean
        moving_variance = self.batchnorm.moving_variance

        # get the inversion factor so that we replace division by multiplication
        inv = math_ops.rsqrt(moving_variance + self.batchnorm.epsilon)
        if gamma is not None:
            inv *= gamma
        # fold bias with bn stats
        folded_bias = inv * (bias - moving_mean) + beta

        # for DepthwiseConv2D inv needs to be broadcasted to the last 2 dimensions
        # of the kernels
        depthwise_weights_shape = [
            depthwise_kernel.get_shape().as_list()[2],
            depthwise_kernel.get_shape().as_list()[3]
        ]
        inv = array_ops.reshape(inv, depthwise_weights_shape)
        # wrap conv kernel with bn parameters
        folded_depthwise_kernel = inv * depthwise_kernel

        return [folded_depthwise_kernel, folded_bias]
  def __call__(self, query):
    """Score the query based on the keys and values.

    Args:
      query: Tensor of dtype matching `self.values` and shape
        `[batch_size, query_depth]`.

    Returns:
      score: Tensor of dtype matching `self.values` and shape
        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
    """
    with ops.name_scope(None, "BahndahauAttentionCall", [query]):
      processed_query = self.query_layer(query) if self.query_layer else query
      dtype = processed_query.dtype
      # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
      processed_query = array_ops.expand_dims(processed_query, 1)
      v = variable_scope.get_variable(
          "attention_v", [self._num_units], dtype=dtype)
      if self._normalize:
        # Scalar used in weight normalization
        g = variable_scope.get_variable(
            "attention_g", dtype=dtype,
            initializer=math.sqrt((1. / self._num_units)))
        # Bias added prior to the nonlinearity
        b = variable_scope.get_variable(
            "attention_b", [self._num_units], dtype=dtype,
            initializer=init_ops.zeros_initializer())
        # Scalar bias added to attention scores
        r = variable_scope.get_variable(
            "attention_r", dtype=dtype,
            initializer=self._attention_r_initializer)
        # normed_v = g * v / ||v||
        normed_v = g * v * math_ops.rsqrt(
            math_ops.reduce_sum(math_ops.square(v)))
        score = math_ops.reduce_sum(
            normed_v * math_ops.tanh(self.keys + processed_query + b), [2]) + r
      else:
        score = math_ops.reduce_sum(
            v * math_ops.tanh(self.keys + processed_query), [2])

    return score
예제 #27
0
def bn(x, name='batchnorm'):
    with tf.variable_scope(name):
        epsilon = 1e-3

        size = int(x.shape.as_list()[-1])

        beta = tf.get_variable('beta', [size],
                               initializer=tf.zeros_initializer())
        scale = tf.get_variable('scale', [size],
                                initializer=tf.ones_initializer())

        moving_mean = tf.get_variable('mean', [size],
                                      initializer=tf.zeros_initializer(),
                                      trainable=False)
        moving_variance = tf.get_variable('variance', [size],
                                          initializer=tf.ones_initializer(),
                                          trainable=False)

        inv = math_ops.rsqrt(moving_variance + epsilon)
        inv *= scale
        return x * inv + (beta - moving_mean * inv)
예제 #28
0
파일: clip_ops.py 프로젝트: qwerzou1/shibie
def clip_by_average_norm(t, clip_norm, name=None):
    """Clips tensor values to a maximum average L2-norm.

  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
  normalizes `t` so that its average L2-norm is less than or equal to
  `clip_norm`. Specifically, if the average L2-norm is already less than or
  equal to `clip_norm`, then `t` is not modified. If the average L2-norm is
  greater than `clip_norm`, then this operation returns a tensor of the same
  type and shape as `t` with its values set to:

  `t * clip_norm / l2norm_avg(t)`

  In this case, the average L2-norm of the output tensor is `clip_norm`.

  This operation is typically used to clip gradients before applying them with
  an optimizer.

  Args:
    t: A `Tensor`.
    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor`.
  """
    with ops.name_scope(name, "clip_by_average_norm", [t, clip_norm]) as name:
        t = ops.convert_to_tensor(t, name="t")

        # Calculate L2-norm per element, clip elements by ratio of clip_norm to
        # L2-norm per element
        n_element = math_ops.cast(array_ops.size(t), dtypes.float32)
        l2norm_inv = math_ops.rsqrt(
            math_ops.reduce_sum(t * t, math_ops.range(array_ops.rank(t))))
        tclip = array_ops.identity(
            t * clip_norm *
            math_ops.minimum(l2norm_inv * n_element,
                             constant_op.constant(1.0) / clip_norm),
            name=name)

    return tclip
예제 #29
0
  def __call__(self, query, tiling_factor=1):
    """Score the query based on the keys and values.

    Args:
      query: Tensor of dtype matching `self.values` and shape
        `[batch_size, query_depth]`.
      tiling_factor: An integer factor for which to tile the batch dimension.
        Used with BeamSearchDecoder.

    Returns:
      score: Tensor of dtype matching `self.values` and shape
        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
    """
    with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
      processed_query = self.query_layer(query) if self.query_layer else query
      dtype = processed_query.dtype
      # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
      processed_query = array_ops.expand_dims(processed_query, 1)
      keys = _maybe_tile_batch(self.keys, tiling_factor)
      v = variable_scope.get_variable(
          "attention_v", [self._num_units], dtype=dtype)
      if self._normalize:
        # Scalar used in weight normalization
        g = variable_scope.get_variable(
            "attention_g", dtype=dtype,
            initializer=math.sqrt((1. / self._num_units)))
        # Bias added prior to the nonlinearity
        b = variable_scope.get_variable(
            "attention_b", [self._num_units], dtype=dtype,
            initializer=init_ops.zeros_initializer())
        # normed_v = g * v / ||v||
        normed_v = g * v * math_ops.rsqrt(
            math_ops.reduce_sum(math_ops.square(v)))
        score = math_ops.reduce_sum(
            normed_v * math_ops.tanh(keys + processed_query + b), [2])
      else:
        score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query),
                                    [2])

    return score
예제 #30
0
    def get_context_additive_null(self, query, top_states_4,
                                  top_states_transform_4, encoder_raws_matrix):
        query_transform_2 = tf.add(tf.matmul(query, self.a_w_target),
                                   self.a_b)  #[batch_size, hidden_size]
        query_transform_4 = tf.reshape(
            query_transform_2,
            [-1, 1, 1, self.model.size])  #[batch_size,1,1,hidden_size]

        if self.model.attention_scale:
            # normed_v = g * v / |v|
            normed_v = self.attention_g * self.a_v * math_ops.rsqrt(
                math_ops.reduce_sum(math_ops.square(self.a_v)))
        else:
            normed_v = self.a_v

        attention_null_vector_transform = tf.matmul(self.null_attention_vector,
                                                    self.a_w_source)
        attention_null_score = tf.reduce_sum(
            normed_v *
            tf.tanh(attention_null_vector_transform + query_transform_2),
            [1])  #[batch_size]
        attention_null_score = tf.reshape(attention_null_score, [-1, 1])
        #a = softmax( a_v * tanh(...))
        s = tf.reduce_sum(normed_v *
                          tf.tanh(top_states_transform_4 + query_transform_4),
                          [2, 3])  #[batch_size, source_length]
        s = self.mask_score(s, encoder_raws_matrix)
        s_with_null = tf.concat([attention_null_score, s], 1)
        a_with_null = tf.nn.softmax(
            s_with_null)  # [batch_size, 1 + source_length]
        a = tf.slice(a_with_null, [0, 1],
                     [-1, -1])  #[batch_size, source_length]

        # context = a * h_source
        context = tf.reduce_sum(
            tf.reshape(a, [self.model.batch_size, -1, 1, 1]) * top_states_4,
            [1, 2])

        return context, a
예제 #31
0
파일: modeling.py 프로젝트: thomwolf/xlnet
def batch_normalization(x,
                        mean,
                        variance,
                        offset,
                        scale,
                        variance_epsilon,
                        name=None):
  with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]):
    inv = math_ops.rsqrt(variance + variance_epsilon)

    if scale is not None:
      inv *= scale
    # Note: tensorflow/contrib/quantize/python/fold_batch_norms.py depends on
    # the precise order of ops that are generated by the expression below.
    out = x * math_ops.cast(inv, x.dtype) + math_ops.cast(
        offset - mean * inv if offset is not None else -mean * inv, x.dtype)

    special = tf.transpose(x, [1, 0, 2])

    # special = tf.transpose(inv, [1, 0, 2])

    return out, special
예제 #32
0
def batch_normalization(x,
                        mean,
                        variance,
                        offset,
                        scale,
                        variance_epsilon,
                        data_format,
                        name=None):
    """Data Format aware version of tf.nn.batch_normalization."""
    if data_format == 'channels_last':
        mean = tf.reshape(mean, [1] * (len(x.shape) - 1) + [-1])
        variance = tf.reshape(variance, [1] * (len(x.shape) - 1) + [-1])
        offset = tf.reshape(offset, [1] * (len(x.shape) - 1) + [-1])
        scale = tf.reshape(scale, [1] * (len(x.shape) - 1) + [-1])
    elif data_format == 'channels_first':
        mean = tf.reshape(mean, [1] + [-1] + [1] * (len(x.shape) - 2))
        variance = tf.reshape(variance, [1] + [-1] + [1] * (len(x.shape) - 2))
        offset = tf.reshape(offset, [1] + [-1] + [1] * (len(x.shape) - 2))
        scale = tf.reshape(scale, [1] + [-1] + [1] * (len(x.shape) - 2))
    else:
        raise ValueError('invalid data_format: %s' % data_format)

    with ops.name_scope(name, 'batchnorm', [x, mean, variance, scale, offset]):
        inv = math_ops.rsqrt(variance + variance_epsilon)
        if scale is not None:
            inv *= scale

        a = math_ops.cast(inv, x.dtype)
        b = math_ops.cast(
            offset - mean * inv if offset is not None else -mean * inv,
            x.dtype)

        # Return a * x + b with customized data_format.
        # Currently TF doesn't have bias_scale, and tensorRT has bug in converting tf.nn.bias_add
        # So we reimplemted them to allow make the model work with tensorRT.
        # See https://github.com/tensorlayer/openpose-plus/issues/75 for more details.
        df = {'channels_first': 'NCHW', 'channels_last': 'NHWC'}
        return _bias_add(_bias_scale(x, a, df[data_format]), b,
                         df[data_format])
예제 #33
0
def bn(x, is_training, name='batchnorm'):
    with tf.variable_scope(name):
        decay = 0.99
        epsilon = 1e-3

        size = x.shape.as_list()[-1]

        beta = tf.get_variable('beta', [size],
                               initializer=tf.zeros_initializer())
        scale = tf.get_variable('scale', [size],
                                initializer=tf.ones_initializer())

        moving_mean = tf.get_variable('mean', [size],
                                      initializer=tf.zeros_initializer(),
                                      trainable=False)
        moving_variance = tf.get_variable('variance', [size],
                                          initializer=tf.ones_initializer(),
                                          trainable=False)

        def train():
            mean, variance = tf.nn.moments(x, [0, 1, 2])
            update_moving_mean = moving_averages.assign_moving_average(
                moving_mean, mean, decay)
            update_moving_variance = moving_averages.assign_moving_average(
                moving_variance, variance, decay)
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_moving_mean)
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS,
                                 update_moving_variance)
            return mean, variance

        mean, variance = tf.cond(
            tf.convert_to_tensor(is_training, dtype=tf.bool), lambda: train(),
            lambda: (moving_mean, moving_variance))

        inv = math_ops.rsqrt(variance + epsilon)
        inv *= scale
        return x * inv + (beta - mean * inv)
예제 #34
0
파일: tf_util2.py 프로젝트: colzycat/LG-GAN
def batch_normalization_my(x,
                        mean,
                        variance,
                        offset,
                        scale,
                        variance_epsilon,
                        name=None):

  with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]):
    inv = math_ops.rsqrt(variance + variance_epsilon)

    scale = tf.cast(scale, tf.float32)
    scale = tf.expand_dims(scale, 1)
    scale = tf.tile(scale, [1, x.get_shape()[1], x.get_shape()[2], 1])
    offset = tf.cast(offset, tf.float32)
    offset = tf.expand_dims(offset, 1)
    offset = tf.tile(offset, [1, x.get_shape()[1], x.get_shape()[2], 1])

    if scale is not None:
      inv *= scale
    # Note: tensorflow/contrib/quantize/python/fold_batch_norms.py depends on
    # the precise order of ops that are generated by the expression below.
    return x * math_ops.cast(inv, x.dtype) + math_ops.cast(
        offset - mean * inv if offset is not None else -mean * inv, x.dtype)
예제 #35
0
def _get_folded_kernel_bias(conv_type, kernel, bias, mu, var, gamma, beta,
                            epsilon):
  """ Get folded kernel and bias
      folded_kernel = kernel * multiplier
                    = kernel * gamma / sigma_bt

      folded_bias = beta - (mu - bias) * multiplier
                  = beta - (mu - bias) * gamma / sigma
  """
  sigma = math_ops.rsqrt(var + epsilon)
  if gamma is not None:
    multiplier = math_ops.mul(gamma, sigma)
  else:
    multiplier = sigma
  if conv_type == 'DepthwiseConv2D':
    new_shape = [kernel.shape[2], kernel.shape[3]]
    depthwise_multiplier = array_ops.reshape(multiplier, new_shape)
    folded_kernel = math_ops.mul(
        depthwise_multiplier, kernel, name='depthwise_kernel')
  else:
    folded_kernel = math_ops.mul(multiplier, kernel, name='kernel')

  folded_bias = math_ops.subtract(beta, (mu - bias) * multiplier, name='bias')
  return folded_kernel, folded_bias
예제 #36
0
    def call(self, inputs):
        inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
        ndim = self._input_rank

        if self.rectify:
            inputs = nn.relu(inputs)

        # Compute normalization pool.
        if ndim == 2:
            norm_pool = math_ops.matmul(math_ops.square(inputs), self.gamma)
            norm_pool = nn.bias_add(norm_pool, self.beta)
        elif self.data_format == "channels_last" and ndim <= 5:
            shape = self.gamma.shape.as_list()
            gamma = array_ops.reshape(self.gamma, (ndim - 2) * [1] + shape)
            norm_pool = nn.convolution(math_ops.square(inputs), gamma, "VALID")
            norm_pool = nn.bias_add(norm_pool, self.beta)
        else:  # generic implementation
            # This puts channels in the last dimension regardless of input.
            norm_pool = math_ops.tensordot(math_ops.square(inputs), self.gamma,
                                           [[self._channel_axis()], [0]])
            norm_pool += self.beta
            if self.data_format == "channels_first":
                # Return to channels_first format if necessary.
                axes = range(ndim - 1)
                axes.insert(1, ndim - 1)
                norm_pool = array_ops.transpose(norm_pool, axes)

        if self.inverse:
            norm_pool = math_ops.sqrt(norm_pool)
        else:
            norm_pool = math_ops.rsqrt(norm_pool)
        outputs = inputs * norm_pool

        if not context.executing_eagerly():
            outputs.set_shape(self.compute_output_shape(inputs.shape))
        return outputs
def per_image_standardization(image):
    """Linearly scales `image` to have zero mean and unit variance.
    This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average
    of all values in image, and
    `adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))`.
    `stddev` is the standard deviation of all values in `image`. It is capped
    away from zero to protect against division by 0 when handling uniform images.
    Args:
    image: An n-D Tensor where the last 3 dimensions are
           `[height, width, channels]`.
    Returns:
    The standardized image with same shape as `image`.
    Raises:
    ValueError: if the shape of 'image' is incompatible with this function.
    """
    with ops.name_scope(None, 'per_image_standardization', [image]) as scope:
        image = ops.convert_to_tensor(image, name='image')
        num_pixels = math_ops.reduce_prod(array_ops.shape(image)[1:4])
        image = math_ops.cast(image, dtype=dtypes.float32)
        image_mean = math_ops.reduce_mean(image,
                                          axis=[-1, -2, -3],
                                          keepdims=True)
        variance = (math_ops.reduce_mean(
            math_ops.square(image), axis=[-1, -2, -3], keepdims=True) -
                    math_ops.square(image_mean))
        variance = gen_nn_ops.relu(variance)
        stddev = math_ops.sqrt(variance)

        # Apply a minimum normalization that protects us against uniform images.
        min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
        pixel_value_scale = math_ops.maximum(stddev, min_stddev)
        pixel_value_offset = image_mean

        image = math_ops.subtract(image, pixel_value_offset)
        image = math_ops.div(image, pixel_value_scale, name=scope)
        return image
def _bahdanau_coverage_mul_score(processed_query, keys, coverage_features, normalize):
    dtype = processed_query.dtype
    # Get the number of hidden units from the trailing dimension of keys
    num_units = keys.shape[2].value or array_ops.shape(keys)[2]
    # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
    processed_query = array_ops.expand_dims(processed_query, 1)
    v = variable_scope.get_variable(
        "attention_v", [num_units], dtype=dtype)
    if normalize:
        # Scalar used in weight normalization
        g = variable_scope.get_variable(
            "attention_g", dtype=dtype,
            initializer=math.sqrt((1. / num_units)))
        # Bias added prior to the nonlinearity
        b = variable_scope.get_variable(
            "attention_b", [num_units], dtype=dtype,
            initializer=init_ops.zeros_initializer())
        # normed_v = g * v / ||v||
        normed_v = g * v * math_ops.rsqrt(
            math_ops.reduce_sum(math_ops.square(v)))
        return math_ops.reduce_sum(
            normed_v * math_ops.tanh(keys + processed_query + coverage_features + b), [2])
    else:
        return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query + coverage_features), [2])
    def call(self, inputs, training=None):

        # numpy value, mark the layer is in training
        training = self.batchnorm._get_training_value(training)  # pylint: disable=protected-access

        # checking if to update batchnorm params
        if (self.ema_freeze_delay is None) or (self.ema_freeze_delay < 0):
            # if ema_freeze_delay is None or a negative value, do not freeze bn stats
            bn_training = tf.cast(training, dtype=bool)
        else:
            bn_training = tf.math.logical_and(
                training,
                tf.math.less_equal(self._iteration, self.ema_freeze_delay))

        depthwise_kernel = self.depthwise_kernel

        # run depthwise_conv2d to produce output for the following batchnorm
        conv_outputs = tf.keras.backend.depthwise_conv2d(
            inputs,
            depthwise_kernel,
            strides=self.strides,
            padding=self.padding,
            dilation_rate=self.dilation_rate,
            data_format=self.data_format)

        if self.use_bias:
            bias = self.bias
            conv_outputs = tf.keras.backend.bias_add(
                conv_outputs, bias, data_format=self.data_format)
        else:
            bias = 0

        _ = self.batchnorm(conv_outputs, training=bn_training)

        self._iteration.assign_add(
            tf_utils.smart_cond(training, lambda: tf.constant(1, tf.int64),
                                lambda: tf.constant(0, tf.int64)))

        # calcuate mean and variance from current batch
        bn_shape = conv_outputs.shape
        ndims = len(bn_shape)
        reduction_axes = [
            i for i in range(ndims) if i not in self.batchnorm.axis
        ]
        keep_dims = len(self.batchnorm.axis) > 1
        mean, variance = self.batchnorm._moments(  # pylint: disable=protected-access
            math_ops.cast(conv_outputs, self.batchnorm._param_dtype),  # pylint: disable=protected-access
            reduction_axes,
            keep_dims=keep_dims)
        gamma = self.batchnorm.gamma
        beta = self.batchnorm.beta
        moving_mean = self.batchnorm.moving_mean
        moving_variance = self.batchnorm.moving_variance

        if self.folding_mode not in [
                "batch_stats_folding", "ema_stats_folding"
        ]:
            assert ValueError("mode {} not supported!".format(
                self.folding_mode))

        mv_inv = math_ops.rsqrt(moving_variance + self.batchnorm.epsilon)
        batch_inv = math_ops.rsqrt(variance + self.batchnorm.epsilon)

        if gamma is not None:
            mv_inv *= gamma
            batch_inv *= gamma

        folded_bias = tf_utils.smart_cond(
            bn_training, lambda: batch_inv * (bias - mean) + beta,
            lambda: mv_inv * (bias - moving_mean) + beta)

        if self.folding_mode == "batch_stats_folding":
            # using batch mean and variance in the initial training stage
            # after sufficient training, switch to moving mean and variance
            inv = tf_utils.smart_cond(bn_training, lambda: batch_inv,
                                      lambda: mv_inv)

        elif self.folding_mode == "ema_stats_folding":
            # We always scale the weights with a correction factor to the long term
            # statistics prior to quantization. This ensures that there is no jitter
            # in the quantized weights due to batch to batch variation. During the
            # initial phase of training, we undo the scaling of the weights so that
            # outputs are identical to regular batch normalization. We also modify
            # the bias terms correspondingly. After sufficient training, switch from
            # using batch statistics to long term moving averages for batch
            # normalization.

            # use batch stats for calcuating bias before bn freeze, and use moving
            # stats after bn freeze

            # moving stats is always used to fold kernel in tflite; before bn freeze
            # an additional correction factor will be applied to the depthwiseconv2d
            # output
            inv = mv_inv

        # for DepthwiseConv2D inv needs to be broadcasted to the last 2 dimensions
        # of the kernels
        depthwise_weights_shape = [
            depthwise_kernel.get_shape().as_list()[2],
            depthwise_kernel.get_shape().as_list()[3]
        ]
        inv = array_ops.reshape(inv, depthwise_weights_shape)
        # wrap conv kernel with bn parameters
        folded_depthwise_kernel = inv * depthwise_kernel
        # quantize the folded kernel
        if self.depthwise_quantizer is not None:
            q_folded_depthwise_kernel = self.depthwise_quantizer_internal(
                folded_depthwise_kernel)
        else:
            q_folded_depthwise_kernel = folded_depthwise_kernel

        # If loaded from a ckpt, bias_quantizer is the ckpt value
        # Else if bias_quantizer not specified, bias
        #   quantizer is None and we need to calculate bias quantizer
        #   type according to accumulator type. User can call
        #   bn_folding_utils.populate_bias_quantizer_for_folded_layers(
        #      model, input_quantizer_list]) to populate such bias quantizer.
        if self.bias_quantizer is not None:
            q_folded_bias = self.bias_quantizer_internal(folded_bias)
        else:
            q_folded_bias = folded_bias

        applied_kernel = q_folded_depthwise_kernel
        applied_bias = q_folded_bias

        # calculate depthwise_conv2d output using the quantized folded kernel
        folded_outputs = tf.keras.backend.depthwise_conv2d(
            inputs,
            applied_kernel,
            strides=self.strides,
            padding=self.padding,
            dilation_rate=self.dilation_rate,
            data_format=self.data_format)

        if training is True and self.folding_mode == "ema_stats_folding":
            batch_inv = math_ops.rsqrt(variance + self.batchnorm.epsilon)
            y_corr = tf_utils.smart_cond(
                bn_training, lambda:
                (math_ops.sqrt(moving_variance + self.batchnorm.epsilon) *
                 math_ops.rsqrt(variance + self.batchnorm.epsilon)),
                lambda: tf.constant(1.0, shape=moving_variance.shape))
            folded_outputs = math_ops.mul(folded_outputs, y_corr)

        folded_outputs = tf.keras.backend.bias_add(
            folded_outputs, applied_bias, data_format=self.data_format)

        if self.activation is not None:
            return self.activation(folded_outputs)

        return folded_outputs
예제 #40
0
def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
  """Finds fused batch norm layers and folds them into preceding layers.

  Folding only affects the following layers: Conv2D, fully connected, depthwise
  convolution.

  Args:
    graph: Graph to walk and modify.
    is_training: Bool, true if training.
    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
      and variance and using them for batch normalization.

  Raises:
    ValueError: When batch norm folding fails.
  """
  for match in _FindFusedBatchNorms(graph):
    scope, sep, _ = match.layer_op.name.rpartition('/')
    # Make sure new ops are added to `graph` and put on the same device as
    # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope
    # named `scope`. Otherwise, TF creates a unique scope whose name starts with
    # `scope`.
    with graph.as_default(), graph.name_scope(scope + sep):
      with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep):
        # new weights = old weights * gamma / sqrt(variance + epsilon)
        # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
        multiplier_tensor = match.gamma_tensor * math_ops.rsqrt(
            match.variance_tensor + match.bn_op.get_attr('epsilon'))
        bias_tensor = math_ops.subtract(
            match.beta_tensor,
            match.mean_tensor * multiplier_tensor,
            name='bias')

        correction_scale, correction_recip, correction_offset = None, None, None
        if is_training:
          correction_scale, correction_recip, correction_offset = (
              _ComputeBatchNormCorrections(
                  context='',
                  match=match,
                  freeze_batch_norm_delay=freeze_batch_norm_delay,
                  fused_batch_norm=True))
        # The shape of depthwise weights is different, so we need to reshape the
        # multiplier_tensor to ensure that the scaled_weight_tensor has the
        # expected shape.
        weights = match.weight_tensor
        if match.layer_op.type == 'DepthwiseConv2dNative':
          new_shape = [
              match.weight_tensor.get_shape().as_list()[2],
              match.weight_tensor.get_shape().as_list()[3]
          ]
          multiplier_tensor = array_ops.reshape(
              multiplier_tensor, new_shape, name='scale_reshape')

          if correction_scale is not None:
            correction_scale = array_ops.reshape(
                correction_scale, new_shape, name='correction_reshape')

      if correction_scale is not None:
        weights = math_ops.multiply(
            correction_scale, weights, name='correction_mult')

      scaled_weight_tensor = math_ops.multiply(
          weights, multiplier_tensor, name='mul_fold')
      new_layer_tensor = _CloneWithNewOperands(
          match.layer_op, match.input_tensor, scaled_weight_tensor)

      if correction_recip is not None:
        new_layer_tensor = math_ops.multiply(
            correction_recip, new_layer_tensor, name='post_conv_mul')
        new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset),
                                        'correction_add')

      bias_add_tensor = math_ops.add(
          new_layer_tensor, bias_tensor, name='add_fold')

      nodes_modified_count = graph_editor.reroute_ts(bias_add_tensor,
                                                     match.output_tensor)
      if nodes_modified_count == 0:
        raise ValueError('Folding batch norms failed, %s had no outputs.' %
                         match.output_tensor.name)
예제 #41
0
def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
    """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.

  Implemented with GPU-compatible ops and supports gradients.

  [Mel-Frequency Cepstral Coefficient (MFCC)][mfcc] calculation consists of
  taking the DCT-II of a log-magnitude mel-scale spectrogram. [HTK][htk]'s MFCCs
  use a particular scaling of the DCT-II which is almost orthogonal
  normalization. We follow this convention.

  All `num_mel_bins` MFCCs are returned and it is up to the caller to select
  a subset of the MFCCs based on their application. For example, it is typical
  to only use the first few for speech recognition, as this results in
  an approximately pitch-invariant representation of the signal.

  For example:

  ```python
  sample_rate = 16000.0
  # A Tensor of [batch_size, num_samples] mono PCM samples in the range [-1, 1].
  pcm = tf.placeholder(tf.float32, [None, None])

  # A 1024-point STFT with frames of 64 ms and 75% overlap.
  stfts = tf.contrib.signal.stft(pcm, frame_length=1024, frame_step=256,
                                 fft_length=1024)
  spectrograms = tf.abs(stfts)

  # Warp the linear scale spectrograms into the mel-scale.
  num_spectrogram_bins = stfts.shape[-1].value
  lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
  linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
    num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
    upper_edge_hertz)
  mel_spectrograms = tf.tensordot(
    spectrograms, linear_to_mel_weight_matrix, 1)
  mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(
    linear_to_mel_weight_matrix.shape[-1:]))

  # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
  log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)

  # Compute MFCCs from log_mel_spectrograms and take the first 13.
  mfccs = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
    log_mel_spectrograms)[..., :13]
  ```

  Args:
    log_mel_spectrograms: A `[..., num_mel_bins]` `float32` `Tensor` of
      log-magnitude mel-scale spectrograms.
    name: An optional name for the operation.
  Returns:
    A `[..., num_mel_bins]` `float32` `Tensor` of the MFCCs of
    `log_mel_spectrograms`.

  Raises:
    ValueError: If `num_mel_bins` is not positive.

  [mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
  [htk]: https://en.wikipedia.org/wiki/HTK_(software)
  """
    with ops.name_scope(name, 'mfccs_from_log_mel_spectrograms',
                        [log_mel_spectrograms]):
        # Compute the DCT-II of the resulting log-magnitude mel-scale spectrogram.
        # The DCT used in HTK scales every basis vector by sqrt(2/N), which is the
        # scaling required for an "orthogonal" DCT-II *except* in the 0th bin, where
        # the true orthogonal DCT (as implemented by scipy) scales by sqrt(1/N). For
        # this reason, we don't apply orthogonal normalization and scale the DCT by
        # `0.5 * sqrt(2/N)` manually.
        log_mel_spectrograms = ops.convert_to_tensor(log_mel_spectrograms,
                                                     dtype=dtypes.float32)
        if (log_mel_spectrograms.shape.ndims
                and log_mel_spectrograms.shape.dims[-1].value is not None):
            num_mel_bins = log_mel_spectrograms.shape.dims[-1].value
            if num_mel_bins == 0:
                raise ValueError('num_mel_bins must be positive. Got: %s' %
                                 log_mel_spectrograms)
        else:
            num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]

        dct2 = spectral_ops.dct(log_mel_spectrograms)
        return dct2 * math_ops.rsqrt(math_ops.to_float(num_mel_bins) * 2.0)
예제 #42
0
def group_norm(inputs,
               groups=32,
               channels_axis=-1,
               reduction_axes=(-3, -2),
               center=True,
               scale=True,
               epsilon=1e-6,
               activation_fn=None,
               param_initializers=None,
               reuse=None,
               variables_collections=None,
               outputs_collections=None,
               trainable=True,
               scope=None,
               mean_close_to_zero=False):
  """Functional interface for the group normalization layer.

  Reference: https://arxiv.org/abs/1803.08494.

    "Group Normalization", Yuxin Wu, Kaiming He

  Args:
    inputs: A Tensor with at least 2 dimensions one which is channels. All
     shape dimensions must be fully defined.
    groups: Integer. Divide the channels into this number of groups over which
      normalization statistics are computed. This number must be commensurate
      with the number of channels in `inputs`.
    channels_axis: An integer. Specifies index of channels axis which will be
      broken into `groups`, each of which whose statistics will be computed
      across. Must be mutually exclusive with `reduction_axes`. Preferred usage
      is to specify negative integers to be agnostic as to whether a batch
      dimension is included.
    reduction_axes: Tuple of integers. Specifies dimensions over which
       statistics will be accumulated. Must be mutually exclusive with
       `channels_axis`. Statistics will not be accumulated across axes not
       specified in `reduction_axes` nor `channel_axis`. Preferred usage is to
       specify negative integers to be agnostic to whether a batch dimension is
       included.

      Some sample usage cases:
        NHWC format: channels_axis=-1, reduction_axes=[-3, -2]
        NCHW format: channels_axis=-3, reduction_axes=[-2, -1]

    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
      is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is
      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
      disabled since the scaling can be done by the next layer.
    epsilon: Small float added to variance to avoid dividing by zero.
    activation_fn: Activation function, default set to None to skip it and
      maintain a linear activation.
    param_initializers: Optional initializers for beta, gamma, moving mean and
      moving variance.
    reuse: Whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    variables_collections: Optional collections for the variables.
    outputs_collections: Collections to add the outputs.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
    scope: Optional scope for `variable_scope`.
    mean_close_to_zero: The mean of `input` before ReLU will be close to zero
      when batch size >= 4k for Resnet-50 on TPU. If `True`, use
      `nn.sufficient_statistics` and `nn.normalize_moments` to calculate the
      variance. This is the same behavior as `fused` equals `True` in batch
      normalization. If `False`, use `nn.moments` to calculate the variance.
      When `mean` is close to zero, like 1e-4, use `mean` to calculate the
      variance may have poor result due to repeated roundoff error and
      denormalization in `mean`.  When `mean` is large, like 1e2,
      sum(`input`^2) is so large that only the high-order digits of the elements
      are being accumulated. Thus, use sum(`input` - `mean`)^2/n to calculate
      the variance has better accuracy compared to (sum(`input`^2)/n - `mean`^2)
      when `mean` is large.


  Returns:
    A `Tensor` representing the output of the operation.

  Raises:
    ValueError: If the rank of `inputs` is undefined.
    ValueError: If rank or channels dimension of `inputs` is undefined.
    ValueError: If number of groups is not commensurate with number of channels.
    ValueError: If reduction_axes or channels_axis are out of bounds.
    ValueError: If reduction_axes are not mutually exclusive with channels_axis.
  """
  # TODO(shlens): Support partially defined shapes for the inputs.
  inputs = ops.convert_to_tensor(inputs)
  original_shape = inputs.shape

  if inputs.shape.ndims is None:
    raise ValueError('Inputs %s has undefined rank.' % inputs.name)
  if channels_axis > (inputs.shape.ndims - 1):
    raise ValueError('Axis is out of bounds.')

  # Standardize the channels_axis to be positive and identify # of channels.
  if channels_axis < 0:
    channels_axis = inputs.shape.ndims + channels_axis
  channels = inputs.shape[channels_axis].value

  if channels is None:
    raise ValueError('Inputs %s has undefined channel dimension: %d.' % (
        inputs.name, channels_axis))

  # Standardize the reduction_axes to be positive.
  reduction_axes = list(reduction_axes)
  for i in range(len(reduction_axes)):
    if reduction_axes[i] < 0:
      reduction_axes[i] += inputs.shape.ndims

  for a in reduction_axes:
    if a > inputs.shape.ndims:
      raise ValueError('Axis is out of bounds.')
    if inputs.shape[a].value is None:
      raise ValueError('Inputs %s has undefined dimensions %d.' % (
          inputs.name, a))
    if channels_axis == a:
      raise ValueError('reduction_axis must be mutually exclusive '
                       'with channels_axis')
  if groups > channels:
    raise ValueError('Invalid groups %d for %d channels.' % (groups, channels))
  if channels % groups != 0:
    raise ValueError('%d channels is not commensurate with %d groups.' %
                     (channels, groups))

  # Determine axes before channels. Some examples of common image formats:
  #  'NCHW': before = [N], after = [HW]
  #  'NHWC': before = [NHW], after = []
  axes_before_channels = inputs.shape.as_list()[:channels_axis]
  axes_after_channels = inputs.shape.as_list()[channels_axis+1:]

  # Manually broadcast the parameters to conform to the number of groups.
  params_shape_broadcast = ([1] * len(axes_before_channels) +
                            [groups, channels // groups] +
                            [1] * len(axes_after_channels))

  # Reshape the input by the group within the channel dimension.
  inputs_shape = (axes_before_channels + [groups, channels // groups] +
                  axes_after_channels)
  inputs = array_ops.reshape(inputs, inputs_shape)

  # Determine the dimensions across which moments are calculated.
  moments_axes = [channels_axis + 1]
  for a in reduction_axes:
    if a > channels_axis:
      moments_axes.append(a + 1)
    else:
      moments_axes.append(a)

  with variable_scope.variable_scope(
      scope, 'GroupNorm', [inputs], reuse=reuse) as sc:
    # Note that the params_shape is the number of channels always.
    params_shape = [channels]

    # Allocate parameters for the beta and gamma of the normalization.
    beta, gamma = None, None
    dtype = inputs.dtype.base_dtype
    if param_initializers is None:
      param_initializers = {}
    if center:
      beta_collections = utils.get_variable_collections(
          variables_collections, 'beta')
      beta_initializer = param_initializers.get(
          'beta', init_ops.zeros_initializer())
      beta = variables.model_variable('beta',
                                      shape=params_shape,
                                      dtype=dtype,
                                      initializer=beta_initializer,
                                      collections=beta_collections,
                                      trainable=trainable)
      beta = array_ops.reshape(beta, params_shape_broadcast)

    if scale:
      gamma_collections = utils.get_variable_collections(
          variables_collections, 'gamma')
      gamma_initializer = param_initializers.get(
          'gamma', init_ops.ones_initializer())
      gamma = variables.model_variable('gamma',
                                       shape=params_shape,
                                       dtype=dtype,
                                       initializer=gamma_initializer,
                                       collections=gamma_collections,
                                       trainable=trainable)
      gamma = array_ops.reshape(gamma, params_shape_broadcast)

    # Calculate the moments.
    if mean_close_to_zero:
      # One pass algorithm returns better result when mean is close to zero.
      counts, means_ss, variance_ss, _ = nn.sufficient_statistics(
          inputs, moments_axes, keep_dims=True)
      mean, variance = nn.normalize_moments(
          counts, means_ss, variance_ss, shift=None)
    else:
      mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)

    # Compute normalization.
    # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor
    # appropriately so that this operation may be faster.
    gain = math_ops.rsqrt(variance + epsilon)
    offset = -mean * gain
    if gamma is not None:
      gain *= gamma
      offset *= gamma
    if beta is not None:
      offset += beta
    outputs = inputs * gain + offset

    # Collapse the groups into the channel dimension.
    outputs = array_ops.reshape(outputs, original_shape)

    if activation_fn is not None:
      outputs = activation_fn(outputs)
    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
예제 #43
0
 def _opsBatchNorm(self, x, m, v, beta, gamma, epsilon,
                   scale_after_normalization, shift_after_normalization):
     y = (x - m) * math_ops.rsqrt(v + epsilon)
     if scale_after_normalization:
         y = gamma * y
     return y + beta if shift_after_normalization else y
예제 #44
0
    def call(self, inputs, training=None):
        if training is None:
            training = K.learning_phase()

        conv_out = super(_ConvBatchNorm2D, self).call(inputs)

        # Not all the computations in the batchnorm need to happen,
        # but this avoids duplicating code (e.g. moving_average).
        self.batchnorm.call(conv_out)

        folded_conv_kernel_multiplier = self.batchnorm.gamma * math_ops.rsqrt(
            self.batchnorm.moving_variance + self.batchnorm.epsilon)
        folded_conv_kernel = math_ops.mul(folded_conv_kernel_multiplier,
                                          self.kernel,
                                          name='folded_conv_kernel')

        folded_conv_bias = math_ops.subtract(self.batchnorm.beta,
                                             self.batchnorm.moving_mean *
                                             folded_conv_kernel_multiplier,
                                             name='folded_conv_bias')

        if self.is_quantized:

            def make_quantizer_fn(training):
                """Return quantizer conditioned on whether training or not."""
                def quantizer_fn():
                    return self.weight_quantizer(folded_conv_kernel,
                                                 self.optimizer_step,
                                                 training,
                                                 min_var=self._weight_min_var,
                                                 max_var=self._weight_max_var)

                return quantizer_fn

            folded_conv_kernel = tf_utils.smart_cond(training,
                                                     make_quantizer_fn(True),
                                                     make_quantizer_fn(False))

        # Second convolution doesn't need new trainable weights, so we
        # cannot reuse Conv2D layer.
        # TODO(alanchiao):
        # 1. See if we can at least reuse the bias logic.
        # 2. See if we need to fork between conv2d and conv2d_v2 for
        #    TensorFlow 1.XX and 2.XX.

        # Taken from keras/layers/convolutional.py:183
        if self.padding == 'causal':
            op_padding = 'valid'
        else:
            op_padding = self.padding
        if not isinstance(op_padding, (list, tuple)):
            op_padding = op_padding.upper()

        folded_conv_out = nn_ops.conv2d(
            inputs,
            folded_conv_kernel,
            strides=self.strides,
            padding=op_padding,
            data_format=conv_utils.convert_data_format(self.data_format,
                                                       self.rank + 2),
            dilations=self.dilation_rate,
            name='folded_conv_out',
        )

        # Taken from keras/layers/convolutional.py:200
        if self.data_format == 'channels_first':
            if self.rank == 1:
                # nn.bias_add does not accept a 1D input tensor.
                bias = array_ops.reshape(folded_conv_bias,
                                         (1, self.filters, 1))
                folded_conv_out += bias
            else:
                outputs = nn.bias_add(folded_conv_out,
                                      folded_conv_bias,
                                      data_format='NCHW')
        else:
            outputs = nn.bias_add(folded_conv_out,
                                  folded_conv_bias,
                                  data_format='NHWC')

        if self.is_quantized:
            self.post_activation.training = training

        if self.post_activation is not None:
            return self.post_activation(outputs)
        return outputs
예제 #45
0
 def _variance_scale_term(self):
   """Helper to `_covariance` and `_variance` which computes a shared scale."""
   return math_ops.rsqrt(1. + self.total_concentration[..., None])
예제 #46
0
 def _variance_scale_term(self):
   """Helper to `_covariance` and `_variance` which computes a shared scale."""
   return math_ops.rsqrt(1. + self.total_concentration[..., array_ops.newaxis])
예제 #47
0
파일: dct_ops.py 프로젝트: MFChunga/poo
def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
    """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.

  Types I, II, III and IV are supported.
  Type I is implemented using a length `2N` padded `tf.signal.rfft`.
  Type II is implemented using a length `2N` padded `tf.signal.rfft`, as
   described here: [Type 2 DCT using 2N FFT padded (Makhoul)]
   (https://dsp.stackexchange.com/a/10606).
  Type III is a fairly straightforward inverse of Type II
   (i.e. using a length `2N` padded `tf.signal.irfft`).
   Type IV is calculated through 2N length DCT2 of padded signal and
  picking the odd indices.

  @compatibility(scipy)
  Equivalent to [scipy.fftpack.dct]
   (https://docs.scipy.org/doc/scipy-1.4.0/reference/generated/scipy.fftpack.dct.html)
   for Type-I, Type-II, Type-III and Type-IV DCT.
  @end_compatibility

  Args:
    input: A `[..., samples]` `float32`/`float64` `Tensor` containing the
      signals to take the DCT of.
    type: The DCT type to perform. Must be 1, 2, 3 or 4.
    n: The length of the transform. If length is less than sequence length,
      only the first n elements of the sequence are considered for the DCT.
      If n is greater than the sequence length, zeros are padded and then
      the DCT is computed as usual.
    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
    norm: The normalization to apply. `None` for no normalization or `'ortho'`
      for orthonormal normalization.
    name: An optional name for the operation.

  Returns:
    A `[..., samples]` `float32`/`float64` `Tensor` containing the DCT of
    `input`.

  Raises:
    ValueError: If `type` is not `1`, `2`, `3` or `4`, `axis` is
      not `-1`, `n` is not `None` or greater than 0,
      or `norm` is not `None` or `'ortho'`.
    ValueError: If `type` is `1` and `norm` is `ortho`.

  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
  """
    _validate_dct_arguments(input, type, n, axis, norm)
    with _ops.name_scope(name, "dct", [input]):
        input = _ops.convert_to_tensor(input)
        zero = _ops.convert_to_tensor(0.0, dtype=input.dtype)

        seq_len = (tensor_shape.dimension_value(input.shape[-1])
                   or _array_ops.shape(input)[-1])
        if n is not None:
            if n <= seq_len:
                input = input[..., 0:n]
            else:
                rank = len(input.shape)
                padding = [[0, 0] for _ in range(rank)]
                padding[rank - 1][1] = n - seq_len
                padding = _ops.convert_to_tensor(padding, dtype=_dtypes.int32)
                input = _array_ops.pad(input, paddings=padding)

        axis_dim = (tensor_shape.dimension_value(input.shape[-1])
                    or _array_ops.shape(input)[-1])
        axis_dim_float = _math_ops.cast(axis_dim, input.dtype)

        if type == 1:
            dct1_input = _array_ops.concat([input, input[..., -2:0:-1]],
                                           axis=-1)
            dct1 = _math_ops.real(fft_ops.rfft(dct1_input))
            return dct1

        if type == 2:
            scale = 2.0 * _math_ops.exp(
                _math_ops.complex(
                    zero, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
                    axis_dim_float))

            # TODO(rjryan): Benchmark performance and memory usage of the various
            # approaches to computing a DCT via the RFFT.
            dct2 = _math_ops.real(
                fft_ops.rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim]
                * scale)

            if norm == "ortho":
                n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
                n2 = n1 * _math.sqrt(2.0)
                # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
                weights = _array_ops.pad(_array_ops.expand_dims(n1, 0),
                                         [[0, axis_dim - 1]],
                                         constant_values=n2)
                dct2 *= weights

            return dct2

        elif type == 3:
            if norm == "ortho":
                n1 = _math_ops.sqrt(axis_dim_float)
                n2 = n1 * _math.sqrt(0.5)
                # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
                weights = _array_ops.pad(_array_ops.expand_dims(n1, 0),
                                         [[0, axis_dim - 1]],
                                         constant_values=n2)
                input *= weights
            else:
                input *= axis_dim_float
            scale = 2.0 * _math_ops.exp(
                _math_ops.complex(
                    zero,
                    _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
                    axis_dim_float))
            dct3 = _math_ops.real(
                fft_ops.irfft(scale * _math_ops.complex(input, zero),
                              fft_length=[2 * axis_dim]))[..., :axis_dim]

            return dct3

        elif type == 4:
            # DCT-2 of 2N length zero-padded signal, unnormalized.
            dct2 = dct(input, type=2, n=2 * axis_dim, axis=axis, norm=None)
            # Get odd indices of DCT-2 of zero padded 2N signal to obtain
            # DCT-4 of the original N length signal.
            dct4 = dct2[..., 1::2]
            if norm == "ortho":
                dct4 *= _math.sqrt(0.5) * _math_ops.rsqrt(axis_dim_float)

            return dct4
예제 #48
0
def _BatchNormGrad(grad_y,
                   x,
                   scale,
                   pop_mean,
                   pop_var,
                   epsilon,
                   data_format,
                   is_training=True):
  """Returns the gradients for the 3 inputs of BatchNorm.

  Args:
    grad_y: A `Tensor` of 4 dimensions for gradient for y.
    x: A `Tensor` of 4 dimensions for x.
    scale: A `Tensor` of 1 dimension for scaling.
    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
      is_training=False.
    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
      when is_training=False.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for input. Either b"NHWC" or b"NCHW".
    is_training: A bool value to indicate the operation is for training
      (default) or inference.

  Returns:
    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
    for x, grad_scale the gradient for scale, and grad_offset the gradient
    for offset.
  """
  x_dtype = x.dtype.base_dtype
  if x_dtype == dtypes.float16:
    # float16 math is too imprecise, so we do the batch norm gradient
    # computations in float32.
    x = math_ops.cast(x, dtypes.float32)
    grad_y = math_ops.cast(grad_y, dtypes.float32)
  if is_training:
    if data_format == b"NHWC":
      keepdims = False
      reduce_axis = [0, 1, 2]
    else:
      keepdims = True
      reduce_axis = [0, 2, 3]
      shape = [1, array_ops.size(scale), 1, 1]
      scale = array_ops.reshape(scale, shape)
    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
    mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
    var_x = math_ops.reduce_mean(
        math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
        reduce_axis,
        keepdims=keepdims)
    grad_y_offset = grad_y - mean_grad_y
    x_offset = x - mean_x
    mean = math_ops.reduce_mean(
        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
    grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
        grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
    grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
    if data_format == b"NCHW":
      grad_scale = array_ops.squeeze(grad_scale)
    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
  else:
    if data_format == b"NHWC":
      reduce_axis = [0, 1, 2]
    else:
      reduce_axis = [0, 2, 3]
      shape = [1, array_ops.size(pop_mean), 1, 1]
      pop_mean = array_ops.reshape(pop_mean, shape)
      pop_var = array_ops.reshape(pop_var, shape)
      scale = array_ops.reshape(scale, shape)

    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
    var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
    grad_scale = math_ops.reduce_sum(
        grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis)
    grad_x = grad_y * scale * var_rsqrt
    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
예제 #49
0
 def BN0(x):
     mean = math_ops.reduce_mean(x, [0])
     var = math_ops.reduce_mean(math_ops.square(x - mean))  # biased var
     rstd = math_ops.rsqrt(var + 1e-8)
     return (x - mean) * rstd
예제 #50
0
def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
    """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.

  Currently only Types II and III are supported. Type II is implemented using a
  length `2N` padded `tf.spectral.rfft`, as described here:
  https://dsp.stackexchange.com/a/10606. Type III is a fairly straightforward
  inverse of Type II (i.e. using a length `2N` padded `tf.spectral.irfft`).

  @compatibility(scipy)
  Equivalent to scipy.fftpack.dct for Type-II and Type-III DCT.
  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
  @end_compatibility

  Args:
    input: A `[..., samples]` `float32` `Tensor` containing the signals to
      take the DCT of.
    type: The DCT type to perform. Must be 2 or 3.
    n: For future expansion. The length of the transform. Must be `None`.
    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
    norm: The normalization to apply. `None` for no normalization or `'ortho'`
      for orthonormal normalization.
    name: An optional name for the operation.

  Returns:
    A `[..., samples]` `float32` `Tensor` containing the DCT of `input`.

  Raises:
    ValueError: If `type` is not `2` or `3`, `n` is not `None, `axis` is not
      `-1`, or `norm` is not `None` or `'ortho'`.

  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
  """
    _validate_dct_arguments(type, n, axis, norm)
    with _ops.name_scope(name, "dct", [input]):
        # We use the RFFT to compute the DCT and TensorFlow only supports float32
        # for FFTs at the moment.
        input = _ops.convert_to_tensor(input, dtype=_dtypes.float32)

        axis_dim = (tensor_shape.dimension_value(input.shape[-1])
                    or _array_ops.shape(input)[-1])
        axis_dim_float = _math_ops.to_float(axis_dim)
        if type == 2:
            scale = 2.0 * _math_ops.exp(
                _math_ops.complex(
                    0.0, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
                    axis_dim_float))

            # TODO(rjryan): Benchmark performance and memory usage of the various
            # approaches to computing a DCT via the RFFT.
            dct2 = _math_ops.real(
                rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale)

            if norm == "ortho":
                n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
                n2 = n1 * _math_ops.sqrt(2.0)
                # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
                weights = _array_ops.pad(_array_ops.expand_dims(n1, 0),
                                         [[0, axis_dim - 1]],
                                         constant_values=n2)
                dct2 *= weights

            return dct2

        elif type == 3:
            if norm == "ortho":
                n1 = _math_ops.sqrt(axis_dim_float)
                n2 = n1 * _math_ops.sqrt(0.5)
                # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
                weights = _array_ops.pad(_array_ops.expand_dims(n1, 0),
                                         [[0, axis_dim - 1]],
                                         constant_values=n2)
                input *= weights
            else:
                input *= axis_dim_float
            scale = 2.0 * _math_ops.exp(
                _math_ops.complex(
                    0.0,
                    _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
                    axis_dim_float))
            dct3 = _math_ops.real(
                irfft(scale * _math_ops.complex(input, 0.0),
                      fft_length=[2 * axis_dim]))[..., :axis_dim]

            return dct3
예제 #51
0
파일: gan5.py 프로젝트: abiaozsh/MyCode
def b_n(value, mean, variance, beta, gamma, epsilon):#beta=offset   gamma=scale
    inv = math_ops.rsqrt(variance + epsilon)#rsqrt = 1/sqrt(value)
    inv *= gamma
    return value * inv + (beta - mean * inv)
예제 #52
0
def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
                                 fused_batch_norm):
  """Computes batch norm correction params.

     Before batch normalization is frozen:
     We use batch statistics for batch norm.
       correction_scale = sigma_b/sigma_mv
       correction_recip = 1/correction_scale
       correction_offset = 0

     After batch normalization is frozen:
      correction_scale = sigma_b/sigma_mv
      correction_recip = 1
      correction_offset =  gamma*(mu_b/sigma_b-mu_mv/sigma_mv).

     Batch norm is frozen if global_step > bn_freeze_delay.
     The corrections ensure that:
     a) The weights are quantized after scaling by gamma/sigma_mv. This enables
     smoother training as the scaling on the weights changes slowly, rather than
     jump across mini-batches
     b) Changing the values of the corrections allows for one to switch between
     using batch statistics to using moving mean and average, without requiring
     changes to batch_norm


  Args:
    context: The scope under which we look for batch norm params
    match: Object containing required batch norm tensors for correction
      computation.
    freeze_batch_norm_delay: Delay in steps at which computation switches
      from regular batch norm to frozen mean and variance.
    fused_batch_norm: Bool, true if fused batch norm is used.

  Returns:
    A tuple of correction_scale, correction_recip, correction_offset
  """

  g = ops.get_default_graph()
  prefix = '' if not context else context + '/'
  with g.name_scope(prefix + 'batch_norm_correction'):
    recip_sigma_mv = math_ops.rsqrt(
        match.moving_variance_tensor + match.batch_epsilon)
    recip_sigma = math_ops.rsqrt(match.variance_tensor + match.batch_epsilon)
    correction_scale = math_ops.divide(
        recip_sigma_mv, recip_sigma, name='scale_compute')
    correction_scale = array_ops.identity(
        correction_scale, name='correction_scale')
    correction_recip = math_ops.reciprocal(
        correction_scale, name='reciprocal_compute')
    correction_offset = math_ops.multiply(
        match.gamma_tensor,
        match.mean_tensor * recip_sigma -
        match.moving_mean_tensor * recip_sigma_mv,
        name='offset_compute')

    if freeze_batch_norm_delay is not None:
      use_mv_avg = math_ops.greater_equal(
          common.CreateOrGetQuantizationStep(),
          freeze_batch_norm_delay,
          name='use_moving_average')
    else:
      use_mv_avg = False

    bn_decay_zero = 0.0
    bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers())
    bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers())

    bn_decay_mean_out = utils.smart_cond(
        use_mv_avg,
        lambda: bn_decay_zero,
        lambda: match.bn_decay_mean_tensor,
        name='freeze_moving_mean')
    graph_editor.reroute_ts(
        [bn_decay_mean_out], [match.bn_decay_mean_tensor],
        can_modify=bn_decay_mean_consumers)

    if fused_batch_norm is False:
      bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers())
      bn_decay_var_out = utils.smart_cond(
          use_mv_avg,
          lambda: bn_decay_zero,
          lambda: match.bn_decay_var_tensor,
          name='freeze_moving_var')
      graph_editor.reroute_ts(
          [bn_decay_var_out], [match.bn_decay_var_tensor],
          can_modify=bn_decay_var_consumers)

    correction_recip = utils.smart_cond(
        use_mv_avg,
        lambda: array_ops.ones(correction_scale.shape),
        lambda: correction_recip,
        name='correction_recip')

    correction_offset = utils.smart_cond(
        use_mv_avg,
        lambda: correction_offset,
        lambda: array_ops.zeros(correction_offset.shape),
        name='correction_offset')
  return correction_scale, correction_recip, correction_offset
예제 #53
0
def batch_norm_slow(tensor, mean, variance, beta, gamma, scale):
  batch_norm = (tensor - mean) * math_ops.rsqrt(variance + 0.001)
  if scale:
    batch_norm *= gamma
  return batch_norm + beta
예제 #54
0
def _BatchNormGrad(grad_y,
                   x,
                   scale,
                   pop_mean,
                   pop_var,
                   epsilon,
                   data_format,
                   is_training=True):
    """Returns the gradients for the 3 inputs of BatchNorm.

  Args:
    grad_y: A `Tensor` of 4 dimensions for gradient for y.
    x: A `Tensor` of 4 dimensions for x.
    scale: A `Tensor` of 1 dimension for scaling.
    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
      is_training=False.
    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
      when is_training=False.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for input. Either b"NHWC" or b"NCHW".
    is_training: A bool value to indicate the operation is for training
      (default) or inference.

  Returns:
    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
    for x, grad_scale the gradient for scale, and grad_offset the gradient
    for offset.
  """
    x_dtype = x.dtype.base_dtype
    if x_dtype == dtypes.float16:
        # float16 math is too imprecise, so we do the batch norm gradient
        # computations in float32.
        x = math_ops.cast(x, dtypes.float32)
        grad_y = math_ops.cast(grad_y, dtypes.float32)
    if is_training:
        if data_format == b"NHWC":
            keepdims = False
            reduce_axis = [0, 1, 2]
        else:
            keepdims = True
            reduce_axis = [0, 2, 3]
            shape = [1, array_ops.size(scale), 1, 1]
            scale = array_ops.reshape(scale, shape)
        mean_grad_y = math_ops.reduce_mean(grad_y,
                                           reduce_axis,
                                           keepdims=keepdims)
        mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
        var_x = math_ops.reduce_mean(math_ops.squared_difference(
            x, array_ops.stop_gradient(mean_x)),
                                     reduce_axis,
                                     keepdims=keepdims)
        grad_y_offset = grad_y - mean_grad_y
        x_offset = x - mean_x
        mean = math_ops.reduce_mean(grad_y * x_offset,
                                    axis=reduce_axis,
                                    keepdims=keepdims)
        grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
            grad_y_offset -
            math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
        grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
            grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
        if data_format == b"NCHW":
            grad_scale = array_ops.squeeze(grad_scale)
        grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
        return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
    else:
        if data_format == b"NHWC":
            reduce_axis = [0, 1, 2]
        else:
            reduce_axis = [0, 2, 3]
            shape = [1, array_ops.size(pop_mean), 1, 1]
            pop_mean = array_ops.reshape(pop_mean, shape)
            pop_var = array_ops.reshape(pop_var, shape)
            scale = array_ops.reshape(scale, shape)

        grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
        var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
        grad_scale = math_ops.reduce_sum(grad_y * (x - pop_mean) * var_rsqrt,
                                         axis=reduce_axis)
        grad_x = grad_y * scale * var_rsqrt
        return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
예제 #55
0
def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
  """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.

  Implemented with GPU-compatible ops and supports gradients.

  [Mel-Frequency Cepstral Coefficient (MFCC)][mfcc] calculation consists of
  taking the DCT-II of a log-magnitude mel-scale spectrogram. [HTK][htk]'s MFCCs
  use a particular scaling of the DCT-II which is almost orthogonal
  normalization. We follow this convention.

  All `num_mel_bins` MFCCs are returned and it is up to the caller to select
  a subset of the MFCCs based on their application. For example, it is typical
  to only use the first few for speech recognition, as this results in
  an approximately pitch-invariant representation of the signal.

  For example:

  ```python
  sample_rate = 16000.0
  # A Tensor of [batch_size, num_samples] mono PCM samples in the range [-1, 1].
  pcm = tf.compat.v1.placeholder(tf.float32, [None, None])

  # A 1024-point STFT with frames of 64 ms and 75% overlap.
  stfts = tf.signal.stft(pcm, frame_length=1024, frame_step=256,
                         fft_length=1024)
  spectrograms = tf.abs(stfts)

  # Warp the linear scale spectrograms into the mel-scale.
  num_spectrogram_bins = stfts.shape[-1].value
  lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
  linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
    num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
    upper_edge_hertz)
  mel_spectrograms = tf.tensordot(
    spectrograms, linear_to_mel_weight_matrix, 1)
  mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(
    linear_to_mel_weight_matrix.shape[-1:]))

  # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
  log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

  # Compute MFCCs from log_mel_spectrograms and take the first 13.
  mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
    log_mel_spectrograms)[..., :13]
  ```

  Args:
    log_mel_spectrograms: A `[..., num_mel_bins]` `float32` `Tensor` of
      log-magnitude mel-scale spectrograms.
    name: An optional name for the operation.
  Returns:
    A `[..., num_mel_bins]` `float32` `Tensor` of the MFCCs of
    `log_mel_spectrograms`.

  Raises:
    ValueError: If `num_mel_bins` is not positive.

  [mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
  [htk]: https://en.wikipedia.org/wiki/HTK_(software)
  """
  with ops.name_scope(name, 'mfccs_from_log_mel_spectrograms',
                      [log_mel_spectrograms]):
    # Compute the DCT-II of the resulting log-magnitude mel-scale spectrogram.
    # The DCT used in HTK scales every basis vector by sqrt(2/N), which is the
    # scaling required for an "orthogonal" DCT-II *except* in the 0th bin, where
    # the true orthogonal DCT (as implemented by scipy) scales by sqrt(1/N). For
    # this reason, we don't apply orthogonal normalization and scale the DCT by
    # `0.5 * sqrt(2/N)` manually.
    log_mel_spectrograms = ops.convert_to_tensor(log_mel_spectrograms,
                                                 dtype=dtypes.float32)
    if (log_mel_spectrograms.shape.ndims and
        log_mel_spectrograms.shape.dims[-1].value is not None):
      num_mel_bins = log_mel_spectrograms.shape.dims[-1].value
      if num_mel_bins == 0:
        raise ValueError('num_mel_bins must be positive. Got: %s' %
                         log_mel_spectrograms)
    else:
      num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]

    dct2 = dct_ops.dct(log_mel_spectrograms, type=2)
    return dct2 * math_ops.rsqrt(
        math_ops.cast(num_mel_bins, dtypes.float32) * 2.0)
예제 #56
0
 def BN0(x):
   mean = math_ops.reduce_mean(x, [0])
   var = math_ops.reduce_mean(math_ops.square(x - mean))  # biased var
   rstd = math_ops.rsqrt(var + 1e-8)
   return (x - mean) * rstd
예제 #57
0
def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
    """Finds fused batch norm layers and folds them into preceding layers.

  Folding only affects the following layers: Conv2D, fully connected, depthwise
  convolution.

  Args:
    graph: Graph to walk and modify.
    is_training: Bool, true if training.
    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
      and variance and using them for batch normalization.

  Raises:
    ValueError: When batch norm folding fails.
  """
    for match in _FindFusedBatchNorms(graph):
        scope, sep, _ = match.layer_op.name.rpartition('/')
        # Make sure new ops are added to `graph` and put on the same device as
        # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope
        # named `scope`. Otherwise, TF creates a unique scope whose name starts with
        # `scope`.
        with graph.as_default(), graph.name_scope(scope + sep):
            with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep):
                # new weights = old weights * gamma / sqrt(variance + epsilon)
                # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
                multiplier_tensor = match.gamma_tensor * math_ops.rsqrt(
                    match.variance_tensor + match.bn_op.get_attr('epsilon'))
                bias_tensor = math_ops.subtract(match.beta_tensor,
                                                match.mean_tensor *
                                                multiplier_tensor,
                                                name='bias')

                correction_scale, correction_recip, correction_offset = None, None, None
                if is_training:
                    correction_scale, correction_recip, correction_offset = (
                        _ComputeBatchNormCorrections(
                            context='',
                            match=match,
                            freeze_batch_norm_delay=freeze_batch_norm_delay))
                # The shape of depthwise weights is different, so we need to reshape the
                # multiplier_tensor to ensure that the scaled_weight_tensor has the
                # expected shape.
                weights = match.weight_tensor
                if match.layer_op.type == 'DepthwiseConv2dNative':
                    new_shape = [
                        match.weight_tensor.get_shape().as_list()[2],
                        match.weight_tensor.get_shape().as_list()[3]
                    ]
                    multiplier_tensor = array_ops.reshape(multiplier_tensor,
                                                          new_shape,
                                                          name='scale_reshape')

                    if correction_scale is not None:
                        correction_scale = array_ops.reshape(
                            correction_scale,
                            new_shape,
                            name='correction_reshape')

            if correction_scale is not None:
                weights = math_ops.multiply(correction_scale,
                                            weights,
                                            name='correction_mult')

            scaled_weight_tensor = math_ops.multiply(weights,
                                                     multiplier_tensor,
                                                     name='mul_fold')

            new_layer_tensor = _CloneWithNewOperands(match.layer_op,
                                                     match.input_tensor,
                                                     scaled_weight_tensor,
                                                     match.batch_to_space_op)

            if correction_recip is not None:
                new_layer_tensor = math_ops.multiply(correction_recip,
                                                     new_layer_tensor,
                                                     name='post_conv_mul')
                new_layer_tensor = math_ops.add(new_layer_tensor,
                                                (correction_offset),
                                                'correction_add')

            bias_add_tensor = math_ops.add(new_layer_tensor,
                                           bias_tensor,
                                           name='add_fold')

            nodes_modified_count = common.RerouteTensor(
                bias_add_tensor, match.output_tensor)
            if nodes_modified_count == 0:
                raise ValueError(
                    'Folding batch norms failed, %s had no outputs.' %
                    match.output_tensor.name)