示例#1
0
 def _compare(self, signals, norm, dct_type, atol=5e-4, rtol=5e-4):
   """Compares (I)DCT to SciPy (if available) and a NumPy implementation."""
   np_dct = NP_DCT[dct_type](signals, norm)
   tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm).eval()
   self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol)
   np_idct = NP_IDCT[dct_type](signals, norm)
   tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm).eval()
   self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol)
   if fftpack:
     scipy_dct = fftpack.dct(signals, type=dct_type, norm=norm)
     self.assertAllClose(scipy_dct, tf_dct, atol=atol, rtol=rtol)
     scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm)
     self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol)
   # Verify inverse(forward(s)) == s, up to a normalization factor.
   tf_idct_dct = dct_ops.idct(
       tf_dct, type=dct_type, norm=norm).eval()
   tf_dct_idct = dct_ops.dct(
       tf_idct, type=dct_type, norm=norm).eval()
   if norm is None:
     if dct_type == 1:
       tf_idct_dct *= 0.5 / (signals.shape[-1] - 1)
       tf_dct_idct *= 0.5 / (signals.shape[-1] - 1)
     else:
       tf_idct_dct *= 0.5 / signals.shape[-1]
       tf_dct_idct *= 0.5 / signals.shape[-1]
   self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol)
   self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
示例#2
0
 def _compare(self, signals, n, norm, dct_type, atol, rtol):
     """Compares (I)DCT to SciPy (if available) and a NumPy implementation."""
     np_dct = NP_DCT[dct_type](signals, n=n, norm=norm)
     tf_dct = dct_ops.dct(signals, n=n, type=dct_type, norm=norm)
     self.assertEqual(tf_dct.dtype.as_numpy_dtype, signals.dtype)
     self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol)
     np_idct = NP_IDCT[dct_type](signals, n=None, norm=norm)
     tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm)
     self.assertEqual(tf_idct.dtype.as_numpy_dtype, signals.dtype)
     self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol)
     if fftpack and dct_type != 4:
         scipy_dct = fftpack.dct(signals, n=n, type=dct_type, norm=norm)
         self.assertAllClose(scipy_dct, tf_dct, atol=atol, rtol=rtol)
         scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm)
         self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol)
     # Verify inverse(forward(s)) == s, up to a normalization factor.
     # Since `n` is not implemented for IDCT operation, re-calculating tf_dct
     # without n.
     tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm)
     tf_idct_dct = dct_ops.idct(tf_dct, type=dct_type, norm=norm)
     tf_dct_idct = dct_ops.dct(tf_idct, type=dct_type, norm=norm)
     if norm is None:
         if dct_type == 1:
             tf_idct_dct *= 0.5 / (signals.shape[-1] - 1)
             tf_dct_idct *= 0.5 / (signals.shape[-1] - 1)
         else:
             tf_idct_dct *= 0.5 / signals.shape[-1]
             tf_dct_idct *= 0.5 / signals.shape[-1]
     self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol)
     self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
示例#3
0
 def _compare(self, signals, norm, dct_type, atol=5e-4, rtol=5e-4):
     """Compares (I)DCT to SciPy (if available) and a NumPy implementation."""
     np_dct = NP_DCT[dct_type](signals, norm)
     tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm).eval()
     self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol)
     np_idct = NP_IDCT[dct_type](signals, norm)
     tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm).eval()
     self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol)
     if fftpack:
         scipy_dct = fftpack.dct(signals, type=dct_type, norm=norm)
         self.assertAllClose(scipy_dct, tf_dct, atol=atol, rtol=rtol)
         scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm)
         self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol)
     # Verify inverse(forward(s)) == s, up to a normalization factor.
     tf_idct_dct = dct_ops.idct(tf_dct, type=dct_type, norm=norm).eval()
     tf_dct_idct = dct_ops.dct(tf_idct, type=dct_type, norm=norm).eval()
     if norm is None:
         if dct_type == 1:
             tf_idct_dct *= 0.5 / (signals.shape[-1] - 1)
             tf_dct_idct *= 0.5 / (signals.shape[-1] - 1)
         else:
             tf_idct_dct *= 0.5 / signals.shape[-1]
             tf_dct_idct *= 0.5 / signals.shape[-1]
     self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol)
     self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
示例#4
0
def stdct(signals, frame_length, frame_step, fft_length=None,
         window_fn=window_ops.hann_window,
         pad_end=False, name=None):
  """
  Short-time discrete cosine transform.

  Argument/s:

  Returns:
  """
  with ops.name_scope(name, 'stdct', [signals, frame_length,
                                     frame_step]):
    signals = ops.convert_to_tensor(signals, name='signals')
    signals.shape.with_rank_at_least(1)
    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
    frame_length.shape.assert_has_rank(0)
    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
    frame_step.shape.assert_has_rank(0)

    if fft_length is None:
      fft_length = _enclosing_power_of_two(frame_length)
    else:
      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')

    framed_signals = shape_ops.frame(
        signals, frame_length, frame_step, pad_end=pad_end)

    # Optionally window the framed signals.
    if window_fn is not None:
      window = window_fn(frame_length, dtype=framed_signals.dtype)
      framed_signals *= window

    return dct_ops.dct(framed_signals, n=fft_length)
示例#5
0
 def test_error(self):
   signals = np.random.rand(10)
   # Unsupported type.
   with self.assertRaises(ValueError):
     dct_ops.dct(signals, type=5)
   # DCT-I normalization not implemented.
   with self.assertRaises(ValueError):
     dct_ops.dct(signals, type=1, norm="ortho")
   # DCT-I requires at least two inputs.
   with self.assertRaises(ValueError):
     dct_ops.dct(np.random.rand(1), type=1)
   # Unknown normalization.
   with self.assertRaises(ValueError):
     dct_ops.dct(signals, norm="bad")
   with self.assertRaises(NotImplementedError):
     dct_ops.dct(signals, n=10)
   with self.assertRaises(NotImplementedError):
     dct_ops.dct(signals, axis=0)
示例#6
0
 def test_error(self):
     signals = np.random.rand(10)
     # Unsupported type.
     with self.assertRaises(ValueError):
         dct_ops.dct(signals, type=5)
     # Invalid n.
     with self.assertRaises(ValueError):
         dct_ops.dct(signals, n=-2)
     # DCT-I normalization not implemented.
     with self.assertRaises(ValueError):
         dct_ops.dct(signals, type=1, norm="ortho")
     # DCT-I requires at least two inputs.
     with self.assertRaises(ValueError):
         dct_ops.dct(np.random.rand(1), type=1)
     # Unknown normalization.
     with self.assertRaises(ValueError):
         dct_ops.dct(signals, norm="bad")
     with self.assertRaises(NotImplementedError):
         dct_ops.dct(signals, axis=0)
示例#7
0
def inverse_mdct(mdcts,
                 window_fn=window_ops.vorbis_window,
                 norm=None,
                 name=None):
    """Computes the inverse modified DCT of `mdcts`.

  To reconstruct an original waveform, the same window function should
  be used with `mdct` and `inverse_mdct`.

  Example usage:

  >>> @tf.function
  ... def compare_round_trip():
  ...   samples = 1000
  ...   frame_length = 400
  ...   halflen = frame_length // 2
  ...   waveform = tf.random.normal(dtype=tf.float32, shape=[samples])
  ...   waveform_pad = tf.pad(waveform, [[halflen, 0],])
  ...   mdct = tf.signal.mdct(waveform_pad, frame_length, pad_end=True,
  ...                         window_fn=tf.signal.vorbis_window)
  ...   inverse_mdct = tf.signal.inverse_mdct(mdct,
  ...                                         window_fn=tf.signal.vorbis_window)
  ...   inverse_mdct = inverse_mdct[halflen: halflen + samples]
  ...   return waveform, inverse_mdct
  >>> waveform, inverse_mdct = compare_round_trip()
  >>> np.allclose(waveform.numpy(), inverse_mdct.numpy(), rtol=1e-3, atol=1e-4)
  True

  Implemented with TPU/GPU-compatible ops and supports gradients.

  Args:
    mdcts: A `float32`/`float64` `[..., frames, frame_length // 2]`
      `Tensor` of MDCT bins representing a batch of `frame_length // 2`-point
      MDCTs.
    window_fn: A callable that takes a window length and a `dtype` keyword
      argument and returns a `[window_length]` `Tensor` of samples in the
      provided datatype. If set to `None`, no windowing is used.
    norm: If "ortho", orthonormal inverse DCT4 is performed, if it is None,
      a regular dct4 followed by scaling of `1/frame_length` is performed.
    name: An optional name for the operation.

  Returns:
    A `[..., samples]` `Tensor` of `float32`/`float64` signals representing
    the inverse MDCT for each input MDCT in `mdcts` where `samples` is
    `(frames - 1) * (frame_length // 2) + frame_length`.

  Raises:
    ValueError: If `mdcts` is not at least rank 2.

  [mdct]: https://en.wikipedia.org/wiki/Modified_discrete_cosine_transform
  """
    with ops.name_scope(name, 'inverse_mdct', [mdcts]):
        mdcts = ops.convert_to_tensor(mdcts, name='mdcts')
        mdcts.shape.with_rank_at_least(2)
        half_len = math_ops.cast(mdcts.shape[-1], dtype=dtypes.int32)

        if norm is None:
            half_len_float = math_ops.cast(half_len, dtype=mdcts.dtype)
            result_idct4 = (0.5 / half_len_float) * dct_ops.dct(mdcts, type=4)
        elif norm == 'ortho':
            result_idct4 = dct_ops.dct(mdcts, type=4, norm='ortho')
        split_result = array_ops.split(result_idct4, 2, axis=-1)
        real_frames = array_ops.concat(
            (split_result[1], -array_ops.reverse(split_result[1], [-1]),
             -array_ops.reverse(split_result[0], [-1]), -split_result[0]),
            axis=-1)

        # Optionally window and overlap-add the inner 2 dimensions of real_frames
        # into a single [samples] dimension.
        if window_fn is not None:
            window = window_fn(2 * half_len, dtype=mdcts.dtype)
            real_frames *= window
        else:
            real_frames *= 1.0 / np.sqrt(2)
        return reconstruction_ops.overlap_and_add(real_frames, half_len)
示例#8
0
def mdct(signals,
         frame_length,
         window_fn=window_ops.vorbis_window,
         pad_end=False,
         norm=None,
         name=None):
    """Computes the [Modified Discrete Cosine Transform][mdct] of `signals`.

  Implemented with TPU/GPU-compatible ops and supports gradients.

  Args:
    signals: A `[..., samples]` `float32`/`float64` `Tensor` of real-valued
      signals.
    frame_length: An integer scalar `Tensor`. The window length in samples
      which must be divisible by 4.
    window_fn: A callable that takes a window length and a `dtype` keyword
      argument and returns a `[window_length]` `Tensor` of samples in the
      provided datatype. If set to `None`, no windowing is used.
    pad_end: Whether to pad the end of `signals` with zeros when the provided
      frame length and step produces a frame that lies partially past its end.
    norm: If it is None, unnormalized dct4 is used, if it is "ortho"
      orthonormal dct4 is used.
    name: An optional name for the operation.

  Returns:
    A `[..., frames, frame_length // 2]` `Tensor` of `float32`/`float64`
    MDCT values where `frames` is roughly `samples // (frame_length // 2)`
    when `pad_end=False`.

  Raises:
    ValueError: If `signals` is not at least rank 1, `frame_length` is
      not scalar, or `frame_length` is not a multiple of `4`.

  [mdct]: https://en.wikipedia.org/wiki/Modified_discrete_cosine_transform
  """
    with ops.name_scope(name, 'mdct', [signals, frame_length]):
        signals = ops.convert_to_tensor(signals, name='signals')
        signals.shape.with_rank_at_least(1)
        frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
        frame_length.shape.assert_has_rank(0)
        # Assert that frame_length is divisible by 4.
        frame_length_static = tensor_util.constant_value(frame_length)
        if frame_length_static is not None:
            if frame_length_static % 4 != 0:
                raise ValueError('The frame length must be a multiple of 4.')
            frame_step = ops.convert_to_tensor(frame_length_static // 2,
                                               dtype=frame_length.dtype)
        else:
            frame_step = frame_length // 2

        framed_signals = shape_ops.frame(signals,
                                         frame_length,
                                         frame_step,
                                         pad_end=pad_end)

        # Optionally window the framed signals.
        if window_fn is not None:
            window = window_fn(frame_length, dtype=framed_signals.dtype)
            framed_signals *= window
        else:
            framed_signals *= 1.0 / np.sqrt(2)

        split_frames = array_ops.split(framed_signals, 4, axis=-1)
        frame_firsthalf = -array_ops.reverse(split_frames[2],
                                             [-1]) - split_frames[3]
        frame_secondhalf = split_frames[0] - array_ops.reverse(
            split_frames[1], [-1])
        frames_rearranged = array_ops.concat(
            (frame_firsthalf, frame_secondhalf), axis=-1)
        # Below call produces the (frame_length // 2) unique components of the
        # type 4 orthonormal DCT of the real windowed signals in frames_rearranged.
        return dct_ops.dct(frames_rearranged, type=4, norm=norm)
示例#9
0
def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
  """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.

  Implemented with GPU-compatible ops and supports gradients.

  [Mel-Frequency Cepstral Coefficient (MFCC)][mfcc] calculation consists of
  taking the DCT-II of a log-magnitude mel-scale spectrogram. [HTK][htk]'s MFCCs
  use a particular scaling of the DCT-II which is almost orthogonal
  normalization. We follow this convention.

  All `num_mel_bins` MFCCs are returned and it is up to the caller to select
  a subset of the MFCCs based on their application. For example, it is typical
  to only use the first few for speech recognition, as this results in
  an approximately pitch-invariant representation of the signal.

  For example:

  ```python
  sample_rate = 16000.0
  # A Tensor of [batch_size, num_samples] mono PCM samples in the range [-1, 1].
  pcm = tf.compat.v1.placeholder(tf.float32, [None, None])

  # A 1024-point STFT with frames of 64 ms and 75% overlap.
  stfts = tf.signal.stft(pcm, frame_length=1024, frame_step=256,
                         fft_length=1024)
  spectrograms = tf.abs(stfts)

  # Warp the linear scale spectrograms into the mel-scale.
  num_spectrogram_bins = stfts.shape[-1].value
  lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
  linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
    num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
    upper_edge_hertz)
  mel_spectrograms = tf.tensordot(
    spectrograms, linear_to_mel_weight_matrix, 1)
  mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(
    linear_to_mel_weight_matrix.shape[-1:]))

  # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
  log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

  # Compute MFCCs from log_mel_spectrograms and take the first 13.
  mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
    log_mel_spectrograms)[..., :13]
  ```

  Args:
    log_mel_spectrograms: A `[..., num_mel_bins]` `float32` `Tensor` of
      log-magnitude mel-scale spectrograms.
    name: An optional name for the operation.
  Returns:
    A `[..., num_mel_bins]` `float32` `Tensor` of the MFCCs of
    `log_mel_spectrograms`.

  Raises:
    ValueError: If `num_mel_bins` is not positive.

  [mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
  [htk]: https://en.wikipedia.org/wiki/HTK_(software)
  """
  with ops.name_scope(name, 'mfccs_from_log_mel_spectrograms',
                      [log_mel_spectrograms]):
    # Compute the DCT-II of the resulting log-magnitude mel-scale spectrogram.
    # The DCT used in HTK scales every basis vector by sqrt(2/N), which is the
    # scaling required for an "orthogonal" DCT-II *except* in the 0th bin, where
    # the true orthogonal DCT (as implemented by scipy) scales by sqrt(1/N). For
    # this reason, we don't apply orthogonal normalization and scale the DCT by
    # `0.5 * sqrt(2/N)` manually.
    log_mel_spectrograms = ops.convert_to_tensor(log_mel_spectrograms,
                                                 dtype=dtypes.float32)
    if (log_mel_spectrograms.shape.ndims and
        log_mel_spectrograms.shape.dims[-1].value is not None):
      num_mel_bins = log_mel_spectrograms.shape.dims[-1].value
      if num_mel_bins == 0:
        raise ValueError('num_mel_bins must be positive. Got: %s' %
                         log_mel_spectrograms)
    else:
      num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]

    dct2 = dct_ops.dct(log_mel_spectrograms, type=2)
    return dct2 * math_ops.rsqrt(
        math_ops.cast(num_mel_bins, dtypes.float32) * 2.0)