def _compare(self, signals, norm, dct_type, atol=5e-4, rtol=5e-4): """Compares (I)DCT to SciPy (if available) and a NumPy implementation.""" np_dct = NP_DCT[dct_type](signals, norm) tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm).eval() self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol) np_idct = NP_IDCT[dct_type](signals, norm) tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm).eval() self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol) if fftpack: scipy_dct = fftpack.dct(signals, type=dct_type, norm=norm) self.assertAllClose(scipy_dct, tf_dct, atol=atol, rtol=rtol) scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm) self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol) # Verify inverse(forward(s)) == s, up to a normalization factor. tf_idct_dct = dct_ops.idct( tf_dct, type=dct_type, norm=norm).eval() tf_dct_idct = dct_ops.dct( tf_idct, type=dct_type, norm=norm).eval() if norm is None: if dct_type == 1: tf_idct_dct *= 0.5 / (signals.shape[-1] - 1) tf_dct_idct *= 0.5 / (signals.shape[-1] - 1) else: tf_idct_dct *= 0.5 / signals.shape[-1] tf_dct_idct *= 0.5 / signals.shape[-1] self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol) self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
def _compare(self, signals, n, norm, dct_type, atol, rtol): """Compares (I)DCT to SciPy (if available) and a NumPy implementation.""" np_dct = NP_DCT[dct_type](signals, n=n, norm=norm) tf_dct = dct_ops.dct(signals, n=n, type=dct_type, norm=norm) self.assertEqual(tf_dct.dtype.as_numpy_dtype, signals.dtype) self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol) np_idct = NP_IDCT[dct_type](signals, n=None, norm=norm) tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm) self.assertEqual(tf_idct.dtype.as_numpy_dtype, signals.dtype) self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol) if fftpack and dct_type != 4: scipy_dct = fftpack.dct(signals, n=n, type=dct_type, norm=norm) self.assertAllClose(scipy_dct, tf_dct, atol=atol, rtol=rtol) scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm) self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol) # Verify inverse(forward(s)) == s, up to a normalization factor. # Since `n` is not implemented for IDCT operation, re-calculating tf_dct # without n. tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm) tf_idct_dct = dct_ops.idct(tf_dct, type=dct_type, norm=norm) tf_dct_idct = dct_ops.dct(tf_idct, type=dct_type, norm=norm) if norm is None: if dct_type == 1: tf_idct_dct *= 0.5 / (signals.shape[-1] - 1) tf_dct_idct *= 0.5 / (signals.shape[-1] - 1) else: tf_idct_dct *= 0.5 / signals.shape[-1] tf_dct_idct *= 0.5 / signals.shape[-1] self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol) self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
def _compare(self, signals, norm, dct_type, atol=5e-4, rtol=5e-4): """Compares (I)DCT to SciPy (if available) and a NumPy implementation.""" np_dct = NP_DCT[dct_type](signals, norm) tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm).eval() self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol) np_idct = NP_IDCT[dct_type](signals, norm) tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm).eval() self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol) if fftpack: scipy_dct = fftpack.dct(signals, type=dct_type, norm=norm) self.assertAllClose(scipy_dct, tf_dct, atol=atol, rtol=rtol) scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm) self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol) # Verify inverse(forward(s)) == s, up to a normalization factor. tf_idct_dct = dct_ops.idct(tf_dct, type=dct_type, norm=norm).eval() tf_dct_idct = dct_ops.dct(tf_idct, type=dct_type, norm=norm).eval() if norm is None: if dct_type == 1: tf_idct_dct *= 0.5 / (signals.shape[-1] - 1) tf_dct_idct *= 0.5 / (signals.shape[-1] - 1) else: tf_idct_dct *= 0.5 / signals.shape[-1] tf_dct_idct *= 0.5 / signals.shape[-1] self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol) self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
def stdct(signals, frame_length, frame_step, fft_length=None, window_fn=window_ops.hann_window, pad_end=False, name=None): """ Short-time discrete cosine transform. Argument/s: Returns: """ with ops.name_scope(name, 'stdct', [signals, frame_length, frame_step]): signals = ops.convert_to_tensor(signals, name='signals') signals.shape.with_rank_at_least(1) frame_length = ops.convert_to_tensor(frame_length, name='frame_length') frame_length.shape.assert_has_rank(0) frame_step = ops.convert_to_tensor(frame_step, name='frame_step') frame_step.shape.assert_has_rank(0) if fft_length is None: fft_length = _enclosing_power_of_two(frame_length) else: fft_length = ops.convert_to_tensor(fft_length, name='fft_length') framed_signals = shape_ops.frame( signals, frame_length, frame_step, pad_end=pad_end) # Optionally window the framed signals. if window_fn is not None: window = window_fn(frame_length, dtype=framed_signals.dtype) framed_signals *= window return dct_ops.dct(framed_signals, n=fft_length)
def test_error(self): signals = np.random.rand(10) # Unsupported type. with self.assertRaises(ValueError): dct_ops.dct(signals, type=5) # DCT-I normalization not implemented. with self.assertRaises(ValueError): dct_ops.dct(signals, type=1, norm="ortho") # DCT-I requires at least two inputs. with self.assertRaises(ValueError): dct_ops.dct(np.random.rand(1), type=1) # Unknown normalization. with self.assertRaises(ValueError): dct_ops.dct(signals, norm="bad") with self.assertRaises(NotImplementedError): dct_ops.dct(signals, n=10) with self.assertRaises(NotImplementedError): dct_ops.dct(signals, axis=0)
def test_error(self): signals = np.random.rand(10) # Unsupported type. with self.assertRaises(ValueError): dct_ops.dct(signals, type=5) # Invalid n. with self.assertRaises(ValueError): dct_ops.dct(signals, n=-2) # DCT-I normalization not implemented. with self.assertRaises(ValueError): dct_ops.dct(signals, type=1, norm="ortho") # DCT-I requires at least two inputs. with self.assertRaises(ValueError): dct_ops.dct(np.random.rand(1), type=1) # Unknown normalization. with self.assertRaises(ValueError): dct_ops.dct(signals, norm="bad") with self.assertRaises(NotImplementedError): dct_ops.dct(signals, axis=0)
def inverse_mdct(mdcts, window_fn=window_ops.vorbis_window, norm=None, name=None): """Computes the inverse modified DCT of `mdcts`. To reconstruct an original waveform, the same window function should be used with `mdct` and `inverse_mdct`. Example usage: >>> @tf.function ... def compare_round_trip(): ... samples = 1000 ... frame_length = 400 ... halflen = frame_length // 2 ... waveform = tf.random.normal(dtype=tf.float32, shape=[samples]) ... waveform_pad = tf.pad(waveform, [[halflen, 0],]) ... mdct = tf.signal.mdct(waveform_pad, frame_length, pad_end=True, ... window_fn=tf.signal.vorbis_window) ... inverse_mdct = tf.signal.inverse_mdct(mdct, ... window_fn=tf.signal.vorbis_window) ... inverse_mdct = inverse_mdct[halflen: halflen + samples] ... return waveform, inverse_mdct >>> waveform, inverse_mdct = compare_round_trip() >>> np.allclose(waveform.numpy(), inverse_mdct.numpy(), rtol=1e-3, atol=1e-4) True Implemented with TPU/GPU-compatible ops and supports gradients. Args: mdcts: A `float32`/`float64` `[..., frames, frame_length // 2]` `Tensor` of MDCT bins representing a batch of `frame_length // 2`-point MDCTs. window_fn: A callable that takes a window length and a `dtype` keyword argument and returns a `[window_length]` `Tensor` of samples in the provided datatype. If set to `None`, no windowing is used. norm: If "ortho", orthonormal inverse DCT4 is performed, if it is None, a regular dct4 followed by scaling of `1/frame_length` is performed. name: An optional name for the operation. Returns: A `[..., samples]` `Tensor` of `float32`/`float64` signals representing the inverse MDCT for each input MDCT in `mdcts` where `samples` is `(frames - 1) * (frame_length // 2) + frame_length`. Raises: ValueError: If `mdcts` is not at least rank 2. [mdct]: https://en.wikipedia.org/wiki/Modified_discrete_cosine_transform """ with ops.name_scope(name, 'inverse_mdct', [mdcts]): mdcts = ops.convert_to_tensor(mdcts, name='mdcts') mdcts.shape.with_rank_at_least(2) half_len = math_ops.cast(mdcts.shape[-1], dtype=dtypes.int32) if norm is None: half_len_float = math_ops.cast(half_len, dtype=mdcts.dtype) result_idct4 = (0.5 / half_len_float) * dct_ops.dct(mdcts, type=4) elif norm == 'ortho': result_idct4 = dct_ops.dct(mdcts, type=4, norm='ortho') split_result = array_ops.split(result_idct4, 2, axis=-1) real_frames = array_ops.concat( (split_result[1], -array_ops.reverse(split_result[1], [-1]), -array_ops.reverse(split_result[0], [-1]), -split_result[0]), axis=-1) # Optionally window and overlap-add the inner 2 dimensions of real_frames # into a single [samples] dimension. if window_fn is not None: window = window_fn(2 * half_len, dtype=mdcts.dtype) real_frames *= window else: real_frames *= 1.0 / np.sqrt(2) return reconstruction_ops.overlap_and_add(real_frames, half_len)
def mdct(signals, frame_length, window_fn=window_ops.vorbis_window, pad_end=False, norm=None, name=None): """Computes the [Modified Discrete Cosine Transform][mdct] of `signals`. Implemented with TPU/GPU-compatible ops and supports gradients. Args: signals: A `[..., samples]` `float32`/`float64` `Tensor` of real-valued signals. frame_length: An integer scalar `Tensor`. The window length in samples which must be divisible by 4. window_fn: A callable that takes a window length and a `dtype` keyword argument and returns a `[window_length]` `Tensor` of samples in the provided datatype. If set to `None`, no windowing is used. pad_end: Whether to pad the end of `signals` with zeros when the provided frame length and step produces a frame that lies partially past its end. norm: If it is None, unnormalized dct4 is used, if it is "ortho" orthonormal dct4 is used. name: An optional name for the operation. Returns: A `[..., frames, frame_length // 2]` `Tensor` of `float32`/`float64` MDCT values where `frames` is roughly `samples // (frame_length // 2)` when `pad_end=False`. Raises: ValueError: If `signals` is not at least rank 1, `frame_length` is not scalar, or `frame_length` is not a multiple of `4`. [mdct]: https://en.wikipedia.org/wiki/Modified_discrete_cosine_transform """ with ops.name_scope(name, 'mdct', [signals, frame_length]): signals = ops.convert_to_tensor(signals, name='signals') signals.shape.with_rank_at_least(1) frame_length = ops.convert_to_tensor(frame_length, name='frame_length') frame_length.shape.assert_has_rank(0) # Assert that frame_length is divisible by 4. frame_length_static = tensor_util.constant_value(frame_length) if frame_length_static is not None: if frame_length_static % 4 != 0: raise ValueError('The frame length must be a multiple of 4.') frame_step = ops.convert_to_tensor(frame_length_static // 2, dtype=frame_length.dtype) else: frame_step = frame_length // 2 framed_signals = shape_ops.frame(signals, frame_length, frame_step, pad_end=pad_end) # Optionally window the framed signals. if window_fn is not None: window = window_fn(frame_length, dtype=framed_signals.dtype) framed_signals *= window else: framed_signals *= 1.0 / np.sqrt(2) split_frames = array_ops.split(framed_signals, 4, axis=-1) frame_firsthalf = -array_ops.reverse(split_frames[2], [-1]) - split_frames[3] frame_secondhalf = split_frames[0] - array_ops.reverse( split_frames[1], [-1]) frames_rearranged = array_ops.concat( (frame_firsthalf, frame_secondhalf), axis=-1) # Below call produces the (frame_length // 2) unique components of the # type 4 orthonormal DCT of the real windowed signals in frames_rearranged. return dct_ops.dct(frames_rearranged, type=4, norm=norm)
def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None): """Computes [MFCCs][mfcc] of `log_mel_spectrograms`. Implemented with GPU-compatible ops and supports gradients. [Mel-Frequency Cepstral Coefficient (MFCC)][mfcc] calculation consists of taking the DCT-II of a log-magnitude mel-scale spectrogram. [HTK][htk]'s MFCCs use a particular scaling of the DCT-II which is almost orthogonal normalization. We follow this convention. All `num_mel_bins` MFCCs are returned and it is up to the caller to select a subset of the MFCCs based on their application. For example, it is typical to only use the first few for speech recognition, as this results in an approximately pitch-invariant representation of the signal. For example: ```python sample_rate = 16000.0 # A Tensor of [batch_size, num_samples] mono PCM samples in the range [-1, 1]. pcm = tf.compat.v1.placeholder(tf.float32, [None, None]) # A 1024-point STFT with frames of 64 ms and 75% overlap. stfts = tf.signal.stft(pcm, frame_length=1024, frame_step=256, fft_length=1024) spectrograms = tf.abs(stfts) # Warp the linear scale spectrograms into the mel-scale. num_spectrogram_bins = stfts.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot( spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) # Compute a stabilized log to get log-magnitude mel-scale spectrograms. log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6) # Compute MFCCs from log_mel_spectrograms and take the first 13. mfccs = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[..., :13] ``` Args: log_mel_spectrograms: A `[..., num_mel_bins]` `float32` `Tensor` of log-magnitude mel-scale spectrograms. name: An optional name for the operation. Returns: A `[..., num_mel_bins]` `float32` `Tensor` of the MFCCs of `log_mel_spectrograms`. Raises: ValueError: If `num_mel_bins` is not positive. [mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum [htk]: https://en.wikipedia.org/wiki/HTK_(software) """ with ops.name_scope(name, 'mfccs_from_log_mel_spectrograms', [log_mel_spectrograms]): # Compute the DCT-II of the resulting log-magnitude mel-scale spectrogram. # The DCT used in HTK scales every basis vector by sqrt(2/N), which is the # scaling required for an "orthogonal" DCT-II *except* in the 0th bin, where # the true orthogonal DCT (as implemented by scipy) scales by sqrt(1/N). For # this reason, we don't apply orthogonal normalization and scale the DCT by # `0.5 * sqrt(2/N)` manually. log_mel_spectrograms = ops.convert_to_tensor(log_mel_spectrograms, dtype=dtypes.float32) if (log_mel_spectrograms.shape.ndims and log_mel_spectrograms.shape.dims[-1].value is not None): num_mel_bins = log_mel_spectrograms.shape.dims[-1].value if num_mel_bins == 0: raise ValueError('num_mel_bins must be positive. Got: %s' % log_mel_spectrograms) else: num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1] dct2 = dct_ops.dct(log_mel_spectrograms, type=2) return dct2 * math_ops.rsqrt( math_ops.cast(num_mel_bins, dtypes.float32) * 2.0)