def test_complex_shape(self): signal = np.vstack([np.arange(6), np.arange(6) + 10, np.arange(6) + 20, np.arange(6) + 30, np.arange(6) + 40, np.arange(6) + 50]) signal = np.reshape(signal, (2, 1, 3, 1, 6)) frame_length = 3 frame_step = 2 with self.test_session(use_gpu=True): # With padding, we pad the last frame with pad_value. result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99).eval() # Resulting shape is (2, 1, 3, 1, 3, 3). expected = [[[[[[0, 1, 2], [2, 3, 4], [4, 5, 99]]], [[[10, 11, 12], [12, 13, 14], [14, 15, 99]]], [[[20, 21, 22], [22, 23, 24], [24, 25, 99]]]]], [[[[[30, 31, 32], [32, 33, 34], [34, 35, 99]]], [[[40, 41, 42], [42, 43, 44], [44, 45, 99]]], [[[50, 51, 52], [52, 53, 54], [54, 55, 99]]]]]] self.assertAllEqual(expected, result) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False).eval() # Resulting shape is (2, 1, 3, 1, 3, 2). expected = [[[[[[0, 1, 2], [2, 3, 4]]], [[[10, 11, 12], [12, 13, 14]]], [[[20, 21, 22], [22, 23, 24]]]]], [[[[[30, 31, 32], [32, 33, 34]]], [[[40, 41, 42], [42, 43, 44]]], [[[50, 51, 52], [52, 53, 54]]]]]] self.assertAllEqual(expected, result)
def test_axis(self): signal = np.reshape(np.arange(16), (2, 4, 2)) with self.test_session(use_gpu=True): result = shape_ops.frame(signal, frame_length=2, frame_step=2, pad_end=True, axis=1) expected = np.reshape(np.arange(16), (2, 2, 2, 2)) self.assertAllEqual(expected, result.eval()) result = shape_ops.frame(signal, frame_length=2, frame_step=1, pad_end=True, axis=1) expected = [[[[0, 1], [2, 3]], [[2, 3], [4, 5]], [[4, 5], [6, 7]], [[6, 7], [0, 0]]], [[[8, 9], [10, 11]], [[10, 11], [12, 13]], [[12, 13], [14, 15]], [[14, 15], [0, 0]]]] self.assertAllEqual(expected, result.eval()) result = shape_ops.frame(signal, frame_length=3, frame_step=1, pad_end=True, axis=1) expected = [[[[0, 1], [2, 3], [4, 5]], [[2, 3], [4, 5], [6, 7]], [[4, 5], [6, 7], [0, 0]], [[6, 7], [0, 0], [0, 0]]], [[[8, 9], [10, 11], [12, 13]], [[10, 11], [12, 13], [14, 15]], [[12, 13], [14, 15], [0, 0]], [[14, 15], [0, 0], [0, 0]]]] self.assertAllEqual(expected, result.eval())
def test_basic_stereo(self): signal = np.vstack([np.arange(6), np.arange(6) + 10]) frame_length = 3 frame_step = 2 with self.test_session(use_gpu=True): for rank in range(5): nd_signal = np.reshape(signal, (1,) * rank + signal.shape) # With padding, we pad the last frame with pad_value. result = shape_ops.frame(nd_signal, frame_length, frame_step, pad_end=True, pad_value=99).eval() expected_inner_frames = np.array([ [[0, 1, 2], [2, 3, 4], [4, 5, 99]], [[10, 11, 12], [12, 13, 14], [14, 15, 99]]]) expected = np.reshape( expected_inner_frames, (1,) * rank + expected_inner_frames.shape) self.assertAllEqual(expected, result) # Without padding, we drop the last frame. expected_inner_frames = np.array([[[0, 1, 2], [2, 3, 4]], [[10, 11, 12], [12, 13, 14]]]) expected = np.reshape( expected_inner_frames, (1,) * rank + expected_inner_frames.shape) result = shape_ops.frame(nd_signal, frame_length, frame_step, pad_end=False).eval() self.assertAllEqual(expected, result)
def test_basic_stereo(self): signal = np.vstack([np.arange(6), np.arange(6) + 10]) frame_length = 3 frame_step = 2 with self.test_session(use_gpu=True): for rank in range(5): nd_signal = np.reshape(signal, (1, ) * rank + signal.shape) # With padding, we pad the last frame with pad_value. result = shape_ops.frame(nd_signal, frame_length, frame_step, pad_end=True, pad_value=99).eval() expected_inner_frames = np.array([[[0, 1, 2], [2, 3, 4], [4, 5, 99]], [[10, 11, 12], [12, 13, 14], [14, 15, 99]]]) expected = np.reshape(expected_inner_frames, (1, ) * rank + expected_inner_frames.shape) self.assertAllEqual(expected, result) # Without padding, we drop the last frame. expected_inner_frames = np.array([[[0, 1, 2], [2, 3, 4]], [[10, 11, 12], [12, 13, 14]]]) expected = np.reshape(expected_inner_frames, (1, ) * rank + expected_inner_frames.shape) result = shape_ops.frame(nd_signal, frame_length, frame_step, pad_end=False).eval() self.assertAllEqual(expected, result)
def test_window_larger_than_signal(self): signal = constant_op.constant([[1, 2], [11, 12]], dtype=dtypes.float32) frame_length = 4 frame_step = 1 with self.test_session(use_gpu=True): result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99).eval() self.assertAllClose([[[1, 2, 99, 99], [2, 99, 99, 99]], [[11, 12, 99, 99], [12, 99, 99, 99]]], result) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False).eval() self.assertEqual((2, 0, 4), result.shape) frame_step = 2 result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99).eval() self.assertAllClose([[[1, 2, 99, 99]], [[11, 12, 99, 99]]], result) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False).eval() self.assertEqual((2, 0, 4), result.shape)
def test_length_zero(self): signal = constant_op.constant([], dtype=dtypes.float32) frame_length = 2 frame_step = 1 with self.test_session(use_gpu=True): result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99).eval() self.assertEqual((0, 2), result.shape) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False).eval() self.assertEqual((0, 2), result.shape)
def fast_time_stretch(signals, constant=False, extreme=False): def overlap(tup): framed_signals, frame_step_out = tup new_wav = reconstruction_ops.overlap_and_add(framed_signals, frame_step_out) return tf_get_word(new_wav) if extreme: print("extreme time warp activated") speedx = tf.random_uniform([tf.shape(signals)[0]], 0.7, 1.3) else: speedx = tf.truncated_normal([tf.shape(signals)[0]], 1.0, 0.2) frame_length = 300 frame_step_in = int(300 * 0.25) frame_step_out = tf.cast(speedx * frame_step_in, tf.int32) hann_window = window_ops.hann_window(frame_length) framed_signals = shape_ops.frame(signals, frame_length, frame_step_in, pad_end=False) framed_signals *= hann_window return tf.map_fn(overlap, [framed_signals, frame_step_out], parallel_iterations=120, back_prop=False, dtype=tf.float32)
def dataset_fft_to_mel_multi_with_files(fn, tfrecord, num_timesteps, num_freqs, linear_to_mel_weight_matrix, all_labels): """ Takes consequtive samples with step size out of each tfr""" context_out, feat_list_out = tf.parse_single_sequence_example( tfrecord, context_features={ "label": tf.FixedLenFeature((1,), dtype=tf.string) }, sequence_features={ "spectrogram": tf.FixedLenSequenceFeature((num_freqs,), tf.float32), } ) spectrogram_all = feat_list_out['spectrogram'] mel_spect = tf.tensordot(spectrogram_all, linear_to_mel_weight_matrix, 1) mel_spect = tf.math.log(mel_spect + 0.00001) mel_spect = tf.expand_dims(mel_spect, -1) spectrogram_frames = tf.transpose(shape_ops.frame(mel_spect, num_timesteps, num_timesteps//4, axis=0),[0,2,1,3]) lbl = context_out['label'] _, idx = tf.setdiff1d(tf.constant(all_labels), lbl) idx, _ = tf.setdiff1d(tf.range(len(all_labels)), idx) lblIndex = tf.fill([tf.shape(spectrogram_frames)[0]], tf.to_int32(idx[0])) files = tf.fill([tf.shape(spectrogram_frames)[0]], fn) return spectrogram_frames, lblIndex, files
def stdct(signals, frame_length, frame_step, fft_length=None, window_fn=functools.partial(window_ops.hann_window, periodic=True), pad_end=False, name=None): with ops.name_scope(name, 'stdct', [signals, frame_length, frame_step]): signals = ops.convert_to_tensor(signals, name='signals') signals.shape.with_rank_at_least(1) frame_length = ops.convert_to_tensor(frame_length, name='frame_length') frame_length.shape.assert_has_rank(0) frame_step = ops.convert_to_tensor(frame_step, name='frame_step') frame_step.shape.assert_has_rank(0) if fft_length is None: fft_length = _enclosing_power_of_two(frame_length) else: fft_length = ops.convert_to_tensor(fft_length, name='fft_length') framed_signals = shape_ops.frame(signals, frame_length, frame_step, pad_end=pad_end) if window_fn is not None: window = window_fn(frame_length, dtype=framed_signals.dtype) framed_signals *= window return spectral_ops.dct(framed_signals)
def test_preserves_type(self): signal = math_ops.range(10, dtype=dtypes.float64) frame_length = 2 frame_step = 3 with self.test_session(use_gpu=True): result = shape_ops.frame(signal, frame_length, frame_step) self.assertEqual(result.dtype, signal.dtype)
def stft(signals, frame_length, frame_step, fft_length=None, window_fn=functools.partial(window_ops.hann_window, periodic=True), pad_end=False, name=None): """Computes the [Short-time Fourier Transform][stft] of `signals`. Implemented with GPU-compatible ops and supports gradients. Args: signals: A `[..., samples]` `float32` `Tensor` of real-valued signals. frame_length: An integer scalar `Tensor`. The window length in samples. frame_step: An integer scalar `Tensor`. The number of samples to step. fft_length: An integer scalar `Tensor`. The size of the FFT to apply. If not provided, uses the smallest power of 2 enclosing `frame_length`. window_fn: A callable that takes a window length and a `dtype` keyword argument and returns a `[window_length]` `Tensor` of samples in the provided datatype. If set to `None`, no windowing is used. pad_end: Whether to pad the end of `signals` with zeros when the provided frame length and step produces a frame that lies partially past its end. name: An optional name for the operation. Returns: A `[..., frames, fft_unique_bins]` `Tensor` of `complex64` STFT values where `fft_unique_bins` is `fft_length // 2 + 1` (the unique components of the FFT). Raises: ValueError: If `signals` is not at least rank 1, `frame_length` is not scalar, or `frame_step` is not scalar. [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform """ with ops.name_scope(name, 'stft', [signals, frame_length, frame_step]): signals = ops.convert_to_tensor(signals, name='signals') signals.shape.with_rank_at_least(1) frame_length = ops.convert_to_tensor(frame_length, name='frame_length') frame_length.shape.assert_has_rank(0) frame_step = ops.convert_to_tensor(frame_step, name='frame_step') frame_step.shape.assert_has_rank(0) if fft_length is None: fft_length = _enclosing_power_of_two(frame_length) else: fft_length = ops.convert_to_tensor(fft_length, name='fft_length') framed_signals = shape_ops.frame( signals, frame_length, frame_step, pad_end=pad_end) # Optionally window the framed signals. if window_fn is not None: window = window_fn(frame_length, dtype=framed_signals.dtype) framed_signals *= window # spectral_ops.rfft produces the (fft_length/2 + 1) unique components of the # FFT of the real windowed signals in framed_signals. return spectral_ops.rfft(framed_signals, [fft_length])
def test_invalid_inputs(self): # Rank 0 input signal. with self.assertRaises(ValueError): shape_ops.frame(1, 1, 1) # If the rank is unknown, do not raise an exception. shape_ops.frame(array_ops.placeholder(dtypes.float32), 1, 1) # Non-scalar frame_length. with self.assertRaises(ValueError): shape_ops.frame([1], [1], 1) # Non-scalar frame_step. with self.assertRaises(ValueError): shape_ops.frame([1], 1, [1]) # Non-scalar pad_value. with self.assertRaises(ValueError): shape_ops.frame([1], 1, 1, pad_end=True, pad_value=[1])
def test_gradient_numerical(self): with self.test_session(use_gpu=True): signal_shape = (2, 128) signal = array_ops.ones(signal_shape) frame_length = 33 frame_step = 9 frames = shape_ops.frame(signal, frame_length, frame_step) error = test.compute_gradient_error( signal, signal_shape, frames, frames.shape.as_list()) self.assertLess(error, 2e-5)
def test_gradient_numerical(self): with self.test_session(use_gpu=True): signal_shape = (2, 128) signal = array_ops.ones(signal_shape) frame_length = 33 frame_step = 9 frames = shape_ops.frame(signal, frame_length, frame_step) error = test.compute_gradient_error(signal, signal_shape, frames, frames.shape.as_list()) self.assertLess(error, 2e-5)
def tf_get_word(wav, size=16000, indices=False): frames = shape_ops.frame(wav, size, 300, pad_end=True) frame_stack = tf.stack(frames) frame_vols = tf.reduce_mean(tf.pow(frame_stack, 2), axis=1) max_frame_vol = tf.argmax(frame_vols) if not indices: return frame_stack[max_frame_vol, :] else: start_index = max_frame_vol * 300 end_index = max_frame_vol * 300 + 16000 return start_index, end_index
def test_complex_shape(self): signal = np.vstack([ np.arange(6), np.arange(6) + 10, np.arange(6) + 20, np.arange(6) + 30, np.arange(6) + 40, np.arange(6) + 50 ]) signal = np.reshape(signal, (2, 1, 3, 1, 6)) frame_length = 3 frame_step = 2 with self.test_session(use_gpu=True): # With padding, we pad the last frame with pad_value. result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99).eval() # Resulting shape is (2, 1, 3, 1, 3, 3). expected = [[[[[[0, 1, 2], [2, 3, 4], [4, 5, 99]]], [[[10, 11, 12], [12, 13, 14], [14, 15, 99]]], [[[20, 21, 22], [22, 23, 24], [24, 25, 99]]]]], [[[[[30, 31, 32], [32, 33, 34], [34, 35, 99]]], [[[40, 41, 42], [42, 43, 44], [44, 45, 99]]], [[[50, 51, 52], [52, 53, 54], [54, 55, 99]]]]]] self.assertAllEqual(expected, result) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False).eval() # Resulting shape is (2, 1, 3, 1, 3, 2). expected = [[[[[[0, 1, 2], [2, 3, 4]]], [[[10, 11, 12], [12, 13, 14]]], [[[20, 21, 22], [22, 23, 24]]]]], [[[[[30, 31, 32], [32, 33, 34]]], [[[40, 41, 42], [42, 43, 44]]], [[[50, 51, 52], [52, 53, 54]]]]]] self.assertAllEqual(expected, result)
def test_constant_folding(self): """frame should be constant foldable for constant inputs.""" for pad_end in [True, False]: g = ops.Graph() with g.as_default(): frame_length, frame_step = 32, 16 signal_shape = (2, 128) signal = array_ops.ones(signal_shape) frames = shape_ops.frame(signal, frame_length, frame_step, pad_end=pad_end) rewritten_graph = test_util.grappler_optimize(g, [frames]) self.assertEqual(1, len(rewritten_graph.node))
def test_shape_inference(self): signal = array_ops.placeholder(dtypes.int32, shape=[1, 1]) frame_length = 2 frame_step = 1 # Shape inference is able to detect the rank and inner-most dimension # if frame_length is known at graph definition time. result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99) self.assertEqual([1, 1, 2], result.shape.as_list()) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False) self.assertEqual([1, 0, 2], result.shape.as_list()) # If frame_length is not known, rank and (known) outer and inner dimensions # are inferred. signal = array_ops.placeholder(dtypes.int32, shape=[1, 2, 3, 4]) frame_length = array_ops.placeholder(dtypes.int32, shape=[]) frame_step = 1 result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99, axis=1) self.assertEqual([1, None, None, 3, 4], result.shape.as_list()) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False, axis=1) self.assertEqual([1, None, None, 3, 4], result.shape.as_list()) # If frame_length and inner-most dimension is known, rank, inner dimensions, # and known outer dimensions are inferred. signal = array_ops.placeholder(dtypes.int32, shape=[None, 5, None, 20, 5, 3]) frame_length = 4 frame_step = 3 result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99, axis=3) self.assertEqual([None, 5, None, 7, 4, 5, 3], result.shape.as_list()) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False, axis=3) self.assertEqual([None, 5, None, 6, 4, 5, 3], result.shape.as_list()) # Test that shape inference is consistent with actual returned shapes for # small values of signal_length, frame_length, frame_step, and pad_end in # [True, False]. frame_step = 1 for signal_length in range(2): signal = [0] * signal_length for frame_length in range(2): for pad_end in [False, True]: op = shape_ops.frame(signal, frame_length, frame_step, pad_end=pad_end, pad_value=99) with self.test_session(use_gpu=True): result = op.eval() self.assertEqual(op.shape.as_list(), list(result.shape))
def test_mapping_of_indices_without_padding(self): with self.test_session(use_gpu=True): tensor = constant_op.constant(np.arange(9152), dtypes.int32) tensor = array_ops.expand_dims(tensor, 0) result = shape_ops.frame(tensor, 512, 180, pad_end=False).eval() expected = np.tile(np.arange(512), (49, 1)) expected += np.tile(np.arange(49) * 180, (512, 1)).T expected = np.expand_dims(expected, axis=0) expected = np.array(expected, dtype=np.int32) self.assertAllEqual(expected, result)
def test_mapping_of_indices_with_padding(self): with self.session(use_gpu=True): tensor = constant_op.constant(np.arange(10000), dtypes.int32) tensor = array_ops.expand_dims(tensor, 0) result = shape_ops.frame(tensor, 512, 192, pad_end=True).eval() expected = np.tile(np.arange(512), (53, 1)) expected += np.tile(np.arange(53) * 192, (512, 1)).T expected[expected >= 10000] = 0 expected = np.expand_dims(expected, axis=0) expected = np.array(expected, dtype=np.int32) self.assertAllEqual(expected, result)
def test_dynamic_tensor(self): # Show that frame works even when the dimensions of its input are # not known at graph creation time. input_signal = np.vstack([np.arange(4), np.arange(4) + 10, np.arange(4) + 20]) frame_length = 2 frame_step = 2 with self.test_session(use_gpu=True) as sess: signal_placeholder = array_ops.placeholder(shape=(None, None), dtype=dtypes.float32) result = sess.run(shape_ops.frame( signal_placeholder, frame_length, frame_step), feed_dict={signal_placeholder: input_signal}) self.assertAllEqual([[[0, 1], [2, 3]], [[10, 11], [12, 13]], [[20, 21], [22, 23]]], result)
def test_dynamic_tensor(self): # Show that frame works even when the dimensions of its input are # not known at graph creation time. input_signal = np.vstack( [np.arange(4), np.arange(4) + 10, np.arange(4) + 20]) frame_length = 2 frame_step = 2 with self.test_session(use_gpu=True) as sess: signal_placeholder = array_ops.placeholder(shape=(None, None), dtype=dtypes.float32) result = sess.run(shape_ops.frame(signal_placeholder, frame_length, frame_step), feed_dict={signal_placeholder: input_signal}) self.assertAllEqual( [[[0, 1], [2, 3]], [[10, 11], [12, 13]], [[20, 21], [22, 23]]], result)
def overlap_and_add(signal, frame_step, name=None): """Reconstructs a signal from a framed representation. Adds potentially overlapping frames of a signal with shape `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`. The resulting tensor has shape `[..., output_size]` where output_size = (frames - 1) * frame_step + frame_length Args: signal: A [..., frames, frame_length] `Tensor`. All dimensions may be unknown, and rank must be at least 2. frame_step: An integer or scalar `Tensor` denoting overlap offsets. Must be less than or equal to `frame_length`. name: An optional name for the operation. Returns: A `Tensor` with shape `[..., output_size]` containing the overlap-added frames of `signal`'s inner-most two dimensions. Raises: ValueError: If `signal`'s rank is less than 2, `frame_step` is not a scalar integer or `frame_step` is greater than `frame_length`. """ with ops.name_scope(name, "overlap_and_add", [signal, frame_step]): signal = ops.convert_to_tensor(signal, name="signal") signal.shape.with_rank_at_least(2) frame_step = ops.convert_to_tensor(frame_step, name="frame_step") frame_step.shape.assert_has_rank(0) if not frame_step.dtype.is_integer: raise ValueError("frame_step must be an integer. Got %s" % frame_step.dtype) signal_shape = array_ops.shape(signal) # All dimensions that are not part of the overlap-and-add. Can be empty for # rank 2 inputs. outer_dimensions = signal_shape[:-2] # If frame_length and frame_step are known at graph construction time, check # frame_step is less than or equal to frame_length. frame_step_static = tensor_util.constant_value(frame_step) if (frame_step_static is not None and signal.shape.ndims is not None and signal.shape.dims[-1].value is not None): if frame_step_static > signal.shape.dims[-1].value: raise ValueError( "frame_step (%d) must be less than or equal to " "frame_length (%d)" % ( frame_step_static, signal.shape.dims[-1].value)) # If frame_length is equal to frame_step, there's no overlap so just # reshape the tensor. if frame_step_static == signal.shape.dims[-1].value: return array_ops.reshape(signal, array_ops.concat( [outer_dimensions, [-1]], 0)) signal_rank = array_ops.rank(signal) frames = signal_shape[-2] frame_length = signal_shape[-1] subframe_length = util_ops.gcd(frame_length, frame_step) subframe_step = frame_step // subframe_length subframes_per_frame = frame_length // subframe_length output_size = frame_step * (frames - 1) + frame_length output_subframes = output_size // subframe_length # To avoid overlap-adding sample-by-sample, we overlap-add at the "subframe" # level, where a subframe is gcd(frame_length, frame_step). Reshape signal # from [..., frames, frame_length] into [..., subframes, subframe_length]. subframe_shape = array_ops.concat( [outer_dimensions, [-1, subframe_length]], 0) subframe_signal = array_ops.reshape(signal, subframe_shape) # Now we shuffle the last [subframes, subframe_length] dimensions to the # front. # TODO(rjryan): Add an axis argument to unsorted_segment_sum so we can # avoid this pair of transposes. subframe_signal = _shuffle_to_front(subframe_signal, 2) # Use unsorted_segment_sum to add overlapping subframes together. segment_ids = array_ops.reshape(shape_ops.frame( math_ops.range(output_subframes), subframes_per_frame, subframe_step, pad_end=False), [-1]) result = math_ops.unsorted_segment_sum(subframe_signal, segment_ids, num_segments=output_subframes) # result is a [subframes, subframe_length, ...outer_dimensions] tensor. We # return a [...outer_dimensions, output_size] tensor with a transpose and # reshape. result_shape = array_ops.concat([outer_dimensions, [output_size]], 0) return array_ops.reshape(_shuffle_to_front(result, signal_rank - 2), result_shape)
def linear_to_mel_weight_matrix(num_mel_bins=20, num_spectrogram_bins=129, sample_rate=8000, lower_edge_hertz=125.0, upper_edge_hertz=3800.0, dtype=dtypes.float32, name=None): """Returns a matrix to warp linear scale spectrograms to the [mel scale][mel]. Returns a weight matrix that can be used to re-weight a `Tensor` containing `num_spectrogram_bins` linearly sampled frequency information from `[0, sample_rate / 2]` into `num_mel_bins` frequency information from `[lower_edge_hertz, upper_edge_hertz]` on the [mel scale][mel]. For example, the returned matrix `A` can be used to right-multiply a spectrogram `S` of shape `[frames, num_spectrogram_bins]` of linear scale spectrum values (e.g. STFT magnitudes) to generate a "mel spectrogram" `M` of shape `[frames, num_mel_bins]`. # `S` has shape [frames, num_spectrogram_bins] # `M` has shape [frames, num_mel_bins] M = tf.matmul(S, A) The matrix can be used with @{tf.tensordot} to convert an arbitrary rank `Tensor` of linear-scale spectral bins into the mel scale. # S has shape [..., num_spectrogram_bins]. # M has shape [..., num_mel_bins]. M = tf.tensordot(S, A, 1) # tf.tensordot does not support shape inference for this case yet. M.set_shape(S.shape[:-1].concatenate(A.shape[-1:])) Args: num_mel_bins: Python int. How many bands in the resulting mel spectrum. num_spectrogram_bins: An integer `Tensor`. How many bins there are in the source spectrogram data, which is understood to be `fft_size // 2 + 1`, i.e. the spectrogram only contains the nonredundant FFT bins. sample_rate: Python float. Samples per second of the input signal used to create the spectrogram. We need this to figure out the actual frequencies for each spectrogram bin, which dictates how they are mapped into the mel scale. lower_edge_hertz: Python float. Lower bound on the frequencies to be included in the mel spectrum. This corresponds to the lower edge of the lowest triangular band. upper_edge_hertz: Python float. The desired top edge of the highest frequency band. dtype: The `DType` of the result matrix. Must be a floating point type. name: An optional name for the operation. Returns: A `Tensor` of shape `[num_spectrogram_bins, num_mel_bins]`. Raises: ValueError: If num_mel_bins/num_spectrogram_bins/sample_rate are not positive, lower_edge_hertz is negative, frequency edges are incorrectly ordered, or upper_edge_hertz is larger than the Nyquist frequency. [mel]: https://en.wikipedia.org/wiki/Mel_scale """ with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name: # Note: As num_spectrogram_bins is passed to `math_ops.linspace` # and the validation is already done in linspace (both in shape function # and in kernel), there is no need to validate num_spectrogram_bins here. _validate_arguments(num_mel_bins, sample_rate, lower_edge_hertz, upper_edge_hertz, dtype) # To preserve accuracy, we compute the matrix at float64 precision and then # cast to `dtype` at the end. This function can be constant folded by graph # optimization since there are no Tensor inputs. sample_rate = ops.convert_to_tensor( sample_rate, dtypes.float64, name='sample_rate') lower_edge_hertz = ops.convert_to_tensor( lower_edge_hertz, dtypes.float64, name='lower_edge_hertz') upper_edge_hertz = ops.convert_to_tensor( upper_edge_hertz, dtypes.float64, name='upper_edge_hertz') zero_float64 = ops.convert_to_tensor(0.0, dtypes.float64) # HTK excludes the spectrogram DC bin. bands_to_zero = 1 nyquist_hertz = sample_rate / 2.0 linear_frequencies = math_ops.linspace( zero_float64, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:] spectrogram_bins_mel = array_ops.expand_dims( _hertz_to_mel(linear_frequencies), 1) # Compute num_mel_bins triples of (lower_edge, center, upper_edge). The # center of each band is the lower and upper edge of the adjacent bands. # Accordingly, we divide [lower_edge_hertz, upper_edge_hertz] into # num_mel_bins + 2 pieces. band_edges_mel = shape_ops.frame( math_ops.linspace(_hertz_to_mel(lower_edge_hertz), _hertz_to_mel(upper_edge_hertz), num_mel_bins + 2), frame_length=3, frame_step=1) # Split the triples up and reshape them into [1, num_mel_bins] tensors. lower_edge_mel, center_mel, upper_edge_mel = tuple(array_ops.reshape( t, [1, num_mel_bins]) for t in array_ops.split( band_edges_mel, 3, axis=1)) # Calculate lower and upper slopes for every spectrogram bin. # Line segments are linear in the mel domain, not Hertz. lower_slopes = (spectrogram_bins_mel - lower_edge_mel) / ( center_mel - lower_edge_mel) upper_slopes = (upper_edge_mel - spectrogram_bins_mel) / ( upper_edge_mel - center_mel) # Intersect the line segments with each other and zero. mel_weights_matrix = math_ops.maximum( zero_float64, math_ops.minimum(lower_slopes, upper_slopes)) # Re-add the zeroed lower bins we sliced out above. mel_weights_matrix = array_ops.pad( mel_weights_matrix, [[bands_to_zero, 0], [0, 0]]) # Cast to the desired type. return math_ops.cast(mel_weights_matrix, dtype, name=name)
def linear_to_mel_weight_matrix(num_mel_bins=20, num_spectrogram_bins=129, sample_rate=8000, lower_edge_hertz=125.0, upper_edge_hertz=3800.0, dtype=dtypes.float32, name=None): """Returns a matrix to warp linear scale spectrograms to the [mel scale][mel]. Returns a weight matrix that can be used to re-weight a `Tensor` containing `num_spectrogram_bins` linearly sampled frequency information from `[0, sample_rate / 2]` into `num_mel_bins` frequency information from `[lower_edge_hertz, upper_edge_hertz]` on the [mel scale][mel]. For example, the returned matrix `A` can be used to right-multiply a spectrogram `S` of shape `[frames, num_spectrogram_bins]` of linear scale spectrum values (e.g. STFT magnitudes) to generate a "mel spectrogram" `M` of shape `[frames, num_mel_bins]`. # `S` has shape [frames, num_spectrogram_bins] # `M` has shape [frames, num_mel_bins] M = tf.matmul(S, A) The matrix can be used with @{tf.tensordot} to convert an arbitrary rank `Tensor` of linear-scale spectral bins into the mel scale. # S has shape [..., num_spectrogram_bins]. # M has shape [..., num_mel_bins]. M = tf.tensordot(S, A, 1) # tf.tensordot does not support shape inference for this case yet. M.set_shape(S.shape[:-1].concatenate(A.shape[-1:])) Args: num_mel_bins: Python int. How many bands in the resulting mel spectrum. num_spectrogram_bins: Python int. How many bins there are in the source spectrogram data, which is understood to be `fft_size // 2 + 1`, i.e. the spectrogram only contains the nonredundant FFT bins. sample_rate: Python float. Samples per second of the input signal used to create the spectrogram. We need this to figure out the actual frequencies for each spectrogram bin, which dictates how they are mapped into the mel scale. lower_edge_hertz: Python float. Lower bound on the frequencies to be included in the mel spectrum. This corresponds to the lower edge of the lowest triangular band. upper_edge_hertz: Python float. The desired top edge of the highest frequency band. dtype: The `DType` of the result matrix. Must be a floating point type. name: An optional name for the operation. Returns: A `Tensor` of shape `[num_spectrogram_bins, num_mel_bins]`. Raises: ValueError: If num_mel_bins/num_spectrogram_bins/sample_rate are not positive, lower_edge_hertz is negative, frequency edges are incorrectly ordered, or upper_edge_hertz is larger than the Nyquist frequency. [mel]: https://en.wikipedia.org/wiki/Mel_scale """ with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name: _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz, dtype) # To preserve accuracy, we compute the matrix at float64 precision and then # cast to `dtype` at the end. This function can be constant folded by graph # optimization since there are no Tensor inputs. sample_rate = ops.convert_to_tensor(sample_rate, dtypes.float64, name='sample_rate') lower_edge_hertz = ops.convert_to_tensor(lower_edge_hertz, dtypes.float64, name='lower_edge_hertz') upper_edge_hertz = ops.convert_to_tensor(upper_edge_hertz, dtypes.float64, name='upper_edge_hertz') zero_float64 = ops.convert_to_tensor(0.0, dtypes.float64) # HTK excludes the spectrogram DC bin. bands_to_zero = 1 nyquist_hertz = sample_rate / 2.0 linear_frequencies = math_ops.linspace( zero_float64, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:] spectrogram_bins_mel = array_ops.expand_dims( _hertz_to_mel(linear_frequencies), 1) # Compute num_mel_bins triples of (lower_edge, center, upper_edge). The # center of each band is the lower and upper edge of the adjacent bands. # Accordingly, we divide [lower_edge_hertz, upper_edge_hertz] into # num_mel_bins + 2 pieces. band_edges_mel = shape_ops.frame(math_ops.linspace( _hertz_to_mel(lower_edge_hertz), _hertz_to_mel(upper_edge_hertz), num_mel_bins + 2), frame_length=3, frame_step=1) # Split the triples up and reshape them into [1, num_mel_bins] tensors. lower_edge_mel, center_mel, upper_edge_mel = tuple( array_ops.reshape(t, [1, num_mel_bins]) for t in array_ops.split(band_edges_mel, 3, axis=1)) # Calculate lower and upper slopes for every spectrogram bin. # Line segments are linear in the mel domain, not Hertz. lower_slopes = (spectrogram_bins_mel - lower_edge_mel) / (center_mel - lower_edge_mel) upper_slopes = (upper_edge_mel - spectrogram_bins_mel) / (upper_edge_mel - center_mel) # Intersect the line segments with each other and zero. mel_weights_matrix = math_ops.maximum( zero_float64, math_ops.minimum(lower_slopes, upper_slopes)) # Re-add the zeroed lower bins we sliced out above. mel_weights_matrix = array_ops.pad(mel_weights_matrix, [[bands_to_zero, 0], [0, 0]]) # Cast to the desired type. return math_ops.cast(mel_weights_matrix, dtype, name=name)
def stft(signals, frame_length, frame_step, fft_length=None, window_fn=functools.partial(window_ops.hann_window, periodic=True), pad_end=False, name=None): """Computes the [Short-time Fourier Transform][stft] of `signals`. Implemented with GPU-compatible ops and supports gradients. Args: signals: A `[..., samples]` `float32` `Tensor` of real-valued signals. frame_length: An integer scalar `Tensor`. The window length in samples. frame_step: An integer scalar `Tensor`. The number of samples to step. fft_length: An integer scalar `Tensor`. The size of the FFT to apply. If not provided, uses the smallest power of 2 enclosing `frame_length`. window_fn: A callable that takes a window length and a `dtype` keyword argument and returns a `[window_length]` `Tensor` of samples in the provided datatype. If set to `None`, no windowing is used. pad_end: Whether to pad the end of `signals` with zeros when the provided frame length and step produces a frame that lies partially past its end. name: An optional name for the operation. Returns: A `[..., frames, fft_unique_bins]` `Tensor` of `complex64` STFT values where `fft_unique_bins` is `fft_length // 2 + 1` (the unique components of the FFT). Raises: ValueError: If `signals` is not at least rank 1, `frame_length` is not scalar, or `frame_step` is not scalar. [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform """ with ops.name_scope(name, 'stft', [signals, frame_length, frame_step]): signals = ops.convert_to_tensor(signals, name='signals') signals.shape.with_rank_at_least(1) frame_length = ops.convert_to_tensor(frame_length, name='frame_length') frame_length.shape.assert_has_rank(0) frame_step = ops.convert_to_tensor(frame_step, name='frame_step') frame_step.shape.assert_has_rank(0) if fft_length is None: fft_length = _enclosing_power_of_two(frame_length) else: fft_length = ops.convert_to_tensor(fft_length, name='fft_length') framed_signals = shape_ops.frame(signals, frame_length, frame_step, pad_end=pad_end) # Optionally window the framed signals. if window_fn is not None: window = window_fn(frame_length, dtype=framed_signals.dtype) framed_signals *= window # spectral_ops.rfft produces the (fft_length/2 + 1) unique components of the # FFT of the real windowed signals in framed_signals. return spectral_ops.rfft(framed_signals, [fft_length])
def test_shape_inference(self): signal = array_ops.placeholder(dtypes.int32, shape=[1, 1]) frame_length = 2 frame_step = 1 # Shape inference is able to detect the rank and inner-most dimension # if frame_length is known at graph definition time. result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99) self.assertEqual([1, 1, 2], result.shape.as_list()) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False) self.assertEqual([1, 0, 2], result.shape.as_list()) # If frame_length is not known, rank and (known) outer and inner dimensions # are inferred. signal = array_ops.placeholder(dtypes.int32, shape=[1, 2, 3, 4]) frame_length = array_ops.placeholder(dtypes.int32, shape=[]) frame_step = 1 result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99, axis=1) self.assertEqual([1, 2, None, 3, 4], result.shape.as_list()) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False, axis=1) self.assertEqual([1, None, None, 3, 4], result.shape.as_list()) # If frame_length and inner-most dimension is known, rank, inner dimensions, # and known outer dimensions are inferred. signal = array_ops.placeholder(dtypes.int32, shape=[None, 5, None, 20, 5, 3]) frame_length = 4 frame_step = 3 result = shape_ops.frame(signal, frame_length, frame_step, pad_end=True, pad_value=99, axis=3) self.assertEqual([None, 5, None, 7, 4, 5, 3], result.shape.as_list()) result = shape_ops.frame(signal, frame_length, frame_step, pad_end=False, axis=3) self.assertEqual([None, 5, None, 6, 4, 5, 3], result.shape.as_list()) # Test that shape inference is consistent with actual returned shapes for # small values of signal_length, frame_length, frame_step, and pad_end in # [True, False]. frame_step = 1 for signal_length in range(2): signal = [0] * signal_length for frame_length in range(2): for pad_end in [False, True]: op = shape_ops.frame(signal, frame_length, frame_step, pad_end=pad_end, pad_value=99) with self.test_session(use_gpu=True): result = op.eval() self.assertEqual(op.shape.as_list(), list(result.shape))