def _MakeLogMelFromTensorflowBuiltin(tf_wav_bytes): sample_rate, audio = audio_lib.DecodeWav(tf_wav_bytes) static_sample_rate = 16000 with tf.control_dependencies( [tf.assert_equal(sample_rate, static_sample_rate)]): log_mel = audio_lib.AudioToMfcc(static_sample_rate, audio, 25, 25, 40) return log_mel
def FinalizeImage(self): """Finishes creation of the overall figure, returning the image tensor.""" subplot_grid_shape = self._subplot_grid_shape if subplot_grid_shape is None: subplot_grid_shape = (len(self._subplots), 1) # AddMatplotlibFigureSummary (due to restrictions of py_func) only supports # flattened list of tensors so we must do some bookkeeping to maintain a # mapping from _SubplotMetadata object to flattened_tensors. subplot_slices = [] flattened_tensors = [] for subplot in self._subplots: start = len(flattened_tensors) subplot_slices.append((start, start + len(subplot.tensor_list))) flattened_tensors.extend(subplot.tensor_list) def PlotFunc(fig, *numpy_data_list): gs = gridspec.GridSpec(*subplot_grid_shape, **self._gridspec_kwargs) for n, subplot in enumerate(self._subplots): axes = fig.add_subplot(gs[n]) start, end = subplot_slices[n] subplot_data = numpy_data_list[start:end] subplot.plot_func(fig, axes, *subplot_data) func = functools.partial(_RenderMatplotlibFigures, self._figsize, self._max_outputs, PlotFunc) batch_sizes = [tf.shape(t)[0] for t in flattened_tensors] num_tensors = len(flattened_tensors) with tf.control_dependencies([ tf.assert_equal( batch_sizes, [batch_sizes[0]] * num_tensors, summarize=num_tensors) ]): return tf.py_func( func, flattened_tensors, tf.uint8, name='RenderMatplotlibFigures')
def _check_paddings(self, paddings): with tf.name_scope('check_paddings'): unpacked_paddings = tf.unstack(paddings) non_decr = [] for t in unpacked_paddings: non_d = tf.is_non_decreasing(t) non_decr.append(non_d) all_non_decr = tf.stack(non_decr) paddings = py_utils.with_dependencies([ tf.assert_equal(tf.reduce_any(tf.equal(paddings, 0.0)), True, message='must have at least one zero value.'), tf.assert_equal( all_non_decr, True, message='must be non-decreasing') ], paddings) return paddings
def _ReshapeToMono2D(self, pcm_audio_data, paddings): """Reshapes a 3D or 4D input to 2D. Since the input to FProp can be 3D or 4D (see class comments), this will collapse it back to a 2D, mono shape for internal processing. Args: pcm_audio_data: 2D, 3D or 4D audio input. See class comments. Must have a rank. paddings: Original paddings shaped to the first two dims of pcm_audio_data. Returns: Tuple of 2D [batch_size, timestep] mono audio data, new paddings. """ shape = py_utils.GetShape(pcm_audio_data) rank = len(shape) if rank == 2: return pcm_audio_data, paddings elif rank == 3: # [batch, time, channel] with tf.control_dependencies([tf.assert_equal(shape[2], 1)]): return tf.squeeze(pcm_audio_data, axis=2), paddings elif rank == 4: # [batch, time, packet, channel] batch_size, orig_time, orig_packet_size, channel = shape time = orig_time * orig_packet_size with tf.control_dependencies([tf.assert_equal(channel, 1)]): pcm_audio_data = tf.reshape(pcm_audio_data, (batch_size, time)) # Transform paddings into the new time base with a padding per time # step vs per packet by duplicating each packet. paddings = tf.reshape( tf.tile(tf.expand_dims(paddings, axis=2), [1, 1, orig_packet_size]), (batch_size, time)) return pcm_audio_data, paddings else: raise ValueError('Illegal pcm_audio_data shape')
def ExtractLogMelFeatures(wav_bytes_t): """Create Log-Mel Filterbank Features from raw bytes. Args: wav_bytes_t: Tensor representing raw wav file as a string of bytes. It is currently assumed that the wav file is encoded at 16KHz (see DecodeWav, below). Returns: A Tensor representing three stacked log-Mel filterbank energies, sub-sampled every three frames. """ # We want to use these parameters exactly. def _CreateAsrFrontend(): """Parameters corresponding to default ASR frontend.""" p = asr_frontend.MelAsrFrontend.Params() p.sample_rate = 16000. p.frame_size_ms = 25. p.frame_step_ms = 10. p.num_bins = 80 p.lower_edge_hertz = 125. p.upper_edge_hertz = 7600. p.preemph = 0.97 p.noise_scale = 0. p.pad_end = False return p.Instantiate() sample_rate, audio = DecodeWav(wav_bytes_t) audio *= 32768 # Remove channel dimension, since we have a single channel. audio = tf.squeeze(audio, axis=1) # TODO(drpng): make batches. audio = tf.expand_dims(audio, axis=0) static_sample_rate = 16000 mel_frontend = _CreateAsrFrontend() with tf.control_dependencies( [tf.assert_equal(sample_rate, static_sample_rate)]): outputs = mel_frontend.FPropDefaultTheta( py_utils.NestedMap(src_inputs=audio, paddings=tf.zeros_like(audio))) log_mel = outputs.src_inputs return log_mel