def read_and_preprocess_batched_audio( self, ks, exs): """Returns batched model input, audio, and sr.""" audios = [] for k, ex in zip(ks, exs): audio = self._read_audio_and_resample(k, ex) if audio.ndim > 1: raise ValueError(f'Audio was too many dims: {audio.ndim}') audios.append(audio) sr = self._target_sample_rate # Do some chunking. if self._chunk_len: logging.info('Chunk len: %s', self._chunk_len) chunked_audios = [] for audio in audios: if audio.shape[0] >= self._chunk_len: chunk = utils.get_chunked_audio_fn(audio, self._chunk_len) else: chunk = np.expand_dims(audio, -1) chunked_audios.append(chunk) audios = np.concatenate(chunked_audios, axis=0) audios = [audios[i] for i in range(audios.shape[0])] # Convert audio to features, if required. model_inputs = [self._audio_to_features(a, sr) for a in audios] for model_input in model_inputs: if model_input.shape != model_inputs[0].shape: raise ValueError(f'Model input shapes not the same: {model_inputs}') logging.info('model_input shape: %s', model_input.shape) batched_model_input = np.stack(model_inputs, axis=0) return batched_model_input, audios, sr
def tfex_to_chunked_audio(self, k, ex): # Read audio from tf.Example, get the sample rate, resample if necessary, # and convert to model inputs (if necessary). model_input, sample_rate = self.read_and_preprocess_audio(k, ex) # Do some chunking. if self._chunk_len: logging.info('Chunk len: %s', self._chunk_len) if model_input.shape[0] >= self._chunk_len: model_input = utils.get_chunked_audio_fn( model_input, self._chunk_len) logging.info('model_input after chunking: ') return model_input, sample_rate