def read_frames(self, frame_size, hop_size, offset=0, duration=None, buffer_size=5760000): """ Generator that reads and returns the samples of the track in frames. Args: frame_size (int): The number of samples per frame. hop_size (int): The number of samples between two frames. offset (float): The time in seconds, from where to start reading the samples (rel. to the track start). duration (float): The length of the samples to read in seconds. Returns: Generator: A generator yielding a tuple for every frame. The first item is the frame, the second the sampling-rate and the third a boolean indicating if it is the last frame. """ if duration is not None: end = offset + duration else: end = float('inf') return audio.read_frames(self.path, frame_size, hop_size, start=offset, end=end, buffer_size=buffer_size)
def test_read_frames_matches_length(tmpdir): wav_path = os.path.join(tmpdir.strpath, 'file.wav') wav_content = np.random.random(10000) librosa.output.write_wav(wav_path, wav_content, 16000) data = list(audio.read_frames(wav_path, frame_size=400, hop_size=160)) frames = np.array([x[0] for x in data]) last = [x[1] for x in data] assert frames.shape == (61, 400) assert frames.dtype == np.float32 assert np.allclose(frames[0], wav_content[:400], atol=0.0001) assert np.allclose(frames[60], wav_content[9600:], atol=0.0001) assert last[:-1] == [False] * (len(data) - 1) assert last[-1]
def test_read_frames(tmpdir): wav_path = os.path.join(tmpdir.strpath, 'file.wav') wav_content = np.random.random(10044) librosa.output.write_wav(wav_path, wav_content, 16000) data = list(audio.read_frames(wav_path, frame_size=400, hop_size=160)) frames = np.array([x[0] for x in data]) sr = [x[1] for x in data] last = [x[2] for x in data] assert frames.shape == (62, 400) assert frames.dtype == np.float32 assert np.allclose(frames[0], wav_content[:400], atol=0.0001) assert np.allclose(frames[61], np.pad(wav_content[9760:], (0, 116), mode='constant'), atol=0.0001) assert sr == [16000] * len(data) assert last[:-1] == [False] * (len(data) - 1) assert last[-1]
def process_file_online(self, file_path, frame_size=400, hop_size=160, sr=None, start=0, end=-1, utterance=None, corpus=None, chunk_size=1, buffer_size=5760000): """ Process the audio-file in **online** mode, chunk by chunk. The processed chunks are yielded one after another. Args: file_path (str): The audio file to process. frame_size (int): The number of samples per frame. hop_size (int): The number of samples between two frames. sr (int): Use the given sampling rate. If None uses the native sampling rate from the underlying data. start (float): The point within the file in seconds to start processing from. end (float): The point within the file in seconds to end processing. utterance (Utterance): The utterance that is associated with this file, if available. corpus (Corpus): The corpus this file is part of, if available. chunk_size (int): Number of frames to process per chunk. buffer_size (int): Number of samples to load into memory at once. The exact number of loaded samples depends on the block-size of the audioread library. So it can be of block-size higher, where the block-size is typically 1024 or 4096. Returns: Generator: A generator that yield processed chunks. """ current_frame = 0 frames = [] # Process chunks that are within end bounds for frame, output_sr, is_last in audio.read_frames( file_path, frame_size, hop_size, sr_target=sr, start=start, end=end, buffer_size=buffer_size): frames.append(frame) if len(frames) == chunk_size: processed = self.process_frames(np.array(frames), output_sr, current_frame, last=is_last, utterance=utterance, corpus=corpus) if processed is not None: yield processed current_frame += chunk_size frames = frames[chunk_size:] # Process overlapping chunks with zero frames at the end if len(frames) > 0: processed = self.process_frames(np.array(frames), output_sr, current_frame, last=True, utterance=utterance, corpus=corpus) yield processed