示例#1
0
文件: file.py 项目: xjc90s/audiomate
    def read_frames(self,
                    frame_size,
                    hop_size,
                    offset=0,
                    duration=None,
                    buffer_size=5760000):
        """
        Generator that reads and returns the samples of the track in frames.

        Args:
            frame_size (int): The number of samples per frame.
            hop_size (int): The number of samples between two frames.
            offset (float): The time in seconds, from where to start
                            reading the samples (rel. to the track start).
            duration (float): The length of the samples to read in seconds.

        Returns:
            Generator: A generator yielding a tuple for every frame.
            The first item is the frame,
            the second the sampling-rate and
            the third a boolean indicating if it is the last frame.
        """
        if duration is not None:
            end = offset + duration
        else:
            end = float('inf')

        return audio.read_frames(self.path,
                                 frame_size,
                                 hop_size,
                                 start=offset,
                                 end=end,
                                 buffer_size=buffer_size)
示例#2
0
def test_read_frames_matches_length(tmpdir):
    wav_path = os.path.join(tmpdir.strpath, 'file.wav')
    wav_content = np.random.random(10000)
    librosa.output.write_wav(wav_path, wav_content, 16000)

    data = list(audio.read_frames(wav_path, frame_size=400, hop_size=160))
    frames = np.array([x[0] for x in data])
    last = [x[1] for x in data]

    assert frames.shape == (61, 400)
    assert frames.dtype == np.float32
    assert np.allclose(frames[0], wav_content[:400], atol=0.0001)
    assert np.allclose(frames[60], wav_content[9600:], atol=0.0001)

    assert last[:-1] == [False] * (len(data) - 1)
    assert last[-1]
示例#3
0
def test_read_frames(tmpdir):
    wav_path = os.path.join(tmpdir.strpath, 'file.wav')
    wav_content = np.random.random(10044)
    librosa.output.write_wav(wav_path, wav_content, 16000)

    data = list(audio.read_frames(wav_path, frame_size=400, hop_size=160))
    frames = np.array([x[0] for x in data])
    sr = [x[1] for x in data]
    last = [x[2] for x in data]

    assert frames.shape == (62, 400)
    assert frames.dtype == np.float32
    assert np.allclose(frames[0], wav_content[:400], atol=0.0001)
    assert np.allclose(frames[61],
                       np.pad(wav_content[9760:], (0, 116), mode='constant'),
                       atol=0.0001)

    assert sr == [16000] * len(data)
    assert last[:-1] == [False] * (len(data) - 1)
    assert last[-1]
示例#4
0
    def process_file_online(self,
                            file_path,
                            frame_size=400,
                            hop_size=160,
                            sr=None,
                            start=0,
                            end=-1,
                            utterance=None,
                            corpus=None,
                            chunk_size=1,
                            buffer_size=5760000):
        """
        Process the audio-file in **online** mode, chunk by chunk.
        The processed chunks are yielded one after another.

        Args:
            file_path (str): The audio file to process.
            frame_size (int): The number of samples per frame.
            hop_size (int): The number of samples between two frames.
            sr (int): Use the given sampling rate. If None uses the native sampling rate from the underlying data.
            start (float): The point within the file in seconds to start processing from.
            end (float): The point within the file in seconds to end processing.
            utterance (Utterance): The utterance that is associated with this file, if available.
            corpus (Corpus): The corpus this file is part of, if available.
            chunk_size (int): Number of frames to process per chunk.
            buffer_size (int): Number of samples to load into memory at once.
                             The exact number of loaded samples depends on the block-size of the audioread library.
                             So it can be of block-size higher, where the block-size is typically 1024 or 4096.

        Returns:
            Generator: A generator that yield processed chunks.
        """
        current_frame = 0
        frames = []

        # Process chunks that are within end bounds
        for frame, output_sr, is_last in audio.read_frames(
                file_path,
                frame_size,
                hop_size,
                sr_target=sr,
                start=start,
                end=end,
                buffer_size=buffer_size):
            frames.append(frame)

            if len(frames) == chunk_size:
                processed = self.process_frames(np.array(frames),
                                                output_sr,
                                                current_frame,
                                                last=is_last,
                                                utterance=utterance,
                                                corpus=corpus)
                if processed is not None:
                    yield processed
                current_frame += chunk_size
                frames = frames[chunk_size:]

        # Process overlapping chunks with zero frames at the end
        if len(frames) > 0:
            processed = self.process_frames(np.array(frames),
                                            output_sr,
                                            current_frame,
                                            last=True,
                                            utterance=utterance,
                                            corpus=corpus)
            yield processed