Пример #1
0
def read_signal(filename, offset=0, nsamples=-1, nchannels=0, offset_is_samples=False):
    """Read a wavefile and return as numpy array of floats.

    Args:
        filename (string): Name of file to read
        offset (int, optional): Offset in samples or seconds (from start). Defaults to 0.
        nchannels: expected number of channel (default: 0 = any number OK)
        offset_is_samples (bool): measurement units for offset (default: False)
    Returns:
        ndarray: audio signal
    """
    try:
        wave_file = SoundFile(filename)
    except:
        # Ensure incorrect error (24 bit) is not generated
        raise Exception(f"Unable to read {filename}.")

    if nchannels != 0 and wave_file.channels != nchannels:
        raise Exception(
            f"Wav file ({filename}) was expected to have {nchannels} channels."
        )

    if wave_file.samplerate != CONFIG.fs:
        raise Exception(f"Sampling rate is not {CONFIG.fs} for filename {filename}.")

    if not offset_is_samples:  # Default behaviour
        offset = int(offset * wave_file.samplerate)

    if offset != 0:
        wave_file.seek(offset)

    x = wave_file.read(frames=nsamples)

    return x
Пример #2
0
def readWave(audio_path,
             start_frame,
             end_frame,
             mono=True,
             sample_rate=None,
             clip=True):
    snd_file = SoundFile(audio_path, mode='r')
    inf = snd_file._info
    audio_sr = inf.samplerate

    snd_file.seek(start_frame)
    audio = snd_file.read(end_frame - start_frame, dtype='float32')
    snd_file.close()
    audio = audio.T  # Tuple to numpy, transpose axis to (channels, frames)

    # Convert to mono if desired
    if mono and len(audio.shape) > 1 and audio.shape[0] > 1:
        audio = np.mean(audio, axis=0)

    # Resample if needed
    if sample_rate is not None and sample_rate != audio_sr:
        audio = librosa.resample(audio,
                                 audio_sr,
                                 sample_rate,
                                 res_type="kaiser_fast")
        audio_sr = sample_rate

    # Clip to [-1,1] if desired
    if clip:
        audio = np.minimum(np.maximum(audio, -1.0), 1.0)

    return audio, audio_sr
Пример #3
0
def readWave(audio_path,
             start_frame,
             end_frame,
             mono=True,
             sample_rate=None,
             clip=True):
    snd_file = SoundFile(audio_path, mode='r')
    inf = snd_file._info
    audio_sr = inf.samplerate

    start_read = max(start_frame, 0)
    pad_front = -min(start_frame, 0)
    end_read = min(end_frame, inf.frames)
    pad_back = max(end_frame - inf.frames, 0)

    snd_file.seek(start_read)
    audio = snd_file.read(end_read - start_read,
                          dtype='float32',
                          always_2d=True)  # (num_frames, channels)
    snd_file.close()

    # Pad if necessary (start_frame or end_frame out of bounds)
    audio = np.pad(audio, [(pad_front, pad_back), (0, 0)],
                   mode="constant",
                   constant_values=0.0)

    # Convert to mono if desired
    if mono:
        audio = np.mean(audio, axis=1, keepdims=True)

    # Resample if needed
    if sample_rate is not None and sample_rate != audio_sr:
        res_length = int(
            np.ceil(
                float(audio.shape[0]) * float(sample_rate) / float(audio_sr)))
        audio = np.pad(audio, [(1, 1), (0, 0)],
                       mode="reflect")  # Pad audio first
        audio = librosa.resample(audio.T,
                                 audio_sr,
                                 sample_rate,
                                 res_type="kaiser_fast").T
        skip = (audio.shape[0] - res_length) // 2
        audio = audio[skip:skip + res_length, :]

    # Clip to [-1,1] if desired
    if clip:
        audio = np.minimum(np.maximum(audio, -1.0), 1.0)

    return audio, audio_sr
Пример #4
0
def batch_list(file_dir, list_name, data_path='data', make_new=False):
    """
	Places the file paths and wav lengths of an audio file into a dictionary, which
	is then appended to a list. 'glob' is used to support Unix style pathname
	pattern expansions. Checks if the training list has already been saved, and loads
	it.

	Argument/s:
		file_dir - directory containing the audio files.
		list_name - name for the list.
		data_path - path to store pickle files.
		make_new - re-create list.

	Returns:
		batch_list - list of file paths and wav length.
	"""
    extension = ['*.wav', '*.flac', '*.mp3']
    if not make_new:
        if os.path.exists(data_path + '/' + list_name + '_list_' +
                          platform.node() + '.p'):
            print('Loading ' + list_name + ' list...')
            with open(
                    data_path + '/' + list_name + '_list_' + platform.node() +
                    '.p', 'rb') as f:
                batch_list = pickle.load(f)
            if batch_list[0]['file_path'].find(file_dir) != -1:
                print(list_name + ' list has a totaltry: of %i entries.' %
                      (len(batch_list)))
                return batch_list

    print('Creating ' + list_name + ' list...')
    batch_list = []
    for i in extension:
        for j in glob.glob(os.path.join(file_dir, i)):
            try:
                f = SoundFile(j)
                wav_len = f.seek(0, SEEK_END)
                if wav_len == -1:
                    wav, _ = read_wav(j)
                    wav_len = len(wav)
            except NameError:
                wav, _ = read_wav(j)
                wav_len = len(wav)
            batch_list.append({
                'file_path': j,
                'wav_len': wav_len
            })  # append dictionary.
    if not os.path.exists(data_path): os.makedirs(data_path)  # make directory.
    with open(data_path + '/' + list_name + '_list_' + platform.node() + '.p',
              'wb') as f:
        pickle.dump(batch_list, f)
    print('The ' + list_name + ' list has a total of %i entries.' %
          (len(batch_list)))
    return batch_list
Пример #5
0
class Encoder(Sink):
    """This class is an interface to write data into an audio file"""
    def __init__(self,
                 file_name: str,
                 rate: int,
                 channels: int,
                 name: str = ""):
        """Creates an instance of a Encoder source with the given configuration

        Args:
            file_name (str): Input file name
            rate (int): The sample rate of the file in Hz
            channels (int): The number of channels
            name (str): Name of the element
        """
        super().__init__(name)
        self.__instance = SoundFile(file=file_name,
                                    mode='w',
                                    samplerate=rate,
                                    channels=channels)

    @property
    def sample_rate(self) -> int:
        """Return the sampling rate in Hz."""
        return self.__instance.samplerate

    @property
    def channels(self) -> int:
        """Return the number of channels."""
        return self.__instance.channels

    @property
    def file_name(self) -> str:
        """Return the file name."""
        return self.__instance.name

    def seek(self, frames):
        """Set the write position.

        Args:
            frames: The frame index or offset to seek
        """
        return self.__instance.seek(frames=frames)

    def process(self, data, extra=None):
        """Writes the buffer of data into the audio file.

        Args:
            data: Array to write in the file
            extra: Any extra information previously computed.
        """
        self.__instance.write(data.flatten()), extra
Пример #6
0
def Batch_list(file_dir, list_name, data_path=None, make_new=False):
    from soundfile import SoundFile, SEEK_END
    '''
	Places the file paths and wav lengths of an audio file into a dictionary, which 
	is then appended to a list. SPHERE format cannot be used. 'glob' is used to 
	support Unix style pathname pattern expansions. Checks if the training list 
	has already been pickled, and loads it. If a different dataset is to be 
	used, delete the pickle file.

	Inputs:
		file_dir - directory containing the wavs.
		list_name - name for the list.
		data_path - path to store pickle files.
		make_new - re-create list.

	Outputs:
		batch_list - list of file paths and wav length.
	'''
    file_name = ['*.wav', '*.flac', '*.mp3']
    if data_path == None: data_path = 'data'
    if not make_new:
        if os.path.exists(data_path + '/' + list_name + '_list_' +
                          platform.node() + '.p'):
            print('Loading ' + list_name + ' list from pickle file...')
            with open(
                    data_path + '/' + list_name + '_list_' + platform.node() +
                    '.p', 'rb') as f:
                batch_list = pickle.load(f)
            if batch_list[0]['file_path'].find(file_dir) != -1:
                print('The ' + list_name + ' list has a total of %i entries.' %
                      (len(batch_list)))
                return batch_list
    print('Creating ' + list_name + ' list, as no pickle file exists...')
    batch_list = []  # list for wav paths and lengths.
    for fn in file_name:
        for file_path in glob.glob(os.path.join(file_dir, fn)):
            f = SoundFile(file_path)
            seq_len = f.seek(0, SEEK_END)
            batch_list.append({
                'file_path': file_path,
                'seq_len': seq_len
            })  # append dictionary.
    if not os.path.exists(data_path): os.makedirs(data_path)  # make directory.
    with open(data_path + '/' + list_name + '_list_' + platform.node() + '.p',
              'wb') as f:
        pickle.dump(batch_list, f)
    print('The ' + list_name + ' list has a total of %i entries.' %
          (len(batch_list)))
    return batch_list
Пример #7
0
def read_audio_segment(file, pos, length):
    myfile = SoundFile(file)
    myfile.seek(pos)
    audio = myfile.read(length)
    myfile.close()
    return audio
Пример #8
0
def readAudio(audio_path,
              offset=0.0,
              duration=None,
              mono=True,
              sample_rate=None,
              clip=True,
              padding_duration=0.0,
              metadata=None):
    '''
    Reads an audio file wholly or partly, and optionally converts it to mono and changes sampling rate.
    By default, it loads the whole audio file. If the offset is set to None, the duration HAS to be not None,
    and the offset is then randomly determined so that a random section of the audio is selected with the desired duration.
    Optionally, the file can be zero-padded by a certain amount of seconds at the start and end before selecting this random section.

    :param audio_path: Path to audio file
    :param offset: Position in audio file (s) where to start reading. If None, duration has to be not None, and position will be randomly determined.
    :param duration: How many seconds of audio to read
    :param mono: Convert to mono after reading
    :param sample_rate: Convert to given sampling rate if given
    :param padding_duration: Amount of padding (s) on each side that needs to be filled up with silence if it isn't available
    :param metadata: metadata about audio file, accelerates reading audio since duration does not need to be determined from file 
    :return: Audio signal, Audio sample rate
    '''

    if os.path.splitext(audio_path)[1][1:].lower(
    ) == "mp3":  # If its an MP3, call ffmpeg with offset and duration parameters
        # Get mp3 metadata information and duration
        if metadata is None:
            audio_sr, audio_channels, audio_duration = Metadata.get_mp3_metadata(
                audio_path)
        else:
            audio_sr = metadata[0]
            audio_channels = metadata[1]
            audio_duration = metadata[2]
        print(audio_duration)

        pad_front_duration = 0.0
        pad_back_duration = 0.0

        if offset is None:  # In this case, select random section of audio file
            assert (duration is not None)
            max_start_pos = audio_duration + 2 * padding_duration - duration
            if (
                    max_start_pos <= 0.0
            ):  # If audio file is longer than duration of desired section, take all of it, will be padded later
                print("WARNING: Audio file " + audio_path + " has length " +
                      str(audio_duration) +
                      " but is expected to be at least " + str(duration))
                return librosa.load(
                    audio_path, sample_rate, mono,
                    res_type='kaiser_fast')  # Return whole audio file
            start_pos = np.random.uniform(
                0.0, max_start_pos
            )  # Otherwise randomly determine audio section, taking padding on both sides into account
            offset = max(start_pos - padding_duration,
                         0.0)  # Read from this position in audio file
            pad_front_duration = max(padding_duration - start_pos, 0.0)
        assert (offset is not None)

        if duration is not None:  # Adjust duration if it overlaps with end of track
            pad_back_duration = max(offset + duration - audio_duration, 0.0)
            duration = duration - pad_front_duration - pad_back_duration  # Subtract padding from the amount we have to read from file
        else:  # None duration: Read from offset to end of file
            duration = audio_duration - offset

        pad_front_frames = int(pad_front_duration * float(audio_sr))
        pad_back_frames = int(pad_back_duration * float(audio_sr))

        args = [
            'ffmpeg', '-noaccurate_seek', '-ss',
            str(offset), '-t',
            str(duration), '-i', audio_path, '-f', 's16le', '-'
        ]

        audio = []
        process = subprocess.Popen(args,
                                   stdout=subprocess.PIPE,
                                   stderr=open(os.devnull, 'wb'))
        num_reads = 0
        while True:
            output = process.stdout.read(4096)
            if output == '' and process.poll() is not None:
                break
            if output:
                audio.append(
                    librosa.util.buf_to_float(output, dtype=np.float32))
                num_reads += 1

        audio = np.concatenate(audio)
        if audio_channels > 1:
            audio = audio.reshape((-1, audio_channels)).T

    else:  #Not an MP3: Handle with PySoundFile
        # open audio file
        snd_file = SoundFile(audio_path, mode='r')
        inf = snd_file._info
        audio_sr = inf.samplerate

        if duration is not None:
            num_frames = int(duration * float(audio_sr))
        pad_frames = int(padding_duration * float(audio_sr))
        pad_front_frames = 0
        pad_back_frames = 0

        if offset is None:  # In this case, select random section of audio file
            assert (duration is not None)
            max_start_pos = inf.frames + 2 * pad_frames - num_frames
            if (
                    max_start_pos <= 0
            ):  # If audio file is longer than duration of desired section, take all of it, will be padded later
                print("WARNING: Audio file " + audio_path + " has frames  " +
                      str(inf.frames) + " but is expected to be at least " +
                      str(num_frames))
                return librosa.load(
                    audio_path, sample_rate, mono,
                    res_type='kaiser_fast')  # Return whole audio file
            start_pos = np.random.randint(
                0, max_start_pos
            )  # Otherwise randomly determine audio section, taking padding on both sides into account
            start_frame = max(start_pos - pad_frames,
                              0)  # Read from this position in audio file
            pad_front_frames = max(pad_frames - start_pos, 0)
        else:
            start_frame = int(offset * float(audio_sr))

        if duration is not None:  # Adjust duration if it overlaps with end of track
            pad_back_frames = max(start_frame + num_frames - inf.frames, 0)
            num_frames = num_frames - pad_front_frames - pad_back_frames
        else:  # Duration is None => Read from start frame to end of track
            num_frames = inf.frames - start_frame

        snd_file.seek(start_frame)
        audio = snd_file.read(num_frames, dtype='float32')
        snd_file.close()
        audio = audio.T  # Tuple to numpy, transpose axis to (channels, frames)

        centre_start_frame = start_frame - pad_front_frames + pad_frames
        centre_end_frame = start_frame + num_frames + pad_back_frames - pad_frames

    # AT THIS POINT WE HAVE A [N_CHANNELS, N_SAMPLES] NUMPY ARRAY FOR THE AUDIO
    # Pad as indicated at beginning and end
    if len(audio.shape) > 1:
        audio = np.pad(audio, [(0, 0), (pad_front_frames, pad_back_frames)],
                       mode="constant",
                       constant_values=0.0)
    else:
        audio = np.pad(audio, [(pad_front_frames, pad_back_frames)],
                       mode="constant",
                       constant_values=0.0)

    # Convert to mono if desired
    if mono and len(audio.shape) > 1 and audio.shape[0] > 1:
        audio = np.mean(audio, axis=0)

    # Resample if needed
    if sample_rate is not None and sample_rate != audio_sr:
        audio = librosa.resample(audio,
                                 audio_sr,
                                 sample_rate,
                                 res_type="kaiser_fast")
        audio_sr = sample_rate

    # Clip to [-1,1] if desired
    if clip:
        audio = np.minimum(np.maximum(audio, -1.0), 1.0)

    if float(audio.shape[0]) / float(sample_rate) < 1.0:
        print("----------------------ERROR------------------")

    if os.path.splitext(audio_path)[1][1:].lower() == "mp3":
        return audio, audio_sr
    else:
        return audio, audio_sr, centre_start_frame, centre_end_frame
Пример #9
0
def readAudio(audio_path,
              offset=0.0,
              duration=None,
              mono=True,
              sample_rate=None,
              clip=True,
              pad_frames=0,
              metadata=None):
    '''
    Reads an audio file wholly or partly, and optionally converts it to mono and changes sampling rate.
    By default, it loads the whole audio file. If the offset is set to None, the duration HAS to be not None,
    and the offset is then randomly determined so that a random section of the audio is selected with the desired duration.
    Optionally, the file can be zero-padded by a certain amount of seconds at the start and end before selecting this random section.

    :param audio_path: Path to audio file
    :param offset: Position in audio file (s) where to start reading. If None, duration has to be not None, and position will be randomly determined.
    :param duration: How many seconds of audio to read
    :param mono: Convert to mono after reading
    :param sample_rate: Convert to given sampling rate if given
    :param pad_frames: number of frames with wich to pad the audio at most if the samples at the borders are not available
    :param metadata: metadata about audio file, accelerates reading audio since duration does not need to be determined from file 
    :return: Audio signal, Audio sample rate
    '''

    if os.path.splitext(audio_path)[1][1:].lower(
    ) == "mp3":  # If its an MP3, call ffmpeg with offset and duration parameters
        # Get mp3 metadata information and duration
        if metadata is None:
            audio_sr, audio_channels, audio_duration = Metadata.get_mp3_metadata(
                audio_path)
        else:
            audio_sr = metadata[0]
            audio_channels = metadata[1]
            audio_duration = metadata[2]
        print(audio_duration)

        pad_front_duration = 0.0
        pad_back_duration = 0.0

        ref_sr = sample_rate if sample_rate is not None else audio_sr
        padding_duration = float(pad_frames) / float(ref_sr)

        if offset is None:  # In this case, select random section of audio file
            assert (duration is not None)
            max_start_pos = audio_duration + 2 * padding_duration - duration
            if (
                    max_start_pos <= 0.0
            ):  # If audio file is longer than duration of desired section, take all of it, will be padded later
                print("WARNING: Audio file " + audio_path + " has length " +
                      str(audio_duration) +
                      " but is expected to be at least " + str(duration))
                return Utils.load(audio_path, sample_rate,
                                  mono)  # Return whole audio file
            start_pos = np.random.uniform(
                0.0, max_start_pos
            )  # Otherwise randomly determine audio section, taking padding on both sides into account
            offset = max(start_pos - padding_duration,
                         0.0)  # Read from this position in audio file
            pad_front_duration = max(padding_duration - start_pos, 0.0)
        assert (offset is not None)

        if duration is not None:  # Adjust duration if it overlaps with end of track
            pad_back_duration = max(offset + duration - audio_duration, 0.0)
            duration = duration - pad_front_duration - pad_back_duration  # Subtract padding from the amount we have to read from file
        else:  # None duration: Read from offset to end of file
            duration = audio_duration - offset

        pad_front_frames = int(pad_front_duration * float(audio_sr))
        pad_back_frames = int(pad_back_duration * float(audio_sr))

        args = [
            'ffmpeg', '-noaccurate_seek', '-ss',
            str(offset), '-t',
            str(duration), '-i', audio_path, '-f', 's16le', '-'
        ]

        audio = []
        process = subprocess.Popen(args,
                                   stdout=subprocess.PIPE,
                                   stderr=open(os.devnull, 'wb'))
        num_reads = 0
        while True:
            output = process.stdout.read(4096)
            if output == '' and process.poll() is not None:
                break
            if output:
                audio.append(
                    librosa.util.buf_to_float(output, dtype=np.float32))
                num_reads += 1

        audio = np.concatenate(audio)
        if audio_channels > 1:
            audio = audio.reshape((-1, audio_channels)).T

    else:  #Not an MP3: Handle with PySoundFile
        # open audio file
        snd_file = SoundFile(audio_path, mode='r')
        inf = snd_file._info
        audio_sr = inf.samplerate

        pad_orig_frames = pad_frames if sample_rate is None else int(
            np.ceil(
                float(pad_frames) * (float(audio_sr) / float(sample_rate))))

        pad_front_frames = 0
        pad_back_frames = 0

        if offset is not None and duration is not None:
            start_frame = int(offset * float(audio_sr))
            read_frames = int(duration * float(audio_sr))
        elif offset is not None and duration is None:
            start_frame = int(offset * float(audio_sr))
            read_frames = inf.frames - start_frame
        else:  # In this case, select random section of audio file
            assert (offset is None)
            assert (duration is not None)
            num_frames = int(duration * float(audio_sr))
            max_start_pos = inf.frames - num_frames  # Maximum start position when ignoring padding on both ends of the file
            if (
                    max_start_pos <= 0
            ):  # If audio file is longer than duration of desired section, take all of it, will be padded later
                print("WARNING: Audio file " + audio_path + " has frames  " +
                      str(inf.frames) + " but is expected to be at least " +
                      str(num_frames))
                raise Exception(
                    "Could not read minimum required amount of audio data")
                #return Utils.load(audio_path, sample_rate, mono)  # Return whole audio file
            start_pos = np.random.randint(
                0, max_start_pos
            )  # Otherwise randomly determine audio section, taking padding on both sides into account

            # Translate source position into mixture input positions (take into account padding)
            start_mix_pos = start_pos - pad_orig_frames
            num_mix_frames = num_frames + 2 * pad_orig_frames
            end_mix_pos = start_mix_pos + num_mix_frames

            # Now see how much of the mixture is available, pad the rest with zeros

            start_frame = max(start_mix_pos, 0)
            end_frame = min(end_mix_pos, inf.frames)
            read_frames = end_frame - start_frame
            pad_front_frames = -min(start_mix_pos, 0)
            pad_back_frames = max(end_mix_pos - inf.frames, 0)

        assert (num_frames > 0)
        snd_file.seek(start_frame)
        audio = snd_file.read(read_frames, dtype='float32', always_2d=True)
        snd_file.close()

        centre_start_frame = start_pos
        centre_end_frame = start_pos + num_frames

    # Pad as indicated at beginning and end
    audio = np.pad(audio, [(pad_front_frames, pad_back_frames), (0, 0)],
                   mode="constant",
                   constant_values=0.0)

    # Convert to mono if desired
    if mono:
        audio = np.mean(audio, axis=1, keepdims=True)

    # Resample if needed
    if sample_rate is not None and sample_rate != audio_sr:
        audio = Utils.resample(audio, audio_sr, sample_rate)

    # Clip to [-1,1] if desired
    if clip:
        audio = np.minimum(np.maximum(audio, -1.0), 1.0)

    if float(audio.shape[0]) / float(sample_rate) < 1.0:
        raise IOError("Error while reading " + audio_path +
                      " - ended up with audio shorter than one second!")

    if os.path.splitext(audio_path)[1][1:].lower() == "mp3":
        return audio, audio_sr, offset, offset + duration
    else:
        return audio, audio_sr, centre_start_frame, centre_end_frame, start_mix_pos, end_mix_pos
Пример #10
0
class Decoder:
    """This class is an interface to read data from an audio file"""
    def __init__(self, file: str):
        """Creates an instance of a Decoder source with the given configuration

        Args:
            file_name (str): Input file name
            frames_per_buffer (int): Number of frames per buffer.
            name (str): Name of the element
        """
        self.__instance = SoundFile(file=file, mode='r')

    @property
    def sample_rate(self) -> int:
        """Return the sampling rate in Hz."""
        return self.__instance.samplerate

    @property
    def channels(self) -> int:
        """Return the number of channels."""
        return self.__instance.channels

    @property
    def file_name(self) -> str:
        """Return the file name."""
        return self.__instance.name

    @property
    def frames(self) -> int:
        """ Number of available frames"""
        return self.__instance.frames

    def done(self) -> bool:
        """Checks if there still data to read from the audio file"""
        return self.__instance.tell() < self.__instance.frames

    def start(self):
        """Starts the streaming"""
        self.__instance.seek(0)

    def stop(self):
        """Stops the streaming by seeking the file to the end"""
        self.__instance.seek(self.__instance.frames)

    def timestamp(self):
        """Returns the current streaming timestamp in seconds"""
        return self.__instance.tell() / self.__instance.samplerate

    def seek(self, frames: int):
        """Set the read position.

        Args:
            frames (int): The frame index or offset to seek

        Returns:
            The new absolute read/write position in frames
        """
        return self.__instance.seek(frames=frames)

    def read(self, frames_per_channel: int = -1):
        """Returns the buffer read from the audio file"""

        if frames_per_channel is -1:
            frames_per_channel = self.__instance.frames

        return self.__instance.read(frames=frames_per_channel,
                                    dtype='float32',
                                    always_2d=False)