def read_signal(filename, offset=0, nsamples=-1, nchannels=0, offset_is_samples=False): """Read a wavefile and return as numpy array of floats. Args: filename (string): Name of file to read offset (int, optional): Offset in samples or seconds (from start). Defaults to 0. nchannels: expected number of channel (default: 0 = any number OK) offset_is_samples (bool): measurement units for offset (default: False) Returns: ndarray: audio signal """ try: wave_file = SoundFile(filename) except: # Ensure incorrect error (24 bit) is not generated raise Exception(f"Unable to read {filename}.") if nchannels != 0 and wave_file.channels != nchannels: raise Exception( f"Wav file ({filename}) was expected to have {nchannels} channels." ) if wave_file.samplerate != CONFIG.fs: raise Exception(f"Sampling rate is not {CONFIG.fs} for filename {filename}.") if not offset_is_samples: # Default behaviour offset = int(offset * wave_file.samplerate) if offset != 0: wave_file.seek(offset) x = wave_file.read(frames=nsamples) return x
def readWave(audio_path, start_frame, end_frame, mono=True, sample_rate=None, clip=True): snd_file = SoundFile(audio_path, mode='r') inf = snd_file._info audio_sr = inf.samplerate snd_file.seek(start_frame) audio = snd_file.read(end_frame - start_frame, dtype='float32') snd_file.close() audio = audio.T # Tuple to numpy, transpose axis to (channels, frames) # Convert to mono if desired if mono and len(audio.shape) > 1 and audio.shape[0] > 1: audio = np.mean(audio, axis=0) # Resample if needed if sample_rate is not None and sample_rate != audio_sr: audio = librosa.resample(audio, audio_sr, sample_rate, res_type="kaiser_fast") audio_sr = sample_rate # Clip to [-1,1] if desired if clip: audio = np.minimum(np.maximum(audio, -1.0), 1.0) return audio, audio_sr
def readWave(audio_path, start_frame, end_frame, mono=True, sample_rate=None, clip=True): snd_file = SoundFile(audio_path, mode='r') inf = snd_file._info audio_sr = inf.samplerate start_read = max(start_frame, 0) pad_front = -min(start_frame, 0) end_read = min(end_frame, inf.frames) pad_back = max(end_frame - inf.frames, 0) snd_file.seek(start_read) audio = snd_file.read(end_read - start_read, dtype='float32', always_2d=True) # (num_frames, channels) snd_file.close() # Pad if necessary (start_frame or end_frame out of bounds) audio = np.pad(audio, [(pad_front, pad_back), (0, 0)], mode="constant", constant_values=0.0) # Convert to mono if desired if mono: audio = np.mean(audio, axis=1, keepdims=True) # Resample if needed if sample_rate is not None and sample_rate != audio_sr: res_length = int( np.ceil( float(audio.shape[0]) * float(sample_rate) / float(audio_sr))) audio = np.pad(audio, [(1, 1), (0, 0)], mode="reflect") # Pad audio first audio = librosa.resample(audio.T, audio_sr, sample_rate, res_type="kaiser_fast").T skip = (audio.shape[0] - res_length) // 2 audio = audio[skip:skip + res_length, :] # Clip to [-1,1] if desired if clip: audio = np.minimum(np.maximum(audio, -1.0), 1.0) return audio, audio_sr
def batch_list(file_dir, list_name, data_path='data', make_new=False): """ Places the file paths and wav lengths of an audio file into a dictionary, which is then appended to a list. 'glob' is used to support Unix style pathname pattern expansions. Checks if the training list has already been saved, and loads it. Argument/s: file_dir - directory containing the audio files. list_name - name for the list. data_path - path to store pickle files. make_new - re-create list. Returns: batch_list - list of file paths and wav length. """ extension = ['*.wav', '*.flac', '*.mp3'] if not make_new: if os.path.exists(data_path + '/' + list_name + '_list_' + platform.node() + '.p'): print('Loading ' + list_name + ' list...') with open( data_path + '/' + list_name + '_list_' + platform.node() + '.p', 'rb') as f: batch_list = pickle.load(f) if batch_list[0]['file_path'].find(file_dir) != -1: print(list_name + ' list has a totaltry: of %i entries.' % (len(batch_list))) return batch_list print('Creating ' + list_name + ' list...') batch_list = [] for i in extension: for j in glob.glob(os.path.join(file_dir, i)): try: f = SoundFile(j) wav_len = f.seek(0, SEEK_END) if wav_len == -1: wav, _ = read_wav(j) wav_len = len(wav) except NameError: wav, _ = read_wav(j) wav_len = len(wav) batch_list.append({ 'file_path': j, 'wav_len': wav_len }) # append dictionary. if not os.path.exists(data_path): os.makedirs(data_path) # make directory. with open(data_path + '/' + list_name + '_list_' + platform.node() + '.p', 'wb') as f: pickle.dump(batch_list, f) print('The ' + list_name + ' list has a total of %i entries.' % (len(batch_list))) return batch_list
class Encoder(Sink): """This class is an interface to write data into an audio file""" def __init__(self, file_name: str, rate: int, channels: int, name: str = ""): """Creates an instance of a Encoder source with the given configuration Args: file_name (str): Input file name rate (int): The sample rate of the file in Hz channels (int): The number of channels name (str): Name of the element """ super().__init__(name) self.__instance = SoundFile(file=file_name, mode='w', samplerate=rate, channels=channels) @property def sample_rate(self) -> int: """Return the sampling rate in Hz.""" return self.__instance.samplerate @property def channels(self) -> int: """Return the number of channels.""" return self.__instance.channels @property def file_name(self) -> str: """Return the file name.""" return self.__instance.name def seek(self, frames): """Set the write position. Args: frames: The frame index or offset to seek """ return self.__instance.seek(frames=frames) def process(self, data, extra=None): """Writes the buffer of data into the audio file. Args: data: Array to write in the file extra: Any extra information previously computed. """ self.__instance.write(data.flatten()), extra
def Batch_list(file_dir, list_name, data_path=None, make_new=False): from soundfile import SoundFile, SEEK_END ''' Places the file paths and wav lengths of an audio file into a dictionary, which is then appended to a list. SPHERE format cannot be used. 'glob' is used to support Unix style pathname pattern expansions. Checks if the training list has already been pickled, and loads it. If a different dataset is to be used, delete the pickle file. Inputs: file_dir - directory containing the wavs. list_name - name for the list. data_path - path to store pickle files. make_new - re-create list. Outputs: batch_list - list of file paths and wav length. ''' file_name = ['*.wav', '*.flac', '*.mp3'] if data_path == None: data_path = 'data' if not make_new: if os.path.exists(data_path + '/' + list_name + '_list_' + platform.node() + '.p'): print('Loading ' + list_name + ' list from pickle file...') with open( data_path + '/' + list_name + '_list_' + platform.node() + '.p', 'rb') as f: batch_list = pickle.load(f) if batch_list[0]['file_path'].find(file_dir) != -1: print('The ' + list_name + ' list has a total of %i entries.' % (len(batch_list))) return batch_list print('Creating ' + list_name + ' list, as no pickle file exists...') batch_list = [] # list for wav paths and lengths. for fn in file_name: for file_path in glob.glob(os.path.join(file_dir, fn)): f = SoundFile(file_path) seq_len = f.seek(0, SEEK_END) batch_list.append({ 'file_path': file_path, 'seq_len': seq_len }) # append dictionary. if not os.path.exists(data_path): os.makedirs(data_path) # make directory. with open(data_path + '/' + list_name + '_list_' + platform.node() + '.p', 'wb') as f: pickle.dump(batch_list, f) print('The ' + list_name + ' list has a total of %i entries.' % (len(batch_list))) return batch_list
def read_audio_segment(file, pos, length): myfile = SoundFile(file) myfile.seek(pos) audio = myfile.read(length) myfile.close() return audio
def readAudio(audio_path, offset=0.0, duration=None, mono=True, sample_rate=None, clip=True, padding_duration=0.0, metadata=None): ''' Reads an audio file wholly or partly, and optionally converts it to mono and changes sampling rate. By default, it loads the whole audio file. If the offset is set to None, the duration HAS to be not None, and the offset is then randomly determined so that a random section of the audio is selected with the desired duration. Optionally, the file can be zero-padded by a certain amount of seconds at the start and end before selecting this random section. :param audio_path: Path to audio file :param offset: Position in audio file (s) where to start reading. If None, duration has to be not None, and position will be randomly determined. :param duration: How many seconds of audio to read :param mono: Convert to mono after reading :param sample_rate: Convert to given sampling rate if given :param padding_duration: Amount of padding (s) on each side that needs to be filled up with silence if it isn't available :param metadata: metadata about audio file, accelerates reading audio since duration does not need to be determined from file :return: Audio signal, Audio sample rate ''' if os.path.splitext(audio_path)[1][1:].lower( ) == "mp3": # If its an MP3, call ffmpeg with offset and duration parameters # Get mp3 metadata information and duration if metadata is None: audio_sr, audio_channels, audio_duration = Metadata.get_mp3_metadata( audio_path) else: audio_sr = metadata[0] audio_channels = metadata[1] audio_duration = metadata[2] print(audio_duration) pad_front_duration = 0.0 pad_back_duration = 0.0 if offset is None: # In this case, select random section of audio file assert (duration is not None) max_start_pos = audio_duration + 2 * padding_duration - duration if ( max_start_pos <= 0.0 ): # If audio file is longer than duration of desired section, take all of it, will be padded later print("WARNING: Audio file " + audio_path + " has length " + str(audio_duration) + " but is expected to be at least " + str(duration)) return librosa.load( audio_path, sample_rate, mono, res_type='kaiser_fast') # Return whole audio file start_pos = np.random.uniform( 0.0, max_start_pos ) # Otherwise randomly determine audio section, taking padding on both sides into account offset = max(start_pos - padding_duration, 0.0) # Read from this position in audio file pad_front_duration = max(padding_duration - start_pos, 0.0) assert (offset is not None) if duration is not None: # Adjust duration if it overlaps with end of track pad_back_duration = max(offset + duration - audio_duration, 0.0) duration = duration - pad_front_duration - pad_back_duration # Subtract padding from the amount we have to read from file else: # None duration: Read from offset to end of file duration = audio_duration - offset pad_front_frames = int(pad_front_duration * float(audio_sr)) pad_back_frames = int(pad_back_duration * float(audio_sr)) args = [ 'ffmpeg', '-noaccurate_seek', '-ss', str(offset), '-t', str(duration), '-i', audio_path, '-f', 's16le', '-' ] audio = [] process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=open(os.devnull, 'wb')) num_reads = 0 while True: output = process.stdout.read(4096) if output == '' and process.poll() is not None: break if output: audio.append( librosa.util.buf_to_float(output, dtype=np.float32)) num_reads += 1 audio = np.concatenate(audio) if audio_channels > 1: audio = audio.reshape((-1, audio_channels)).T else: #Not an MP3: Handle with PySoundFile # open audio file snd_file = SoundFile(audio_path, mode='r') inf = snd_file._info audio_sr = inf.samplerate if duration is not None: num_frames = int(duration * float(audio_sr)) pad_frames = int(padding_duration * float(audio_sr)) pad_front_frames = 0 pad_back_frames = 0 if offset is None: # In this case, select random section of audio file assert (duration is not None) max_start_pos = inf.frames + 2 * pad_frames - num_frames if ( max_start_pos <= 0 ): # If audio file is longer than duration of desired section, take all of it, will be padded later print("WARNING: Audio file " + audio_path + " has frames " + str(inf.frames) + " but is expected to be at least " + str(num_frames)) return librosa.load( audio_path, sample_rate, mono, res_type='kaiser_fast') # Return whole audio file start_pos = np.random.randint( 0, max_start_pos ) # Otherwise randomly determine audio section, taking padding on both sides into account start_frame = max(start_pos - pad_frames, 0) # Read from this position in audio file pad_front_frames = max(pad_frames - start_pos, 0) else: start_frame = int(offset * float(audio_sr)) if duration is not None: # Adjust duration if it overlaps with end of track pad_back_frames = max(start_frame + num_frames - inf.frames, 0) num_frames = num_frames - pad_front_frames - pad_back_frames else: # Duration is None => Read from start frame to end of track num_frames = inf.frames - start_frame snd_file.seek(start_frame) audio = snd_file.read(num_frames, dtype='float32') snd_file.close() audio = audio.T # Tuple to numpy, transpose axis to (channels, frames) centre_start_frame = start_frame - pad_front_frames + pad_frames centre_end_frame = start_frame + num_frames + pad_back_frames - pad_frames # AT THIS POINT WE HAVE A [N_CHANNELS, N_SAMPLES] NUMPY ARRAY FOR THE AUDIO # Pad as indicated at beginning and end if len(audio.shape) > 1: audio = np.pad(audio, [(0, 0), (pad_front_frames, pad_back_frames)], mode="constant", constant_values=0.0) else: audio = np.pad(audio, [(pad_front_frames, pad_back_frames)], mode="constant", constant_values=0.0) # Convert to mono if desired if mono and len(audio.shape) > 1 and audio.shape[0] > 1: audio = np.mean(audio, axis=0) # Resample if needed if sample_rate is not None and sample_rate != audio_sr: audio = librosa.resample(audio, audio_sr, sample_rate, res_type="kaiser_fast") audio_sr = sample_rate # Clip to [-1,1] if desired if clip: audio = np.minimum(np.maximum(audio, -1.0), 1.0) if float(audio.shape[0]) / float(sample_rate) < 1.0: print("----------------------ERROR------------------") if os.path.splitext(audio_path)[1][1:].lower() == "mp3": return audio, audio_sr else: return audio, audio_sr, centre_start_frame, centre_end_frame
def readAudio(audio_path, offset=0.0, duration=None, mono=True, sample_rate=None, clip=True, pad_frames=0, metadata=None): ''' Reads an audio file wholly or partly, and optionally converts it to mono and changes sampling rate. By default, it loads the whole audio file. If the offset is set to None, the duration HAS to be not None, and the offset is then randomly determined so that a random section of the audio is selected with the desired duration. Optionally, the file can be zero-padded by a certain amount of seconds at the start and end before selecting this random section. :param audio_path: Path to audio file :param offset: Position in audio file (s) where to start reading. If None, duration has to be not None, and position will be randomly determined. :param duration: How many seconds of audio to read :param mono: Convert to mono after reading :param sample_rate: Convert to given sampling rate if given :param pad_frames: number of frames with wich to pad the audio at most if the samples at the borders are not available :param metadata: metadata about audio file, accelerates reading audio since duration does not need to be determined from file :return: Audio signal, Audio sample rate ''' if os.path.splitext(audio_path)[1][1:].lower( ) == "mp3": # If its an MP3, call ffmpeg with offset and duration parameters # Get mp3 metadata information and duration if metadata is None: audio_sr, audio_channels, audio_duration = Metadata.get_mp3_metadata( audio_path) else: audio_sr = metadata[0] audio_channels = metadata[1] audio_duration = metadata[2] print(audio_duration) pad_front_duration = 0.0 pad_back_duration = 0.0 ref_sr = sample_rate if sample_rate is not None else audio_sr padding_duration = float(pad_frames) / float(ref_sr) if offset is None: # In this case, select random section of audio file assert (duration is not None) max_start_pos = audio_duration + 2 * padding_duration - duration if ( max_start_pos <= 0.0 ): # If audio file is longer than duration of desired section, take all of it, will be padded later print("WARNING: Audio file " + audio_path + " has length " + str(audio_duration) + " but is expected to be at least " + str(duration)) return Utils.load(audio_path, sample_rate, mono) # Return whole audio file start_pos = np.random.uniform( 0.0, max_start_pos ) # Otherwise randomly determine audio section, taking padding on both sides into account offset = max(start_pos - padding_duration, 0.0) # Read from this position in audio file pad_front_duration = max(padding_duration - start_pos, 0.0) assert (offset is not None) if duration is not None: # Adjust duration if it overlaps with end of track pad_back_duration = max(offset + duration - audio_duration, 0.0) duration = duration - pad_front_duration - pad_back_duration # Subtract padding from the amount we have to read from file else: # None duration: Read from offset to end of file duration = audio_duration - offset pad_front_frames = int(pad_front_duration * float(audio_sr)) pad_back_frames = int(pad_back_duration * float(audio_sr)) args = [ 'ffmpeg', '-noaccurate_seek', '-ss', str(offset), '-t', str(duration), '-i', audio_path, '-f', 's16le', '-' ] audio = [] process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=open(os.devnull, 'wb')) num_reads = 0 while True: output = process.stdout.read(4096) if output == '' and process.poll() is not None: break if output: audio.append( librosa.util.buf_to_float(output, dtype=np.float32)) num_reads += 1 audio = np.concatenate(audio) if audio_channels > 1: audio = audio.reshape((-1, audio_channels)).T else: #Not an MP3: Handle with PySoundFile # open audio file snd_file = SoundFile(audio_path, mode='r') inf = snd_file._info audio_sr = inf.samplerate pad_orig_frames = pad_frames if sample_rate is None else int( np.ceil( float(pad_frames) * (float(audio_sr) / float(sample_rate)))) pad_front_frames = 0 pad_back_frames = 0 if offset is not None and duration is not None: start_frame = int(offset * float(audio_sr)) read_frames = int(duration * float(audio_sr)) elif offset is not None and duration is None: start_frame = int(offset * float(audio_sr)) read_frames = inf.frames - start_frame else: # In this case, select random section of audio file assert (offset is None) assert (duration is not None) num_frames = int(duration * float(audio_sr)) max_start_pos = inf.frames - num_frames # Maximum start position when ignoring padding on both ends of the file if ( max_start_pos <= 0 ): # If audio file is longer than duration of desired section, take all of it, will be padded later print("WARNING: Audio file " + audio_path + " has frames " + str(inf.frames) + " but is expected to be at least " + str(num_frames)) raise Exception( "Could not read minimum required amount of audio data") #return Utils.load(audio_path, sample_rate, mono) # Return whole audio file start_pos = np.random.randint( 0, max_start_pos ) # Otherwise randomly determine audio section, taking padding on both sides into account # Translate source position into mixture input positions (take into account padding) start_mix_pos = start_pos - pad_orig_frames num_mix_frames = num_frames + 2 * pad_orig_frames end_mix_pos = start_mix_pos + num_mix_frames # Now see how much of the mixture is available, pad the rest with zeros start_frame = max(start_mix_pos, 0) end_frame = min(end_mix_pos, inf.frames) read_frames = end_frame - start_frame pad_front_frames = -min(start_mix_pos, 0) pad_back_frames = max(end_mix_pos - inf.frames, 0) assert (num_frames > 0) snd_file.seek(start_frame) audio = snd_file.read(read_frames, dtype='float32', always_2d=True) snd_file.close() centre_start_frame = start_pos centre_end_frame = start_pos + num_frames # Pad as indicated at beginning and end audio = np.pad(audio, [(pad_front_frames, pad_back_frames), (0, 0)], mode="constant", constant_values=0.0) # Convert to mono if desired if mono: audio = np.mean(audio, axis=1, keepdims=True) # Resample if needed if sample_rate is not None and sample_rate != audio_sr: audio = Utils.resample(audio, audio_sr, sample_rate) # Clip to [-1,1] if desired if clip: audio = np.minimum(np.maximum(audio, -1.0), 1.0) if float(audio.shape[0]) / float(sample_rate) < 1.0: raise IOError("Error while reading " + audio_path + " - ended up with audio shorter than one second!") if os.path.splitext(audio_path)[1][1:].lower() == "mp3": return audio, audio_sr, offset, offset + duration else: return audio, audio_sr, centre_start_frame, centre_end_frame, start_mix_pos, end_mix_pos
class Decoder: """This class is an interface to read data from an audio file""" def __init__(self, file: str): """Creates an instance of a Decoder source with the given configuration Args: file_name (str): Input file name frames_per_buffer (int): Number of frames per buffer. name (str): Name of the element """ self.__instance = SoundFile(file=file, mode='r') @property def sample_rate(self) -> int: """Return the sampling rate in Hz.""" return self.__instance.samplerate @property def channels(self) -> int: """Return the number of channels.""" return self.__instance.channels @property def file_name(self) -> str: """Return the file name.""" return self.__instance.name @property def frames(self) -> int: """ Number of available frames""" return self.__instance.frames def done(self) -> bool: """Checks if there still data to read from the audio file""" return self.__instance.tell() < self.__instance.frames def start(self): """Starts the streaming""" self.__instance.seek(0) def stop(self): """Stops the streaming by seeking the file to the end""" self.__instance.seek(self.__instance.frames) def timestamp(self): """Returns the current streaming timestamp in seconds""" return self.__instance.tell() / self.__instance.samplerate def seek(self, frames: int): """Set the read position. Args: frames (int): The frame index or offset to seek Returns: The new absolute read/write position in frames """ return self.__instance.seek(frames=frames) def read(self, frames_per_channel: int = -1): """Returns the buffer read from the audio file""" if frames_per_channel is -1: frames_per_channel = self.__instance.frames return self.__instance.read(frames=frames_per_channel, dtype='float32', always_2d=False)