def analyse(filename, resample_to=2756, bt_hop_length=128, chroma_hop_length=512, chroma_n_fft=1024): samples, sampleRate = librosa.load(filename) length = float(len(samples))/sampleRate if resample_to: samples = librosa.resample(samples, sampleRate, resample_to) sampleRate = resample_to newSampleRate = 2756 samples = librosa.resample(samples, sampleRate, newSampleRate) sampleRate = newSampleRate tempo, beats = librosa.beat.beat_track(samples, sampleRate, hop_length=bt_hop_length) beat_times = librosa.frames_to_time(beats, sampleRate, hop_length=bt_hop_length) chromagram = librosa.feature.chromagram(samples, sampleRate, hop_length=chroma_hop_length, n_fft=chroma_n_fft) chromagram = numpy.transpose(chromagram) distances = scipy.spatial.distance.cdist(chromagram, CHORDS, "cosine") chords = distances.argmin(axis=1) chords = scipy.signal.medfilt(chords, 11) chord_frames = numpy.array(numpy.where(numpy.diff(chords) != 0)) chords = chords[chord_frames][0].astype(int) chord_times = librosa.frames_to_time(chord_frames, sampleRate, hop_length=chroma_hop_length, n_fft=chroma_n_fft)[0] chord_names = CHORD_NAMES[chords] return {"beats": list(beat_times), "chords": [{"chord": chord_name, "time": chord_time} for chord_name, chord_time in zip(chord_names, chord_times)], "tempo": tempo}
def test_resample_scikitsamplerate(): warnings.resetwarnings() warnings.simplefilter('always') with warnings.catch_warnings(record=True) as out: librosa.resample(np.zeros(1000), 1000, 500, res_type='sinc_best') assert len(out) > 0 assert out[0].category is DeprecationWarning assert 'deprecated' in str(out[0].message).lower()
def lpc_formants(signal, sr, num_formants, max_freq, time_step, win_len, window_shape='gaussian'): output = {} new_sr = 2 * max_freq alpha = np.exp(-2 * np.pi * 50 * (1 / new_sr)) proc = lfilter([1., -alpha], 1, signal) if sr > new_sr: proc = librosa.resample(proc, sr, new_sr) nperseg = int(win_len * new_sr) nperstep = int(time_step * new_sr) if window_shape == 'gaussian': window = gaussian(nperseg + 2, 0.45 * (nperseg - 1) / 2)[1:nperseg + 1] else: window = np.hanning(nperseg + 2)[1:nperseg + 1] indices = np.arange(int(nperseg / 2), proc.shape[0] - int(nperseg / 2) + 1, nperstep) num_frames = len(indices) for i in range(num_frames): if nperseg % 2 != 0: X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2) + 1] else: X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2)] frqs, bw = process_frame(X, window, num_formants, new_sr) formants = [] for j, f in enumerate(frqs): if f < 50: continue if f > max_freq - 50: continue formants.append((np.asscalar(f), np.asscalar(bw[j]))) missing = num_formants - len(formants) if missing: formants += [(None, None)] * missing output[indices[i] / new_sr] = formants return output
def read_song(source_id): song_data_raw, source_sr = lr.load(data_source[source_id]) song_data = lr.resample(song_data_raw, source_sr, target_sr, scale=True) song_data = song_data[:song_data.shape[0]/10] song_data, data_denom = norm(song_data) return song_data, source_sr, data_denom
def get_best_fs_ratio(a, b, max_drift, steps, max_offset, correlation_size, center=1): ''' Given two signals with components in common, tries to estimate the clock drift and offset of b vs a :parameters: - a : np.ndarray Some signal - b : np.ndarray Some other signal - max_drift : float max sample rate drift, in percent, e.g. .02 = 2% clock drift - steps : int Number of sample rates to consider, between -max_drift and max_drift - max_offset : int Maximum expected offset of the signals - correlation_size : int Number of samples to use in each correlate - center : float Ratio to deviate from - default 1 Output: fs_ratio - fs ratio to make b line up well with a ''' # Sample rate ratios to try fs_ratios = center + np.linspace(-max_drift, max_drift, steps + 1) # The max correlation value for each fs ratio corr_max = np.zeros(fs_ratios.shape) for n, ratio in enumerate(fs_ratios): # Resample b with this fs ratio b_resampled = librosa.resample(b, 1, ratio) # Compute the max correlation _, corr = align_over_window(a, b_resampled, max_offset, correlation_size) corr_max[n] = corr.max() # Choose ratio with the highest correlation value return fs_ratios[np.argmax(corr_max)]
def read_song(source_id, target_sr): song_data_raw, source_sr = lr.load(data_source[source_id]) song_data = lr.resample(song_data_raw, source_sr, target_sr, scale=True) song_data = song_data[1500:(1500+2*seq_size)] #song_data.shape[0]/10] song_data, data_denom = norm(song_data, return_denom=True) return song_data, source_sr, data_denom
def save_waveform_as(self, waveform, data_id, dst): source_sr, data_denom = self.get_data_info(data_id) waveform *= data_denom waveform_resampled = lr.resample(waveform, self.cfg.target_sr, source_sr, scale=True) logging.info("Saving waveform as {}".format(dst)) lr.output.write_wav(dst, waveform_resampled, source_sr)
def compute_pcen(audio, sr): # Load settings. pcen_settings = get_pcen_settings() # Map to the range [-2**31, 2**31[ audio = (audio * (2**31)).astype('float32') # Resample to 22,050 kHz if not sr == pcen_settings["sr"]: audio = librosa.resample(audio, sr, pcen_settings["sr"]) sr = pcen_settings["sr"] # Compute Short-Term Fourier Transform (STFT). stft = librosa.stft( audio, n_fft=pcen_settings["n_fft"], win_length=pcen_settings["win_length"], hop_length=pcen_settings["hop_length"], window=pcen_settings["window"]) # Compute squared magnitude coefficients. abs2_stft = (stft.real*stft.real) + (stft.imag*stft.imag) # Gather frequency bins according to the Mel scale. # NB: as of librosa v0.6.2, melspectrogram is type-instable and thus # returns 64-bit output even with a 32-bit input. Therefore, we need # to convert PCEN to single precision eventually. This might not be # necessary in the future, if the whole PCEN pipeline is kept type-stable. melspec = librosa.feature.melspectrogram( y=None, S=abs2_stft, sr=pcen_settings["sr"], n_fft=pcen_settings["n_fft"], n_mels=pcen_settings["n_mels"], htk=True, fmin=pcen_settings["fmin"], fmax=pcen_settings["fmax"]) # Compute PCEN. pcen = librosa.pcen( melspec, sr=pcen_settings["sr"], hop_length=pcen_settings["hop_length"], gain=pcen_settings["pcen_norm_exponent"], bias=pcen_settings["pcen_delta"], power=pcen_settings["pcen_power"], time_constant=pcen_settings["pcen_time_constant"]) # Convert to single floating-point precision. pcen = pcen.astype('float32') # Truncate spectrum to range 2-10 kHz. pcen = pcen[:pcen_settings["top_freq_id"], :] # Return. return pcen
def slice_clip(filename, start, stop, n_samples, sr, mono=True): '''Slice a fragment of audio from a file. This uses pysoundfile to efficiently seek without loading the entire stream. Parameters ---------- filename : str Path to the input file start : int The sample index of `filename` at which the audio fragment should start stop : int The sample index of `filename` at which the audio fragment should stop (e.g. y = audio[start:stop]) n_samples : int > 0 The number of samples to load sr : int > 0 The target sampling rate mono : bool Ensure monophonic audio Returns ------- y : np.ndarray [shape=(n_samples,)] A fragment of audio sampled from `filename` Raises ------ ValueError If the source file is shorter than the requested length ''' with psf.SoundFile(str(filename), mode='r') as soundf: n_target = stop - start soundf.seek(start) y = soundf.read(n_target).T if mono: y = librosa.to_mono(y) # Resample to initial sr y = librosa.resample(y, soundf.samplerate, sr) # Clip to the target length exactly y = librosa.util.fix_length(y, n_samples) return y
def _read_song(self, fname, proportion=None): logging.info("Reading {}".format(fname)) song_data_raw, source_sr = lr.load(fname) logging.info("Got sampling rate {}, resampling to {} ...".format(source_sr, self.cfg.target_sr)) song_data = lr.resample(song_data_raw, source_sr, self.cfg.target_sr, scale=True) logging.info("Normalizing with l2 norm ...") if proportion: song_data = song_data[: int(proportion * len(song_data)),] song_data, data_denom = norm(song_data) logging.info("Done") return song_data, source_sr, data_denom
def sample_clip(filename, n_samples, sr, mono=True): '''Sample a fragment of audio from a file. This uses pysoundfile to efficiently seek without loading the entire stream. Parameters ---------- filename : str Path to the input file n_samples : int > 0 The number of samples to load sr : int > 0 The target sampling rate mono : bool Ensure monophonic audio Returns ------- y : np.ndarray [shape=(n_samples,)] A fragment of audio sampled randomly from `filename` Raises ------ ValueError If the source file is shorter than the requested length ''' with psf.SoundFile(str(filename), mode='r') as soundf: n_target = int(np.ceil(n_samples * soundf.samplerate / sr)) # Draw a random clip start = np.random.randint(0, len(soundf) - n_target) soundf.seek(start) y = soundf.read(n_target).T if mono: y = librosa.to_mono(y) # Resample to initial sr y = librosa.resample(y, soundf.samplerate, sr) # Clip to the target length exactly y = librosa.util.fix_length(y, n_samples) return y
def activation_upsample(A_low, sr): n, k, _, sr_old = A_low.shape A = np.zeros((n, k, 1, sr)) for i in range(n): for j in range(k): act_res = librosa.resample(A_low[i, j, :, :].squeeze(), orig_sr=sr_old, target_sr=sr) # Local-max filter act_res A[i, j, :, : min(sr, len(act_res))] = librosa.localmax(act_res) * act_res return A
def __test(sr_in, sr_out, res_type, y): y2 = librosa.resample(y, sr_in, sr_out, res_type=res_type, scale=True) # First, check that the audio is valid librosa.util.valid_audio(y2, mono=True) n_orig = np.sqrt(np.sum(np.abs(y)**2)) n_res = np.sqrt(np.sum(np.abs(y2)**2)) # If it's a no-op, make sure the signal is untouched assert np.allclose(n_orig, n_res, atol=1e-2), (n_orig, n_res)
def analyse(filename, resample_to=2756, bt_hop_length=128, chroma_hop_length=512, chroma_n_fft=1024): #load audiofile-> return as floating point time series (array) with a sampling rate (int>0) samples, sampleRate = librosa.load(filename) length = float(len(samples))/sampleRate if resample_to: #resample time series from sampleRate to resample_to rate samples = librosa.resample(samples, sampleRate, resample_to) sampleRate = resample_to newSampleRate = 2756 #resample time series from sampleRate to newSampleRate(2756) samples = librosa.resample(samples, sampleRate, newSampleRate) sampleRate = newSampleRate #track the beats from the time series in samples, using the number of audio samples reps by hop_length # --> returning tempo(estimated global tempo in bpmin) and beats (frame numbers of estimated beat events) tempo, beats = librosa.beat.beat_track(samples, sampleRate, hop_length=bt_hop_length) #convert the frame counts in 'beats' to time (seconds) beat_times = librosa.frames_to_time(beats, sampleRate, hop_length=bt_hop_length) #draw a chromagram using the data from samples, sampleRate, hop_length, and window size specified by n_fft chromagram = librosa.feature.chromagram(samples, sampleRate, hop_length=chroma_hop_length, n_fft=chroma_n_fft) #permute the dimensions of chromogram into a new array chromagram = numpy.transpose(chromagram) #compute the cosine distance between chromogram and CHORDS distances = scipy.spatial.distance.cdist(chromagram, CHORDS, "cosine") #return the indices of the minimum values along axis=1 (axis=0 reps the columns, axis=1 reps the rows) chords = distances.argmin(axis=1) #apply median filter to chords, the size of the median filter is rep by the kernal of size 11 chords = scipy.signal.medfilt(chords, 11) #create a new array from th chords array using points where the discrete difference along the x-axis is not 0 chord_frames = numpy.array(numpy.where(numpy.diff(chords) != 0)) chords = chords[chord_frames][0].astype(int) #convert the frame counts in 'chord_frames' to time(seconds) chord_times = librosa.frames_to_time(chord_frames, sampleRate, hop_length=chroma_hop_length, n_fft=chroma_n_fft)[0] chord_names = CHORD_NAMES[chords] return {"beats": list(beat_times), "chords": [{"chord": chord_name, "time": chord_time} for chord_name, chord_time in zip(chord_names, chord_times)], "tempo": tempo}
def __init__(self, file_path=None, raw_samples=None, convert_to_mono=False, sample_rate=44100, analysis_sample_rate=22050): """ Audio constructor. Opens a file path, loads the audio with librosa, and prepares the features Parameters ---------- file_path: string path to the audio file to load raw_samples: np.array samples to use for audio output convert_to_mono: boolean (optional) converts the file to mono on loading sample_rate: number > 0 [scalar] (optional) sample rate to pass to librosa. Returns ------ An Audio object """ if file_path: y, sr = librosa.load(file_path, mono=convert_to_mono, sr=sample_rate) elif raw_samples is not None: # This assumes that we're passing in raw_samples # directly from another Audio's raw_samples. y = raw_samples sr = sample_rate self.file_path = file_path self.sample_rate = float(sr) self.analysis_sample_rate = float(analysis_sample_rate) self.num_channels = y.ndim self.duration = librosa.get_duration(y=y, sr=sr) self.analysis_samples = librosa.resample(librosa.to_mono(y), sr, self.analysis_sample_rate, res_type='kaiser_best') self.raw_samples = np.atleast_2d(y) self.zero_indexes = self._create_zero_indexes() self.features = self._create_features() self.timings = self._create_timings()
def __test(res_type): y_native, sr = librosa.load(librosa.util.example_audio_file(), sr=None, offset=offset, duration=duration, res_type=res_type) y2 = librosa.resample(y_native, sr, sr_target, res_type=res_type) y, _ = librosa.load(librosa.util.example_audio_file(), sr=sr_target, offset=offset, duration=duration, res_type=res_type) assert np.allclose(y2, y)
def signal_to_formants(signal, sr, freq_lims, win_len, time_step, num_formants, window_shape = 'gaussian', begin = None, padding = None): rep = {} new_sr = 2 * freq_lims[1] alpha = np.exp(-2 * np.pi * 50 * (1 / new_sr)) proc = lfilter([1., -alpha], 1, signal) proc = librosa.resample(proc, sr, new_sr) nperseg = int(win_len*new_sr) nperstep = int(time_step*new_sr) if window_shape == 'gaussian': window = gaussian(nperseg + 2, 0.45 * (nperseg - 1) / 2)[1:nperseg + 1] else: window = hanning(nperseg + 2)[1:nperseg+1] indices = np.arange(int(nperseg / 2), proc.shape[0] - int(nperseg / 2) + 1, nperstep) num_frames = len(indices) for i in range(num_frames): if nperseg % 2 != 0: X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2) + 1] else: X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2)] frqs, bw = process_frame(X, window, num_formants, new_sr) formants = [] for j,f in enumerate(frqs): if f < 50: continue if f > freq_lims[1] - 50: continue formants.append((np.asscalar(f), np.asscalar(bw[j]))) missing = num_formants - len(formants) if missing: formants += [(None,None)] * missing rep[indices[i]/new_sr] = formants duration = signal.shape[0] / sr if begin is not None: if padding is not None: begin -= padding real_output = {} for k,v in rep.items(): if padding is not None and (k < padding or k > duration - padding): continue t = np.asscalar(k+begin) real_output[t] = v return real_output return rep
def get_enrate(signal = None, sr = None, filepath = None, downsample=100): """Computes the enrate as described in ARGS signal: audio signal <number array> sr: sampling rate <int> filepath: fullpath of audio file <str> downsample: sampling rate to downsample signal <int> RETURN enrate: proportional to speaking rate <float> """ if signal == None: signal, sr = load_signal(filepath) # FFT data n_fft = downsample hop_length = int(0.8*downsample) # Half-wave rectify the signal waveform signal[signal<0] = 0 # Low-pass filter numtaps=2 cutoff=16.0 nyq=sr/2.0 transfer_function = scipy.signal.firwin(numtaps=numtaps, cutoff=cutoff/nyq, nyq=nyq) signal = scipy.signal.lfilter(transfer_function, 1.0, signal) # Downsample to 100hz signal = librosa.resample(signal, sr, downsample) # Hamming window 1-2 seconds with > 75% overlap fft_window = scipy.signal.hamming(downsample, sym=False) # FFT, ignore values above 16 hz magnitudes = np.abs(librosa.stft(signal, n_fft, hop_length, window=fft_window)) bin_count, freq_res = get_bin_count_and_frequency_resolution(n_fft, downsample) lowest_fbin_idx = int(1/freq_res) highest_fbin_idx = int(16/freq_res) # Compute the spectral moment ( index weight each power spectral value and sum ) enrate = np.sum(magnitudes[lowest_fbin_idx:highest_fbin_idx].T * np.array(range(lowest_fbin_idx, highest_fbin_idx))) return enrate
def wav_data_to_samples(wav_data, sample_rate): """Read PCM-formatted WAV data and return a NumPy array of samples. Uses scipy to read and librosa to process WAV data. Audio will be converted to mono if necessary. Args: wav_data: WAV audio data to read. sample_rate: The number of samples per second at which the audio will be returned. Resampling will be performed if necessary. Returns: A numpy array of audio samples, single-channel (mono) and sampled at the specified rate, in float32 format. Raises: AudioIOReadError: If scipy is unable to read the WAV data. AudioIOError: If audio processing fails. """ try: # Read the wav file, converting sample rate & number of channels. native_sr, y = scipy.io.wavfile.read(six.BytesIO(wav_data)) except Exception as e: # pylint: disable=broad-except raise AudioIOReadError(e) if y.dtype == np.int16: # Convert to float32. y = int16_samples_to_float32(y) elif y.dtype == np.float32: # Already float32. pass else: raise AudioIOError( 'WAV file not 16-bit or 32-bit float PCM, unsupported') try: # Convert to mono and the desired sample rate. if y.ndim == 2 and y.shape[1] == 2: y = y.T y = librosa.to_mono(y) if native_sr != sample_rate: y = librosa.resample(y, native_sr, sample_rate) except Exception as e: # pylint: disable=broad-except raise AudioIOError(e) return y
def __test(y, sr_in, sr_out, res_type, fix): y2 = librosa.resample(y, sr_in, sr_out, res_type=res_type, fix=fix) # First, check that the audio is valid librosa.util.valid_audio(y2, mono=True) # If it's a no-op, make sure the signal is untouched if sr_out == sr_in: assert np.allclose(y, y2) # Check buffer contiguity assert y2.flags['C_CONTIGUOUS'] # Check that we're within one sample of the target length target_length = y.shape[-1] * sr_out // sr_in assert np.abs(y2.shape[-1] - target_length) <= 1
def __test(infile): DATA = load(infile) # load the wav file (y_in, sr_in) = librosa.load(DATA['wavfile'][0], sr=None, mono=True) # Resample it to the target rate y_out = librosa.resample(y_in, DATA['sr_in'], DATA['sr_out']) # Are we the same length? if len(y_out) == len(DATA['y_out']): # Is the data close? assert np.allclose(y_out, DATA['y_out']) elif len(y_out) == len(DATA['y_out']) - 1: assert (np.allclose(y_out, DATA['y_out'][:-1,0]) or np.allclose(y_out, DATA['y_out'][1:,0])) elif len(y_out) == len(DATA['y_out']) + 1: assert (np.allclose(y_out[1:], DATA['y_out']) or np.allclose(y_out[:-2], DATA['y_out'])) else: assert False pass
def resample_all(): audio_folder = 'scenes_stereo/' subsamp_folder = 'scenes_mono_8k/' chdir(audio_folder) mkdir(subsamp_folder) for sub_folder in glob('*'): mkdir(subsamp_folder + sub_folder) for filename in glob(sub_folder + '/*.wav'): print(filename) [fs, sig] = read(filename) sig = to_mono(sig.T) sig = resample(sig, fs, 8000) write(subsamp_folder + filename, 8000, sig)
def apply_offsets_resample(b, offset_locations, offsets): ''' Adjust a signal b according to local offset estimations using resampling :parameters: - b : np.ndarray Some signal - offset_locations : np.ndarray locations, in samples, of each local offset estimation - offsets : np.ndarray local offset for the corresponding sample in offset_locations :returns: - b_aligned : np.ndarray b with offsets applied ''' assert offset_locations.shape[0] == offsets.shape[0] # Include signal boundaries in offset locations offset_locations = np.append(0, np.append( offset_locations, b.shape[0]-100 )) # Allocate output signal b_aligned = np.zeros(np.int(np.sum(np.diff(offset_locations)) + np.max(np.abs(offsets)))) # Set last offset to whatever the second to last one was offsets = np.append(offsets, offsets[-1]) current = 0 # !!!!!!!!!!!!!!!!!! # Should zip here # !!!!!!!!!!!!!!!!!! for n, offset in enumerate(offsets): start = offset_locations[n] end = offset_locations[n + 1] # Compute the necessary resampling ratio to compensate for this offset ratio = 1 + (-offset + start - current)/(end - start) # Resample this portion of the signal, with some padding at the end resampled = librosa.resample(b[start:end + 100], 1, ratio) # Compute length and place the signal length = int(end - current - offset) b_aligned[current:current + length] = resampled[:length] current += length return b_aligned
def __test(infile, scipy_resample): DATA = load(infile) # load the wav file (y_in, sr_in) = librosa.load(DATA['wavfile'][0], sr=None, mono=True) # Resample it to the target rate y_out = librosa.resample(y_in, DATA['sr_in'], DATA['sr_out'], scipy_resample=scipy_resample) # Are we the same length? if len(y_out) == len(DATA['y_out']): # Is the data close? assert np.allclose(y_out, DATA['y_out']) elif len(y_out) == len(DATA['y_out']) - 1: assert (np.allclose(y_out, DATA['y_out'][:-1, 0]) or np.allclose(y_out, DATA['y_out'][1:, 0])) elif len(y_out) == len(DATA['y_out']) + 1: assert (np.allclose(y_out[1:], DATA['y_out']) or np.allclose(y_out[:-2], DATA['y_out'])) else: assert False pass
def read_csv(self): waves = [] labels = {} with open(self.dataset_path, "r") as file: csv_reader = csv.reader(file) for row in csv_reader: if row: label = row[-1] wav_path = f"{CURRENT_DIR}/{row[0]}" if row[0][ 0] != '/' else f"{CURRENT_DIR}{row[0]}" signal, sr = librosa.load(wav_path, sr=self.dim) signal = librosa.resample(signal, sr, self.dim) if signal.shape[0] != self.dim: continue waves.append(wav_path) labels[wav_path] = label return waves, labels
def resample(y, src_sr, target_sr, mode='kaiser_fast'): if mode == 'kaiser_best': warnings.warn( f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \ we recommend the mode kaiser_fast in large scale audio trainning') assert type(y) == np.ndarray, 'currently only numpy data are supported' assert mode in __resample_mode__, f'resample mode must in {__resample_mode__}' assert type( src_sr ) == int and src_sr > 0 and src_sr <= 48000, 'make sure type(sr) == int and sr > 0 and sr <= 48000,' assert type( target_sr ) == int and target_sr > 0 and target_sr <= 48000, 'make sure type(sr) == int and sr > 0 and sr <= 48000,' if has_resampy: return resampy.resample(y, src_sr, target_sr, filter=mode) if has_librosa: return librosa.resample(y, src_sr, target_sr, res_type=mode) assert False, 'requires librosa or resampy to do resampling, pip install resampy'
def on_post(self, request, response): # NB: uses middleware to pull out data. form_data = request.params['audio_data'].file data, samplerate = soundfile.read(form_data) # For debugging browser input, uncomment the following line: # scipy.io.wavfile.write('browser_input_audio.wav', samplerate, data) # NB: Convert the input stereo signal into mono. # In the future the frontend should be responsible for sampling details. mono = data[:, 0] # NB: We must downsample to the rate that the network is trained on. downsampled = librosa.resample(mono, samplerate, 16000) # Evaluate the model print(">>> Converting...") results = converter.convert(downsampled, conversion_direction = 'A2B') temp_dir = tempfile.TemporaryDirectory(prefix='tmp_ml_audio') temp_file = tempfile.NamedTemporaryFile(suffix='.wav') temp_file.write(results.read()) out_file = temp_dir.name + '/output.ogg' # NB: Browsers have a great deal of trouble decoding WAV files unless they are in the # narrow slice of the WAV spec expected. None of the {librosa, scipy, soundfile} python # tools do a good job of this, so here we shell out to ffmpeg and generate OGG. # It's lazy and messy, but it works for now. # See https://github.com/librosa/librosa/issues/361 for a survey of the library landscape # See https://bugzilla.mozilla.org/show_bug.cgi?id=523837 for one of dozens of browser codec bugs _stdout = subprocess.check_output(['ffmpeg', '-i', temp_file.name, '-acodec', 'libvorbis', out_file]) response.content_type = 'audio/ogg' with open(out_file, mode='rb') as f: response.data = f.read()
def resample2(data, sr): '''Resample if required and drop to disc resampled data ''' # now = datetime.datetime.now() # time = now.strftime("%H:%M:%S") # print(time, ': Resample all recordings to target sampling rate if required') downsample = data.loc[data['sample_rate'] != sr] #print(file, ': Resampling records:', len(downsample.index)) resampled = [] indexes = [] length = [] srs = [] samples = [] for index, row in downsample.iterrows(): sample = row['raw_sounds'] sro = row['sample_rate'] y = librosa.resample(sample, sro, sr) l = len(y) / sr resampled.append(y) indexes.append(index) length.append(l) samples.append(len(y)) srs.append(sr) output = pd.DataFrame(list(zip(resampled, srs, indexes, length, samples)), columns=[ 'raw_sounds', 'sample_rate', 'index', 'length', 'sample_count' ]).set_index('index') data.update(output) # Join resampled recordings to raw frame # data.to_pickle(p / file.name) #file_list = [p/(file.name) for file in file_list] return data
def process_uid(uid): audio_path = os.path.join(audio_files_dir, uid) sound_class = uid.split('-')[-1].split('.wav')[0] if sound_class not in sound_classes: raise ValueError('Sound Class: {} must be in Classes: {}'.format( sound_class, sound_classes)) y, sr = librosa.load(audio_path, sr=44100) y_8k = librosa.resample(y, sr, 8000) if y_8k.shape[0] < min_wav_samples: return wav = torch.tensor(y_8k, dtype=torch.float32).unsqueeze(0) norm_wav = torch.tensor(normalize_wav(y_8k), dtype=torch.float32).unsqueeze(0) output_uid_folder = os.path.join(output_dirpath, sound_class, uid.split('.wav')[0]) data = { 'wav': wav, 'wav_norm': norm_wav, } # Append metadata to the written data for meta_label, meta_label_val in metadata_dict[uid].items(): if meta_label in data: raise IndexError('Trying to override essential ' 'information about files by ' 'assigning metalabel: {} in data ' 'dictionary: {}'.format(meta_label, data)) data[meta_label] = meta_label_val if not os.path.exists(output_uid_folder): os.makedirs(output_uid_folder) for k, v in data.items(): file_path = os.path.join(output_uid_folder, k) joblib.dump(v, file_path, compress=0)
def load_audio(self): """ Reads wav file based on csv values, resamples audio to 8000hz, fixes length to 1 second :return: numpy array of stereo audio, DOA from file """ df = pd.read_csv("{dir}/iteration_{iter}.csv".format(dir=self.directory, iter=self.iteration), usecols=[1, 2]) wav_name = df.iloc[0][0] filename = "{wav_name}".format(wav_name=wav_name) y, sr = librosa.load(filename, mono=False) y_8k = librosa.resample(y, sr, 8000) o_env = librosa.onset.onset_strength(y_8k[0], sr=8000) peaks = librosa.util.peak_pick(o_env, 3, 3, 3, 5, 0.25, 5) times = librosa.frames_to_time(np.arange(len(o_env)), sr=8000, hop_length=512) peak_times = times[peaks] time = 0 for i in range(1, len(peak_times) + 1): if 3 - peak_times[-i] >= 0.75: time = peak_times[-i] - 0.25 break sample = librosa.time_to_samples(np.array([time]), sr=8000) sliced_y = np.array([y_8k[0][sample[0]:], y_8k[1][sample[0]:]]) y_out = librosa.util.fix_length(sliced_y, 8000) return y_out
def import_to_mel(filepath, sample_rate): ''' Import target audio file and pre process. input: filepath to target audio sample_rate of the current model output: a mel spectrogram of the loaded audiofile, normalised and limited to 4 seconds. ''' # mel settings n_mels = 128 n_fft = 2048 hop_length = 512 # the paper says 2048, but then the output matrix is the wrong size ЪциРђЇРЎѓ№ИЈ # import, convert to mono, normalise waveform, sr = torchaudio.load(filepath) waveform = waveform.numpy() if (sr != sample_rate): waveform = librosa.resample(waveform, sr, sample_rate) if (waveform.shape[0] > 1): waveform = librosa.to_mono(waveform).reshape(1, len(waveform[0])) waveform[0] = waveform[0] * (1.0 / np.max(waveform[0])) # normalise waveform = torch.from_numpy(waveform) # copy to a tensor of specific size waveform_4s = torch.zeros(1, sample_rate * 4) iter_len = min(sample_rate * 4, waveform.shape[1]) for i in range(iter_len): waveform_4s[0][i] = waveform[0][i] # generate mel spectrogram = torchaudio.transforms.MelSpectrogram( sample_rate=sample_rate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)(waveform_4s) return spectrogram
def audio_strip(self, orig_x, orig_sr) -> np.array: """ Trim silence from both side """ # Noise reduction if self._do_noise_reduction: orig_x = nr.reduce_noise(audio_clip=orig_x, noise_clip=orig_x) # Resampling x = librosa.resample(orig_x, orig_sr=orig_sr, target_sr=self.SAMPLE_RATE) # Voice activity detection start_sample = 0 stop_sample = x.shape[0] # begin for i in range(x.shape[0] // self.VAD_CHUNK): frame_buf = self.float_to_pcm16(x[self.VAD_CHUNK * i:self.VAD_CHUNK * (i + 1)]) if self._vad.is_speech(frame_buf, self.SAMPLE_RATE): start_sample = (i + 1) * self.VAD_CHUNK break # end for i in range(x.shape[0] // self.VAD_CHUNK - 1, start_sample // self.VAD_CHUNK, -1): frame_buf = self.float_to_pcm16(x[self.VAD_CHUNK * i:self.VAD_CHUNK * (i + 1)]) if self._vad.is_speech(frame_buf, self.SAMPLE_RATE): stop_sample = i * self.VAD_CHUNK break # less than 120 ms if stop_sample - start_sample <= self.MINIMUM_DIFF * ( self.SAMPLE_RATE // 1000): return None return x[start_sample:stop_sample]
def preprocess(raw_path, clean_path): raw_path = os.path.join(BASE_DIR, raw_path) clean_path = os.path.join(BASE_DIR, clean_path) if not os.path.isdir(raw_path): print('Path to Raw dataset is invalid!') return if not os.path.isdir(clean_path): os.mkdir(clean_path) for cls in os.scandir(raw_path): if cls.is_dir: for item in tqdm(os.scandir(cls), total=len(os.listdir(cls))): if item.name.endswith('.wav'): audio = item.path sr, signal = wavfile.read(audio) signal = signal.astype(np.float32).T if signal.shape[0] == 2: signal = to_mono(signal) elif signal.shape[0] == 1: signal = to_mono(signal.reshape(-1)) signal = resample(signal, sr, SR) sr = SR signal = signal.astype(np.int16) mask = purifier(signal, sr, 100) signal = signal[mask] if signal.shape[ 0] < DURATION: #if the audio after purification is less than 2s, append zeros rectified_signal = np.zeros((DURATION, ), dtype=np.int16) rectified_signal[:signal.shape[0]] = signal save_file(signal, sr, cls.name, item.name, 0) else: trunc = signal.shape[0] % DURATION for i, j in enumerate( range(0, signal.shape[0] - trunc, DURATION)): strip = signal[j:j + DURATION] save_file(strip, sr, clean_path, cls.name, item.name, i)
def load_wav_file(file_path, sample_rate, mono=True, resample_type="kaiser_best"): """Load a wav audio file as a floating point time series. Significantly faster than load_sound_file.""" actual_sample_rate, samples = wavfile.read(file_path) if samples.dtype != np.float32: assert samples.dtype == np.int16 samples = np.true_divide( samples, 32768, dtype=np.float32) # ends up roughly between -1 and 1 if mono and len(samples.shape) > 1: if samples.shape[1] == 1: samples = samples[:, 0] else: samples = np.mean(samples, axis=1) if sample_rate is not None and actual_sample_rate != sample_rate: if resample_type == "auto": resample_type = ("kaiser_fast" if actual_sample_rate < sample_rate else "kaiser_best") samples = librosa.resample(samples, actual_sample_rate, sample_rate, res_type=resample_type) warnings.warn( "{} had to be resampled from {} hz to {} hz. This hurt execution time." .format(str(file_path), actual_sample_rate, sample_rate)) actual_sample_rate = actual_sample_rate if sample_rate is None else sample_rate return samples, actual_sample_rate
def generator(self, data_dir, tmp_dir, dataset, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 data_tuples = _collect_data(tmp_dir) encoders = self.feature_encoders(data_dir) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for utt_id, media_file, text_data, speaker, utt_dataset in tqdm( sorted(data_tuples)[start_from:]): if dataset != utt_dataset: continue if how_many > 0 and i == how_many: return i += 1 try: wav_data = audio_encoder.encode(media_file) except AssertionError: audio, sr = librosa.load(media_file) data_resampled = librosa.resample(audio, sr, SAMPLE_RATE) with tempfile.NamedTemporaryFile(suffix='.wav') as fid: librosa.output.write_wav(fid.name, data_resampled, SAMPLE_RATE) wav_data = audio_encoder.encode(fid.name) yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": [speaker], }
def _get_spectrograms(fpath, require_sr, preemphasis, n_fft, hop_length, win_length, max_db, ref_db): '''Parse the wave file in `fpath` and Returns normalized melspectrogram and linear spectrogram. Args: fpath: A string. The full path of a sound file. Returns: mel: A 2d array of shape (T, n_mels) and dtype of float32. mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32. ''' # Loading sound file y, sr = librosa.load(fpath, sr=None) if sr != require_sr: y = librosa.resample(y, sr, require_sr) # Preemphasis y = np.append(y[0], y[1:] - preemphasis * y[:-1]) # stft linear = librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) # magnitude spectrogram mag = np.abs(linear) # (1+n_fft//2, T) # to decibel mag = 20 * np.log10(np.maximum(1e-5, mag)) # normalize mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1) # Transpose mag = mag.T.astype(np.float32) # (T, 1+n_fft//2) return mag
def wav_data_to_samples(wav_data, sample_rate): """Read PCM-formatted WAV data and return a NumPy array of samples. Uses scipy to read and librosa to process WAV data. Audio will be converted to mono if necessary. Args: wav_data: WAV audio data to read. sample_rate: The number of samples per second at which the audio will be returned. Resampling will be performed if necessary. Returns: A numpy array of audio samples, single-channel (mono) and sampled at the specified rate, in float32 format. Raises: AudioIOReadException: If scipy is unable to read the WAV data. AudioIOException: If audio processing fails. """ try: # Read the wav file, converting sample rate & number of channels. native_sr, y = scipy.io.wavfile.read(six.BytesIO(wav_data)) except Exception as e: # pylint: disable=broad-except raise AudioIOReadException(e) if y.dtype != np.int16: raise AudioIOException('WAV file not 16-bit PCM, unsupported') try: # Convert to float, mono, and the desired sample rate. y = int16_samples_to_float32(y) if y.ndim == 2 and y.shape[1] == 2: y = y.T y = librosa.to_mono(y) if native_sr != sample_rate: y = librosa.resample(y, native_sr, sample_rate) except Exception as e: # pylint: disable=broad-except raise AudioIOException(e) return y
def asr_transcript(model, tokenizer, input_file): if not os.path.isfile(input_file): raise FileNotFoundError # tokenizer, model = load_model() speech, fs = sf.read(input_file) if len(speech.shape) > 1: speech = speech[:, 0] + speech[:, 1] if fs != 16000: speech = librosa.resample(speech, fs, 16000) input_values = tokenizer(speech, return_tensors="pt").input_values input_values = input_values.to(device) logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = tokenizer.decode(predicted_ids[0]) return correct_sentence(transcription.lower())
def supercompression(filename, mono, resample): y_file, fs = librosa.load(filename, mono=mono) if resample: targetsample = 2000 y_file = librosa.resample(y_file, target_sr=targetsample, orig_sr=fs) D = librosa.stft(y_file) S_db = np.transpose( np.array(librosa.amplitude_to_db(np.abs(D), ref=np.max))).tolist() edited = [] average = 0 count = 0 for line in S_db: temp = [] for data in line: data += 80 if data > 0: average += (data) count += 1 average = average / count for line in S_db: temp = [] for data in line: data += 80 if data == 0: category = 0 elif data > 0 and data < average * 0.5: category = 1 elif data > average * 0.5 and data < average: category = 2 elif data > average and data < average * 1.5: category = 3 elif data > average * 1.5: category = 4 temp.append(category) edited.append(temp) return edited
def read_audio(): # 녹음을 수행하고 prediction을 수행하는 부분 audio = sd.rec(duration * fs, samplerate=fs, channels=4, dtype='float64') # print("Recording Audio") sd.wait() audio = np.multiply(audio, negative) audio = np.multiply(audio, mul) audio = np.multiply(audio, negative) # sf.write('./test audio file/False_alarm.wav', audio, fs) # print('recorded!') audio = audio.T y = librosa.resample(audio, fs, 16000) y = np.asfortranarray(y) fe1, fe2, fe3, fe4 = extract_feature(y, sr=16000) rr1, rr2 = prediction(fe1, fe2, fe3, fe4) result = '' if not rr2: result = rr1 else: result = rr1 + ", " + rr2 return result
def __getitem__(self, idx): hop_length = 1024 # open audio file_path = self.data[idx] signal, sampling_rate = open_audio(file_path) if len(signal.shape) > 1: signal = np.mean(signal, axis = 1) if sampling_rate != 44100: signal = librosa.resample(signal, sampling_rate, 44100) sampling_rate = 44100 # get 30 second chunk len_index_30_sec = int(30 / (1 / sampling_rate)) # trim first and last 30 seconds signal = signal[len_index_30_sec:-len_index_30_sec] # random start index start_index = np.random.randint(low = 0, high = len(signal) - len_index_30_sec) signal = signal[start_index:start_index + len_index_30_sec] # if training change pitch randomly if self.train: n_steps = np.random.randint(low = -4, high=4) signal = librosa.effects.pitch_shift(signal, sampling_rate, n_steps=n_steps) # extract harmonic data_h = librosa.effects.harmonic(signal) # cqt transform S = np.real(librosa.cqt(data_h, sr=sampling_rate, hop_length=hop_length)).astype(np.float32) d = torch.from_numpy(np.expand_dims(S, axis = 0)).type(torch.FloatTensor) # normalize d = F.normalize(d) l = torch.from_numpy(np.array(self.labels[idx])).type(torch.LongTensor) # print(d.shape, sampling_rate, file_path) return d,l
def generate_amplitude_envelopes(signal, sr, num_bands, min_frequency, max_frequency, mode='downsample'): signal = preemphasize(signal, 0.97) proc = signal / np.sqrt(np.mean(signal ** 2)) * 0.03 band_mins = [min_frequency * np.exp(np.log(max_frequency / min_frequency) / num_bands) ** x for x in range(num_bands)] band_maxes = [min_frequency * np.exp(np.log(max_frequency / min_frequency) / num_bands) ** (x + 1) for x in range(num_bands)] envs = [] for i in range(num_bands): b, a = butter(2, (band_mins[i] / (sr / 2), band_maxes[i] / (sr / 2)), btype='bandpass') env = filtfilt(b, a, proc) env = abs(hilbert(env)) if mode == 'downsample': env = resample(env, sr, 120) envs.append(env) envs = np.array(envs).T if mode == 'downsample': sr = 120 output = dict() for i in range(envs.shape[0]): output[i / sr] = envs[i, :] return output
def preprocess_wav(fpath_or_wav, source_sr = None): """ Applies the preprocessing operations used in training the Speaker Encoder to a waveform either on disk or in memory. The waveform will be resampled to match the data hyperparameters. :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not just .wav), either the waveform as a numpy array of floats. :param source_sr: if passing an audio waveform, the sampling rate of the waveform before preprocessing. After preprocessing, the waveform's sampling rate will match the data hyperparameters. If passing a filepath, the sampling rate will be automatically detected and this argument will be ignored. """ # Load the wav from disk if needed if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): wav, source_sr = librosa.load(fpath_or_wav, sr=None) else: wav = fpath_or_wav # Resample the wav if needed if source_sr is not None and source_sr != sampling_rate: wav = librosa.resample(wav, source_sr, sampling_rate) ## Apply the preprocessing: normalize volume and shorten long silences #wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) #wav = trim_long_silences(wav) return wav
def pool_msr(path, dirname, mrs_dir, msf_dir, ftype = 3): dirs = os.listdir("%s/%s"%(path, dirname)) for file in dirs: if file.endswith('.wav'): print('%s/%s/%s' % (path, dirname, file)) if dirname == "": sample_rate, signal = scipy.io.wavfile.read(path, file) feat = logfbank(signal) mf = srmr_audio(path, file, pad) else: signal, fs = librosa.load("%s/%s/%s" % (path, dirname, file)) signal = librosa.resample(signal, fs, 16000) feat1 = logfbank(signal) feat2 = ssc(signal) mrs = feat1 write_file(mrs_dir, file, mrs, ftype) msf = feat2 write_file(msf_dir, file, msf, ftype)
def preview(filename, directory): print '...Previewing Accompaniment' FNULL = open(os.devnull, 'w') subprocess.call([ 'fluidsynth', '-T', 'wav', '-F', directory + "/" + filename[:-4] + '.raw', '-ni', directory[:-10] + 'lib/sf2/sf.sf2', directory + "/" + filename[:-4] + '.mid', '-g', '0.8', '-r', '22050' ], stdout=FNULL, stderr=subprocess.STDOUT) subprocess.call([ 'SoX', '-t', 'raw', '-r', '22050', '-e', 'signed', '-b', '16', '-c', '1', directory + "/" + filename[:-4] + '.raw', directory + "/" + filename[:-4] + '_midi.wav' ]) y, sr = librosa.load(directory + "/" + filename) z, sr2 = librosa.load(directory + "/" + filename[:-4] + '_midi.wav') y = librosa.resample(y, sr, sr * 2) mix = np.zeros(max(len(y), len(z)), dtype=float) mix[:len(y)] += y / 2 mix[:len(z)] += z / 2 mix = np.int16(mix / np.max(np.abs(mix)) * 16383) write(directory + "/" + filename[:-4] + '_mix.wav', 44100, mix)
def process_wav_file(wav, orig_d_path, target_d_path, sample_duration, sample_rate, channels_to_extract): # read the orig wav file, and resample with given sample_rate wav_p = os.path.join(orig_d_path, wav) data, sr = sf.read(wav_p, dtype=np.float32) if data.shape[1] < channels_to_extract: raise ValueError('Not enough channels') # reduce it to a single channel and resample data = np.concatenate(data[:, :channels_to_extract]) data = librosa.resample(data, sr, sample_rate) # split it into the required duration frames_per_sample = int(sample_duration * sample_rate) # write the audio to the given directory idx = 0 start = 0 end = frames_per_sample while end < data.shape[0]: target_wav = wav[:-4] + '_' + str(idx) + '.wav' target_wav_p = os.path.join(target_d_path, target_wav) wavfile.write(target_wav_p, sample_rate, data[start:end]) idx += 1 start = end end += frames_per_sample
def get_spectrogram_feature(filepath): if filepath.split('/')[1] == 'TIMIT': sig = np.fromfile(filepath, dtype=np.int16)[512:].reshape((-1, 1)) else: (fate, width, sig) = wavio.readwav(filepath) sig = sig.ravel().astype(np.float) / 32767 sig = librosa.resample(sig, 16000, 8000) * 32767 sig = sig.astype(np.int16) stft = torch.stft(torch.FloatTensor(sig), N_FFT, hop_length=int(0.01 * SAMPLE_RATE), win_length=int(0.03 * SAMPLE_RATE), window=torch.hamming_window(int(0.03 * SAMPLE_RATE)), center=False, normalized=False, onesided=True) stft = (stft[:, :, 0].pow(2) + stft[:, :, 1].pow(2)).pow(0.5) amag = stft.numpy() feat = torch.FloatTensor(amag) feat = torch.FloatTensor(feat).transpose(0, 1) return feat
def get_x_y(i, fs, resample, directory, input_data_frame): """ returns tuple of x, y data pair at sample rate fs written to ensure that parallel computations have the correct label """ file_path = "{directory}/{filename}".format( directory=directory, filename=input_data_frame.iloc[i][0]) y, sr = librosa.load(file_path, mono=False) if resample: y_8k = librosa.resample(y, sr, fs) result_x = librosa.util.fix_length(y_8k, fs) else: y_8k = y result_x = librosa.util.fix_length(y_8k, sr) result_y = get_y(i, input_data_frame) return result_x, result_y
def load_audio(audiofile): try: audio, sr = soundfile.read(audiofile) if audio.shape[1] != 1: audio = librosa.to_mono(audio.T) if sr != 16000: audio = librosa.resample(audio, sr, 16000) except: path_audio = Path(audiofile) filetype = path_audio.suffix assert filetype in ['.mp3', '.ogg', '.flac', '.wav', '.m4a', '.mp4'], filetype with tempfile.TemporaryDirectory() as tempdir: tempwav = Path(tempdir) / (path_audio.stem + '_temp' + '.flac') command = [ 'ffmpeg', '-i', audiofile, '-af', 'aformat=s16:16000', '-ac', '1', tempwav ] process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() audio, sr = soundfile.read(tempwav) return audio
def spoil_audio(sample, sr=8000, rem_coef=0.1): """Spoil the wav audio file by removing measures. Inputs: sample (numpy.ndarray): wav file array; sr=8000 (int): incoming sample rate; rem_coef=0.1 (float): removing coefficient Return: numpy.ndarray: reprocessed sample wav file array""" new_sample_rate = sr + sr * rem_coef sample_filtered = librosa.resample(sample, orig_sr=sr, target_sr=new_sample_rate) # Получить Series из wav numpy.ndarray s1 = Series(sample_filtered) # Выбрать случайно индексы rand_ind = np.random.choice(s1.index.values, sr, replace=False) # Выбрать из исходного Series только полученные случайные индексы s1.loc[rand_ind].sort_index().values return s1.loc[rand_ind].sort_index().values
def _resample_and_cut(cache_path, spec, example, label): """Resample and cut the audio files into snippets.""" def _new_path(i): # Note that `splitext` handles hidden files differently and here we just # assume that audio files are not hidden files. # os.path.splitext('/root/path/.wav') => ('/root/path/.wav', '') # instead of ('/root/path/', '.wav') as this code expects. filename_without_extension = os.path.splitext( os.path.basename(example))[0] new_filename = filename_without_extension + '_%d.wav' % i return os.path.join(cache_path, label, new_filename) sampling_rate, xs = wavfile.read(example) if xs.dtype != np.int16: raise ValueError( 'DataLoader expects 16 bit PCM encoded WAV files, but {} has type {}' .format(example, xs.dtype)) # Extract snippets. n_samples_per_snippet = int(spec.snippet_duration_sec * sampling_rate) begin_index = 0 count = 0 while begin_index + n_samples_per_snippet <= len(xs): snippet = xs[begin_index:begin_index + n_samples_per_snippet] if spec.target_sample_rate != sampling_rate: # Resample, librosa.resample only works with float32. # Ref: https://github.com/bmcfee/resampy/issues/44 snippet = snippet.astype(np.float32) snippet = librosa.resample( snippet, orig_sr=sampling_rate, target_sr=spec.target_sample_rate).astype(np.int16) wavfile.write(_new_path(count), spec.target_sample_rate, xs) begin_index += n_samples_per_snippet count += 1 return count
def get_audio(config, mp3_path=None, array=None, array_sr=None): if mp3_path: array, sr_in = librosa.core.load(mp3_path, sr=None, mono=False) elif array is not None: array = array.astype(np.float32) sr_in = array_sr array = librosa.core.to_mono(array) array = librosa.resample(array, sr_in, config.sr) array = librosa.core.power_to_db( librosa.feature.melspectrogram(array, config.sr, n_mels=config.n_mels)) array = array.astype(np.float32) # normalization mean, variance = tf.nn.moments(tf.constant(array), axes=[0, 1], keepdims=True) array = tf.nn.batch_normalization(array, mean, variance, offset=0, scale=1, variance_epsilon=.000001).numpy() return array
def __onset_time(song_path): ''' Loads the song at song_path, computes onsets, returns array of times and harmonic component of source separated audio ~~~~ ARGUMENTS ~~~~ - song_path (Path or str): path to audio file ~~~~ RETURNS ~~~~ - y_harmonic (1D numpy array): numpy representation of audio, sr=22050, with percussive components of audio removed - onset_times (list of float): list of onset times corresponding to audio in seconds ''' # Load the songs and the notes arrays one at a time # for idx in range (len(song_paths)): # Load the song y, sr = librosa.load(song_path) # resample the song if it isn't sr=22050 (for consistent sizing) if not sr == 22050: y = librosa.resample(y, sr, 22050) sr = 22050 #source seperation, margin can be tuned y_harmonic, _ = librosa.effects.hpss(y, margin=2.0) # Set Hop_len hop_len = 512 onset_frame_backtrack = librosa.onset.onset_detect(y_harmonic, sr=sr, hop_length=hop_len, backtrack=True) onset_times = librosa.frames_to_time(onset_frame_backtrack) return y_harmonic, onset_times
def resample(sample_rate=None, dir=None, csv_path=None): clips = [] start_time = time.time() # List all clips that appear on the csv (train, eval or test) if csv_path != 'test': with open(csv_path, 'r') as csvFile: reader = csv.reader(csvFile) for row in reader: clips.append(row[0]) csvFile.close() clips.remove('fname') else: clips = os.listdir(dir) if os.path.exists(dir+'/resampled/'): shutil.rmtree(dir+'/resampled', ignore_errors=True) # ignore errors whit read only files os.mkdir(dir+'/resampled') for clip in clips: # Audio clip is read data, sr = sf.read(dir+'/'+clip) data = data.T # Audio data is resampled to desired sample_rate if sr != sample_rate: data_resampled = librosa.resample(data, sr, sample_rate) # Processed data is saved into a directory under train_clip_dir sf.write(dir+'/resampled/'+clip, data_resampled, sample_rate, subtype='PCM_16') print('Audio data has been resampled successfully') elapsed_time = time.time() - start_time print('Elapsed time ' + str(elapsed_time) + ' seconds')
def k_filter(data, fs): """ x_t: audio data in samples across time fs: sample rate of x_t TEMPORARY FUNCTION UNTIL THE LOUDNESS FUNCTION IS FIXED FOR ACTIVITY return: k-filtered data AND new 48khz fs """ # Convert fs to 48khz to do K-Filtering if fs != 48000: data = librosa.resample(data, fs, 48000) fs = 48000 # Hi-Shelf Boost of +4dB at 1681hz a1 = [1.0, -1.69065929318241, 0.73248077421585] b1 = [1.53512485958697, -2.69169618940638, 1.19839281085285] # Create High-Pass roll off at 38hz a2 = [1.0, -1.99004745483398, 0.99007225036621] b2 = [1.0, -2.0, 1.0] # Filter in succession return lfilter(b2, a2, lfilter(b1, a1, data)), fs
def analyze_sound(event, context): file_data = event file_name = file_data['name'] bucket_name = file_data['bucket'] blob = storage_client.bucket(bucket_name).get_blob(file_name) blob_name = blob.name _, temp_local_filename = tempfile.mkstemp() # Download file from bucket. blob.download_to_filename(temp_local_filename) y, sr = librosa.load(temp_local_filename) y_mono = librosa.to_mono(y) y_mono_22050 = librosa.resample(y_mono, sr, 22050) mfccs = librosa.feature.mfcc(y=y_mono_22050, sr=22050, n_mfcc=12) mean_mfccs = [np.mean(mfcc) for mfcc in mfccs] print(f'Audio file name: {file_name}') print(f'Audio file is {len(y)} samples long.') uid = uuid.uuid4() doc_ref = db.collection(u'sounds').document(str(uid)) doc_ref.set({ u'uid': str(uid), u'blob_name': blob_name, u'file_name': file_name, u'length': len(y), u'mean_mfccs': mean_mfccs })
filters_init = filters.copy() use_gpu = True filters_gpu = tf.placeholder(tf.float32, shape=filters.shape, name="Filters") Rn_gpu = tf.placeholder(tf.float32, shape=(batch_size, filter_size), name="Rn") prods_gpu = math_ops.matmul(filters_gpu, tf.transpose(Rn_gpu)) gpu_session = tf.Session() assert not use_gpu or jobs == 1, "Can't use gpu with multiple processes" data_source = [data_source[1]] for source_id, source_filename in enumerate(data_source): data, source_sr = lr.load(source_filename) data = data[:500000] data = lr.resample(data, source_sr, target_sr, scale=True) data_test = lr.resample(data, target_sr, source_sr, scale=True) lr.output.write_wav("/home/alexeyche/Music/ml/test.wav", data_test, source_sr) data_denom = np.sqrt(np.sum(data ** 2)) data = data/data_denom data = np.concatenate([data, np.zeros(filter_size)]) print "Source with id {} and file {}".format(source_id, source_filename) processes = [] records = [] def sync(wait_all=True): global dfilters, records, processes, filters while len(processes)>0:
def __init__(self, db, # data source name = '', # optional name selectors = dict(), partitioner = None, meta_sources = [], # optional sources other than 'features' and 'targets' from metadata channel_filter = NoChannelFilter(), # optional channel filter, default: keep all channel_names = None, # optional channel names (for metadata) label_attribute = 'label', # metadata attribute to be used as label label_map = None, # optional conversion of labels use_targets = True, # use targets if provides, otherwise labels are used remove_dc_offset = False, # optional subtraction of channel mean, usually done already earlier resample = None, # optional down-sampling normalize = True, # normalize to max=1 # optional sub-sequences selection start_sample = 0, stop_sample = None, # optional for selection of sub-sequences zero_padding = True, # if True (default) trials that are too short will be padded with # otherwise they will rejected. # optional signal filter to by applied before splitting the signal signal_filter = None, trial_processors = [], # optional processing of the trials target_processor = None, # optional processing of the targets, e.g. zero-padding transformers = [], # optional transformations of the dataset layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time debug=False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params self.name = name self.debug = debug metadb = DatasetMetaDB(db.metadata, selectors.keys()) if partitioner is not None: pass # FIXME selected_trial_ids = metadb.select(selectors) log.info('selectors: {}'.format(selectors)) log.info('selected trials: {}'.format(selected_trial_ids)) if normalize: log.info('Data will be normalized to max amplitude 1 per channel (normalize=True).') trials = list() labels = list() targets = list() meta = list() if stop_sample == 'auto-min': stop_sample = np.min([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids]) log.info('Using minimum trial length. stop_sample={}'.format(stop_sample)) elif stop_sample == 'auto-max': stop_sample = np.max([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids]) log.info('Using maximum trial length. stop_sample={}'.format(stop_sample)) for trial_i in selected_trial_ids: trial_meta = db.metadata[trial_i] if use_targets: if targets is None: target = None else: target = db.targets[trial_i] assert not np.isnan(np.sum(target)) if target_processor is not None: target = target_processor.process(target, trial_meta) assert not np.isnan(np.sum(target)) else: # get and process label label = db.metadata[trial_i][label_attribute] if label_map is not None: label = label_map[label] processed_trial = [] trial = db.data[trial_i] if np.isnan(np.sum(trial)): print trial_i, trial assert not np.isnan(np.sum(trial)) rejected = False # flag for trial rejection trial = np.atleast_2d(trial) # process 1 channel at a time for channel in xrange(trial.shape[0]): # filter channels if not channel_filter.keep_channel(channel): continue samples = trial[channel, :] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1], res_type='sinc_best') # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) if stop_sample is not None and stop_sample > len(samples): if zero_padding: tmp = np.zeros(stop_sample) tmp[:len(samples)] = samples samples = tmp else: rejected = True break # stop processing this trial s = samples[start_sample:stop_sample] # TODO optional channel processing # normalize to max amplitude 1 if normalize: s = librosa.util.normalize(s) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print s.shape s = np.asfarray(s, dtype=theano.config.floatX) processed_trial.append(s) ### end of channel iteration ### if rejected: continue # next trial processed_trial = np.asfarray([processed_trial], dtype=theano.config.floatX) # processed_trial = processed_trial.reshape((1, processed_trial.shape)) processed_trial = np.rollaxis(processed_trial, 1, 4) # optional (external) trial processing, e.g. windowing # trials will be in b01c format with tf layout for 01-axes for trial_processor in trial_processors: processed_trial = trial_processor.process(processed_trial, trial_meta) trials.append(processed_trial) for k in range(len(processed_trial)): meta.append(trial_meta) if use_targets: targets.append(target) else: labels.append(label) ### end of datafile iteration ### # turn into numpy arrays self.trials = np.vstack(trials) assert not np.isnan(np.sum(self.trials)) # prepare targets / labels if use_targets: self.targets = np.vstack(targets) assert not np.isnan(np.sum(self.targets)) else: labels = np.hstack(labels) if label_map is None: one_hot_formatter = OneHotFormatter(max(labels) + 1) else: one_hot_formatter = OneHotFormatter(max(label_map.values()) + 1) one_hot_y = one_hot_formatter.format(labels) self.targets = one_hot_y self.metadata = meta if layout == 'ft': # swap axes to (batch, feature, time, channels) self.trials = self.trials.swapaxes(1, 2) # transform after finalizing the data structure for transformer in transformers: self.trials, self.targets = transformer.process(self.trials, self.targets) self.trials = np.asarray(self.trials, dtype=theano.config.floatX) log.debug('final dataset shape: {} (b,0,1,c)'.format(self.trials.shape)) # super(EEGEpochsDataset, self).__init__(topo_view=self.trials, y=self.targets, axes=['b', 0, 1, 'c']) self.X = self.trials.reshape(self.trials.shape[0], np.prod(self.trials.shape[1:])) self.y = self.targets log.info('generated dataset "{}" with shape X={}={} y={} targets={} '. format(self.name, self.X.shape, self.trials.shape, self.y.shape, self.targets.shape)) # determine data specs features_space = Conv2DSpace( shape=[self.trials.shape[1], self.trials.shape[2]], num_channels=self.trials.shape[3] ) features_source = 'features' targets_space = VectorSpace(dim=self.targets.shape[-1]) targets_source = 'targets' space_components = [features_space, targets_space] source_components = [features_source, targets_source] # additional support for meta information self.meta_maps = dict() for meta_source in meta_sources: self.meta_maps[meta_source] = sorted(list(set([m[meta_source] for m in self.metadata]))) space_components.extend([VectorSpace(dim=1)]) source_components.extend([meta_source]) log.info('Generated meta-source "{}" with value map: {}' .format(meta_source, self.meta_maps[meta_source])) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) log.debug('data specs: {}'.format(self.data_specs))
# tvars = tf.trainable_variables() # grads_raw = tf.gradients(cost, tvars) # grads, _ = tf.clip_by_global_norm(grads_raw, 5.0) # apply_grads = optimizer.apply_gradients(zip(grads, tvars)) ################################## # DATA fname = env.dataset([f for f in os.listdir(env.dataset()) if f.endswith(".wav")][0]) df = env.run("test_data.pkl") if not os.path.exists(df): song_data_raw, source_sr = lr.load(fname) print "Got sampling rate {}, resampling to {} ...".format(source_sr, target_sr) song_data = lr.resample(song_data_raw, source_sr, target_sr, scale=True) song_data = song_data[:30000,] np.save(open(df, "w"), song_data) else: song_data = np.load(open(df)) inputs_v, data_denom = norm(song_data) ################################## # EVALUATION sess = tf.Session()
def __init__(self, path, name = '', # optional name # selectors subjects='all', # optional selector (list) or 'all' trial_types='all', # optional selector (list) or 'all' trial_numbers='all', # optional selector (list) or 'all' conditions='all', # optional selector (list) or 'all' partitioner = None, channel_filter = NoChannelFilter(), # optional channel filter, default: keep all channel_names = None, # optional channel names (for metadata) label_map = None, # optional conversion of labels remove_dc_offset = False, # optional subtraction of channel mean, usually done already earlier resample = None, # optional down-sampling # optional sub-sequences selection start_sample = 0, stop_sample = None, # optional for selection of sub-sequences # optional signal filter to by applied before spitting the signal signal_filter = None, # windowing parameters frame_size = -1, hop_size = -1, # values > 0 will lead to windowing hop_fraction = None, # alternative to specifying absolute hop_size # optional spectrum parameters, n_fft = 0 keeps raw data n_fft = 0, n_freq_bins = None, spectrum_log_amplitude = False, spectrum_normalization_mode = None, include_phase = False, flatten_channels=False, layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time save_matrix_path = None, keep_metadata = False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params # TODO: get the whole filtering into an extra class datafiles_metadata, metadb = load_datafiles_metadata(path) # print datafiles_metadata def apply_filters(filters, node): if isinstance(node, dict): filtered = [] keepkeys = filters[0] for key, value in node.items(): if keepkeys == 'all' or key in keepkeys: filtered.extend(apply_filters(filters[1:], value)) return filtered else: return node # [node] # keep only files that match the metadata filters self.datafiles = apply_filters([subjects,trial_types,trial_numbers,conditions], datafiles_metadata) # copy metadata for retained files self.metadb = {} for datafile in self.datafiles: self.metadb[datafile] = metadb[datafile] # print self.datafiles # print self.metadb self.name = name if partitioner is not None: self.datafiles = partitioner.get_partition(self.name, self.metadb) self.include_phase = include_phase self.spectrum_normalization_mode = spectrum_normalization_mode self.spectrum_log_amplitude = spectrum_log_amplitude self.sequence_partitions = [] # used to keep track of original sequences # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] n_sequences = 0 if frame_size > 0 and hop_size == -1 and hop_fraction is not None: hop_size = np.ceil(frame_size / hop_fraction) for i in xrange(len(self.datafiles)): with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): # save start of next sequence self.sequence_partitions.append(n_sequences) data, metadata = load(os.path.join(path, self.datafiles[i])) label = metadata['label'] if label_map is not None: label = label_map[label] multi_channel_frames = [] # process 1 channel at a time for channel in xrange(data.shape[1]): # filter channels if not channel_filter.keep_channel(channel): continue samples = data[:, channel] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1]) # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) samples = samples[start_sample:stop_sample] if n_fft is not None and n_fft > 0: # Optionally: ### frequency spectrum branch ### # transform to spectogram hop_length = n_fft / 4; ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length) # mag = np.abs(S) # magnitude spectrum mag = np.abs(S)**2 # power spectrum # include phase information if requested if self.include_phase: # phase = np.unwrap(np.angle(S)) phase = np.angle(S) # Optionally: cut off high bands if n_freq_bins is not None: mag = mag[0:n_freq_bins, :] if self.include_phase: phase = phase[0:n_freq_bins, :] if self.spectrum_log_amplitude: mag = librosa.logamplitude(mag) s = mag # for normalization ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s) std = np.std(s) s = (s - mean) / std ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s) ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}'.format( self.spectrum_normalization_mode) ) #print s.mean(axis=0) #print s.std(axis=0) # include phase information if requested if self.include_phase: # normalize phase to [-1.1] phase = phase / np.pi s = np.vstack([s, phase]) # transpose to fit pylearn2 layout s = np.transpose(s) # print s.shape ### end of frequency spectrum branch ### else: ### raw waveform branch ### # normalize to max amplitude 1 s = librosa.util.normalize(samples) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print s.shape ### end of raw waveform branch ### s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: s = s.copy() # FIXME: THIS IS NECESSARY IN MultiChannelEEGSequencesDataset - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!! frames = frame(s, frame_length=frame_size, hop_length=hop_size) else: frames = s del s # print frames.shape if flatten_channels: # add artificial channel dimension frames = frames.reshape((frames.shape[0], frames.shape[1], frames.shape[2], 1)) # print frames.shape sequences.append(frames) # increment counter by new number of frames n_sequences += frames.shape[0] if keep_metadata: # determine channel name channel_name = None if channel_names is not None: channel_name = channel_names[channel] elif 'channels' in metadata: channel_name = metadata['channels'][channel] self.metadata.append({ 'subject' : metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no' : metadata['trial_no'], # trial_no 'condition' : metadata['condition'], # condition 'channel' : channel, # channel 'channel_name' : channel_name, 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }) for _ in xrange(frames.shape[0]): labels.append(label) else: multi_channel_frames.append(frames) ### end of channel iteration ### if not flatten_channels: # turn list into array multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32') # [channels x frames x time x freq] -> cb01 # [channels x frames x time x 1] -> cb0. # move channel dimension to end multi_channel_frames = np.rollaxis(multi_channel_frames, 0, 4) # print multi_channel_frames.shape # log.debug(multi_channel_frames.shape) sequences.append(multi_channel_frames) # increment counter by new number of frames n_sequences += multi_channel_frames.shape[0] if keep_metadata: self.metadata.append({ 'subject' : metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no' : metadata['trial_no'], # trial_no 'condition' : metadata['condition'], # condition 'channel' : 'all', # channel 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }) for _ in xrange(multi_channel_frames.shape[0]): labels.append(label) ### end of datafile iteration ### # turn into numpy arrays sequences = np.vstack(sequences) # print sequences.shape; labels = np.hstack(labels) # one_hot_y = one_hot(labels) one_hot_formatter = OneHotFormatter(labels.max() + 1) # FIXME! one_hot_y = one_hot_formatter.format(labels) self.labels = labels if layout == 'ft': # swap axes to (batch, feature, time, channels) sequences = sequences.swapaxes(1, 2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)) super(MultiChannelEEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) log.info('generated dataset "{}" with shape X={}={} y={} labels={} '. format(self.name, self.X.shape, sequences.shape, self.y.shape, self.labels.shape)) if save_matrix_path is not None: matrix = DenseDesignMatrix(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix)