def decompose(y, n_components=8): # How about something more advanced? Let's decompose a spectrogram with NMF, and then resynthesize an individual component D = librosa.stft(y) # Separate the magnitude and phase S, phase = librosa.magphase(D) # Decompose by nmf components, activations = librosa.decompose.decompose(S, n_components, sort=True) plt.figure(figsize=(12,4)) plt.subplot(1,2,1) librosa.display.specshow(librosa.logamplitude(components**2.0, ref_power=np.max), y_axis='log') plt.xlabel('Component') plt.ylabel('Frequency') plt.title('Components') plt.subplot(1,2,2) librosa.display.specshow(activations) plt.xlabel('Time') plt.ylabel('Component') plt.title('Activations') plt.tight_layout() plt.savefig('components_activations.png') print('components', components.shape) print('activations', activations.shape) return components, activations, phase
def transform_audio(self, y): mag, phase = librosa.magphase(librosa.stft(y, hop_length=self.hop_length, n_fft=self.n_fft, dtype=np.float32)) return {'mag': mag.T, 'phase': np.angle(phase.T)}
def __test_consistency(frame_length, hop_length, center): y, sr = librosa.load(__EXAMPLE_FILE, sr=None) # Ensure audio is divisible into frame size. y = librosa.util.fix_length(y, y.size - y.size % frame_length) assert y.size % frame_length == 0 # STFT magnitudes with a constant windowing function and no centering. S = librosa.magphase(librosa.stft(y, n_fft=frame_length, hop_length=hop_length, window=np.ones, center=center))[0] # Try both RMS methods. rms1 = librosa.feature.rms(S=S, frame_length=frame_length, hop_length=hop_length) rms2 = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length, center=center) assert rms1.shape == rms2.shape # Normalize envelopes. rms1 /= rms1.max() rms2 /= rms2.max() # Ensure results are similar. np.testing.assert_allclose(rms1, rms2, rtol=5e-2)
def parse_audio(path, audio_conf, windows, normalize=False): ''' Input: path : string 导入音频的路径 audio_conf : dict 求频谱的音频参数 windows : dict 加窗类型 Output: spect : FloatTensor 每帧的频谱 ''' y = load_audio(path) n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"]) win_length = n_fft hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride']) window = windows[audio_conf['window']] D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spect, phase = librosa.magphase(D) spect = torch.FloatTensor(spect) spect = spect.log1p() if normalize: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) return spect.transpose(0,1)
def parse_audio(path, audio_conf, windows, normalize=False): '''使用librosa计算音频的对数幅度谱 Args: path(string) : 音频的路径 audio_conf(dict) : 求频谱的参数 windows(dict) : 加窗类型 Returns: spect(FloatTensor) : 音频的对数幅度谱(numFrames * nFeatures) nFeatures = n_fft / 2 + 1 ''' y = load_audio(path) n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"]) win_length = n_fft hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride']) window = windows[audio_conf['window']] D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spect, phase = librosa.magphase(D) spect = torch.FloatTensor(spect) spect = spect.log1p() #每句话自己做归一化 if normalize: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) return spect.transpose(0,1)
def parse_audio(self, audio_path): if self.augment: y = load_randomly_augmented_audio(audio_path, self.sample_rate) else: y = load_audio(audio_path) if self.noiseInjector: add_noise = np.random.binomial(1, self.noise_prob) if add_noise: y = self.noiseInjector.inject_noise(y) n_fft = int(self.sample_rate * self.window_size) win_length = n_fft hop_length = int(self.sample_rate * self.window_stride) # STFT D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=self.window) spect, phase = librosa.magphase(D) # S = log(S+1) spect = np.log1p(spect) spect = torch.FloatTensor(spect) if self.normalize: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) return spect
def parse_audio(path, audio_conf, windows, normalize=True): ''' Input: path : string 导入音频的路径 audio_conf : dcit 求频谱的音频参数 windows : dict 加窗类型 Output: spect : ndarray 每帧的频谱 ''' y = load_audio(path) n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"]) win_length = n_fft hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride']) window = windows[audio_conf['window']] D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spect, phase = librosa.magphase(D) spect = np.log1p(spect) if normalize: mean = spect.mean() std = spect.std() spect = np.add(spect, -mean) spect = np.divide(spect, std) return spect.transpose()
def Magnitude_phase(spectrogram) : Magnitude_list = [] Phase_list = [] for X in spectrogram : mag, phase = librosa.magphase(X) Magnitude_list.append(mag) Phase_list.append(phase) return Magnitude_list, Phase_list
def test_magphase(): (y, sr) = librosa.load(os.path.join('data', 'test1_22050.wav')) D = librosa.stft(y) S, P = librosa.magphase(D) assert np.allclose(S * P, D)
def test_magphase(): (y, sr) = librosa.load('data/test1_22050.wav') D = librosa.stft(y) S, P = librosa.magphase(D) assert np.allclose(S * P, D)
def mix_by_spectogram(src_path, tgt_paths, n_fft = 4096, hop_length = 1024): #STFT from source src_signal, sr = librosa.load(src_path) src_stft = librosa.stft(src_signal, n_fft, hop_length) src_mag, src_phase = librosa.magphase(src_stft) src_spectra = Spectra(src_mag, src_phase, sr) targets = {} #STFT from paths for path in tgt_paths: signal, sr = librosa.load(path) D_tgt = librosa.stft(signal, n_fft, hop_length) tgt_mag, tgt_phase = librosa.magphase(D_tgt) tgt_spectra = Spectra(tgt_mag, tgt_phase, sr) targets[path] = tgt_spectra length = len(src_stft[0]) #Compute distances for i in range(len(src_spectra.magnitude)): print i distance = None closest = src_spectra.magnitude*0 for target in targets.values(): try: cap = min(len(target.magnitude[i]), len(src_spectra.magnitude[i])) new_dist = norm(target.magnitude[i][:cap] - src_mag[i][:cap]) if new_dist < distance or distance == None: distance = new_dist closest = target except IndexError: print 'IDX Error' cap = min(len(closest.magnitude[i]) , len(src_mag[i])) #Add magnitudes and phases src_spectra.magnitude[i][:cap] += closest.magnitude[i][:cap] src_spectra.phase[i][:cap] += closest.phase[i][:cap] #Average magnitudes and phases src_spectra.magnitude *= 0.5 src_spectra.phase *= 0.5 signal = librosa.istft(src_spectra.magnitude * src_spectra.phase) librosa.output.write_wav(src_path[:-4]+"-mix.wav", signal, 2*sr)
def transform_audio(self, y): cqt, phase = librosa.magphase(librosa.cqt(y=y, sr=self.sr, hop_length=self.hop_length, fmin=self.fmin, n_bins=self.n_octaves * self.over_sample * 12, bins_per_octave=self.over_sample * 12, real=False)) return {'mag': cqt.T.astype(np.float32), 'phase': np.angle(phase).T.astype(np.float32)}
def spectrogram2wav(mag, n_fft, win_length, hop_length, num_iters, phase_angle=None, length=None): assert(num_iters > 0) if phase_angle is None: phase_angle = np.pi * np.random.rand(*mag.shape) spec = mag * np.exp(1.j * phase_angle) for i in range(num_iters): wav = librosa.istft(spec, win_length=win_length, hop_length=hop_length, length=length) if i != num_iters - 1: spec = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length) _, phase = librosa.magphase(spec) phase_angle = np.angle(phase) spec = mag * np.exp(1.j * phase_angle) return wav
def _get_features(self, audio_data): _fs = 16000 # sampling rate hop_length = 160 win_length = 400 #utt, sr = librosa.load(audio_path,sr=None) audio_data = audio_data / np.max(audio_data) utt = self.pre_emp(audio_data) linear_spect = self.lin_spectogram_from_wav(utt, hop_length, win_length, n_fft=512) mag, _ = librosa.magphase(linear_spect) # magnitude spec = mag.T logspec = np.log(spec + 1e-8).T return logspec
def wav_to_spectrogram_clips(wav_file): """convert audio into spectorgram, then chop it into 2d-segmentation of 100 frames""" # convert audio into spectorgram sound, sr = librosa.load(wav_file, sr=SR, mono=True) stft = librosa.stft(sound, n_fft=N_FFT, hop_length=HOP_LEN, win_length=WIN_LEN) mag, phase = librosa.magphase(stft) # chop magnitude of spectrogram into clips, each has 1025 bins, 100 frames stft_clips = np.empty((0, FREQ_BINS, 100)) for i in range(mag.shape[1] // 100): stft_clips = np.concatenate((stft_clips, mag[np.newaxis, :, i * 100:(i + 1) * 100])) return stft_clips
def make_feature(y, sr): if FEATURE == 'fft': S = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LEN, window=hamming) feature, _ = librosa.magphase(S) feature = np.log1p(feature) feature = feature.transpose() else: if FEATURE == 'logfbank': feature = logfbank(y, sr, winlen=WIN_LEN, winstep=WIN_STEP) else: feature = mfcc(y, sr, winlen=WIN_LEN, winstep=WIN_STEP) feature_d1 = delta(feature, N=1) feature_d2 = delta(feature, N=2) feature = np.hstack([feature, feature_d1, feature_d2]) return normalize(feature)
def load_custom_data(args) : datalist = os.listdir(args.custompath) input_data = list() for dataname in datalist : speech_data, _ = librosa.load(args.custompath + dataname, sr=16000) speech_data = speech_data / np.max(speech_data) mel_data = get_mel_feature(speech_data, args) mel_data, _ = librosa.magphase(mel_data) text = os.path.splitext(dataname)[0] filtered_data = sentence_filter(text) jamo = split_syllables(filtered_data) label = np.array(jamo_to_label(jamo)) input_data.append([mel_data, filtered_data, label]) return input_data
def transform_audio(self, y): cqt, phase = librosa.magphase( librosa.cqt(y=y, sr=self.sr, hop_length=self.hop_length, fmin=self.fmin, n_bins=self.n_octaves * self.over_sample * 12, bins_per_octave=self.over_sample * 12, real=False)) return { 'mag': cqt.T.astype(np.float32), 'phase': np.angle(phase).T.astype(np.float32) }
def __inverse_transform(self, data, n_iter): data = self.scaler.inverse_transform(data) data = data.T complex_specgram = self.inv_magphase(data, 0.0) for i in range(n_iter): audio = librosa.core.istft(complex_specgram, win_length=self.n_fft) if i != n_iter - 1: complex_specgram = librosa.core.stft(audio, n_fft=self.n_fft) _, phase = librosa.magphase(complex_specgram) phase_angle = np.angle(phase) complex_specgram = self.inv_magphase(data, phase_angle) return audio
def spectralCent(song): y, sr = librosa.load("C:\Users\Katherine\Music\\" + song + ".mp3", duration=60) cent = librosa.feature.spectral_centroid(y=y, sr=sr) S, phase = librosa.magphase(librosa.stft(y=y)) librosa.feature.spectral_centroid(S=S) if_gram, D = librosa.ifgram(y) librosa.feature.spectral_centroid(S=np.abs(D), freq=if_gram) plt.figure() plt.subplot(2, 1, 1) plt.semilogy(cent.T, label=song) plt.ylabel('Hz') plt.xticks([]) plt.xlim([0, cent.shape[-1]]) plt.legend()
def __getitem__(self, item): noisy_path, clean_path = self.dataset_list[item].split(" ") name = os.path.splitext(os.path.basename(noisy_path))[0] noisy, _ = librosa.load(os.path.abspath( os.path.expanduser(noisy_path)), sr=self.sr) clean, _ = librosa.load(os.path.abspath( os.path.expanduser(clean_path)), sr=self.sr) if self.train: noisy_mag, _ = librosa.magphase( librosa.stft(noisy, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.n_fft)) clean_mag, _ = librosa.magphase( librosa.stft(clean, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.n_fft)) return noisy_mag, clean_mag, noisy_mag.shape[-1], name else: return noisy, clean, name
def griffin_lim(magnitude, n_fft, hop_length, n_iterations): """Iterative algorithm for phase retrival from a magnitude spectrogram.""" phase_angle = np.pi * np.random.rand(*magnitude.shape) D = invert_magnitude_phase(magnitude, phase_angle) signal = librosa.istft(D, hop_length=hop_length) for i in range(n_iterations): D = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length) _, phase = librosa.magphase(D) phase_angle = np.angle(phase) D = invert_magnitude_phase(magnitude, phase_angle) signal = librosa.istft(D, hop_length=hop_length) return signal
def griffin_lim(self, magnitude, iters=30): ''' based on: https://github.com/soobinseo/Tacotron-pytorch/blob/master/data.py in turn based on: librosa implementation of Griffin-Lim Based on https://github.com/librosa/librosa/issues/434 ''' angles = np.exp(2j * np.pi * np.random.rand(*magnitude.shape)) S_complex = np.abs(magnitude).astype(np.complex) y = librosa.istft(S_complex * angles) for i in range(iters): _, angles = librosa.magphase(librosa.stft(y)) y = librosa.istft(S_complex * angles) return y
def mix_strategies(src, tgt): src_signal, sr = librosa.load(src) src_stft = librosa.stft(src_signal, n_fft, hop_length) src_mag, src_phase = librosa.magphase(src_stft) src_spectra = Spectra(src_mag, src_phase, sr) tgt_signal, sr = librosa.load(tgt) tgt_stft = librosa.stft(tgt_signal, n_fft, hop_length) tgt_mag, tgt_phase = librosa.magphase(tgt_stft) tgt_spectra = Spectra(tgt_mag, tgt_phase, sr) for i in range(len(src_spectra.magnitude)): #Average of magnitude and Phase cap = min(len(src_spectra.magnitude[i]), len(tgt_spectra.magnitude[i])) src_spectra.magnitude[i][:cap] += tgt_spectra.magnitude[i][:cap] src_spectra.phase[i][:cap] += tgt_spectra.phase[i][:cap] src_spectra.magnitude *= 0.5 src_spectra.phase *= 0.5 new_spectra = src_spectra new_signal = librosa.istft(new_spectra.magnitude * new_spectra.phase) librosa.output.write_wav(src[:-4]+"-mix.wav", new_signal, 2*new_spectra.sr) """
def decompose(self): #filter out precussive parts hpss_y = self.hpss() #Perform Short-time Fourier transform D = librosa.stft(hpss_y) # Separate the magnitude and phase S, phase = librosa.magphase(D) #NMF decompose to components components, activations = self.decomposeNMF(hpss_y, S, self.n_components) #reconstruct and return return [ self.reconstructComponent(components[:, i], activations[i], phase) for i in range(0, len(activations)) ]
def add_noise(audio_path, noise_path, percent=0.5, sr=16000): src, sr = librosa.load(audio_path, sr=sr) src_noise, sr = librosa.load(noise_path, sr=sr) #print(len(src), len(src_noise)) if len(src) > len(src_noise): n = int(len(src) / len(src_noise)) src_noise = src_noise.repeat(n + 1) flag = random.randint(0, len(src_noise) - len(src)) src_noise = src_noise[flag:flag + len(src)] percent = 0.002 * random.randint(1, 5) src = src + percent * src_noise S = librosa.core.stft(src, n_fft=N_FFT, hop_length=HOP_LEN, window=hamming) # feature, _ = librosa.magphase(S) return feature
def load_data(path, win_length=400, sr=16000, hop_length=160, n_fft=512, spec_len=250, mode='train'): wav = load_wav(path, sr=sr, mode=mode) linear_spect = lin_spectogram_from_wav(wav, hop_length, win_length, n_fft) mag, _ = librosa.magphase(linear_spect) # magnitude mag_T = mag.T freq, time = mag_T.shape if mode == 'train': randtime = np.random.randint(0, time - spec_len) spec_mag = mag_T[:, randtime:randtime + spec_len] else: spec_mag = mag_T # preprocessing, subtract mean, divided by time-wise var mu = np.mean(spec_mag, 0, keepdims=True) std = np.std(spec_mag, 0, keepdims=True) return (spec_mag - mu) / (std + 1e-5)
def load_vocal_audio(self, y, sr): S_full, phase = librosa.magphase(librosa.stft(y)) S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine', width=int(librosa.time_to_frames(2, sr=sr))) S_filter = np.minimum(S_full, S_filter) margin_i, margin_v = 2, 10 power = 2 mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power) S_foreground = mask_v * S_full output_data = librosa.griffinlim(S_foreground) return output_data, sr
def get_spectrum(track_name, sr, N, M, H): W = np.hanning(M) # Window Type ## Load WAV File track,sr = load(track_name+'.wav', sr = sr, mono = 'False') ## Perform Short Term Fourier Transform stft_ = stft(y = track, n_fft = N,win_length=M, hop_length=H, window = 'hann') ## Magnitudes (excluding phase) magnitude, _ = magphase(stft_) magnitude = magnitude / np.sum(W) #normalising STFT output ## Spectrum Average spec_avg = np.average(magnitude,axis=1) spec_avg = spec_avg/np.max(spec_avg) len_signal = spec_avg.shape[0] # filter bank length return spec_avg, len_signal
def test_rms(y_ex, y2, frame_length, hop_length, center): y1, sr = y_ex # Ensure audio is divisible into frame size. y1 = librosa.util.fix_length(y1, y1.size - y1.size % frame_length) y2 = librosa.util.fix_length(y2, y2.size - y2.size % frame_length) assert y1.size % frame_length == 0 assert y2.size % frame_length == 0 # STFT magnitudes with a constant windowing function and no centering. S1 = librosa.magphase( librosa.stft( y1, n_fft=frame_length, hop_length=hop_length, window=np.ones, center=center ) )[0] S2 = librosa.magphase( librosa.stft( y2, n_fft=frame_length, hop_length=hop_length, window=np.ones, center=center ) )[0] # Try both RMS methods. rms1 = librosa.feature.rms(S=S1, frame_length=frame_length, hop_length=hop_length) rms2 = librosa.feature.rms( y=y1, frame_length=frame_length, hop_length=hop_length, center=center ) rms3 = librosa.feature.rms(S=S2, frame_length=frame_length, hop_length=hop_length) rms4 = librosa.feature.rms( y=y2, frame_length=frame_length, hop_length=hop_length, center=center ) assert rms1.shape == rms2.shape assert rms3.shape == rms4.shape # Ensure results are similar. np.testing.assert_allclose(rms1, rms2, atol=5e-4) np.testing.assert_allclose(rms3, rms4, atol=5e-4)
def spectrogram(wav, normalize=True): D = librosa.stft(wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spec, phase = librosa.magphase(D) spec = np.log1p(spec) spec = torch.FloatTensor(spec) if normalize: spec = (spec - spec.mean()) / spec.std() return spec
def load_data(path_spk_tuples, win_length=400, sr=16000, hop_length=160, n_fft=512, min_win_time=240, max_win_time=1600): win_time = np.random.randint(min_win_time, max_win_time, 1)[ 0] # win_length in [240,1600] ms win_spec = win_time//(1000//(sr//hop_length)) # win_length in spectrum hop_spec = win_spec//2 wavs = np.array([]) change_points = [] paths = list(zip(*path_spk_tuples))[0] speakers = list(zip(*path_spk_tuples))[1] for path in paths: wav = load_wav(path, sr=sr) # VAD wavs = np.concatenate((wavs, wav)) # change_point in spectrum change_points.append(wavs.shape[0]//hop_length) linear_spect = lin_spectogram_from_wav(wavs, hop_length, win_length, n_fft) mag, _ = librosa.magphase(linear_spect) # magnitude mag_T = mag.T freq, time = mag_T.shape spec_mag = mag_T utterance_specs = [] utterance_speakers = [] cur_spec = 0 cur_speaker = speakers[0] i = 0 while(True): if(cur_spec+win_spec > time): break spec_mag = mag_T[:, cur_spec:cur_spec+win_spec] # cur win_spec span to the next speaker if(cur_spec+win_spec//2 > change_points[i]): i += 1 cur_speaker = speakers[i] # preprocessing, subtract mean, divided by time-wise var mu = np.mean(spec_mag, 0, keepdims=True) std = np.std(spec_mag, 0, keepdims=True) spec_mag = (spec_mag - mu) / (std + 1e-5) utterance_specs.append(spec_mag) utterance_speakers.append(cur_speaker) cur_spec += hop_spec return utterance_specs, utterance_speakers
def calculate_SDR(music, model, n_fft=2048, hop_length=512, slice_duration=2): model.eval() scores = [] sr = music.rate ind = 0 mixture = librosa.to_mono(music.audio.transpose()) vocal = librosa.to_mono(music.targets['vocals'].audio.transpose()) for i in range(0, len(music.audio), slice_duration * sr): ind += 1 mixture = mixture[i:i + slice_duration * sr] vocal = vocal[i:i + slice_duration * sr] if np.all(vocal == 0): # print('[!] - all 0s, skipping') continue if i + 2 * sr >= len(music.audio): break resampled_mixture = mixture mixture_stft = librosa.stft(resampled_mixture, n_fft=n_fft, hop_length=512, window='hann', center=True) magnitude_mixture_stft, mixture_phase = librosa.magphase(mixture_stft) normalized_magnitude_mixture_stft = torch.Tensor(Normalize().forward( [magnitude_mixture_stft])[0]) sr_v = music.rate with torch.no_grad(): mask = model.forward( normalized_magnitude_mixture_stft.unsqueeze(0)).squeeze(0) out = mask * torch.Tensor(normalized_magnitude_mixture_stft) predicted_vocal_stft = out.numpy() * mixture_phase predicted_vocal_audio = librosa.istft(predicted_vocal_stft.squeeze(0), win_length=n_fft, hop_length=hop_length, window='hann', center='True') try: scores.append( mir_eval.separation.bss_eval_sources( vocal[:predicted_vocal_audio.shape[0]], predicted_vocal_audio)[0]) except ValueError: print(vocal.all() == 0) print(predicted_vocal_stft.all() == 0) print('Error but skipping')
def spect_loader(path, window_size, window_stride, window, normalize, max_len=101, augment=False, allow_speedandpitch=False, allow_pitch=False, allow_speed=False, allow_dyn=False, allow_noise=False, allow_timeshift=False): y, sr = librosa.load(path, sr=None) n_fft = int(sr * window_size) win_length = n_fft hop_length = int(sr * window_stride) # STFT D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spect, phase = librosa.magphase(D) # S = log(S+1) spect = np.log1p(spect) # make all spects with the same dims # TODO: change that in the future if spect.shape[1] < max_len: pad = np.zeros((spect.shape[0], max_len - spect.shape[1])) spect = np.hstack((spect, pad)) elif spect.shape[1] > max_len: spect = spect[:max_len, ] spect = np.resize(spect, (1, spect.shape[0], spect.shape[1])) #spect = torch.FloatTensor(spect) # z-score normalization if normalize: mean = np.mean(np.ravel(spect)) std = np.std(np.ravel(spect)) if std != 0: spect = spect - mean spect = spect / std return spect
def my_filter(y, sr): Y = librosa.stft(y, n_fft=4096, hop_length=512) Y_dB = librosa.amplitude_to_db(Y, ref=np.max) var_trust = var_trust_func(Y_dB) contrast = contrast_trust_func(np.abs(Y), sr) mask = np.multiply(contrast, var_trust) mask = (mask - np.min(mask)) / (np.max(mask) - np.min(mask)) mag, phase = librosa.magphase(Y) newmag = np.multiply(mag, mask) Y_rec = np.multiply(newmag, np.exp(np.multiply(phase, (1j)))) y_rec = librosa.istft(Y_rec, hop_length=512) return y_rec, Y_rec
def parse_audio(self,audio_path): y = load_audio(audio_path) n_fft = int(self.sample_rate * self.window_size) win_length = n_fft hop_length = int(self.sample_rate * self.window_stride) D = librosa.stft(y, n_fft=n_fft, hop_length = hop_length, win_length=win_length,window=self.window) spect, phase = librosa.magphase(D) spect = np.log1p(spect) mean = spect.mean() std = spect.std() spect = np.add(spect,-mean) spect = spect / std return spect
def mask_from_timeseries(sr, x): x = x.astype('float16') S, ph = librosa.magphase(librosa.stft(x)) # i'm not sure what the value of "time" should be. 0.1 works well for segment lengths of 0.5 seconds. time = 0.1 S_filter = librosa.decompose.nn_filter(S, aggregate=np.median, metric='cosine', width=int(librosa.time_to_frames(time, sr=sr))) S_filter = np.minimum(S, S_filter) margin = 5 power = 2 mask = librosa.util.softmask(S - S_filter, margin * S_filter, power=power) return mask
def make_feature(y, sr): #提取特征,y是语音data部分,sr为采样率 if FEATURE == 'fft': #提取fft特征 S = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LEN, window=hamming) #进行短时傅里叶变换,参数意义在一开始有定义 feature, _ = librosa.magphase(S) feature = np.log1p(feature) #log1p操作 feature = feature.transpose() else: if FEATURE == 'logfbank': #提取fbank特征 feature = logfbank(y, sr, winlen=WIN_LEN, winstep=WIN_STEP) else: feature = mfcc(y, sr, winlen=WIN_LEN, winstep=WIN_STEP) #提取mfcc特征 feature_d1 = delta(feature, N=1) #加上两个delta,特征维度X3 feature_d2 = delta(feature, N=2) feature = np.hstack([feature, feature_d1, feature_d2]) #横向拼起来 return normalize(feature) #返回归一化的特征
def plot_spectral_centroid(number): example_mp3, sr, song_name = load_music.load_song(number) cent = librosa.feature.spectral_centroid(example_mp3, sr) S, phase = librosa.magphase(librosa.stft(y=example_mp3)) fig, ax = plt.subplots() times = librosa.times_like(cent) librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), y_axis='log', x_axis='time', ax=ax) ax.plot(times, cent.T, label='Spectral centroid', color='w') ax.legend(loc='upper right') fig.suptitle('log power spectogram on' + ' ' + song_name, fontsize=8) plt.show()
def plt_stft(data, title): titles = [ 'Sample', '440Hz Beep', 'No Sound', 'English Male', 'English Female', 'Japanese Female', 'Japanese Male', 'OK, Google'] plt.figure(figsize = (16, 9)) plt.suptitle(title) for i in range(0,len(titles)): D = data[i] plt.subplot(2, 4, i + 1) librosa.display.specshow( librosa.amplitude_to_db(librosa.magphase(D)[0]), y_axis='log', x_axis='time') plt.title(titles[i]) plt.colorbar(format='%+2.0f dB') plt.tight_layout() plt.savefig('./fig/fig_stft/' + title + '.png')
def filter_background(self, margin_v: int = 5, power: int = 2): stft = librosa.stft(self.waveform) S_full, phase = librosa.magphase(stft) S_filter = librosa.decompose.nn_filter( S_full, aggregate=np.median, metric='cosine', width=int(librosa.time_to_frames(2, sr=self.sample_rate))) S_filter = np.minimum(S_full, S_filter) mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power) S_foreground = mask_v * S_full y_foreground = librosa.istft(S_foreground * phase) self.waveform = y_foreground return self
def griffin_lim(magnitudes, n_iters=50, n_fft=1024): """ Griffin-Lim algorithm to convert magnitude spectrograms to audio signals """ phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape)) complex_spec = magnitudes * phase signal = librosa.istft(complex_spec) if not np.isfinite(signal).all(): logging.warning("audio was not finite, skipping audio saving") return np.array([0]) for _ in range(n_iters): _, phase = librosa.magphase(librosa.stft(signal, n_fft=n_fft)) complex_spec = magnitudes * phase signal = librosa.istft(complex_spec) return signal
def parse_audio(self, audio_path): """ parse audio """ audio = self.load_audio(audio_path) n_fft = int(self.sample_rate * self.window_size) win_length = n_fft hop_length = int(self.sample_rate * self.window_stride) D = librosa.stft(y=audio, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=self.window) mag, _ = librosa.magphase(D) mag = np.log1p(mag) if self.is_normalization: mean = mag.mean() std = mag.std() mag = (mag - mean) / std return mag
def reconstruct(file, spec1): y, sr_ = librosa.load(file) D = librosa.core.stft(y, n_fft=512) mag, phase = librosa.magphase(D) spec1 = np.transpose(spec1) mask = np.abs(spec1) / np.abs(D) rec_a = D * mask rec_b = D * (1 - mask) o_a = librosa.core.istft(rec_a) o_b = librosa.core.istft(rec_b) return o_a, o_b
def decompose(filename, offset=0, duration=30, voice=True): '''Decompose a song into its pieces :parameters: - filename : str path to the audio - offset : float initial offset for loading audio - duration : float maximum amount of audio to load :returns: - D : np.array, dtype=complex STFT of the full signal - D_inst : np.array, dtype=complex STFT of the instruments - D_vox : np.array, dtype=complex STFT of the vocals - D_inst_harm : np.array, dtype=complex STFT of the instrument harmonics - D_inst_perc : np.array, dtype=complex STFT of the instruments percussives ''' y, sr = librosa.load(filename, sr=SR, offset=offset, duration=duration) # Step 1: compute STFT D = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH).astype(np.complex64) # Step 2: separate magnitude and phase S, P = librosa.magphase(D) S = S / S.max() if voice: tau = (D.shape[0] * 3) / 4 # Step 3: RPCA to separate voice and background S1, S2, _ = rpca.robust_pca(S[:tau,:], max_iter=25) S1, S2 = rpca_correct(S[:tau,:], S1, S2) S1 = np.vstack((S1, S[tau:,:])) S2 = np.vstack((S2, S[tau:,:])) else: S1, S2 = librosa.hpss.hpss_median(S, win_H=WIN_HPSS, win_P=WIN_HPSS, p=1.0) # Step 4: recombine with phase return D, S1 * P, S2 * P
def reconstruct(y, a_W, a_H, b_W, b_H): a = np.dot(a_W, a_H) b = np.dot(b_W, b_H) D = librosa.core.stft(y, n_fft=NMF.d_w, hop_length=NMF.d_h) mag, phase = librosa.magphase(D) mask_b = 1 rec_a = a * phase #rec_b = b * phase mask_b = np.nan_to_num(b**1 / (a**1 + b**1)) rec_b = b * mask_b * phase #np.abs(D) * mask_b * phase o_a = librosa.core.istft(rec_a, win_length=NMF.d_w, hop_length=NMF.d_h) o_b = librosa.core.istft(rec_b, win_length=NMF.d_w, hop_length=NMF.d_h) return o_a, o_b
def tf_wave_to_stft(wave): sample_rate = 16000 window_size = 0.02 window_stride = 0.01 window = 'hamming' normalize = True # y = librosa.core.load(wave, sr=sample_rate)[0] # print(len(y)) n_fft = 320 # int(sample_rate * window_size) win_length = n_fft hop_length = 160 # int(sample_rate * window_stride) # STFT D = librosa.stft(wave, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spect, phase = librosa.magphase(D) # S = log(S+1) spect = np.log1p(spect) return spect
def parse_audio(path, audio_conf, windows): ''' Input: path : string 导入音频的路径 audio_conf : dcit 求频谱的音频参数 windows : dict 加窗类型 Output: spect : FloatTensor 每帧的频谱 ''' y = load_audio(path) n_fft = int(audio_conf['sample_rate']*audio_conf["window_size"]) win_length = n_fft hop_length = int(audio_conf['sample_rate']*audio_conf['window_stride']) window = windows[audio_conf['window']] #D = librosa.cqt(y, sr=audio_conf['sample_rate']) D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spect, phase = librosa.magphase(D) spect = np.log1p(spect) spect = torch.FloatTensor(spect) return spect.transpose(0,1)
def griffin_lim(mag, phase_angle, n_fft, hop, num_iters): """Iterative algorithm for phase retrival from a magnitude spectrogram. Args: mag: Magnitude spectrogram. phase_angle: Initial condition for phase. n_fft: Size of the FFT. hop: Stride of FFT. Defaults to n_fft/2. num_iters: Griffin-Lim iterations to perform. Returns: audio: 1-D array of float32 sound samples. """ fft_config = dict(n_fft=n_fft, win_length=n_fft, hop_length=hop, center=True) ifft_config = dict(win_length=n_fft, hop_length=hop, center=True) complex_specgram = inv_magphase(mag, phase_angle) for i in range(num_iters): audio = librosa.istft(complex_specgram, **ifft_config) if i != num_iters - 1: complex_specgram = librosa.stft(audio, **fft_config) _, phase = librosa.magphase(complex_specgram) phase_angle = np.angle(phase) complex_specgram = inv_magphase(mag, phase_angle) return audio
def spectrogram2wav(mag, n_fft, win_length, hop_length, num_iters, phase_angle=None, length=None): ''' :param mag: [f, t] :param n_fft: n_fft :param win_length: window length :param hop_length: hop length :param num_iters: num of iteration when griffin-lim reconstruction :param phase_angle: phase angle :param length: length of wav :return: ''' assert (num_iters > 0) if phase_angle is None: phase_angle = np.pi * np.random.rand(*mag.shape) spec = mag * np.exp(1.j * phase_angle) for i in range(num_iters): wav = librosa.istft(spec, win_length=win_length, hop_length=hop_length, length=length) if i != num_iters - 1: spec = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length) _, phase = librosa.magphase(spec) phase_angle = np.angle(phase) spec = mag * np.exp(1.j * phase_angle) return wav
def extract_features(self, audio_path): # torchaudio loading options recently changed. It's probably # straightforward to rewrite the audio handling to make use of # up-to-date torchaudio, but in the meantime there is a legacy # method which uses the old defaults sound, sample_rate_ = torchaudio.legacy.load(audio_path) if self.truncate and self.truncate > 0: if sound.size(0) > self.truncate: sound = sound[:self.truncate] assert sample_rate_ == self.sample_rate, \ 'Sample rate of %s != -sample_rate (%d vs %d)' \ % (audio_path, sample_rate_, self.sample_rate) sound = sound.numpy() if len(sound.shape) > 1: if sound.shape[1] == 1: sound = sound.squeeze() else: sound = sound.mean(axis=1) # average multiple channels n_fft = int(self.sample_rate * self.window_size) win_length = n_fft hop_length = int(self.sample_rate * self.window_stride) # STFT d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=self.window) spect, _ = librosa.magphase(d) spect = np.log1p(spect) spect = torch.FloatTensor(spect) if self.normalize_audio: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) return spect
def get_sound_features(audio_path): print( audio_path) y, sr = librosa.load(audio_path) n_fft=2048 hop_length=512 output={} # Separate harmonics and percussives into two waveforms y_harmonic, y_percussive = librosa.effects.hpss(y) #y is wave domain, y_harmonic is frequency domain output['y_harmonic_mean']= y_harmonic.mean() output['y_harmonic_std']= y_harmonic.std() output['y_percussive_mean']= y_percussive.mean() output['y_percussive_std']= y_percussive.std() # Beat track on the percussive signal tempo, beat_frames = librosa.beat.beat_track(y=y_percussive,sr=sr) output['tempo']= tempo # Compute MFCC features from the raw signal mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13) output['mfcc_mean']=mfcc.mean() output['mfcc_std']=mfcc.std() # And the first-order differences (delta features) mfcc_delta = librosa.feature.delta(mfcc) output['mfcc_delta_mean']=mfcc_delta.mean() output['mfcc_delta_std']=mfcc_delta.std() # Stack and synchronize between beat events # This time, we'll use the mean value (default) instead of median beat_mfcc_delta = librosa.feature.sync(np.vstack([mfcc, mfcc_delta]),beat_frames) output['beat_mfcc_delta_mean']=beat_mfcc_delta.mean() output['beat_mfcc_delta_std']=beat_mfcc_delta.std() # Compute chroma features from the harmonic signal chromagram = librosa.feature.chroma_cqt(y=y_harmonic,sr=sr) output['chromagram_mean']=chromagram.mean() output['chromagram_std']=chromagram.std() # Aggregate chroma features between beat events # We'll use the median value of each feature between beat frames beat_chroma = librosa.feature.sync(chromagram,beat_frames,aggregate=np.median) output['beat_chroma_mean']=beat_chroma.mean() output['beat_chroma_std']=beat_chroma.std() # Finally, stack all beat-synchronous features together # beat_features = np.vstack([beat_chroma, beat_mfcc_delta]) # output['beat_features']=beat_features #nfft in example different ????????????? 4096 # Compute a chromagram from a waveform or power spectrogram. S = np.abs(librosa.stft(y, n_fft=n_fft)) chroma_stft = librosa.feature.chroma_stft(S=S, sr=sr) output['chroma_stft_mean']=chroma_stft.mean() output['chroma_stft_std']=chroma_stft.std() # Constant-Q chromagram chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr) output['chroma_cq_mean']=chroma_cq.mean() output['chroma_cq_std']=chroma_cq.std() # Compute a Mel-scaled power spectrogram. S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,fmax=8000) output['S_mean']=S.mean() output['S_std']=S.std() # Compute root-mean-square (RMS) energy for each frame. S, phase = librosa.magphase(librosa.stft(y)) rms = librosa.feature.rmse(S=S) output['rms_mean']=rms.mean() output['rms_std']=rms.std() # Compute the spectral centroid. cent = librosa.feature.spectral_centroid(y=y, sr=sr) output['cent_mean']=cent.mean() output['cent_std']=cent.std() S, phase = librosa.magphase(librosa.stft(y=y)) spec_bw=librosa.feature.spectral_bandwidth(S=S) output['spec_bw_mean']=spec_bw.mean() output['spec_bw_std']=spec_bw.std() S = np.abs(librosa.stft(y)) contrast = librosa.feature.spectral_contrast(S=S, sr=sr) output['contrast_mean']=contrast.mean() output['contrast_std']=contrast.std() tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr) output['tonnetz_mean']=tonnetz.mean() output['tonnetz_std']=tonnetz.std() zero_cross = librosa.feature.zero_crossing_rate(y) output['zero_cross_mean']=zero_cross.mean() output['zero_cross_std']=zero_cross.std() oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length) output['oenv_mean']=oenv.mean() output['oenv_std']=oenv.std() tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,hop_length=hop_length) output['tempogram_mean']=tempogram.mean() output['tempogram_std']=tempogram.std() L = librosa.feature.logfsgram(y=y, sr=sr) output['L_mean']=L.mean() output['L_std']=L.std() R = librosa.segment.recurrence_matrix(mfcc) output['R_mean']=R.mean() output['R_std']=R.std() output['audio_path'] = audio_path return output
def LoadAudio(file_path) : y, sr = load(file_path,sr=SR) stft = librosa.stft(y,n_fft=window_size,hop_length=hop_length) mag, phase = librosa.magphase(stft) return mag.astype(np.float32), phase
def read_audio_file(path, src_dir, side, sample_rate, window_size, window_stride, window, normalize_audio, truncate=None): """ Args: path (str): location of a src file containing audio paths. src_dir (str): location of source audio files. side (str): 'src' or 'tgt'. sample_rate (int): sample_rate. window_size (float) : window size for spectrogram in seconds. window_stride (float): window stride for spectrogram in seconds. window (str): window type for spectrogram generation. normalize_audio (bool): subtract spectrogram by mean and divide by std or not. truncate (int): maximum audio length (0 or None for unlimited). Yields: a dictionary containing audio data for each line. """ assert (src_dir is not None) and os.path.exists(src_dir),\ "src_dir must be a valid directory if data_type is audio" global torchaudio, librosa, np import torchaudio import librosa import numpy as np with codecs.open(path, "r", "utf-8") as corpus_file: index = 0 for line in corpus_file: audio_path = os.path.join(src_dir, line.strip()) if not os.path.exists(audio_path): audio_path = line assert os.path.exists(audio_path), \ 'audio path %s not found' % (line.strip()) sound, sample_rate = torchaudio.load(audio_path) if truncate and truncate > 0: if sound.size(0) > truncate: continue assert sample_rate == sample_rate, \ 'Sample rate of %s != -sample_rate (%d vs %d)' \ % (audio_path, sample_rate, sample_rate) sound = sound.numpy() if len(sound.shape) > 1: if sound.shape[1] == 1: sound = sound.squeeze() else: sound = sound.mean(axis=1) # average multiple channels n_fft = int(sample_rate * window_size) win_length = n_fft hop_length = int(sample_rate * window_stride) # STFT d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spect, _ = librosa.magphase(d) spect = np.log1p(spect) spect = torch.FloatTensor(spect) if normalize_audio: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) example_dict = {side: spect, side + '_path': line.strip(), 'indices': index} index += 1 yield example_dict
################## # Standard imports from __future__ import print_function import numpy as np import matplotlib.pyplot as plt import librosa import librosa.display ############################################# # Load an example signal y, sr = librosa.load('audio/sir_duke_slow.mp3') # And compute the spectrogram magnitude and phase S_full, phase = librosa.magphase(librosa.stft(y)) ################### # Plot the spectrum plt.figure(figsize=(12, 4)) librosa.display.specshow(librosa.amplitude_to_db(S_full, ref=np.max), y_axis='log', x_axis='time', sr=sr) plt.colorbar() plt.tight_layout() ########################################################### # As you can see, there are periods of silence and # non-silence throughout this recording. #
def get_librosa_large(path): #Load an audio file as a floating point time series y, sr = librosa.load(path) # Decompose an audio time series into harmonic and percussive components y_harmonic, y_percussive = librosa.effects.hpss(y) # -------------BEAT AND TEMPO-------------------- # Compute a spectral flux onset strength envelope hop_length = 512 onset_env = librosa.onset.onset_strength(y = y_percussive, sr = sr, aggregate = numpy.median, hop_length=hop_length) # Dynamic programming beat tracker tempo, beats = librosa.beat.beat_track(onset_envelope = onset_env, sr = sr) # Locate note onset events by picking peaks in an onset strength envelope onset_frames = librosa.onset.onset_detect(onset_envelope = onset_env, sr = sr) # Compute the tempogram: local autocorrelation of the onset strength envelope. tempogram = librosa.feature.tempogram(onset_envelope = onset_env, sr = sr, hop_length = hop_length) #--------------SPECTRAL FEATURES---------------- # Compute a chromagram from a waveform or power spectrogram chroma = librosa.feature.chroma_stft(y = y_harmonic, sr = sr) # Compute a Mel-scaled power spectrogram mel = librosa.feature.melspectrogram(y = y, sr = sr) mel_h = librosa.feature.melspectrogram(y = y_harmonic, sr = sr) mel_p = librosa.feature.melspectrogram(y = y_percussive, sr = sr) # Convert to log scale (dB). We'll use the peak power as reference. log_mel = librosa.logamplitude(mel, ref_power = numpy.max) log_mel_h = librosa.logamplitude(mel_h, ref_power = numpy.max) log_mel_p = librosa.logamplitude(mel_p, ref_power = numpy.max) # Mel-frequency cepstral coefficients mfcc = librosa.feature.mfcc(S = log_mel) delta_mfcc = librosa.feature.delta(mfcc) delta2_mfcc = librosa.feature.delta(mfcc, order=2) # Compute root-mean-square (RMS) energy for each frame S, phase = librosa.magphase(chroma) rms = librosa.feature.rmse(S = S) # Compute the spectral centroid cent = librosa.feature.spectral_centroid(S = S) # Compute p’th-order spectral bandwidth spec_bw = librosa.feature.spectral_bandwidth(S = S) # Compute spectral contrast S_abs = numpy.abs(chroma) # contrast = librosa.feature.spectral_contrast(S = S_abs, sr = sr) # Compute roll-off frequency rolloff = librosa.feature.spectral_rolloff(S = S, sr = sr) # Get coefficients of fitting an nth-order polynomial to the columns of a spectrogram line = librosa.feature.poly_features(S = S_abs, sr = sr) quad = librosa.feature.poly_features(S = S_abs, order = 2) # Compute the zero-crossing rate of an audio time series z_cross = librosa.feature.zero_crossing_rate(y) #-------------WRITING TO FILE---------------- features = [tempo, onset_env, beats, onset_frames, tempogram, chroma, log_mel, log_mel_h, log_mel_p, mfcc, delta_mfcc, delta2_mfcc, rms, cent, spec_bw, rolloff, line, quad, z_cross] features_names = ["Tempo", "Onset strength envelope", "Beats", "Onset events", "Tempogram", "Chromagram", "Log-scaled melspectrogram", "Log-scaled harmonic melspectrogram", "Log-scaled percussive melspectrogram", "Mel-frequency cepstral coefficients", "Delta features", "Delta square features", "Root mean square energy", "Spectral centroid", "Spectral bandwidth", "Roll-off frequency", "Linear coefficients", "Quadratic coefficients", "Zero crossing rate"] suf_ind = path.find(".") if (suf_ind != -1): file_name = path[0:suf_ind] + ".dat" else: file_name = path + ".dat" with open(file_name, 'w') as f: tmp = "%s : %s\n" % (features_names[0], str(features[0])) # Tempo f.write(tmp) for i in range(1, len(features)): tmp = "%s : %s\n" % (features_names[i], str(features[i].tolist())) f.write(tmp) f.close() return