def time_stretch_hpss(audio, f): if f == 1.0: return audio stft = core.stft(audio) # Perform HPSS stft_harm, stft_perc = decompose.hpss( stft, kernel_size=31) # original kernel size 31 # OLA the percussive part y_perc = librosa.util.fix_length(core.istft(stft_perc, dtype=audio.dtype), len(audio)) y_perc = time_stretch_sola(y_perc, f) #~ # Phase-vocode the harmonic part #~ stft_stretch = core.phase_vocoder(stft_harm, 1.0/f) #~ # Inverse STFT of harmonic #~ y_harm = librosa.util.fix_length(core.istft(stft_stretch, dtype=y_perc.dtype), len(y_perc)) y_harm = librosa.util.fix_length(core.istft(stft_harm, dtype=audio.dtype), len(audio)) y_harm = librosa.util.fix_length( time_stretch_sola(core.istft(stft_harm, dtype=audio.dtype), f, wsola=True), len(y_perc)) # Add them together return y_harm + y_perc
def main(argv): os.makedirs(FLAGS.output_dir, exist_ok=True) ''' Initialize model ''' unet = Unet() restore(net=unet, ckpt_path=FLAGS.ckpt_path) ''' Load data ''' mix_wav, _ = load(FLAGS.original_wav, sr=SAMPLE_RATE) mix_wav_mag, mix_wav_phase = magphase(stft(mix_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH)) mix_wav_mag= mix_wav_mag[:, START:END] mix_wav_phase= mix_wav_phase[:, START:END] '''Load gt ''' if FLAGS.gt == True: gt_wav, _ = load(FLAGS.original_gt, sr=SAMPLE_RATE) gt_wav_mag, gt_wav_phase = magphase(stft(gt_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH)) gt_wav_mag= gt_wav_mag[:, START:END] gt_wav_phase= gt_wav_phase[:, START:END] '''Save input spectrogram image and gt''' write_wav(FLAGS.output_dir+'original_mix.wav', istft(mix_wav_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) spectogram_librosa(FLAGS.output_dir+'original_mix.wav',0) if FLAGS.gt == True: write_wav(FLAGS.output_dir+'gt.wav', istft(gt_wav_mag * gt_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) spectogram_librosa(FLAGS.output_dir+'gt.wav',0) ''' run data ''' inputs = mix_wav_mag[1:].reshape(1, 512, 128, 1) mask = unet(inputs).numpy().reshape(512, 128) predict = inputs.reshape(512, 128)*mask ''' evaluation metrics ''' if FLAGS.gt == True: expand_pre = np.expand_dims(predict.flatten(), axis=0) expand_gt = np.expand_dims(gt_wav_mag[1:].flatten(), axis=0) expand_input = np.expand_dims(inputs.flatten(), axis=0) (SDR, SIR, SAR, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_pre) (SDR2, _, _, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_input) NSDR = SDR - SDR2 #SDR(Se, Sr) − SDR(Sm, Sr) fout = open(FLAGS.output_dir+'metrics.txt','a') print('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****') fout.write('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****') fout.close() ''' Convert model output to target magnitude ''' target_pred_mag = np.vstack((np.zeros((128)), predict)) ''' Write vocal prediction audio files ''' write_wav(FLAGS.output_dir+'pred_vocal.wav', istft(target_pred_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) spectogram_librosa(FLAGS.output_dir+'pred_vocal.wav',1)
def SaveStereoAudio(fname, mag, phase, norm=True, save_path=None): y_l = istft(mag[0] * phase[0], hop_length=C.H, win_length=C.FFT_SIZE, window=C.WINDOW) y_r = istft(mag[1] * phase[1], hop_length=C.H, win_length=C.FFT_SIZE, window=C.WINDOW) stereo = np.array((y_l, y_r)) if save_path is None: write_wav(C.PATH_MUSIC / fname, stereo, C.SR, norm=norm) else: write_wav(save_path / fname, stereo, C.SR, norm=norm)
def test(): vis = Visualizer(env='svs') model = getattr(models, 'Unet')().eval() # model.cuda() model.load_state_dict( t.load('G:/Unet_svs/check/epoch_219__0724_16_57_35.pth')) mix_wav, _ = load("C:/Users/lenovo/Music/c.mp3", sr=8192) mix_wav_mag, mix_wav_phase = magphase( stft(mix_wav, n_fft=1024, hop_length=768)) START = 700 END = START + 128 mix_wav_mag = mix_wav_mag[:, START:END] mix_wav_phase = mix_wav_phase[:, START:END] print(mix_wav_mag.shape) gg = mix_wav_mag[1:] gg = t.from_numpy(gg) gg.unsqueeze_(0) gg.unsqueeze_(0) vis.img('a', gg) print(gg.shape) with t.no_grad(): gg = Variable(gg) score = model(gg) predict = gg.data * score.data print(predict.shape) target_pred_mag = predict.view(512, 128).cpu().numpy() target_pred_mag = np.vstack((np.zeros((128)), target_pred_mag)) vis.img('b', t.from_numpy(target_pred_mag)) print(target_pred_mag.shape) write_wav( f'C:/Users/lenovo/Music/pred_vocal.wav', istft( target_pred_mag * mix_wav_phase # (mix_wav_mag * target_pred_mag) * mix_wav_phase , win_length=1024, hop_length=768), 8192, norm=True) write_wav(f'C:/Users/lenovo/Music/pred_mix.wav', istft(mix_wav_mag * mix_wav_phase, win_length=1024, hop_length=768), 8192, norm=True)
def spectrogram_to_audio(input_spectrogram, hop_length, win_length, window='hann', center=True): return (audio.istft(input_spectrogram, hop_length, win_length, window, center))
def encode(data_file, output_file, key_file=None): print '* * encoding message in audio file...' data_file_size = os.path.getsize(data_file) if key_file is not None: signal, sr = librosa.load(key_file, sr=RATE) spec = stft(signal, WINDOW_LENGTH, HOP_SIZE) else: signal = make_sinewave(1, math.ceil(data_file_size / 20.), RATE) spec = stft(signal, WINDOW_LENGTH, HOP_SIZE) print 'data file size:', data_file_size print 'spec shape', spec.shape with open(data_file) as dfile: d = dfile.read(1) i = 0 while d: h = int(d.encode("hex"), 16) if key_file is not None: spec[h][i] = np.max( np.abs([spec[x][i] for x in range(spec.shape[0])])) + 200 else: spec[h][i] = np.max( np.abs([spec[x][i] for x in range(spec.shape[0])])) * 200 spec[h - 1][i] = 0 spec[h + 1][i] = 0 d = dfile.read(1) i += 1 spec = spec[:, :i] spec = add_start_stop(spec) wavwrite(output_file, istft(spec, 1024, 2048), RATE)
def gl_rec(S): sr, nfft, wlen, hop = 22050, 1022, 1022, 256 S = 10**(S) angles = 3.1415 * (np.random.randn(S.shape[0], S.shape[1]) - 0.5) #print (angles.shape) #print (S.shape) y = core.istft(S * angles, hop_length=hop, win_length=wlen) num_samples = y.shape[0] #print (y.shape) for i in range(40): angles = core.stft(y, n_fft=nfft, hop_length=hop, win_length=wlen) S = S[:, :angles.shape[1]] _, angles = core.magphase(angles) y = core.istft(S * angles, hop_length=hop, win_length=wlen) #y = y[:num_samples] return y
def separate(PATH_INPUT, PATH_OUTPUT, MODEL, SR=16000, FFT_SIZE = 1024, H = 512): if os.path.isdir( PATH_INPUT): # 入力がディレクトリーの場合、ファイルリストをつくる filelist_mixdown = find_files(PATH_INPUT, ext="wav", case_sensitive=True) else: # 入力が単一ファイルの場合 filelist_mixdown=[PATH_INPUT] print ('number of mixdown file', len(filelist_mixdown)) # 出力用のディレクトリーがない場合は 作成する。 _, path_output_ext = os.path.splitext(PATH_OUTPUT) print ('path_output_ext',path_output_ext) if len(path_output_ext)==0 and not os.path.exists(PATH_OUTPUT): os.mkdir(PATH_OUTPUT) # モデルの読み込み unet = train.UNet() chainer.serializers.load_npz( MODEL,unet) config.train = False config.enable_backprop = False # ミックスされたものを読み込み、vocal(speech)の分離を試みる for fmixdown in filelist_mixdown: # audioread でエラーが発生した場合は、scipyを使う。 try: y_mixdown, _ = load(fmixdown, sr=SR, mono=True) except: sr_mixdown, y_mixdown = read(fmixdown) if not sr_mixdown == SR: y_mixdown = resample(y_mixdown, sr_mixdown, SR) # 入力の短時間スペクトラムを計算して、正規化する。 spec = stft(y_mixdown, n_fft=FFT_SIZE, hop_length=H, win_length=FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j*np.angle(spec)) print ('mag.shape', mag.shape) start = 0 end = 128 * (mag.shape[1] // 128) # 入力のフレーム数以下で、networkの定義に依存して 適切な値を選ぶこと。 # speech(vocal)を分離するためのマスクを求める mask = unet(mag[:, start:end][np.newaxis, np.newaxis, 1:, :]).data[0, 0, :, :] mask = np.vstack((np.zeros(mask.shape[1], dtype="float32"), mask)) # 入力の短時間スペクトラムにマスクを掛けて、逆FFTで波形を合成する。 mag2=mag[:, start:end]*mask phase2=phase[:, start:end] y = istft(mag2*phase2, hop_length=H, win_length=FFT_SIZE) # 分離した speech(vocal)を出力ファイルとして保存する。 if len(path_output_ext)==0: # ディレクトリーへ出力 foutname, _ = os.path.splitext( os.path.basename(fmixdown) ) fname= os.path.join(PATH_OUTPUT, (foutname + '.wav')) else: # 指定されたファイルへ出力 fname= PATH_OUTPUT print ('saving... ', fname) write_wav(fname, y, SR, norm=True)
def gl_rec(mag_stft, hop, wlen, init_rec, n_iter=40): # Function for Griffin-Lim reconstruction rec = 1.0 * init_rec rec_stft = core.stft(rec, n_fft=nfft, hop_length=hop, win_length=wlen) angles = rec_stft / np.abs(rec_stft) for i in range(n_iter): rec = core.istft(np.abs(mag_stft**1.2) * angles, hop, wlen) rec_stft = core.stft(rec, n_fft=nfft, hop_length=hop, win_length=wlen) angles = rec_stft / np.abs(rec_stft) return rec
def eval(net1, net2, speech_file_loc, melody_file_loc): # Evaluates the result of net1, net2 on a given speech file and melody file # speech_file_loc, melody_file_loc are strings that specify the location of the respective audio files network1, network2 = net1.eval(), net2.eval() # Read input audio orig_speech = core.load(speech_file_loc, sr)[0] inp_speech = DL.remove_silent_frames(orig_speech) #inp_speech = 1.0 * orig_speech stft_inp = core.stft(inp_speech, n_fft=nfft, hop_length=hop, win_length=wlen) # Extract melody and create its image melody = utils.MelodyExt.melody_extraction(melody_file_loc, 'runtime_folder/ref_melody')[0] ref_pc = melody[:, 1] ref_time = melody[:, 0] const = hop * 1.0 / sr new_sampling_time = np.arange(const, ref_time[-1], const) interp_melody = np.interp(new_sampling_time, ref_time, ref_pc) n_frames = new_sampling_time.shape[0] idx1 = (1.0 * interp_melody * nfft / sr).astype(int) idx2 = np.array(range(n_frames)) pc = np.zeros([1 + nfft / 2, n_frames]) pc[idx1, idx2] = 1 pc[-1] = 1 * pc[0] pc[0] = 0 * pc[0] # Complete input preprocessing rate = stft_inp.shape[1] * 1.0 / n_frames stft_inp = core.phase_vocoder(stft_inp, rate, hop) # Stretch input speech to target length n_frames += 8 - n_frames % 8 # Append zeros to make it suitable for network stft_inp = np.concatenate([ stft_inp, np.zeros([stft_inp.shape[0], n_frames - stft_inp.shape[1]]) ], axis=1) pc = np.concatenate( [pc, np.zeros([pc.shape[0], n_frames - pc.shape[1]])], axis=1) stft_inp = np.log(1 + np.abs(stft_inp)) stft_inp, pc = torch.from_numpy(stft_inp).float().unsqueeze( 0), torch.from_numpy(pc).float().unsqueeze(0) # Make tensors # Extract output encode2 = network2(Variable(pc.to(device))) pred, encode1 = network1(Variable(stft_inp.to(device)), encode2) pred = pred[0].cpu().data.numpy() pred[pred < 0] = 0 pred = np.exp(pred) - 1 time_pred = 3.0 * utils.gl_rec(pred, hop, wlen, core.istft( pred, hop, wlen)) # Adding a multiplier to increase loudness return time_pred
def extract(spec, model, max_norm, fs, frame_size, shift_size): input = np.abs(spec[None, None, 1:, :].copy()) input /= max_norm soft_mask = model(Variable(torch.from_numpy( input)).cuda()).data.cpu().numpy().squeeze() hard_mask = soft_mask > 0.5 soft_vocal = istft( spec * np.vstack((np.zeros_like(spec[0, :]), soft_mask)), shift_size) soft_accom = istft( spec * np.vstack((np.zeros_like(spec[0, :]), 1 - soft_mask)), shift_size) hard_vocal = istft( spec * np.vstack((np.zeros_like(spec[0, :]), hard_mask)), shift_size) hard_accom = istft( spec * np.vstack((np.zeros_like(spec[0, :]), 1 - hard_mask)), shift_size) return soft_vocal, soft_accom, hard_vocal, hard_accom
def istft(self, _stft=[]): # Take the stft using Librosa.core.stft # returns numpy array if _stft == []: X = self.Xc # if an array is not specified use Xc (stft of xOriginal) else: X = _stft # otherwise, use the specified signal _x = istft(X, hop_length=self.hopSize, win_length=self.frameSize) return _x
def decode(audioName, locations, model, device): PSD_frames = spectralImages_1D(audioName, locations['audioloc']) nframes = len([key for key in PSD_frames if 'Phase' in key]) audio = {} noisy_mag = [] noisy_phase = [] noisy_norm = [] clean_mag = [] clean_phase = [] clean_norm = [] for k in range(nframes): uttname = 'MagdB_' + audioName + '_frame_' + str(k) noisy_mag.append(PSD_frames[uttname]) noisy_phase.append(PSD_frames[uttname.replace('MagdB', 'Phase')]) noisy_norm = PSD_frames[uttname.replace('MagdB', 'Norm').split('_frame')[0]] samples = PSD_frames[uttname.replace('MagdB', 'Samples').split('_frame')[0]] audio['noisy_mag'] = torch.from_numpy(np.expand_dims(noisy_mag, axis=1)) audio['noisy_phase'] = np.hstack(noisy_phase) audio['noisy_norm'] = noisy_norm audio['utt_samples'] = int(samples) audio['uttname'] = audioName with torch.no_grad(): input_mag = audio['noisy_mag'].float().to(device) enhanced_mag = model(input_mag).cpu().numpy() if enhanced_mag.shape[0] > 1: enhanced_mag = np.hstack(np.squeeze(enhanced_mag)) else: enhanced_mag = np.squeeze(enhanced_mag) noisy_mag = np.hstack(np.squeeze(audio['noisy_mag'].numpy())) noisy_mag = np.interp(noisy_mag, [-1, 1], audio['noisy_norm']) enhanced_mag = np.interp(enhanced_mag, [-1, 1], audio['noisy_norm']) temp = np.zeros((257, enhanced_mag.shape[1])) + 1j * np.zeros( (257, enhanced_mag.shape[1])) temp[:-1, :] = 10**(enhanced_mag / 20) * ( np.cos(audio['noisy_phase']) + np.sin(audio['noisy_phase']) * 1j) enhanced_audio = istft(temp) enhanced_audio = 0.98 * enhanced_audio / np.max(np.abs(enhanced_audio)) enhanced_audio = enhanced_audio[:audio['utt_samples']] enhanceloc = locations['enhanceloc'] Path(os.path.dirname(enhanceloc)).mkdir(parents=True, exist_ok=True) sf.write(enhanceloc, enhanced_audio, 16000) return
def fastgl_rec(mag_stft, hop, wlen, n_iter=40): angles = np.exp(2j * np.pi * np.random.rand(*mag_stft.shape)) momentum = 1.1 rebuilt = 0 for i in range(n_iter): tprev = 1 * rebuilt inverse = core.istft(np.abs(mag_stft**1.2) * angles, hop_length=hop, win_length=wlen) rebuilt = core.stft(inverse, n_fft=nfft, hop_length=hop, win_length=wlen) angles[:] = rebuilt - (momentum / (1 + momentum)) * tprev angles[:] /= np.abs(angles) + 1e-16 return inverse
def loudnessSTFTMatrix(matrix, sr, **kwargs): """Calculates the loudness of a signal encoded by its STFT matrix. Args: matrix (np.ndarray): STFT matrix of the actual signal. sr (int, optional): The sample rate of the input signal. **kwargs: Keywords for istft() (see https://librosa.github.io/librosa/generated/librosa.core.istft.html) Returns: float: The negative replay gain as the loudness in dB of the signal. """ # Convert STFT matrix to signal and use loudnessSignal() to obtain loudness # TODO: this is inefficient y = istft(matrix, **kwargs) return loudnessSignal(y, sr)
def change_speed(input_signal, rate): """Change the playback speed of an audio signal Parameters ---------- input_signal : numpy.array Input array, must have numerical type. rate : numeric Desired rate of change to the speed. To increase the speed, pass in a value greater than 1.0. To decrease the speed, pass in a value between 0.0 and 1.0. Returns ------- numpy.array representing the audio signal with changed speed. """ if input_signal.dtype.kind not in 'iu' and input_signal.dtype.kind != 'f': raise TypeError( "'input_signal' must be an array of integers or floats") if rate <= 0: raise Exception('rate must be a positive number') # Convert input signal to a -1.0 to 1.0 float if it's an integer type if input_signal.dtype.kind in 'iu': i = np.iinfo('float32') abs_max = 2**(i.bits - 1) offset = i.min + abs_max input_signal = (input_signal.astype('float32') - offset) / abs_max # Transform signal to frequency domain frequency_domain_signal = core.stft(input_signal) # Change speed with the phase vocoding method fds_changed_speed = core.phase_vocoder(frequency_domain_signal, rate) # Transform frequency domain signal back to time domain output_signal = core.istft(fds_changed_speed, dtype=input_signal.dtype) return output_signal
def istft(self, spectrogram, length): result = [] # 按列为主序存储,也就是按通道为主序 spectrogram = np.asfortranarray(spectrogram) window = hann(self.frame_length, sym=False) channels = spectrogram.shape[-1] for c in range(channels): data = spectrogram[..., c].T wave = istft(data, hop_length=self.frame_step, window=window, center=False, length=length) wave = np.expand_dims(wave.T, axis=1) result.append(wave) result = np.concatenate(result, axis=-1) return result
def SaveAudio(fname, mag, phase, norm=True): """ util.valid_audio(y, mono=False) if norm and np.issubdtype(y.dtype, np.floating): wav = util.normalize(y, norm=np.inf, axis=None) else: wav = y if wav.ndim > 1 and wav.shape[0] == 2: wav = wav.T sf.write( file=C.PATH_MUSIC / fname, data=wav, samplerate=C.SR) """ y = istft(mag * phase, hop_length=C.H, win_length=C.FFT_SIZE, window=C.WINDOW) write_wav(C.PATH_MUSIC / fname, y, C.SR, norm=norm)
def decode(wavfile, key_file=None): print '* decoding signal' signal, sr = librosa.load(wavfile, sr=RATE) spec = stft(signal, WINDOW_LENGTH, HOP_SIZE) message = "" if key_file is not None: key_signal, sr = librosa.load(key_file, sr=RATE) signal = np.pad(signal, (0, key_signal.shape[0] - signal.shape[0]), 'edge') try: spec = np.subtract(stft(key_signal, WINDOW_LENGTH, HOP_SIZE), spec) wavwrite('minus.wav', istft(spec, 1024, 2048), RATE) except ValueError: print "Oops! Your encoded signal must be at least as long as the key signal" return i, decode = 0, False while i < spec.shape[1]: h = np.argmax(np.abs([spec[x][i] for x in range(spec.shape[0])])) if h == 500: decode = True i = i + 1 continue if h == 550: break if decode: while h > 255: spec[h][i] = 0 h = np.argmax( np.abs([spec[x][i] for x in range(spec.shape[0])])) char = str(chr(h)) message += char i = i + 1 print message return message
def istft(mel_spec, phase): """ mel_spec: numpy 256*256 phase: numpy array 512*256 audio_proc: numpy array 65280*1 """ #mel_spec = mel_spec.numpy() yt = np.zeros((253, 256)) #print(np.shape(mel_dedup)) counter = 0 for i in range(len(mel_idx)): if i > 0: if mel_idx[i] == mel_idx[i - 1]: #linear_spec[mel_idx[i]] = (mel_spec[i]+mel_spec[i-1])/2 yt[counter - 1] = (yt[counter - 1] + mel_spec[i]) / 2 else: #linear_spec[mel_idx[i]] = mel_spec[i] yt[counter] = mel_spec[i] counter += 1 else: #linear_spec[mel_idx[i]] = mel_spec[i] yt[counter] = mel_spec[i] counter += 1 f = interpolate.interp1d(mel_dedup, yt, 'cubic', axis=0) ynew = f(all_idx) ynew = ynew[:512] #print(ynew.shape) j = np.array([1j], dtype='complex') linear_spec = np.multiply( ynew, np.cos(phase)) + j * np.multiply(ynew, np.sin(phase)) audio_proc = lc.istft(linear_spec, hop_length=256, win_length=1022) #print(audio_proc.shape) return audio_proc
def deocdeData(dataloc, specImageloc, destfolder, model, device): for dataset in ['Dev', 'Eval']: audiofiles = __readscpFiles__(dataloc + '/' + dataset + '_SimData.scp') audiofiles.update( __readscpFiles__(dataloc + '/' + dataset + '_RealData.scp')) pbar = pkbar.Pbar(name='Decoding ' + dataset + ' AudioFiles: ', target=len(audiofiles)) data = SpecImages(specImageloc + '/' + dataset, mode='decode') with torch.no_grad(): for i, (k, v) in enumerate(audiofiles.items()): uttID = data.uttname2idx('MagdB_' + k) audio = data.__getaudio__(uttID) input_mag = audio['noisy_mag'].unsqueeze(1).to(device) enhanced_mag = model(input_mag).cpu().numpy() if enhanced_mag.shape[0] > 1: enhanced_mag = np.hstack(np.squeeze(enhanced_mag)) else: enhanced_mag = np.squeeze(enhanced_mag) enhanced_mag = np.interp(enhanced_mag, [-1, 1], audio['noisy_norm']) temp = np.zeros((257, enhanced_mag.shape[1])) + 1j * np.zeros( (257, enhanced_mag.shape[1])) temp[:-1, :] = 10**(enhanced_mag / 20) * (np.cos( audio['noisy_phase']) + np.sin(audio['noisy_phase']) * 1j) enhanced_audio = istft(temp) enhanced_audio = 0.98 * enhanced_audio / np.max( np.abs(enhanced_audio)) enhanced_audio = enhanced_audio[:audio['utt_samples']] destloc = destfolder + v.split('Reverb_Challenge')[1] Path(os.path.dirname(destloc)).mkdir(parents=True, exist_ok=True) sf.write(destloc, enhanced_audio, 16000) del audio, input_mag, enhanced_mag, temp, enhanced_audio pbar.update(i)
def stretch(x, factor, nfft=2048): ''' From this repository: https://github.com/gaganbahga/time_stretch stretch an audio sequence by a factor using FFT of size nfft converting to frequency domain :param x: np.ndarray, audio array in PCM float32 format :param factor: float, stretching or shrinking factor, depending on if its > or < 1 respectively :return: np.ndarray, time stretched audio ''' stft = core.stft( x, n_fft=nfft).transpose() # i prefer time-major fashion, so transpose stft_rows = stft.shape[0] stft_cols = stft.shape[1] times = np.arange(0, stft.shape[0], factor) # times at which new FFT to be calculated hop = nfft / 4 # frame shift stft_new = np.zeros((len(times), stft_cols), dtype=np.complex_) phase_adv = (2 * np.pi * hop * np.arange(0, stft_cols)) / nfft phase = np.angle(stft[0]) stft = np.concatenate((stft, np.zeros((1, stft_cols))), axis=0) for i, time in enumerate(times): left_frame = int(np.floor(time)) local_frames = stft[[left_frame, left_frame + 1], :] right_wt = time - np.floor(time) # weight on right frame out of 2 local_mag = (1 - right_wt) * np.absolute( local_frames[0, :]) + right_wt * np.absolute(local_frames[1, :]) local_dphi = np.angle(local_frames[1, :]) - np.angle( local_frames[0, :]) - phase_adv local_dphi = local_dphi - 2 * np.pi * np.floor(local_dphi / (2 * np.pi)) stft_new[i, :] = local_mag * np.exp(phase * 1j) phase += local_dphi + phase_adv return core.istft(stft_new.transpose())
os.makedirs(saveFolderLoc) monoLoader = es.MonoLoader(filename=mixFile, sampleRate=44100) x = monoLoader()[:nSeconds * 44100] _stft = stft(x, n_fft=fftSize, hop_length=hopSize, win_length=frameSize, window=winType) X_H, X_P = hpss(_stft, kernel_size=150) # Get harmonic and percussive stfts x_h = istft( X_H, hop_length=hopSize, win_length=frameSize) # Convert stfts to time domain signals x_p = istft(X_P, hop_length=hopSize, win_length=frameSize) MonoWriter = es.MonoWriter(sampleRate=44100, format="mp3") # Write to file MonoWriter.configure(filename=saveFolderLoc + filename + "_median_percussive.mp3") MonoWriter(array(x_p)) MonoWriter = es.MonoWriter(sampleRate=44100, format="mp3") # Write to file MonoWriter.configure(filename=saveFolderLoc + filename + "_median_harmonic.mp3") MonoWriter(array(x_h))
def save_audio(mag, phase): return istft(mag * phase, hop_length=512, win_length=1024)
#!/usr/bin/env python from librosa.core import stft, istft import numpy as np import scipy y = np.random.rand(44032) stft_matrix = stft(y, window=scipy.signal.hann(2048), hop_length=1024) y_hat = istft(stft_matrix, window=np.ones(2048), hop_length=1024) diff = y - y_hat print np.dot(diff, diff)
def SaveAudio(fname, mag, phase): y = istft(mag*phase, hop_length=C.H, win_length=C.FFT_SIZE) write_wav(fname, y, C.SR, norm=True)
vocal_wav_mag = vocal_wav_mag[:, START:END] vocal_wav_phase = vocal_wav_phase[:, START:END] # load saved model model = keras.models.load_model('../models/vocal_20_test_model.h5') #model = keras.models.load_model('../models/vocal_20.h5') # predict and write into file X = mix_wav_mag[1:].reshape(1, 512, 128, 1) y = model.predict(X, batch_size=32) target_pred_mag = np.vstack((np.zeros((128)), y.reshape(512, 128))) write_wav(f'../wav_files/vocal_20_sample_py.wav', istft(target_pred_mag * mix_wav_phase, win_length=WINDOW_SIZE, hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) write_wav(f'../wav_files/mix_downsampled.wav', istft(mix_wav_mag * mix_wav_phase, win_length=WINDOW_SIZE, hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) write_wav(f'../wav_files/vocals_downsampled.wav', istft(vocal_wav_mag * vocal_wav_phase, win_length=WINDOW_SIZE, hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True)
def random_pred(model_list=['PMTL'], n_samp=2, min_length=1.0, fld=test_fld, psongs=test_psongs): # Predicts output of specified list of systems on some random samples from the dataset # It also takes as arguments the number of samples for evaluation, minimum length of each sample, nus_train_data = DL.NUS_48E(data_key, [sr, nfft, wlen, hop]) sampler = DL.nus_samp(data_dir, 1, n_samp, fld, psongs, use_word=True, randomize=True, print_elem=True, min_len=min_length) dataload = DataLoader(dataset=nus_train_data, batch_sampler=sampler, collate_fn=my_collate_e8) samp_idx = -1 lsd = [] for data in dataload: # Initialize, Load the networks and their weights properly taking into account the exceptions samp_idx += 1 print 'Processing sample', samp_idx for idx in range(len(model_list)): cur_model = model_list[idx] suffix = suffix_dict[cur_model] network2 = defModel.exp_net(512, 512, freq=513).to(device) if cur_model == 'B2' or cur_model == 'b2': network1 = defModel.net_base(512, 512, freq=513).to(device) else: network1 = defModel.net_in_v2(512, 512, freq=513).to(device) if not (cur_model == 'B1' or cur_model == 'b1'): network2.load_state_dict( torch.load('output/models/net2_' + suffix + '.pt', map_location=device)) # Complete network1.load_state_dict( torch.load('output/models/net1_' + suffix + '.pt', map_location=device)) network1, network2 = network1.eval(), network2.eval() # Make predictions encode2 = int(not cur_model == 'B1') * network2( Variable(data[3].to(device))) pred, encode1 = network1(Variable(data[0].to(device)), encode2) pred = pred.cpu().data.numpy() pred[pred < 0] = 0 #Save log-STFTs of input, target and prediction saving_dir = 'output/random_predictions/' logstft_inp = data[0].numpy() logstft_out = data[1].numpy() logstft_pred = 1.0 * pred np.save(saving_dir + 'inp_lgstft' + str(samp_idx), logstft_inp) np.save(saving_dir + 'out_lgstft' + str(samp_idx), logstft_out) np.save(saving_dir + 'pred_lgstft' + str(samp_idx), logstft_pred) # Get time domain signals stft_pred = np.zeros([513, pred.shape[2]]) stft_pred[:pred.shape[1]] = np.exp(pred[0]) - 1 time_pred = utils.gl_rec(stft_pred, hop, wlen, core.istft(stft_pred**1.0, hop, wlen)) time_inp_orig = core.istft(data[4][0], hop, wlen) time_inp_phase = core.istft(data[5][0], hop, wlen) time_target_phase = core.istft(data[6][0], hop, wlen) # Save predictions librosa.output.write_wav( saving_dir + 'original_speech_' + str(samp_idx) + '.wav', time_inp_orig, sr) librosa.output.write_wav( saving_dir + 'stretched_speech_' + str(samp_idx) + '.wav', time_inp_phase, sr) librosa.output.write_wav( saving_dir + 'true_singing_' + str(samp_idx) + '.wav', time_target_phase, sr) librosa.output.write_wav( saving_dir + 'predicted_singing_' + str(samp_idx) + cur_model + '.wav', time_pred, sr) return
def to_time(image): """ :param image: STFT with magnitude in one channel and phase in the other. :return: Raw audio """ return lc.istft(image[:, :, 0] + 1j * image[:, :, 1])
def score(self, loader, framewise=False, save_dir=None): """ Score the model. Args ---- loader : PyTorch DataLoader. """ self.model.eval() class_sdr = defaultdict(list) class_sir = defaultdict(list) class_sar = defaultdict(list) # only perform framewise evaluation at testing time if self.n_fft == 1025: rate = 22050 hop = 512 win = 2048 elif self.n_fft == 2049: rate = 44100 hop = 1024 win = 4096 if not framewise: rate = np.inf if save_dir: class_map = {0: 'bass', 1: 'drums', 2: 'other', 3: 'vocals'} mus = musdb.DB(root_dir="data/musdb18") # list of batches preds, ys, cs, ts, _, nm = self.predict(loader) # for each batch for b_preds, b_ys, b_cs, b_ts, b_nm in tqdm(list(zip(preds, ys, cs, ts, nm))): # for each sample for pred, y, c, t, n in zip(b_preds, b_ys, b_cs, b_ts, b_nm): pred_recons = [] y_recons = [] pred_cs = [] pred_recons_dict = defaultdict(list) y_recons_dict = defaultdict(list) # for each class for i, (c_pred, c_y, c_c) in enumerate(zip(pred, y, c)): # if the class exists in the source signal if c_c == 1 and np.abs(c_y).sum() > 0: c_pred = c_pred[..., :t] c_y = c_y[..., :t] # predictions can be over multiple channels pred_recon = [] y_recon = [] for c_pred_chan, c_y_chan in zip(c_pred, c_y): pred_recon += [istft( c_pred_chan, hop_length=hop, win_length=win)] y_recon += [istft( c_y_chan, hop_length=hop, win_length=win)] pred_recon = np.stack(pred_recon, axis=-1) y_recon = np.stack(y_recon, axis=-1) # accumulate list of reconstructions for stacking pred_recons += [pred_recon] y_recons += [y_recon] pred_cs += [i] if save_dir: pred_recons_dict[class_map[i]] = pred_recon y_recons_dict[class_map[i]] = y_recon # possible to sample from targets that are all zeros if pred_recons: pred_recons = np.stack(pred_recons) # possible to predict all zeros... # TODO: Figure out how to handle this case properly if np.abs(pred_recons.sum()) > 0: y_recons = np.stack(y_recons) # nclassex x time if self.eval_version == 'v3': sdr, sir, sar, _ = bss_eval_sources( y_recons, pred_recons, compute_permutation=False) elif self.eval_version == 'v4': if save_dir: name = loader.dataset.metadata.at[ int(n.cpu().numpy()), 'urlId'] track = mus.load_mus_tracks( tracknames=[name])[0] sdr, isr, sir, sar = evaluate( y_recons, pred_recons, win=rate, hop=rate, padding=True) data = self._to_evalstore( sdr, sir, isr, sar, rate, rate, class_map) self._save_framewise(data, save_dir, track) continue else: sdr, isr, sir, sar = evaluate( y_recons, pred_recons, win=rate, hop=rate, padding=True) cmb_sdr = np.concatenate([x for x in sdr]) sdr = np.nanmean(sdr, axis=1) sir = np.nanmean(sir, axis=1) sar = np.nanmean(sar, axis=1) for m1, m2, m3, cl in zip(sdr, sir, sar, pred_cs): class_sdr[cl] += [m1] class_sir[cl] += [m2] class_sar[cl] += [m3] class_sdr_out = defaultdict(list) class_sir_out = defaultdict(list) class_sar_out = defaultdict(list) class_sdr_out['median'] = {k: np.round(np.median(v), 2) for k, v in class_sdr.items()} class_sdr_out['mean'] = {k: np.round(np.mean(v), 2) for k, v in class_sdr.items()} class_sir_out['median'] = {k: np.round(np.median(v), 2) for k, v in class_sir.items()} class_sir_out['mean'] = {k: np.round(np.mean(v), 2) for k, v in class_sir.items()} class_sar_out['median'] = {k: np.round(np.median(v), 2) for k, v in class_sar.items()} class_sar_out['mean'] = {k: np.round(np.mean(v), 2) for k, v in class_sar.items()} return class_sdr_out, class_sir_out, class_sar_out, cmb_sdr
def eval_sys(model_list=['PMTL', 'PMSE', 'B1', 'B2'], n_samp=30, min_length=1.0, random=True, fld=test_fld, psongs=test_psongs): # Currently evaluates the specified models on the NUS dataset for the given songs. Default songs comprise of our test set # It also takes as arguments the number of samples for evaluation (n_samp), minimum length of speech in each sample (min_length), # Returns array of all computed LSD's and prints the mean LSD for each model nus_train_data = DL.NUS_48E(data_key, [sr, nfft, wlen, hop]) sampler = DL.nus_samp(data_dir, 1, n_samp, fld, psongs, use_word=True, randomize=random, print_elem=False, min_len=min_length) dataload = DataLoader(dataset=nus_train_data, batch_sampler=sampler, collate_fn=my_collate_e8) samp_idx = -1 lsd = [] for data in dataload: # Initialize, Load the networks and their weights properly taking into account the exceptions samp_idx += 1 print 'Processing sample ', samp_idx for idx in range(len(model_list)): cur_model = model_list[idx] suffix = suffix_dict[cur_model] network2 = defModel.exp_net(512, 512, freq=513).to(device) if cur_model == 'B2': network1 = defModel.net_base(512, 512, freq=513).to(device) else: network1 = defModel.net_in_v2(512, 512, freq=513).to(device) if not cur_model == 'B1': network2.load_state_dict( torch.load('output/models/net2_' + suffix + '.pt', map_location=device)) # Complete network1.load_state_dict( torch.load('output/models/net1_' + suffix + '.pt', map_location=device)) network1, network2 = network1.eval(), network2.eval() # Make predictions encode2 = int(not cur_model == 'B1') * network2( Variable(data[3].to(device))) pred, encode1 = network1(Variable(data[0].to(device)), encode2) pred = pred.cpu().data.numpy() pred[pred < 0] = 0 #Temporarily save log-STFTs of input target and prediction logstft_inp = data[0].numpy() logstft_out = data[1].numpy() logstft_pred = 1.0 * pred np.save('runtime_folder/inp_stft', logstft_inp) np.save('runtime_folder/out_stft', logstft_out) np.save('runtime_folder/pred_stft', logstft_pred) # Get time domain signals stft_inp = np.zeros([513, pred.shape[2]]) stft_pred = np.zeros([513, pred.shape[2]]) stft_target = np.zeros([513, pred.shape[2]]) stft_pred[:pred.shape[1]] = np.exp(pred[0]) - 1 time_pred = utils.gl_rec(stft_pred, hop, wlen, core.istft(stft_pred**1.0, hop, wlen)) time_target_phase = core.istft(data[6][0], hop, wlen) # Save predictions in the runtime folder true_file = 'runtime_folder/runtime_true.wav' pred_file = 'runtime_folder/runtime_pred.wav' librosa.output.write_wav(true_file, time_target_phase, sr) librosa.output.write_wav(pred_file, time_pred, sr) calc_lsd = utils.comp_lsd(true_file, pred_file) #print cur_model, calc_lsd lsd.append(calc_lsd) # Print the results arr = np.zeros([len(model_list), n_samp]) for i in range(len(model_list) * n_samp): arr[i % len(model_list), i // len(model_list)] = lsd[i] for i in range(len(model_list)): print model_list[i] + ' (mean LSD):', np.mean(arr[i]) return lsd
def SaveAudio(fname, mag, phase): y = istft(mag * phase, hop_length=C.H, win_length=C.FFT_SIZE) write_wav(fname, y, C.SR, norm=True)