def reconstruct_audio(self, description, irm=None, mask=None, idx=None, test=False): n_wavfiles = len(self.x_wavfiles) if idx is None: for j in range(n_wavfiles): if irm is None or mask is None: yest = self.reconstruct_x(j) else: yest = self.reconstruct_x( j, mask=irm[j, :np.sum(mask[j, :]), :].T) y = self.reconstruct_y(j) wavfile_enhanced = self.y_wavfiles[j].replace( 'scaled', 'enhanced_%s' % description) if not os.path.exists(os.path.dirname(wavfile_enhanced)): os.makedirs(os.path.dirname(wavfile_enhanced)) util.wavwrite(wavfile_enhanced, 16e3, yest) elif isinstance(idx, list): for j in idx: if irm is None or mask is None: yest = self.reconstruct_x(j) else: yest = self.reconstruct_x( j, mask=irm[j, :np.sum(mask[j, :]), :].T) y = self.reconstruct_y(j) if test: y_orig = util.wavread(self.y_wavfiles[j])[0:1, :] x = util.wavread(self.x_wavfiles[j])[0:1, :] if yest.shape[1] > x.shape[1]: yest = yest[:, :x.shape[1]] if y.shape[1] > y_orig.shape[1]: y = y[:, :y_orig.shape[1]] print "For file %d, NMSE between original x and yest is %e" % ( j, np.mean((x - yest)**2) / np.mean(x**2)) print "For file %d, NMSE between original y_orig and y is %e" % ( j, np.mean((y_orig - y)**2) / np.mean(y_orig**2)) else: wavfile_enhanced = self.y_wavfiles[j].replace( 'scaled', 'enhanced_%s' % description) if not os.path.exists(os.path.dirname(wavfile_enhanced)): os.makedirs(os.path.dirname(wavfile_enhanced)) util.wavwrite(wavfile_enhanced, 16e3, yest) else: if irm is None: yest = self.reconstruct_x(idx) else: yest = self.reconstruct_x(idx, mask=irm) wavfile_enhanced = self.y_wavfiles[idx].replace( 'scaled', 'enhanced_%s' % description) if not os.path.exists(os.path.dirname(wavfile_enhanced)): os.makedirs(os.path.dirname(wavfile_enhanced)) util.wavwrite(wavfile_enhanced, 16e3, yest) return
def load_wav(fname, fs): """ load_wav function test and read a wav files, and convert stereo channels into mono Parameters ---------- fname : [string] wav input file name fs : [int] check if the sampling rate of the signal given by this variable, frequency in Hz Returns ------- sig : [np.array] mono-channel signal """ sig, found_fs = wavread(fname) if fs != found_fs: raise ValueError('sampling rate should be {0}, not {1}. ' 'please resample.'.format(fs, found_fs)) if len(sig.shape) > 1: warnings.warn('stereo audio: merging channels') sig = (sig[:, 0] + sig[:, 1]) / 2 return sig
def __getitem__(self, idx): if idx >= len(self.mixturePaths): raise IndexError mixture, fs = util.wavread(self.mixturePaths[idx]) vocal, fs = util.wavread(self.vocalPaths[idx]) if self.mono: #downmix here mixture = np.mean(mixture, axis=-1) vocal = np.mean(vocal, axis=-1) sample = { 'mixture': mixture.astype(np.float32), 'vocal': vocal.astype(np.float32) } if self.transform is not None: sample = self.transform(sample) return sample
def extract_features(df, label2ix, spec_kwargs, vad_kwargs, stacksize=1, frate=100, return_y=False): if return_y: return_y = 'label' in df X = {} if return_y: y = {} spectrum_encoder = Spectral(**spec_kwargs) vad_encoder = VAD(**vad_kwargs) for ix, fname in enumerate(df.filename.unique()): sig, fs = wavread(fname) if fs != spec_kwargs['fs']: raise ValueError('expected samplerate {}, got {}'.format( spec_kwargs['fs'], fs) ) spec = spectrum_encoder.transform(sig) spec = (spec - spec.mean(0)) / spec.std(0) if stacksize > 1: spec = roll_array(spec, stacksize) vad = vad_encoder.activations(sig) vad = vad.reshape(vad.shape[0], -1) if stacksize > 1: vad = roll_array(vad, stacksize) X_curr = [] if return_y: y_curr = [] rows_iter = df[df.filename == fname].iterrows() for _, row in rows_iter: start = row.start end = row.end start_fr = int(start * frate) end_fr = int(end * frate) feat = np.hstack( (spec[start_fr: end_fr], vad[start_fr: end_fr]) ) X_curr.append( feat.astype(np.float32) ) if return_y: y_curr.append( np.ones(feat.shape[0], dtype=np.uint8) * \ label2ix[row.label] ) X[fname] = np.vstack(X_curr) if return_y: y[fname] = np.hstack(y_curr) if return_y: return X, y else: return X
def extract_features(df, label2ix, spec_kwargs, vad_kwargs, stacksize=1, frate=100, return_y=False): if return_y: return_y = 'label' in df X = {} if return_y: y = {} spectrum_encoder = Spectral(**spec_kwargs) vad_encoder = VAD(**vad_kwargs) for ix, fname in enumerate(df.filename.unique()): sig, fs = wavread(fname) if fs != spec_kwargs['fs']: raise ValueError('expected samplerate {}, got {}'.format( spec_kwargs['fs'], fs)) spec = spectrum_encoder.transform(sig) spec = (spec - spec.mean(0)) / spec.std(0) if stacksize > 1: spec = roll_array(spec, stacksize) vad = vad_encoder.activations(sig) vad = vad.reshape(vad.shape[0], -1) if stacksize > 1: vad = roll_array(vad, stacksize) X_curr = [] if return_y: y_curr = [] rows_iter = df[df.filename == fname].iterrows() for _, row in rows_iter: start = row.start end = row.end start_fr = int(start * frate) end_fr = int(end * frate) feat = np.hstack((spec[start_fr:end_fr], vad[start_fr:end_fr])) X_curr.append(feat.astype(np.float32)) if return_y: y_curr.append( np.ones(feat.shape[0], dtype=np.uint8) * \ label2ix[row.label] ) X[fname] = np.vstack(X_curr) if return_y: y[fname] = np.hstack(y_curr) if return_y: return X, y else: return X
def _load_wav(self, fname): """ Memoized audio loader. """ key = fname if not key in self.wav_cache: sig, fs_ = wavread(fname) if self.fs != fs_: raise ValueError('sampling rate should be {0}, not {1}. ' 'please resample.'.format(self.fs, fs_)) if len(sig.shape) > 1: warnings.warn('stereo audio: merging channels') sig = (sig[:, 0] + sig[:, 1]) / 2 self.wav_cache[key] = sig return self.wav_cache[key]
def run(self): import algorithm padata = algorithm.creat_data(algorithm.Names) while self.event.isSet(): if hasattr(self, 'wavfiles'): for wavfile in self.wavfiles: x, fs, bits, N = util.wavread(unicode(wavfile)) self.x = (x + 32768) / 16 for i in range(1, len(x) / 1024): for key in padata: padata[key][:-1] = padata[key][1:] raw_data = self.x[1024 * (i - 1):1024 * i] padata['max'][-1] = max(raw_data) padata['min'][-1] = min(raw_data) padict.update({self.figurename: padata}) if self.plotmode == "by data": self.plotsignal.emit() time.sleep(self.importwavspreed / self.importspreed)
import numpy as np import Model_fcn #Model import torch import util import sys if __name__ == "__main__": blockSize = 4096 hopSize = 2048 if len(sys.argv) != 3: print("Usage:\n", sys.argv[0], "input_path output_path") sys.exit(1) #read the wav file x, fs = util.wavread(sys.argv[1]) #downmix to single channel x = np.mean(x, axis=-1) #perform stft S = util.stft_real(x, blockSize=blockSize, hopSize=hopSize) magnitude = np.abs(S).astype(np.float32) angle = np.angle(S).astype(np.float32) #initialize the model model = Model_fcn.ModelSingleStep(blockSize) #load the pretrained model model.load_state_dict( torch.load("Modelfcn.pt", map_location=lambda storage, loc: storage)) #switch to eval mode model.eval()
maxlen = None maxlen = 500 print "Loading data..." # development data D_valid = AudioDataset(config['taskfile_x_valid'], config['taskfile_y_valid'], datafile=config['datafile_valid'], params_stft=config['params_stft']) #print " Loading validation data..." #x_valid, y_valid, mask_valid = D_valid.get_padded_data_matrix(transform_x=transform_x, transform_y=transform_y, pad_value=mask_value, maxlen=maxlen) for i in range(10): x = util.wavread(D_valid.x_wavfiles[i])[0:1, :] xr = D_valid.reconstruct_x(i)[0:1, :] if xr.shape[1] > x.shape[1]: xr = xr[:, :x.shape[1]] print "For file %d, NMSE between original x and reconstructed x is %e" % ( i, np.mean((x - xr)**2) / np.mean(x**2)) y = util.wavread(D_valid.y_wavfiles[i])[0:1, :] yr = D_valid.reconstruct_y(i) if yr.shape[1] > y.shape[1]: yr = yr[:, :y.shape[1]] print "For file %d, NMSE between original y and reconstructed y is %e" % ( i, np.mean((y - yr)**2) / np.mean(y**2)) D_valid.reconstruct_audio(description="test_reconstruction_audio", idx=range(10),
outpath = {} for i, j, k in os.walk(mix): for song in k: inpath += [mix + song] outpath[mix + song] = est + song for f in inpath: input_path = f output_path = outpath[f] print(f) blockSize = 4096 hopSize = 2048 #read the wav file x, fs = util.wavread(input_path) #downmix to single channel x = np.mean(x, axis=-1) #perform stft S = util.stft_real(x, blockSize=blockSize, hopSize=hopSize) magnitude = np.abs(S).astype(np.float32) angle = np.angle(S).astype(np.float32) #initialize the model model = Model.ModelSingleStep(blockSize) #load the pretrained model checkpoint = torch.load("savedModel_RNN_best.pt", map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict'])