def main_denoising(wav_dir, out_dir, gpu_id, truncate_minutes): if not os.path.exists(wav_dir): raise RuntimeError("cannot locate the original dictionary !") if not os.path.exists(out_dir): os.makedirs(out_dir) # print "Since the some clips in DHAHRD are long, it's better to split the long sentences to several sub-clips, in case of causing GPU memory problem during LSTM inference.\n " # loading global MVN statistics glo_mean_var = sio.loadmat('./model/global_mvn_stats.mat') mean = glo_mean_var['global_mean'] var = glo_mean_var['global_var'] wav_files = [os.path.join(wav_dir, line) for line in os.listdir(wav_dir)] # feature_extraction for wav in wav_files: if wav.endswith('.wav'): rate, wav_data = wav_io.read(wav) sample_length = wav_data.size # apply peak-normalization first. #pdb.set_trace() wav_data = utils.peak_normalization(wav_data) chunk_length = truncate_minutes * rate * 60 total_chunks = int( math.ceil(float(sample_length) / float(chunk_length))) se_data_total = np.array([], dtype=np.int16) #pdb.set_trace() for i in range(1, total_chunks + 1): if (i == 1 and total_chunks == 1): # if it only contains 1 chunk temp = wav_data elif (i == total_chunks): # if it's the last chunk temp = wav_data[(i - 1) * chunk_length - 1:] else: if (i == 1): temp = wav_data[0:chunk_length - 1] else: temp = wav_data[((i - 1) * chunk_length - 1):(i * chunk_length - 1)] print("Current processing wav: %s, segment: %d/%d ." % (wav, i, total_chunks)) if temp.shape[ 0] < 256: # if it's not enough for one half of frame #pdb.set_trace() data_se = temp # do not process se_data_total = np.append(se_data_total, data_se.astype(np.int16)) continue # Process the audio in separate temporary files noisy_normed_lps = 'temp_normed.lps' enhanced_wav = 'temp_se.wav' # extract lps feature from waveform noisy_htkdata = utils.wav2logspec( temp, window=np.hamming(512)) ##!!!!!!!!!!!! # Do MVN before decoding normed_noisy = (noisy_htkdata - mean) / var utils.writeHtk(noisy_normed_lps, normed_noisy, sampPeriod=160000, parmKind=9) # make the decoding list in CNTK-determined format cntk_len = noisy_htkdata.shape[0] - 1 flist = open('./test_normed.scp', 'w') flist.write('test.normedlsp=temp_normed.lps[0,' + str(cntk_len) + "]\n") flist.close() # Start CNTK model-decoding os.system('python decode_model.py %d ' % (gpu_id)) # Read decoded data SE_mat = sio.loadmat( 'enhanced_norm_fea_mat/test.normedlsp.mat') IRM = SE_mat['SE'] # Directly mask the original feature masked_lps = noisy_htkdata + np.log(IRM) wave_recon = utils.logspec2wav(masked_lps, temp, window=np.hamming(512), nperseg=512, noverlap=256) wav_io.write(enhanced_wav, 16000, np.asarray(wave_recon)) # # # Back to time domain rate, data_se = wav_io.read(enhanced_wav) se_data_total = np.append(se_data_total, data_se) output_wav = os.path.join(out_dir, wav.split('/')[-1]) wav_io.write(output_wav, 16000, np.asarray(se_data_total)) print("Processing wav: %s, done ." % (wav))
def denoise_wav(src_wav_file, dest_wav_file, global_mean, global_var, use_gpu, gpu_id, truncate_minutes): """Apply speech enhancement to audio in WAV file. Parameters ---------- src_wav_file : str Path to WAV to denosie. dest_wav_file : str Output path for denoised WAV. global_mean : ndarray, (n_feats,) Global mean for LPS features. Used for CMVN. global_var : ndarray, (n_feats,) Global variances for LPS features. Used for CMVN. use_gpu : bool, optional If True and GPU is available, perform all processing on GPU. (Default: True) gpu_id : int, optional Id of GPU on which to do computation. (Default: 0) truncate_minutes: float Maximimize size in minutes to process at a time. The enhancement will be done on chunks of audio no greather than ``truncate_minutes`` minutes duration. """ # Read noisy audio WAV file. As scipy.io.wavefile.read is FAR faster than # librosa.load, we use the former. rate, wav_data = wav_io.read(src_wav_file) # Apply peak-normalization. wav_data = utils.peak_normalization(wav_data) # Perform denoising in chunks of size chunk_length samples. chunk_length = int(truncate_minutes * rate * 60) total_chunks = int(math.ceil(wav_data.size / chunk_length)) data_se = [] # Will hold enhanced audio data for each chunk. for i in range(1, total_chunks + 1): tmp_dir = tempfile.mkdtemp() try: # Get samples for this chunk. bi = (i - 1) * chunk_length # Index of first sample of this chunk. ei = bi + chunk_length # Index of last sample of this chunk + 1. temp = wav_data[bi:ei] print('Processing file: %s, segment: %d/%d.' % (src_wav_file, i, total_chunks)) # Skip denoising if chunk is too short. if temp.shape[0] < WL2: data_se.append(temp) continue # Determine paths to the temporary files to be created. noisy_normed_lps_fn = os.path.join(tmp_dir, 'noisy_normed_lps.htk') noisy_normed_lps_scp_fn = os.path.join(tmp_dir, 'noisy_normed_lps.scp') irm_fn = os.path.join(tmp_dir, 'irm.mat') # Extract LPS features from waveform. noisy_htkdata = utils.wav2logspec(temp, window=np.hamming(WL)) # Do MVN before decoding. normed_noisy = (noisy_htkdata - global_mean) / global_var # Write features to HTK binary format making sure to also # create a script file. utils.write_htk(noisy_normed_lps_fn, normed_noisy, samp_period=SR, parm_kind=9) cntk_len = noisy_htkdata.shape[0] - 1 with open(noisy_normed_lps_scp_fn, 'w') as f: f.write('irm=%s[0,%d]\n' % (noisy_normed_lps_fn, cntk_len)) # Apply CNTK model to determine ideal ratio mask (IRM), which will # be output to the temp directory as irm.mat. In order to avoid a # memory leak, must do this in a separate process which we then # kill. decode_model(noisy_normed_lps_scp_fn, tmp_dir, NFREQS, use_gpu, gpu_id) # Read in IRM and directly mask the original LPS features. irm = sio.loadmat(irm_fn)['IRM'] masked_lps = noisy_htkdata + np.log(irm) # Reconstruct audio. wave_recon = utils.logspec2wav(masked_lps, temp, window=np.hamming(WL), n_per_seg=WL, noverlap=WL2) data_se.append(wave_recon) finally: shutil.rmtree(tmp_dir) data_se = np.concatenate(data_se) wav_io.write(dest_wav_file, SR, data_se)
def denoise_wav(src_wav_file, dest_wav_file, global_mean, global_var, use_gpu, truncate_minutes, mode, stage_select): """Apply speech enhancement to audio in WAV file. Parameters ---------- src_wav_file : str Path to WAV to denosie. dest_wav_file : str Output path for denoised WAV. global_mean : ndarray, (n_feats,) Global mean for LPS features. Used for CMVN. global_var : ndarray, (n_feats,) Global variances for LPS features. Used for CMVN. use_gpu : bool, optional If True and GPU is available, perform all processing on GPU. (Default: True) truncate_minutes: float Maximimize size in minutes to process at a time. The enhancement will be done on chunks of audio no greather than ``truncate_minutes`` minutes duration. """ # Read noisy audio WAV file. As scipy.io.wavefile.read is FAR faster than # librosa.load, we use the former. rate, wav_data = wav_io.read(src_wav_file) if mode == 1: print( "###Selecting the estimated ideal-ratio-masks in mode 1 (more conservative).###" ) elif mode == 2: print( "###Selecting the estimated log-power-spec features in mode 2 (more agressive).###" ) elif mode == 3: print( "###Selecting both estimated IRM and LPS outputs with equal weights in mode 3 (trade-off).###" ) # Apply peak-normalization. wav_data = utils.peak_normalization(wav_data) # Perform denoising in chunks of size chunk_length samples. chunk_length = int(truncate_minutes * rate * 60) total_chunks = int(math.ceil(wav_data.size / chunk_length)) data_se = [] # Will hold enhanced audio data for each chunk. model_pth = os.path.join(HERE, '1000h_se.pth') if not os.path.exists(model_pth): cmd = "cp {} {} ".format( '/export/fs01/jsalt19/leisun/speech_enhancement/speech_denoising_pytorch/model/1000h_se.pth', model_pth) os.system(cmd) nnet = LSTM_SE_PL_Dense_MTL(257, 7, 1024, 3, 257, 'false') nnet.load_state_dict(torch.load(model_pth)) nnet = nnet.cuda() nnet.eval() for i in range(1, total_chunks + 1): # Get samples for this chunk. bi = (i - 1) * chunk_length # Index of first sample of this chunk. ei = bi + chunk_length # Index of last sample of this chunk + 1. temp = wav_data[bi:ei] print('Processing file: %s, segment: %d/%d.' % (src_wav_file, i, total_chunks)) # Skip denoising if chunk is too short. if temp.shape[0] < WL2: data_se.append(temp) continue # Extract LPS features from waveform. noisy_htkdata = utils.wav2logspec(temp, window=np.hamming(WL)) # frame expandation in the input noisy_htkdata_expand = utils.expand_frames(noisy_htkdata, [3, 3]) input = torch.from_numpy( (noisy_htkdata_expand - global_mean) / (global_var)) lps_outputs, irm_outputs = nnet( torch.unsqueeze(input, 1).cuda().float()) if mode == 1: print(" Use the estimated LPS.") recovered_lps = noisy_htkdata + np.log( torch.squeeze( irm_outputs[stage_select - 1]).cpu().data.numpy()) elif mode == 2: print(" Use the estimated IRM.") recovered_lps = torch.squeeze(lps_outputs[stage_select - 1]).cpu( ).data.numpy() * global_var[:257] + global_mean[:257] elif mode == 3: print(" Use the fusion of estimated LPS and IRM.") recovered_lps = 0.5 * (noisy_htkdata + np.log( torch.squeeze(irm_outputs[stage_select - 1]).cpu().data.numpy( ))) + 0.5 * (torch.squeeze(lps_outputs[stage_select - 1]).cpu( ).data.numpy() * global_var[:257] + global_mean[:257]) # Reconstruct audio. wave_recon = utils.logspec2wav(recovered_lps, temp, window=np.hamming(WL), n_per_seg=WL, noverlap=WL2) data_se.append(wave_recon) data_se = [x.astype(np.int16, copy=False) for x in data_se] data_se = np.concatenate(data_se) wav_io.write(dest_wav_file, SR, data_se)
def denoise_wav(src_wav_file, dest_wav_file, global_mean, global_var, use_gpu, gpu_id, truncate_minutes, mode, model_select, stage_select): """Apply speech enhancement to audio in WAV file. Parameters ---------- src_wav_file : str Path to WAV to denosie. dest_wav_file : str Output path for denoised WAV. global_mean : ndarray, (n_feats,) Global mean for LPS features. Used for CMVN. global_var : ndarray, (n_feats,) Global variances for LPS features. Used for CMVN. use_gpu : bool, optional If True and GPU is available, perform all processing on GPU. (Default: True) gpu_id : int, optional Id of GPU on which to do computation. (Default: 0) truncate_minutes: float Maximimize size in minutes to process at a time. The enhancement will be done on chunks of audio no greather than ``truncate_minutes`` minutes duration. """ # Read noisy audio WAV file. As scipy.io.wavefile.read is FAR faster than # librosa.load, we use the former. rate, wav_data = wav_io.read(src_wav_file) if mode == 1: print( "###Selecting the estimated ideal-ratio-masks in mode 1 (more conservative).###" ) elif mode == 2: print( "###Selecting the estimated log-power-spec features in mode 2 (more agressive).###" ) elif mode == 3: print( "###Selecting both estimated IRM and LPS outputs with equal weights in mode 3 (trade-off).###" ) print("Using the pre-trained {} speech enhancement model.".format( model_select)) # Apply peak-normalization. wav_data = utils.peak_normalization(wav_data) # Perform denoising in chunks of size chunk_length samples. chunk_length = int(truncate_minutes * rate * 60) total_chunks = int(math.ceil(wav_data.size / chunk_length)) data_se = [] # Will hold enhanced audio data for each chunk. for i in range(1, total_chunks + 1): tmp_dir = tempfile.mkdtemp() try: # Get samples for this chunk. bi = (i - 1) * chunk_length # Index of first sample of this chunk. ei = bi + chunk_length # Index of last sample of this chunk + 1. temp = wav_data[bi:ei] print('Processing file: %s, segment: %d/%d.' % (src_wav_file, i, total_chunks)) # Skip denoising if chunk is too short. if temp.shape[0] < WL2: data_se.append(temp) continue # Determine paths to the temporary files to be created. noisy_normed_lps_fn = os.path.join(tmp_dir, 'noisy_normed_lps.htk') noisy_normed_lps_scp_fn = os.path.join(tmp_dir, 'noisy_normed_lps.scp') outputs_fn = os.path.join(tmp_dir, 'irm.mat') # Extract LPS features from waveform. noisy_htkdata = utils.wav2logspec(temp, window=np.hamming(WL)) # Do MVN before decoding. normed_noisy = (noisy_htkdata - global_mean) / global_var # Write features to HTK binary format making sure to also # create a script file. #utils.write_htk( # noisy_normed_lps_fn, normed_noisy, samp_period=SR, # parm_kind=9) if model_select.lower() == '400h': utils.write_htk(noisy_normed_lps_fn, normed_noisy, samp_period=SR, parm_kind=9) elif model_select.lower() == '1000h': utils.write_htk( noisy_normed_lps_fn, noisy_htkdata, samp_period=SR, parm_kind=9 ) ### The 1000h model already integrates MVN inside itself. cntk_len = noisy_htkdata.shape[0] - 1 with open(noisy_normed_lps_scp_fn, 'w') as f: f.write('irm=%s[0,%d]\n' % (noisy_normed_lps_fn, cntk_len)) # Apply CNTK model to determine ideal ratio mask (IRM), which will # be output to the temp directory as irm.mat. In order to avoid a # memory leak, must do this in a separate process which we then # kill. #def decode_model(features_file, irm_mat_dir, feature_dim, use_gpu=True, # gpu_id=0, mode=1, model_select='400h', stage_select=3): p = Process(target=decode_model, args=(noisy_normed_lps_scp_fn, tmp_dir, NFREQS, use_gpu, gpu_id, mode, model_select, stage_select)) p.start() p.join() if p.exception: e, tb = p.exception raise type(e)(tb) # Read in IRM and directly mask the original LPS features. irm = sio.loadmat(outputs_fn)['IRM'] lps = sio.loadmat(outputs_fn)['LPS'] if mode == 1: recovered_lps = noisy_htkdata + np.log(irm) elif mode == 2: recovered_lps = (lps * global_var) + global_mean elif mode == 3: recovered_lps = 0.5 * (noisy_htkdata + np.log(irm)) + 0.5 * ( (lps * global_var) + global_mean) # Reconstruct audio. wave_recon = utils.logspec2wav(recovered_lps, temp, window=np.hamming(WL), n_per_seg=WL, noverlap=WL2) data_se.append(wave_recon) finally: shutil.rmtree(tmp_dir) data_se = [x.astype(np.int16, copy=False) for x in data_se] data_se = np.concatenate(data_se) wav_io.write(dest_wav_file, SR, data_se)