def prep_dataset(params): """ Compute the mixtures at various input SNR for creating the dataset Args: params: dictionary with fields: 'noise_data_dir': string - the path to the noise data 'speech_data_dir': string - the path to the clean speech data 'sample rate': int - the sampling frequency 'n_mix': int - the number of mixtures to create 'input_SNR_list': list - the list of input SNRs to consider """ # Load the noises, keep the 1st of the 16 channels noise_data_list = librosa.util.find_files(params['noise_data_dir'], ext='wav') noise_data_list = [n for n in noise_data_list if n.__contains__('ch01')] noise_total = np.array([]) for n in noise_data_list: noise_total = np.concatenate( (noise_total, librosa.core.load(n, sr=params['sample_rate'])[0])) noise_beg_ind = [] noise_total_len = noise_total.shape[0] # Load the list of clean speech files and shuffle it speech_data_list = librosa.util.find_files(params['speech_data_dir'], ext='wav') np.random.shuffle(speech_data_list) # Create the mixtures for n in range(params['n_mix']): print('Creating mix ' + str(n + 1) + ' / ' + str(params['n_mix'])) # Load the speech speech_data = speech_data_list[n] clean = librosa.core.load(speech_data, sr=params['sample_rate'])[0] len_clean = clean.shape[0] # Take a piece of the noise of same length rand_sample_noise_beg = np.random.randint(noise_total_len) noise = noise_total[rand_sample_noise_beg:rand_sample_noise_beg + len_clean] # Collect the noise index (to further study the results as a function of the noise type) noise_beg_ind.append(rand_sample_noise_beg) # Adjust the input SNR and record audio for iSNR in params['input_SNR_list']: # Adjust the noise at target input SNR noise_adj = adjust_noise_at_isnr(clean, noise, input_snr=iSNR) src_ref = np.concatenate( (clean[:, np.newaxis], noise_adj[:, np.newaxis]), axis=1) # Take the STFT and iSTFT to ensure the length is fixed src_ref_stft = my_stft(src_ref, n_fft=params['n_fft'], hop_length=params['hop_length'], win_length=params['win_length'], win_type=params['win_type']) src_ref = my_istft(src_ref_stft, hop_length=params['hop_length'], win_length=params['win_length'], win_type=params['win_type']) # Create the folder to record the wav (if necessary) rec_dir = 'data/SNR_' + str(iSNR) + '/' + str(n) if not os.path.exists(rec_dir): os.makedirs(rec_dir) # Record wav record_src(rec_dir + '/', src_ref, params['sample_rate'], rec_mix=True) # Get the indices of noise type for each mixture in the test set and record noise_beg_ind = np.array(noise_beg_ind) noise_beg_ind = noise_beg_ind[50:] ind_noise_1 = noise_beg_ind < noise_total_len // 3 ind_noise_3 = noise_beg_ind > 2 * noise_total_len // 3 ind_noise_2 = 1 - (ind_noise_1 + ind_noise_3) np.savez('data/noise_ind.npz', ind_noise_1=ind_noise_1, ind_noise_2=ind_noise_2, ind_noise_3=ind_noise_3) return
def validation(params, val_sdr_path='outputs/val_sdr.npz'): """ Run the proposed algorithm on the validation subset in different settings Args: params: dictionary with fields: 'sample rate': int - the sampling frequency 'n_mix': int - the number of mixtures to process 'max_iter': int - the nomber of iterations of the proposed algorithm 'input_SNR_list': list - the list of input SNRs to consider 'grad_step_range': numpy array - the step size grid 'beta_range': numpy array - the beta-divergence parameter grid 'hop_length': int - the hop size of the STFT 'win_length': int - the window length 'n_fft': int - the number of FFT points 'win_type': string - the STFT window type (e.g., Hann, Hamming, Blackman...) val_sdr_path: string - the path where to store the validation SDR """ # Some parameters n_isnr = len(params['input_SNR_list']) n_grad, n_beta = params['grad_step_range'].shape[0], params[ 'beta_range'].shape[0] # Initialize the SDR array sdr_val = np.zeros((params['max_iter'] + 1, n_grad, n_beta, 2, 2, n_isnr, params['n_mix'])) # Loop over iSNRs, mixtures and parameters for index_isnr, isnr in enumerate(params['input_SNR_list']): for index_mix in range(params['n_mix']): # Load time-domain signals and get the mixture's STFT audio_path = 'data/SNR_' + str(isnr) + '/' + str(index_mix) + '/' src_ref, mix = load_src(audio_path, params['sample_rate']) mix_stft = my_stft(mix, n_fft=params['n_fft'], hop_length=params['hop_length'], win_length=params['win_length'], win_type=params['win_type'])[:, :, 0] # Estimate the magnitude spectrograms spectro_mag = estim_spectro_from_mix(mix) # Gradient descent for index_b, b in enumerate(params['beta_range']): for index_g, g in enumerate(params['grad_step_range']): print('iSNR ' + str(index_isnr + 1) + ' / ' + str(n_isnr) + ' -- Mix ' + str(index_mix + 1) + ' / ' + str(params['n_mix']) + ' -- Beta ' + str(index_b + 1) + ' / ' + str(n_beta) + ' -- Step size ' + str(index_g + 1) + ' / ' + str(n_grad)) # Run the gradient descent algorithm for d=1,2 and for the "right" and "left" problems out = bregmisi_all(mix_stft, spectro_mag, src_ref=src_ref, win_length=params['win_length'], hop_length=params['hop_length'], win_type=params['win_type'], beta=b, grad_step=g * np.ones((2, 2)), max_iter=params['max_iter']) # Store the SDR over iterations sdr_val[:, index_g, index_b, 0, 0, index_isnr, index_mix] = out['sdr_1r'] sdr_val[:, index_g, index_b, 1, 0, index_isnr, index_mix] = out['sdr_2r'] sdr_val[:, index_g, index_b, 0, 1, index_isnr, index_mix] = out['sdr_1l'] sdr_val[:, index_g, index_b, 1, 1, index_isnr, index_mix] = out['sdr_2l'] # Save results np.savez(val_sdr_path, sdr=sdr_val) return
def misi(mix_stft, spectro_mag, win_length=None, hop_length=None, src_ref=None, max_iter=20, win_type='hann'): """The multiple input spectrogram inversion algorithm for source separation. Args: mix_stft: numpy.ndarray (nfreqs, nframes) - input mixture STFT spectro_mag: numpy.ndarray (nfreqs, nframes, nrsc) - the target sources' magnitude spectrograms win_length: int - the window length hop_length: int - the hop size of the STFT src_ref: numpy.ndarray (nsamples, nrsc) - reference sources for computing the SDR over iterations max_iter: int - number of iterations win_type: string - window type Returns: estimated_sources: numpy.ndarray (nsamples, nrsc) - the time-domain estimated sources error: list (max_iter) - loss function (magnitude mismatch) over iterations sdr: list (max_iter) - score (SDR in dB) over iterations """ # Parameters n_src = spectro_mag.shape[2] n_fft = (spectro_mag.shape[0] - 1) * 2 if win_length is None: win_length = n_fft if hop_length is None: hop_length = win_length // 2 # Pre allocate SDR and error compute_sdr = not (src_ref is None) error, sdr = [], [] # Initialization with amplitude mask src_est = amplitude_mask(spectro_mag, mix_stft, win_length=win_length, hop_length=hop_length, win_type=win_type) if compute_sdr: sdr.append(get_score(src_ref, src_est)) for iteration_number in range(max_iter): # STFT stft_est = my_stft(src_est, n_fft=n_fft, hop_length=hop_length, win_length=win_length, win_type=win_type) current_magnitude = np.abs(stft_est) # Normalize to the target amplitude stft_est = stft_est * spectro_mag / (np.abs(stft_est) + sys.float_info.epsilon) # Compute and distribute the mixing error mixing_error = mix_stft - np.sum(stft_est, axis=2) stft_est += np.repeat(mixing_error[:, :, np.newaxis], n_src, axis=2) / n_src # Inverse STFT src_est = my_istft(stft_est, win_length=win_length, hop_length=hop_length, win_type=win_type) # BSS score if compute_sdr: sdr.append(get_score(src_ref, src_est)) # Error error.append(np.linalg.norm(current_magnitude - spectro_mag)) return src_est, error, sdr
def testing(params, test_sdr_path='outputs/test_sdr.npz'): """ Run the proposed algorithm on the test subset and the MISI and AM baselines Args: params: dictionary with fields: 'sample rate': int - the sampling frequency 'n_mix': int - the number of mixtures to process 'max_iter': int - the nomber of iterations of the proposed algorithm 'input_SNR_list': list - the list of input SNRs to consider 'beta_range': numpy array - the beta-divergence parameter grid 'hop_length': int - the hop size of the STFT 'win_length': int - the window length 'n_fft': int - the number of FFT points 'win_type': string - the STFT window type (e.g., Hann, Hamming, Blackman...) test_sdr_path: string - the path where to store the test SDR """ # Define some parameters and initialize the SNR array n_isnr = len(params['input_SNR_list']) sdr_am = np.zeros((n_isnr, params['n_mix'])) sdr_misi = np.zeros((n_isnr, params['n_mix'])) sdr_gd = np.zeros( (params['beta_range'].shape[0], 2, 2, n_isnr, params['n_mix'])) # Load the optimal step sizes from validation gd_step_opt = np.load('outputs/val_gd_step.npz')['gd_step'] # Loop over iSNRs, mixtures and parameters for index_isnr, isnr in enumerate(params['input_SNR_list']): for index_mix in range(params['n_mix']): # Load data (start from mixture 50 since the first 50 are for validation) audio_path = 'data/SNR_' + str(isnr) + '/' + str( index_mix + params['n_mix']) + '/' src_ref, mix = load_src(audio_path, params['sample_rate']) mix_stft = my_stft(mix, n_fft=params['n_fft'], hop_length=params['hop_length'], win_length=params['win_length'], win_type=params['win_type'])[:, :, 0] # Estimate the magnitude spectrograms spectro_mag = estim_spectro_from_mix(mix) # Amplitude mask src_est_am = amplitude_mask(spectro_mag, mix_stft, win_length=params['win_length'], hop_length=params['hop_length'], win_type=params['win_type']) sdr_am[index_isnr, index_mix] = get_score(src_ref, src_est_am) record_src(audio_path + 'am_', src_est_am, params['sample_rate']) # MISI src_est_misi = misi(mix_stft, spectro_mag, win_length=params['win_length'], hop_length=params['hop_length'], max_iter=params['max_iter'])[0] sdr_misi[index_isnr, index_mix] = get_score(src_ref, src_est_misi) record_src(audio_path + 'misi_', src_est_misi, params['sample_rate']) # Gradient descent for index_b, b in enumerate(params['beta_range']): print('iSNR ' + str(index_isnr + 1) + ' / ' + str(n_isnr) + ' -- Mix ' + str(index_mix + 1) + ' / ' + str(params['n_mix']) + ' -- Beta ' + str(index_b + 1) + ' / ' + str(9)) # Get the optimal step size(s) for this beta / iSNR my_steps = gd_step_opt[index_b, :, :, index_isnr] # Run the gradient descent algorithm for d=1,2 and for the "right" and "left" problems out = bregmisi_all(mix_stft, spectro_mag, src_ref=src_ref, win_length=params['win_length'], hop_length=params['hop_length'], win_type=params['win_type'], beta=b, grad_step=my_steps, max_iter=params['max_iter']) # Store the SDR sdr_gd[index_b, 0, 0, index_isnr, index_mix] = get_score(src_ref, out['src_est_1r']) sdr_gd[index_b, 1, 0, index_isnr, index_mix] = get_score(src_ref, out['src_est_2r']) sdr_gd[index_b, 0, 1, index_isnr, index_mix] = get_score(src_ref, out['src_est_1l']) sdr_gd[index_b, 1, 1, index_isnr, index_mix] = get_score(src_ref, out['src_est_2l']) # Record in the nice setting (beta=1.25 d=2, left) if b == 1.25: record_src(audio_path + 'gd_', out['src_est_2l'], params['sample_rate']) # Save results np.savez(test_sdr_path, sdr_am=sdr_am, sdr_misi=sdr_misi, sdr_gd=sdr_gd) return
def bregmisi(mix_stft, spectro, win_length=None, hop_length=None, win_type='hann', src_ref=None, beta=2., d=1, grad_step=1e-3, direc='right', max_iter=20, eps=1e-8): """The Gradient Descent algorithm for phase recovery in audio source separation Args: mix_stft: numpy.ndarray (nfreqs, nframes) - input mixture STFT spectro: numpy.ndarray (nfreqs, nframes, nrsc) - the target sources' magnitude or power spectrograms win_length: int - the window length hop_length: int - the hop size of the STFT src_ref: numpy.ndarray (nsamples, nrsc) - reference sources for computing the SDR over iterations max_iter: int - number of iterations win_type: string - window type direc: string ('Right' or 'Left') - corresponds to the problem formulation d: int - magnitude (1) or power (2) measurements beta: float - parameter of the beta-divergence grad_step: float - step size for the gradient descent eps: float - small ridge added to the loss for avoiding numerical issues Returns: src_est: numpy.ndarray (nsamples, nrsc) - the time-domain estimated sources sdr: list (max_iter) - score (SDR in dB) over iterations """ # Parameters n_src = spectro.shape[2] n_fft = (spectro.shape[0] - 1) * 2 if win_length is None: win_length = n_fft if hop_length is None: hop_length = win_length // 2 # Pre allocate SDR and error compute_sdr = not (src_ref is None) sdr = [] # Initialization with amplitude mask spectro_mag = np.power(spectro, 1 / d) src_est = amplitude_mask(spectro_mag, mix_stft, win_length=win_length, hop_length=hop_length, win_type=win_type) if compute_sdr: sdr.append(get_score(src_ref, src_est)) # Loop over iterations for iteration_number in range(max_iter): # Get the STFTs stft_est = my_stft(src_est, n_fft=n_fft, hop_length=hop_length, win_length=win_length, win_type=win_type) # Gradient descent in the TF domain #G = grad_beta(stft_est ** d, spectro, beta, direc) #breg_grad = d * (stft_est * (np.abs(stft_est) ** (d - 2)) * G) G = grad_beta_eps(stft_est, spectro, d, beta, direc, eps) breg_grad = d * (stft_est * ((np.abs(stft_est)**2 + eps)**(d / 2 - 1)) * G) stft_est -= grad_step * breg_grad # Compute and distribute the mixing error mixing_error = mix_stft - np.sum(stft_est, axis=2) corrected_stft = stft_est + np.repeat( mixing_error[:, :, np.newaxis], n_src, axis=2) / n_src # Back to time domain and score src_est = my_istft(corrected_stft, win_length=win_length, hop_length=hop_length, win_type=win_type) # BSS score if compute_sdr: sdr.append(get_score(src_ref, src_est)) return src_est, sdr