def misi(mixture_stft, spectrograms_target, win_length, hop_length=None, src_ref=None, max_iter=15, win_type='hann'): """The multiple input spectrogram inversion algorithm for source separation. Args: mixture_stft: numpy.ndarray (nfreqs, nframes) - input mixture STFT spectrograms_target: numpy.ndarray (nfreqs, nframes, nrsc) - the target sources' magnitude spectrograms win_length: int - the window length hop_length: int - the hop size of the STFT src_ref: numpy.ndarray (nsamples, nrsc) - reference sources for computing the SDR over iterations max_iter: int - number of iterations win_type: string - window type Returns: estimated_sources: numpy.ndarray (nsamples, nrsc) - the time-domain estimated sources error: list (max_iter) - loss function (magnitude mismatch) over iterations sdr: list (max_iter) - score (SI-SDR in dB) over iterations """ if hop_length is None: hop_length = win_length // 2 compute_sdr = True if src_ref is None: compute_sdr = False # Parameters number_sources = spectrograms_target.shape[2] n_fft = (spectrograms_target.shape[0]-1)*2 # Pre allocate SDR and error sdr = [] error = [] # Initialize the time domain estimates with the mixture mixture_time = my_istft(mixture_stft, hop_length=hop_length, win_length=win_length, win_type=win_type) estimated_sources = np.repeat(mixture_time[:, np.newaxis], number_sources, axis=1) / number_sources for iteration_number in range(max_iter): # STFT stft_reest = my_stft(estimated_sources, n_fft=n_fft, hop_length=hop_length, win_length=win_length, win_type=win_type) current_magnitude = np.abs(stft_reest) # Compute and distribute the mixing error mixing_error = mixture_stft - np.sum(stft_reest, axis=2) corrected_stft = stft_reest + np.repeat(mixing_error[:, :, np.newaxis], number_sources, axis=2) / number_sources # Normalize to the target amplitude stft_estim = corrected_stft * spectrograms_target / (np.abs(corrected_stft) + sys.float_info.epsilon) # Inverse STFT estimated_sources = my_istft(stft_estim, win_length=win_length, hop_length=hop_length, win_type=win_type) # BSS score if compute_sdr: sdr.append(get_separation_score(src_ref, estimated_sources)) # Error error.append(np.linalg.norm(current_magnitude - spectrograms_target)) return estimated_sources, error, sdr
def omisi(mixture_stft, spectrograms_target, win_length, hop_length=None, init_method='mix', future_frames=1, src_ref=None, phase_true=None, max_iter=5, win_type='hann'): """The online multiple input spectrogram inversion algorithm for source separation. Args: mixture_stft: numpy.ndarray (nfreqs, nframes) - input mixture STFT spectrograms_target: numpy.ndarray (nfreqs, nframes, nrsc) - the target sources' magnitude spectrograms win_length: int - the window length hop_length: int - the hop size of the STFT init_method: string - phase initialization method ('mix', 'sinus', or 'true') future_frames: int - number of future frames to account for src_ref: numpy.ndarray (nsamples, nrsc) - reference sources for computing the SDR over iterations phase_true: numpy.ndarray (nfreqs, nframes, nrsc) - the ground truth sources' phase (for ideal phase mask) max_iter: int - number of iterations win_type: string - window type Returns: estimated_sources: numpy.ndarray (nsamples, nrsc) - the time-domain estimated sources sdr: list (max_iter) - score (SI-SDR in dB) over iterations """ # For a null look-ahead, use a slightly faster version of oMISI if future_frames == 0: estimated_sources, sdr = omisi_fast(mixture_stft, spectrograms_target, win_length, hop_length, init_method, src_ref, phase_true, max_iter, win_type) return estimated_sources, sdr if hop_length is None: hop_length = win_length // 2 # Parameters n_freqs, n_frames, nsrc = spectrograms_target.shape n_fft = (n_freqs - 1) * 2 # Pre allocate proper size time domain signals expected_signal_len = win_length + hop_length * (n_frames - 1) estimated_sources = np.zeros((expected_signal_len, nsrc)) s_current = np.zeros((win_length + future_frames * hop_length, nsrc)) # Initial phase allocation phase_current = np.repeat(np.angle(mixture_stft[:, 0:future_frames])[:, :, np.newaxis], nsrc, axis=2) # Loop over time frames for i in range(n_frames-future_frames): sample_start = i * hop_length mag = spectrograms_target[:, i:i+future_frames+1, :] # Initialization of the new frame if init_method == 'mix': phase_new = np.repeat(np.angle(mixture_stft[:, i+future_frames])[:, np.newaxis, np.newaxis], nsrc, axis=2) elif init_method == 'sinus': phase_new = phase_current[:, -1, :] + get_normalized_frequencies_multi_sources(mag[:, -1, :]) * 2 * np.pi * hop_length phase_new = phase_new.reshape((n_freqs, 1, nsrc)) elif init_method == 'true': phase_new = phase_true[:, i+future_frames, :] else: raise ValueError('Unknown initialization scheme') phase_ini = np.concatenate((phase_current, phase_new), axis=1) Y_dft_corrected = mag * np.exp(1j * phase_ini) # partial iSTFT s_wind = my_istft(Y_dft_corrected, win_length=win_length, hop_length=hop_length, win_type=win_type) # Overlap add s_ola = s_current + s_wind for iter in range(max_iter): # partial STFT Y_dft = my_stft(s_ola, win_length=win_length, hop_length=hop_length, n_fft=n_fft, win_type='hanning') # Compute and distribute the mixing error mixing_error = mixture_stft[:, i:i+future_frames+1] - np.sum(Y_dft, axis=2) Y_dft_corrected = Y_dft + np.repeat(mixing_error[:, :, np.newaxis], nsrc, axis=2) / nsrc # Normalize to the target magnitude (GL) Y_dft_norm = mag * np.exp(1j * np.angle(Y_dft_corrected)) # partial iSTFT s_wind = my_istft(Y_dft_norm, win_length=win_length, hop_length=hop_length, win_type=win_type) # Overlap add with the previous exited frame s_ola = s_current + s_wind phase_current = np.angle(Y_dft_corrected)[:, 1:, :] #estimated_sources[sample_start:(sample_start + hop_length), :] = s_ola[:hop_length, :] #s_current = np.concatenate((s_ola[hop_length:, :], np.zeros((hop_length, nsrc)))) estimated_sources[sample_start:(sample_start + win_length + future_frames * hop_length), :] = s_ola # Update the current fixed segment (ignore the future frames but account for the overlapped past frames) s_partial = my_istft(np.reshape(Y_dft_norm[:, 0, :], (n_freqs, 1, nsrc)), win_length=win_length, hop_length=hop_length, win_type=win_type) s_current = np.concatenate((s_current[hop_length:win_length, :] + s_partial[hop_length:, :], np.zeros(((future_frames+1) * hop_length, nsrc)))) sdr = [] if not(src_ref is None): sdr = get_separation_score(src_ref, estimated_sources) return estimated_sources, sdr
def main(folders, parameters): """The main function that benchmarks all algorithm over the dataset Args: folders: dict with fields: 'data': the dataset path 'speakers': the name of the folder corresponding to the chosen speaker pair 'outputs': the folder where the outputs (audio files, metrics, models) are stored parameters: dict with audio parameters: 'sample_rate': int - the sample rate 'win_length': int - the window length for the STFT 'hop_length': int - the hop size of the STFT 'n_fft': int - number of FFT points 'fs': int - sample rate 'win_type': string - window type """ # Get parameters n_fft = parameters['n_fft'] hop_length = parameters['hop_length'] win_length = parameters['win_length'] fs = parameters['sample_rate'] win_type = parameters['win_type'] # Create an object for each speaker to get the file list. Test mixtures are created by summing files from the lists test_data_1 = HINT_audio_handler(folders['data'], [folders['speakers'][0]], 'test', parameters['sample_rate']) test_data_2 = HINT_audio_handler(folders['data'], [folders['speakers'][1]], 'test', parameters['sample_rate']) test_data_1.get_files_list() test_data_2.get_files_list() # Number of sentences per speaker list_range = 2 # Pre-allocate score metric_omisi = np.zeros([5, list_range**2]) metric_misi = np.zeros([15, 2, list_range**2]) # Loop over testing dataset ic = 0 for file_num_1 in np.arange(list_range): for file_num_2 in np.arange(list_range, list_range * 2): # Load the test data for each sentence audio_in_1, audio_name_1 = test_data_1.get_file_from_list( file_num_1) audio_in_2, audio_name_2 = test_data_2.get_file_from_list( file_num_2) # Adjust to the same length and stack in an array min_len = min(len(audio_in_1), len(audio_in_2)) audio_in_1 = audio_in_1[:min_len] audio_in_2 = audio_in_2[:min_len] src_ref = np.stack((audio_in_1, audio_in_2), axis=1) # STFTs src_ref_stft = my_stft(src_ref, n_fft=n_fft, hop_length=hop_length, win_length=win_length) mixture_stft = np.sum(src_ref_stft, axis=2) spectro_ref = np.abs(src_ref_stft) # iSTFT (for having proper time domain size) src_ref = my_istft(src_ref_stft, hop_length=hop_length, win_length=win_length) # Create the folder to record audio files audio_1 = audio_name_1[audio_name_1.find('L'):][:-4] audio_2 = audio_name_2[audio_name_2.find('L'):][:-4] audio_folder_path = os.path.join(folders['outputs'], 'audio_files', audio_1 + '_' + audio_2) if not os.path.isdir(audio_folder_path): os.makedirs(audio_folder_path) # Separation algorithms sdr_omisi, sdr_misi, err_misi =\ apply_separation_algos(mixture_stft, spectro_ref, src_ref, audio_folder_path, win_length=win_length, hop_length=hop_length, max_iter=15, fs=fs, win_type=win_type) # Record score metric_omisi[:, ic] = sdr_omisi metric_misi[:, 0, ic] = sdr_misi metric_misi[:, 1, ic] = err_misi ic += 1 np.savez(folders['outputs'] + '/metrics.npz', metric_omisi=metric_omisi, metric_misi=metric_misi)