예제 #1
def misi(mixture_stft, spectrograms_target, win_length, hop_length=None, src_ref=None, max_iter=15, win_type='hann'):
    """The multiple input spectrogram inversion algorithm for source separation.
        mixture_stft: numpy.ndarray (nfreqs, nframes) - input mixture STFT
        spectrograms_target: numpy.ndarray (nfreqs, nframes, nrsc) - the target sources' magnitude spectrograms
        win_length: int - the window length
        hop_length: int - the hop size of the STFT
        src_ref: numpy.ndarray (nsamples, nrsc) - reference sources for computing the SDR over iterations
        max_iter: int - number of iterations
        win_type: string - window type
        estimated_sources: numpy.ndarray (nsamples, nrsc) - the time-domain estimated sources
        error: list (max_iter) - loss function (magnitude mismatch) over iterations
        sdr: list (max_iter) - score (SI-SDR in dB) over iterations
    if hop_length is None:
        hop_length = win_length // 2

    compute_sdr = True
    if src_ref is None:
        compute_sdr = False

    # Parameters
    number_sources = spectrograms_target.shape[2]
    n_fft = (spectrograms_target.shape[0]-1)*2

    # Pre allocate SDR and error
    sdr = []
    error = []

    # Initialize the time domain estimates with the mixture
    mixture_time = my_istft(mixture_stft, hop_length=hop_length, win_length=win_length, win_type=win_type)
    estimated_sources = np.repeat(mixture_time[:, np.newaxis], number_sources, axis=1) / number_sources

    for iteration_number in range(max_iter):
        # STFT
        stft_reest = my_stft(estimated_sources, n_fft=n_fft, hop_length=hop_length, win_length=win_length, win_type=win_type)
        current_magnitude = np.abs(stft_reest)
        # Compute and distribute the mixing error
        mixing_error = mixture_stft - np.sum(stft_reest, axis=2)
        corrected_stft = stft_reest + np.repeat(mixing_error[:, :, np.newaxis], number_sources, axis=2) / number_sources
        # Normalize to the target amplitude
        stft_estim = corrected_stft * spectrograms_target / (np.abs(corrected_stft) + sys.float_info.epsilon)
        # Inverse STFT
        estimated_sources = my_istft(stft_estim, win_length=win_length, hop_length=hop_length, win_type=win_type)
        # BSS score
        if compute_sdr:
            sdr.append(get_separation_score(src_ref, estimated_sources))
        # Error
        error.append(np.linalg.norm(current_magnitude - spectrograms_target))

    return estimated_sources, error, sdr
예제 #2
def omisi(mixture_stft, spectrograms_target, win_length, hop_length=None, init_method='mix', future_frames=1, src_ref=None, phase_true=None, max_iter=5, win_type='hann'):
    """The online multiple input spectrogram inversion algorithm for source separation.
        mixture_stft: numpy.ndarray (nfreqs, nframes) - input mixture STFT
        spectrograms_target: numpy.ndarray (nfreqs, nframes, nrsc) - the target sources' magnitude spectrograms
        win_length: int - the window length
        hop_length: int - the hop size of the STFT
        init_method: string - phase initialization method ('mix', 'sinus', or 'true')
        future_frames: int - number of future frames to account for
        src_ref: numpy.ndarray (nsamples, nrsc) - reference sources for computing the SDR over iterations
        phase_true: numpy.ndarray (nfreqs, nframes, nrsc) - the ground truth sources' phase (for ideal phase mask)
        max_iter: int - number of iterations
        win_type: string - window type
        estimated_sources: numpy.ndarray (nsamples, nrsc) - the time-domain estimated sources
        sdr: list (max_iter) - score (SI-SDR in dB) over iterations
    # For a null look-ahead, use a slightly faster version of oMISI
    if future_frames == 0:
        estimated_sources, sdr = omisi_fast(mixture_stft, spectrograms_target, win_length, hop_length, init_method, src_ref, phase_true, max_iter, win_type)
        return estimated_sources, sdr

    if hop_length is None:
        hop_length = win_length // 2

    # Parameters
    n_freqs, n_frames, nsrc = spectrograms_target.shape
    n_fft = (n_freqs - 1) * 2

    # Pre allocate proper size time domain signals
    expected_signal_len = win_length + hop_length * (n_frames - 1)
    estimated_sources = np.zeros((expected_signal_len, nsrc))
    s_current = np.zeros((win_length + future_frames * hop_length, nsrc))

    # Initial phase allocation
    phase_current = np.repeat(np.angle(mixture_stft[:, 0:future_frames])[:, :, np.newaxis], nsrc, axis=2)

    # Loop over time frames
    for i in range(n_frames-future_frames):

        sample_start = i * hop_length
        mag = spectrograms_target[:, i:i+future_frames+1, :]

        # Initialization of the new frame
        if init_method == 'mix':
            phase_new = np.repeat(np.angle(mixture_stft[:, i+future_frames])[:, np.newaxis, np.newaxis], nsrc, axis=2)
        elif init_method == 'sinus':
            phase_new = phase_current[:, -1, :] + get_normalized_frequencies_multi_sources(mag[:, -1, :]) * 2 * np.pi * hop_length
            phase_new = phase_new.reshape((n_freqs, 1, nsrc))
        elif init_method == 'true':
            phase_new = phase_true[:, i+future_frames, :]
            raise ValueError('Unknown initialization scheme')

        phase_ini = np.concatenate((phase_current, phase_new), axis=1)
        Y_dft_corrected = mag * np.exp(1j * phase_ini)

        # partial iSTFT
        s_wind = my_istft(Y_dft_corrected, win_length=win_length, hop_length=hop_length, win_type=win_type)

        # Overlap add
        s_ola = s_current + s_wind

        for iter in range(max_iter):
            # partial STFT
            Y_dft = my_stft(s_ola, win_length=win_length, hop_length=hop_length, n_fft=n_fft, win_type='hanning')
            # Compute and distribute the mixing error
            mixing_error = mixture_stft[:, i:i+future_frames+1] - np.sum(Y_dft, axis=2)
            Y_dft_corrected = Y_dft + np.repeat(mixing_error[:, :, np.newaxis], nsrc, axis=2) / nsrc
            # Normalize to the target magnitude (GL)
            Y_dft_norm = mag * np.exp(1j * np.angle(Y_dft_corrected))
            # partial iSTFT
            s_wind = my_istft(Y_dft_norm, win_length=win_length, hop_length=hop_length, win_type=win_type)
            # Overlap add with the previous exited frame
            s_ola = s_current + s_wind

        phase_current = np.angle(Y_dft_corrected)[:, 1:, :]
        #estimated_sources[sample_start:(sample_start + hop_length), :] = s_ola[:hop_length, :]
        #s_current = np.concatenate((s_ola[hop_length:, :], np.zeros((hop_length, nsrc))))
        estimated_sources[sample_start:(sample_start + win_length + future_frames * hop_length), :] = s_ola
        # Update the current fixed segment (ignore the future frames but account for the overlapped past frames)
        s_partial = my_istft(np.reshape(Y_dft_norm[:, 0, :], (n_freqs, 1, nsrc)), win_length=win_length, hop_length=hop_length, win_type=win_type)
        s_current = np.concatenate((s_current[hop_length:win_length, :] + s_partial[hop_length:, :], np.zeros(((future_frames+1) * hop_length, nsrc))))
    sdr = []
    if not(src_ref is None):
        sdr = get_separation_score(src_ref, estimated_sources)

    return estimated_sources, sdr
예제 #3
def main(folders, parameters):
    """The main function that benchmarks all algorithm over the dataset
        folders: dict with fields:
            'data': the dataset path
            'speakers': the name of the folder corresponding to the chosen speaker pair
            'outputs': the folder where the outputs (audio files, metrics, models) are stored
        parameters: dict with audio parameters:
            'sample_rate': int - the sample rate
            'win_length': int - the window length for the STFT
            'hop_length': int - the hop size of the STFT
            'n_fft': int - number of FFT points
            'fs': int - sample rate
            'win_type': string - window type
    # Get parameters
    n_fft = parameters['n_fft']
    hop_length = parameters['hop_length']
    win_length = parameters['win_length']
    fs = parameters['sample_rate']
    win_type = parameters['win_type']

    # Create an object for each speaker to get the file list. Test mixtures are created by summing files from the lists
    test_data_1 = HINT_audio_handler(folders['data'], [folders['speakers'][0]],
                                     'test', parameters['sample_rate'])
    test_data_2 = HINT_audio_handler(folders['data'], [folders['speakers'][1]],
                                     'test', parameters['sample_rate'])

    # Number of sentences per speaker
    list_range = 2

    # Pre-allocate score
    metric_omisi = np.zeros([5, list_range**2])
    metric_misi = np.zeros([15, 2, list_range**2])

    # Loop over testing dataset
    ic = 0
    for file_num_1 in np.arange(list_range):
        for file_num_2 in np.arange(list_range, list_range * 2):
            # Load the test data for each sentence
            audio_in_1, audio_name_1 = test_data_1.get_file_from_list(
            audio_in_2, audio_name_2 = test_data_2.get_file_from_list(

            # Adjust to the same length and stack in an array
            min_len = min(len(audio_in_1), len(audio_in_2))
            audio_in_1 = audio_in_1[:min_len]
            audio_in_2 = audio_in_2[:min_len]
            src_ref = np.stack((audio_in_1, audio_in_2), axis=1)

            # STFTs
            src_ref_stft = my_stft(src_ref,
            mixture_stft = np.sum(src_ref_stft, axis=2)
            spectro_ref = np.abs(src_ref_stft)

            # iSTFT (for having proper time domain size)
            src_ref = my_istft(src_ref_stft,

            # Create the folder to record audio files
            audio_1 = audio_name_1[audio_name_1.find('L'):][:-4]
            audio_2 = audio_name_2[audio_name_2.find('L'):][:-4]
            audio_folder_path = os.path.join(folders['outputs'], 'audio_files',
                                             audio_1 + '_' + audio_2)
            if not os.path.isdir(audio_folder_path):

            # Separation algorithms
            sdr_omisi, sdr_misi, err_misi =\
                apply_separation_algos(mixture_stft, spectro_ref, src_ref, audio_folder_path, win_length=win_length,
                                       hop_length=hop_length, max_iter=15, fs=fs, win_type=win_type)

            # Record score
            metric_omisi[:, ic] = sdr_omisi
            metric_misi[:, 0, ic] = sdr_misi
            metric_misi[:, 1, ic] = err_misi

            ic += 1

    np.savez(folders['outputs'] + '/metrics.npz',