Пример #1
0
 def convergence_callback(Y):
     global SDR, SIR
     from mir_eval.separation import bss_eval_images
     ref = np.moveaxis(separate_recordings, 1, 2)
     y = np.array([
         pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_back=L)
         for ch in range(Y.shape[2])
     ])
     sdr, isr, sir, sar, perm = bss_eval_images(ref[:, :, 0],
                                                y[:, :ref.shape[1]])
     SDR.append(sdr)
     SIR.append(sir)
Пример #2
0
def parallel_loop(args):
    ''' This is the function that should be dumb parallel '''

    # expand positional arguments
    src_locs_ind, partial_length, seed = args

    # now the keyword arguments
    result_file = parameters['result_file']
    stft_win_len = parameters['stft_win_len']
    fs = parameters['fs']
    room = parameters['room']
    partial_rirs = parameters['partial_rirs']
    single_sources = parameters['single_sources']
    single_sources_anechoic = parameters['single_sources_anechoic']
    n_latent_var = parameters['n_latent_var']
    W_dict = parameters['W_dict']
    n_iter = parameters['n_iter']
    n_iter = parameters['n_iter']
    base_dir = parameters['base_dir']
    method = parameters['method']

    # make sure base dir is in path
    import sys, os
    if base_dir not in sys.path:
        sys.path.append(base_dir)

    import numpy as np
    from mir_eval.separation import bss_eval_images
    from multinmf_conv_mu import multinmf_conv_mu_wrapper
    from multinmf_conv_em import multinmf_conv_em_wrapper
    from utilities import partial_rir
    from sim_tools import json_append

    try:
        import mkl as mkl_service
        # for such parallel processing, it is better
        # to deactivate multithreading in mkl
        if not use_mkl:
            mkl_service.set_num_threads(1)
    except ImportError:
        pass

    # select between echoic and anechoic signals
    if partial_length != 'anechoic':
        clean_sources = single_sources
    else:
        # anechoic propagation
        clean_sources = single_sources_anechoic

    n_channels = clean_sources.shape[-1]
    n_sources = clean_sources.shape[0]
    n_bins = stft_win_len // 2 + 1

    # mix the sources
    mic_signals = np.zeros(clean_sources.shape[-2:])  # (n_samples, n_mics)
    for speech_index, loc_index in enumerate(src_locs_ind):
        mic_signals += clean_sources[speech_index, loc_index, :, :]

    # shape (n_mics, n_src, n_bins)
    if partial_length == 'anechoic':
        # in anechoic conditions, we have flat responses everywhere
        partial_rirs_sources = np.ones((n_channels, n_sources, n_bins))
        if method == 'em':
            freqvec = np.fft.rfftfreq(parameters['stft_win_len'], 1 / room.fs)
            partial_rirs_sources = np.swapaxes(
                partial_rirs[0][src_locs_ind, :, :], 0, 1)
    elif partial_length == 'learn':
        partial_rirs_sources = None
    elif partial_length >= 0:
        partial_rirs_sources = np.swapaxes(
            partial_rirs[partial_length][src_locs_ind, :, :], 0, 1)
    else:
        raise ValueError('Partial length needs to be non-negative')

    if method == 'mu':
        # L1 reg parameter
        gamma = parameters['gamma_opt'][partial_lengths]

        # separate using MU
        sep_sources = multinmf_conv_mu_wrapper(
            mic_signals,
            n_sources,
            n_latent_var,
            stft_win_len,
            partial_rirs=partial_rirs_sources,
            W_dict=W_dict,
            l1_reg=gamma,
            n_iter=mu_n_iter,
            verbose=False,
            random_seed=seed)

    elif method == 'em':
        # separate using EM
        sep_sources = multinmf_conv_em_wrapper(mic_signals,
                                               n_sources,
                                               stft_win_len,
                                               n_latent_var,
                                               n_iter=n_iter,
                                               A_init=partial_rirs_sources,
                                               W_init=W_dict,
                                               update_a=False,
                                               update_w=False,
                                               verbose=False)
    else:
        raise ValueError('Unknown algorithm {} requested'.format(method))

    # #render sources
    # for j, s in enumerate(sep_sources):
    #     # write the separated source to a wav file
    #     out_filename = 'data/Speech/' + 'speech_source_' + str(j) + '_' + str(partial_length) + '_EM.wav'
    #     wavfile.write(out_filename, room.fs, s)

    # compute the metrics
    n_samples = np.minimum(clean_sources.shape[2], sep_sources.shape[1])

    reference_signals = []
    for speech_ind, loc_ind in enumerate(src_locs_ind):
        reference_signals.append(clean_sources[speech_ind,
                                               loc_ind, :n_samples, :])
    reference_signals = np.array(reference_signals)

    ret = \
            bss_eval_images(reference_signals, sep_sources[:,:n_samples,:])

    entry = dict(
        src_locs_ind=src_locs_ind,
        partial_length=partial_length,
        algorithm=method,
        seed=seed,
        sdr=ret[0].tolist(),
        isr=ret[1].tolist(),
        sir=ret[2].tolist(),
        sar=ret[3].tolist(),
    )

    filename = result_file.format(os.getpid())
    json_append(filename, entry)

    return entry
Пример #3
0
def process_experiment_max_sinr(SIR, mic, args):

    nfft = args.nfft
    vad_guard = args.vad_guard
    if args.thresh is None:
        vad_thresh = thresh_opt[SIR]
    else:
        vad_thresh = args.thresh

    # read_in the mix signals
    fs_led, leds = wavfile.read(
        file_pattern.format('camera_leds_zero_hold', 'mix', SIR))
    fs_snd, audio = wavfile.read(
        file_pattern.format(mic_choices[mic], 'mix', SIR))
    assert fs_led == fs_snd

    # read in the ref signals
    r, noise_ref = wavfile.read(
        file_pattern.format(mic_choices[mic], 'noise_ref', SIR))
    assert r == fs_snd
    r, speech_ref = wavfile.read(file_speech_ref.format(mic_choices[mic]))
    assert r == fs_snd
    r, leds_ref = wavfile.read(file_speech_ref.format('camera_leds_zero_hold'))
    assert r == fs_snd

    # In case of objective evaluation, we do an artificial mix
    if args.synth_mix:
        audio = noise_ref + speech_ref

    # get the geometry information to get nice plots.
    mics_loc = np.array(protocol['geometry']['microphones'][mic]['reference'])
    noise_loc = protocol['geometry']['speakers']['locations'][0]
    speech_loc = protocol['geometry']['speakers']['locations'][1]

    # the directions of arrival
    theta_speech = 0
    p0 = speech_loc - mics_loc
    p1 = noise_loc - mics_loc
    theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1))
    print('Source separation', theta_noise / np.pi * 180)

    if mic == 'pyramic':
        I = list(range(8, 16)) + list(range(24, 32)) + list(range(
            40, 48))  # flat part
        #I = list(range(24,32)) + list(range(40,48)) # flat part
        #I = list(range(8,16))
        #I = list(range(48))
        audio = audio[:, I]
        noise_ref = noise_ref[:, I].copy()
        speech_ref = speech_ref[:, I].copy()
        mics_positions = mics_geom['pyramic'][I].copy()
        # place in room 2-806
        mics_positions -= np.mean(mics_positions, axis=0)[None, :]
        mics_positions[:, 2] -= np.max(mics_positions[:, 2])
        mics_positions += mics_loc

    elif mic == 'olympus':
        mics_positions = mics_geom['olympus'].copy() + mics_loc

    n_samples = audio.shape[0]  # shorthand
    n_channels = audio.shape[1]

    # perform VAD
    vad_snd = leds > vad_thresh

    # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix.
    # For that we will remove frames neighbouring the detected speech
    vad_guarded = vad_snd.copy()
    if vad_guard is not None:
        for i, v in enumerate(vad_snd):
            if np.any(vad_snd[i - vad_guard:i + vad_guard]):
                vad_guarded[i] = True

    ##############################
    ## STFT and frame-level VAD ##
    ##############################

    print('STFT and stuff')
    sys.stdout.flush()

    engine = pra.realtime.STFT(nfft,
                               nfft // 2,
                               pra.hann(nfft),
                               channels=audio.shape[1])

    def analysis(x):
        engine.analysis(x)
        return np.moveaxis(engine.X, 1, 0)

    # Now compute the STFT of the microphone input
    X = analysis(audio)
    X_time = np.arange(1, X.shape[0] + 1) * (nfft / 2) / fs_snd

    X_speech = analysis(audio * vad_guarded[:, None])
    X_noise = analysis(audio * (1 - vad_guarded[:, None]))

    S_ref = analysis(speech_ref)
    N_ref = analysis(noise_ref)

    ##########################
    ## MAX SINR BEAMFORMING ##
    ##########################

    print('Max SINR beamformer computation')
    sys.stdout.flush()

    # covariance matrices from noisy signal
    Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech))
    Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise))

    # compute covariances with reference signals to check everything is working correctly
    #Rs = np.einsum('i...j,i...k->...jk', S_ref, np.conj(S_ref))
    #Rn = np.einsum('i...j,i...k->...jk', N_ref, np.conj(N_ref))

    # compute the MaxSINR beamformer
    w = [
        la.eigh(rs, b=rn, eigvals=(n_channels - 1, n_channels - 1))[1]
        for rs, rn in zip(Rs[1:], Rn[1:])
    ]
    w = np.squeeze(np.array(w))
    nw = la.norm(w, axis=1)
    w[nw > 1e-10, :] /= nw[nw > 1e-10, None]
    w = np.concatenate([np.ones((1, n_channels)), w], axis=0)

    if not args.no_norm:
        # normalize with respect to input signal
        z = compute_gain(w,
                         X_speech,
                         X_speech[:, :, 0],
                         clip_up=args.clip_gain)
        w *= z[:, None]

    ###########
    ## APPLY ##
    ###########

    print('Apply beamformer')
    sys.stdout.flush()

    # 2D beamformer
    mic_array = pra.Beamformer(mics_positions[:, :2].T,
                               fs=fs_snd,
                               N=nfft,
                               hop=nfft,
                               zpb=nfft)
    mic_array.signals = audio.T
    mic_array.weights = w.T

    out = mic_array.process()

    # Signal alignment step
    ref = np.vstack([speech_ref[:, 0], noise_ref[:, 0]])
    # Not sure why the delay is sometimes negative here... Need to check more
    delay = np.abs(
        int(pra.tdoa(out, speech_ref[:, 0].astype(np.float), phat=True)))
    if delay > 0:
        out_trunc = out[delay:delay + ref.shape[1]]
        noise_eval = audio[:ref.shape[1], 0] - out_trunc
    else:
        out_trunc = np.concatenate(
            (np.zeros(-delay), out[:ref.shape[1] + delay]))
        noise_eval = audio[:ref.shape[1], 0] - out_trunc
    sig_eval = np.vstack([out_trunc, noise_eval])

    # We use the BSS eval toolbox
    metric = bss_eval_images(ref[:, :, None], sig_eval[:, :, None])

    # we are only interested in SDR and SIR for the speech source
    SDR_out = metric[0][0]
    SIR_out = metric[2][0]

    ##################
    ## SAVE SAMPLES ##
    ##################

    if args.save_sample is not None:

        # for informal listening tests, we need to high pass and normalize the
        # amplitude.
        upper = np.maximum(audio[:, 0].max(), out.max())
        sig_in = pra.highpass(audio[:, 0].astype(np.float) / upper,
                              fs_snd,
                              fc=150)
        sig_out = pra.highpass(out / upper, fs_snd, fc=150)

        f1 = os.path.join(args.save_sample,
                          '{}_ch0_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f1, fs_snd, sig_in)
        f2 = os.path.join(args.save_sample,
                          '{}_out_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f2, fs_snd, sig_out)

    ##########
    ## PLOT ##
    ##########

    if args.plot:

        plt.figure()
        plt.plot(out_trunc)
        plt.plot(speech_ref[:, 0])
        plt.legend(['output', 'reference'])

        # time axis for plotting
        led_time = np.arange(leds.shape[0]) / fs_led + 1 / (2 * fs_led)
        audio_time = np.arange(n_samples) / fs_snd

        plt.figure()
        plt.plot(led_time, leds, 'r')
        plt.title('LED signal')

        # match the scales of VAD and light to sound before plotting
        q_vad = np.max(audio)
        q_led = np.max(audio) / np.max(leds)

        plt.figure()
        plt.plot(audio_time, audio[:, 0], 'b')
        plt.plot(led_time, leds * q_led, 'r')
        plt.plot(audio_time, vad_snd * q_vad, 'g')
        plt.plot(audio_time, vad_guarded * q_vad, 'g--')
        plt.legend(['audio', 'VAD'])
        plt.title('LED and audio signals')

        plt.figure()
        a_time = np.arange(audio.shape[0]) / fs_snd
        plt.plot(a_time, audio[:, 0])
        plt.plot(a_time, out_trunc)
        #plt.plot(a_time, speech_ref[:,0])
        plt.legend(['channel 0', 'beamformer output', 'speech reference'])

        plt.figure()
        mic_array.plot_beam_response()
        plt.vlines(
            [180 + np.degrees(theta_speech), 180 - np.degrees(theta_noise)], 0,
            nfft // 2)

        room = pra.ShoeBox(protocol['geometry']['room'][:2],
                           fs=16000,
                           max_order=1)

        room.add_source(noise_loc[:2])  # noise
        room.add_source(speech_loc[:2])  # speech
        room.add_source(
            protocol['geometry']['speakers']['locations'][1][:2])  # signal

        room.add_microphone_array(mic_array)
        room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000])

        plt.figure()
        mic_array.plot()

        plt.show()

    # Return SDR and SIR
    return SDR_out, SIR_out
def test_sparseauxiva():
    fs = 16000

    signals = [
        np.concatenate([
            wavfile.read(f)[1].astype(np.float32, order='C')
            for f in source_files
        ]) for source_files in wav_files
    ]

    wavfile.write('sample1.wav', fs, np.asarray(signals[0], dtype=np.int16))
    wavfile.write('sample2.wav', fs, np.asarray(signals[1], dtype=np.int16))

    # Define an anechoic room envrionment, as well as the microphone array and source locations.

    # Room 4m by 6m
    room_dim = [8, 9]
    # source locations and delays
    locations = [[2.5, 3], [2.5, 6]]
    delays = [1., 0.]
    # create an anechoic room with sources and mics
    room = pra.ShoeBox(room_dim,
                       fs=16000,
                       max_order=15,
                       absorption=0.35,
                       sigma2_awgn=1e-8)

    # add mic and good source to room
    # Add silent signals to all sources
    for sig, d, loc in zip(signals, delays, locations):
        room.add_source(loc, signal=np.zeros_like(sig), delay=d)

    # add microphone array

    room.add_microphone_array(
        pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], room.fs))

    # Compute the RIRs as in the Room Impulse Response generation section.

    # compute RIRs
    room.compute_rir()

    # Record each source separately

    separate_recordings = []
    for source, signal in zip(room.sources, signals):
        source.signal[:] = signal
        room.simulate()
        separate_recordings.append(room.mic_array.signals)
        source.signal[:] = 0.
    separate_recordings = np.array(separate_recordings)

    # Mix down the recorded signals
    mics_signals = np.sum(separate_recordings, axis=0)

    # save mixed signals as wav files
    wavfile.write('mix1.wav', fs, np.asarray(mics_signals[0].T,
                                             dtype=np.int16))
    wavfile.write('mix2.wav', fs, np.asarray(mics_signals[1].T,
                                             dtype=np.int16))
    wavfile.write(
        'mix1_norm.wav', fs,
        np.asarray(mics_signals[0].T / np.max(np.abs(mics_signals[0].T)) *
                   32767,
                   dtype=np.int16))
    wavfile.write(
        'mix2_norm.wav', fs,
        np.asarray(mics_signals[1].T / np.max(np.abs(mics_signals[1].T)) *
                   32767,
                   dtype=np.int16))

    # STFT frame length
    L = 2048

    # START BSS
    ###########

    # Preprocessing
    # Observation vector in the STFT domain
    X = np.array([
        pra.stft(ch,
                 L,
                 L,
                 transform=np.fft.rfft,
                 zp_front=L // 2,
                 zp_back=L // 2) for ch in mics_signals
    ])
    X = np.moveaxis(X, 0, 2)

    # Reference signal to calculate performance of BSS
    ref = np.moveaxis(separate_recordings, 1, 2)

    ratio = 0.35
    average = np.abs(np.mean(np.mean(X, axis=2), axis=0))
    k = np.int_(average.shape[0] * ratio)
    S = np.argpartition(average, -k)[-k:]
    S = np.sort(S)
    n_iter = 30

    # Run SparseAuxIva
    Y = pra.bss.sparseauxiva(X, S, n_iter, lasso=True)

    # run iSTFT
    y = np.array([
        pra.istft(Y[:, :, ch],
                  L,
                  L,
                  transform=np.fft.irfft,
                  zp_front=L // 2,
                  zp_back=L // 2) for ch in range(Y.shape[2])
    ])

    # Compare SIR and SDR with our reference signal
    sdr, isr, sir, sar, perm = bss_eval_images(
        ref[:, :y.shape[1] - L // 2, 0], y[:, L // 2:ref.shape[1] + L // 2])
    print('SDR: {0}, SIR: {1}'.format(sdr, sir))

    wavfile.write('demix1.wav', fs, np.asarray(y[0].T, dtype=np.int16))
    wavfile.write('demix2.wav', fs, np.asarray(y[1].T, dtype=np.int16))
    wavfile.write(
        'demix1_norm.wav', fs,
        np.asarray(y[0].T / np.max(np.abs(y[0].T)) * 32767, dtype=np.int16))
    wavfile.write(
        'demix2_norm.wav', fs,
        np.asarray(y[1].T / np.max(np.abs(y[1].T)) * 32767, dtype=np.int16))
Пример #5
0
                       proj_back=True,
                       callback=convergence_callback)

    # run iSTFT
    y = np.array([
        pra.istft(Y[:, :, ch],
                  L,
                  L,
                  transform=np.fft.irfft,
                  zp_front=L // 2,
                  zp_back=L // 2) for ch in range(Y.shape[2])
    ])

    # Compare SIR
    #############
    sdr, isr, sir, sar, perm = bss_eval_images(
        ref[:, :y.shape[1] - L // 2, 0], y[:, L // 2:ref.shape[1] + L // 2])

    print('SDR:', sdr)
    print('SIR:', sir)

    import matplotlib.pyplot as plt
    plt.figure()
    plt.subplot(2, 2, 1)
    plt.specgram(ref[0, :, 0], NFFT=1024, Fs=room.fs)
    plt.title('Source 0 (clean)')

    plt.subplot(2, 2, 2)
    plt.specgram(ref[1, :, 0], NFFT=1024, Fs=room.fs)
    plt.title('Source 1 (clean)')

    plt.subplot(2, 2, 3)
Пример #6
0
def process_experiment_max_sinr(SIR, mic, blinky, args):

    session = args.session
    target = args.target

    with open(metadata_file.format(session=args.session), 'r') as f:
        metadata = json.load(f)

    file_pattern = os.path.join(experiment_folder, metadata['filename_pattern'])

    with open(protocol_file.format(session=args.session), 'r') as f:
        protocol = json.load(f)

    nfft = args.nfft
    vad_guard = args.vad_guard
    if args.thresh is None:
        vad_thresh = thresh_opt[SIR]
    else:
        vad_thresh = args.thresh

    # read_in the mix signals
    fs_led, leds   = wavfile.read(file_pattern.format(
        session=session, snr=SIR, mic=blinky, source='mix', fs=fs))
    fs_snd, audio  = wavfile.read(file_pattern.format(
        session=session, snr=SIR, mic=mic_choices[mic], source='mix', fs=fs))
    assert fs_led == fs_snd

    # read in the ref signals
    sources_ref  = dict(zip(target_choices,
        [ wavfile.read(file_pattern.format(
                session=session, mic=mic_choices[mic], snr=SIR, source=ch, fs=fs))[1]
            for ch in target_choices ]))
    leds_ref  = dict(zip(target_choices,
        [ wavfile.read(file_pattern.format(
                session=session, mic=blinky, snr=SIR, source=ch, fs=fs))[1]
            for ch in target_choices ]))

    # reorder with target in first position
    ref = np.array([sources_ref[target]] + [sources_ref[ch]
                for ch in target_choices if ch != target])

    noise_ref = np.zeros_like(sources_ref[target])
    n_ch = [ch for ch in target_choices if ch != target]
    for ch in n_ch:
        noise_ref += sources_ref[ch]

    # In case of objective evaluation, we do an artificial mix
    if args.synth_mix:
        audio = sources_ref[target] + noise_ref

    # get the geometry information to get nice plots.
    mics_geom = {
            'pyramic' : np.array(protocol['geometry']['microphones']['pyramic']['locations']),
            'camera'  : np.array(protocol['geometry']['microphones']['camera']['locations']),
            }

    mics_loc = np.array(protocol['geometry']['microphones'][mic_choices[mic]]['reference'])
    noise_loc = protocol['geometry']['speakers']['locations'][0]
    speech_loc = protocol['geometry']['speakers']['locations'][1]

    # the directions of arrival
    theta_speech = 0
    p0 = speech_loc - mics_loc
    p1 = noise_loc - mics_loc
    theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1))
    print('Source separation', theta_noise / np.pi * 180)

    if 'pyramic' in mic:

        if mic == 'pyramic_2':
            I = pyramic_bss_2ch
        elif mic == 'pyramic_4':
            I = pyramic_bss_4ch
        elif mic == 'pyramic_24':
            I = list(range(8,16)) + list(range(24,32)) + list(range(40,48)) # flat part
        elif mic == 'pyramic_48':
            I = list(range(48))
        else:
            raise ValueError('Unsupported configuration')

        audio = audio[:,I]
        noise_ref = noise_ref[:,I].copy()
        ref = ref[:,:,I].copy()

        mics_positions = mics_geom['pyramic'][I].copy()
        # place in room 2-806
        mics_positions -= np.mean(mics_positions, axis=0)[None,:]
        mics_positions[:,2] -= np.max(mics_positions[:,2])
        mics_positions += mics_loc

    elif mic == 'camera':
        mics_positions = mics_geom['camera'].copy() + mics_loc


    n_samples = audio.shape[0]  # shorthand
    n_channels = audio.shape[1]

    # adjust length of led signal if necessary
    if leds.shape[0] < audio.shape[0]:
        z_missing = audio.shape[0] - leds.shape[0]
        leds = np.pad(leds, (0,z_missing), 'constant')
    elif leds.shape[0] > audio.shape[0]:
        leds = leds[:audio.shape[0],]

    # perform VAD
    led_target = leds[:,blinky_source_map[target]]
    vad_snd = led_target > vad_thresh

    # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix.
    # For that we will remove frames neighbouring the detected speech
    vad_guarded = vad_snd.copy()
    if vad_guard is not None:
        for i,v in enumerate(vad_snd):
            if np.any(vad_snd[i-vad_guard:i+vad_guard]):
                vad_guarded[i] = True

    ##############################
    ## STFT and frame-level VAD ##
    ##############################

    print('STFT and stuff')
    sys.stdout.flush()

    a_win = pra.hann(nfft)
    s_win = pra.realtime.compute_synthesis_window(a_win, nfft // 2)

    engine = pra.realtime.STFT(nfft, nfft // 2,
            analysis_window=a_win, synthesis_window=s_win,
            channels=audio.shape[1])

    # Now compute the STFT of the microphone input
    X = engine.analysis(audio)
    X_time = np.arange(1, X.shape[0]+1) * (nfft / 2) / fs_snd

    X_speech = engine.analysis(audio * vad_guarded[:audio.shape[0],None])
    X_noise = engine.analysis(audio * (1 - vad_guarded[:audio.shape[0],None]))

    ##########################
    ## MAX SINR BEAMFORMING ##
    ##########################

    print('Max SINR beamformer computation')
    sys.stdout.flush()

    # covariance matrices from noisy signal
    Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech))
    Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise)) 
    Rall = Rs + Rn

    # compute the MaxSINR beamformer
    w = [la.eigh(rs, b=rn, eigvals=(n_channels-1,n_channels-1))[1] for rs,rn in zip(Rall[1:], Rn[1:])]
    w = np.squeeze(np.array(w))
    nw = la.norm(w, axis=1)
    w[nw > 1e-10,:] /= nw[nw > 1e-10,None]
    w = np.concatenate([np.ones((1,n_channels)), w], axis=0)  # add dummy beamformer at DC

    if not args.no_norm:
        # normalize with respect to input signal
        z = compute_gain(w, X_speech, X_speech[:,:,0], clip_up=args.clip_gain)
        w *= z[:,None]


    ###########
    ## APPLY ##
    ###########

    print('Apply beamformer')
    sys.stdout.flush()

    # 2D beamformer
    mic_array = pra.Beamformer(mics_positions[:,:2].T, fs=fs_snd, N=nfft, hop=nfft, zpb=nfft)
    mic_array.signals = audio.T
    mic_array.weights = w.T

    out = mic_array.process()

    # Signal alignment step

    # Not sure why the delay is sometimes negative here... Need to check more
    delay = int(pra.tdoa(out, ref[0,:,0].astype(np.float), phat=True))
    print(delay)
    delay = np.abs(delay)
    if delay > 0:
        out_trunc = out[delay:delay+ref.shape[1]]
    else:
        out_trunc = np.concatenate((np.zeros(-delay), out[:ref.shape[1]+delay]))
    sig_eval = np.vstack([out_trunc] * len(target_choices))

    # We use the BSS eval toolbox
    metric = bss_eval_images(ref[:,:,0], sig_eval[:,:,None])

    # we are only interested in SDR and SIR for the speech source
    ret = { 'Max-SINR' : {'SDR' : metric[0][0], 'SIR' : metric[2][0]} }


    #############################
    ## BLIND SOURCE SEPARATION ##
    #############################

    if mic in ['camera', 'pyramic_2', 'pyramic_4']:

        Y = pra.bss.auxiva(X, n_iter=40)
        bss = pra.realtime.synthesis(Y, nfft, nfft // 2, win=s_win)

        match = []
        for col in range(bss.shape[1]):
            xcorr = fast_corr(bss[:,col], ref[0,:,0])
            match.append(np.max(xcorr))
        best_col = np.argmax(match)

        # Not sure why the delay is sometimes negative here... Need to check more
        delay = np.abs(int(pra.tdoa(bss[:,best_col], ref[0,:,0].astype(np.float), phat=True)))
        if delay > 0:
            bss_trunc = bss[delay:delay+ref.shape[1],]
        elif delay < 0:
            bss_trunc = np.concatenate((np.zeros((-delay, bss.shape[1])), bss[:ref.shape[1]+delay]))
        else:
            bss_trunc = bss[:ref.shape[1],]

        if ref.shape[1] > bss_trunc.shape[0]:
            ref_lim = bss_trunc.shape[0]
        else:
            ref_lim = ref.shape[1]

        if mic in ['camera', 'pyramic_2']:
            bss_trunc = np.hstack([bss_trunc] * 2)

        metric = bss_eval_images(ref[:,:ref_lim,0,None], bss_trunc.T[:,:,None])
        SDR_bss = metric[0][0]
        SIR_bss = metric[2][0]
        ret['BSS'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] }

    #################################
    ## Estimate SDR and SIR of mix ##
    #################################

    # Not sure why the delay is sometimes negative here... Need to check more
    delay = np.abs(int(pra.tdoa(audio[:,0], ref[0,:,0].astype(np.float), phat=True)))
    if delay > 0:
        audio_trunc = audio[delay:delay+ref.shape[1],0]
    elif delay < 0:
        audio_trunc = np.concatenate((np.zeros(-delay), audio[:ref.shape[1]+delay,0]))
    else:
        audio_trunc = audio[:ref.shape[1],0]

    if ref.shape[1] > audio_trunc.shape[0]:
        ref_lim = audio_trunc.shape[0]
    else:
        ref_lim = ref.shape[1]

    audio_trunc = np.vstack([audio_trunc] * len(ref))

    metric = bss_eval_images(ref[:,:ref_lim,0,None], audio_trunc[:,:,None])
    SDR_bss = metric[0][0]
    SIR_bss = metric[2][0]
    ret['Mix'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] }

    ##################
    ## SAVE SAMPLES ##
    ##################

    if args.save_sample is not None:

        if not os.path.exists(args.save_sample):
            os.makedirs(args.save_sample)

        # for informal listening tests, we need to high pass and normalize the
        # amplitude.
        if mic in ['camera', 'pyramic_2', 'pyramic_4']:
            upper = np.max([audio[:,0].max(), out.max(), bss.max(), ref[0,:,0].max()])
        else:
            upper = np.max([audio[:,0].max(), out.max(), ref[0,:,0].max()])


        # Clean signal for reference
        sig_ref = pra.highpass(ref[0,:,0].astype(np.float) / upper, fs_snd, fc=150)
        f0 = os.path.join(args.save_sample, '{}_ref_SIR_NA_dB.wav'.format(mic))
        wavfile.write(f0, fs_snd, sig_ref)

        # Mix signal for reference
        sig_mix = pra.highpass(audio[:,0].astype(np.float) / upper, fs_snd, fc=150)
        f1 = os.path.join(args.save_sample, '{}_mix_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f1, fs_snd, sig_mix)

        # Output of MaxSINR
        sig_out = pra.highpass(out / upper, fs_snd, fc=150)
        f2 = os.path.join(args.save_sample, '{}_maxsinr_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f2, fs_snd, sig_out)

        # Output of BSS
        if mic in ['camera', 'pyramic_2', 'pyramic_4']:
            sig_bss = pra.highpass(bss[:,best_col] / upper, fs_snd, fc=150)
            f3 = os.path.join(args.save_sample, '{}_bss_SIR_{}_dB.wav'.format(mic, SIR))
            wavfile.write(f3, fs_snd, sig_bss)


    ##########
    ## PLOT ##
    ##########

    if args.plot:

        plt.figure()
        plt.plot(out_trunc)
        plt.plot(ref[0,:,0])
        plt.legend(['output', 'reference'])

        # time axis for plotting
        led_time = np.arange(led_target.shape[0]) / fs_led + 1 / (2 * fs_led)
        audio_time = np.arange(n_samples) / fs_snd

        plt.figure()
        plt.plot(led_time, led_target, 'r')
        plt.title('LED signal')

        # match the scales of VAD and light to sound before plotting
        q_vad = np.max(audio)
        q_led = np.max(audio) / np.max(led_target)

        plt.figure()
        plt.plot(audio_time, audio[:,0], 'b') 
        plt.plot(led_time, led_target * q_led, 'r')
        plt.plot(audio_time, vad_snd * q_vad, 'g')
        plt.plot(audio_time, vad_guarded * q_vad, 'g--')
        plt.legend(['audio','VAD'])
        plt.title('LED and audio signals')

        plt.figure()
        a_time = np.arange(audio.shape[0]) / fs_snd
        plt.plot(a_time, audio[:,0])
        plt.plot(a_time, out_trunc)
        plt.legend(['channel 0', 'beamformer output', 'speech reference'])

        '''
        plt.figure()
        mic_array.plot_beam_response()
        plt.vlines([180+np.degrees(theta_speech), 180-np.degrees(theta_noise)], 0, nfft // 2)

        room = pra.ShoeBox(protocol['geometry']['room'][:2], fs=16000, max_order=1)

        room.add_source(noise_loc[:2])   # noise
        room.add_source(speech_loc[:2])  # speech
        room.add_source(protocol['geometry']['speakers']['locations'][1][:2])  # signal

        room.add_microphone_array(mic_array)
        room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000])
        '''

        plt.figure()
        mic_array.plot()

        plt.show()


    # Return SDR and SIR
    return ret
Пример #7
0
                                               W_init=W_dict,
                                               update_a=False,
                                               update_w=False,
                                               verbose=True)
    else:
        raise ValueError('Unknown algorithm {} requested'.format(method))

    n_samples = np.minimum(single_sources.shape[2], sep_sources.shape[1])
    reference_signals = []
    for speech_ind in range(n_speech):
        reference_signals.append(single_sources[speech_ind,
                                                speech_ind, :n_samples, :])
    reference_signals = np.array(reference_signals)

    ret = \
            bss_eval_images(reference_signals, sep_sources[:,:n_samples,:])

    print('SDR={} ISR={} SIR={} SAR={}'.format(*ret[:4]))

    mic_norm = 0.7 / np.max(np.abs(mic_signals))
    sep_src_norm = 0.7 / np.max(np.abs(sep_sources))

    if args.play:
        import sounddevice as sd
        sd.play(mic_signals[:, :2] / mic_norm, samplerate=fs, blocking=True)
        for s in range(n_speech):
            sd.play(sep_sources[s, :, :2] / sep_src_norm,
                    samplerate=fs,
                    blocking=True)

    if args.save is not None: