def convergence_callback(Y): global SDR, SIR from mir_eval.separation import bss_eval_images ref = np.moveaxis(separate_recordings, 1, 2) y = np.array([ pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_back=L) for ch in range(Y.shape[2]) ]) sdr, isr, sir, sar, perm = bss_eval_images(ref[:, :, 0], y[:, :ref.shape[1]]) SDR.append(sdr) SIR.append(sir)
def parallel_loop(args): ''' This is the function that should be dumb parallel ''' # expand positional arguments src_locs_ind, partial_length, seed = args # now the keyword arguments result_file = parameters['result_file'] stft_win_len = parameters['stft_win_len'] fs = parameters['fs'] room = parameters['room'] partial_rirs = parameters['partial_rirs'] single_sources = parameters['single_sources'] single_sources_anechoic = parameters['single_sources_anechoic'] n_latent_var = parameters['n_latent_var'] W_dict = parameters['W_dict'] n_iter = parameters['n_iter'] n_iter = parameters['n_iter'] base_dir = parameters['base_dir'] method = parameters['method'] # make sure base dir is in path import sys, os if base_dir not in sys.path: sys.path.append(base_dir) import numpy as np from mir_eval.separation import bss_eval_images from multinmf_conv_mu import multinmf_conv_mu_wrapper from multinmf_conv_em import multinmf_conv_em_wrapper from utilities import partial_rir from sim_tools import json_append try: import mkl as mkl_service # for such parallel processing, it is better # to deactivate multithreading in mkl if not use_mkl: mkl_service.set_num_threads(1) except ImportError: pass # select between echoic and anechoic signals if partial_length != 'anechoic': clean_sources = single_sources else: # anechoic propagation clean_sources = single_sources_anechoic n_channels = clean_sources.shape[-1] n_sources = clean_sources.shape[0] n_bins = stft_win_len // 2 + 1 # mix the sources mic_signals = np.zeros(clean_sources.shape[-2:]) # (n_samples, n_mics) for speech_index, loc_index in enumerate(src_locs_ind): mic_signals += clean_sources[speech_index, loc_index, :, :] # shape (n_mics, n_src, n_bins) if partial_length == 'anechoic': # in anechoic conditions, we have flat responses everywhere partial_rirs_sources = np.ones((n_channels, n_sources, n_bins)) if method == 'em': freqvec = np.fft.rfftfreq(parameters['stft_win_len'], 1 / room.fs) partial_rirs_sources = np.swapaxes( partial_rirs[0][src_locs_ind, :, :], 0, 1) elif partial_length == 'learn': partial_rirs_sources = None elif partial_length >= 0: partial_rirs_sources = np.swapaxes( partial_rirs[partial_length][src_locs_ind, :, :], 0, 1) else: raise ValueError('Partial length needs to be non-negative') if method == 'mu': # L1 reg parameter gamma = parameters['gamma_opt'][partial_lengths] # separate using MU sep_sources = multinmf_conv_mu_wrapper( mic_signals, n_sources, n_latent_var, stft_win_len, partial_rirs=partial_rirs_sources, W_dict=W_dict, l1_reg=gamma, n_iter=mu_n_iter, verbose=False, random_seed=seed) elif method == 'em': # separate using EM sep_sources = multinmf_conv_em_wrapper(mic_signals, n_sources, stft_win_len, n_latent_var, n_iter=n_iter, A_init=partial_rirs_sources, W_init=W_dict, update_a=False, update_w=False, verbose=False) else: raise ValueError('Unknown algorithm {} requested'.format(method)) # #render sources # for j, s in enumerate(sep_sources): # # write the separated source to a wav file # out_filename = 'data/Speech/' + 'speech_source_' + str(j) + '_' + str(partial_length) + '_EM.wav' # wavfile.write(out_filename, room.fs, s) # compute the metrics n_samples = np.minimum(clean_sources.shape[2], sep_sources.shape[1]) reference_signals = [] for speech_ind, loc_ind in enumerate(src_locs_ind): reference_signals.append(clean_sources[speech_ind, loc_ind, :n_samples, :]) reference_signals = np.array(reference_signals) ret = \ bss_eval_images(reference_signals, sep_sources[:,:n_samples,:]) entry = dict( src_locs_ind=src_locs_ind, partial_length=partial_length, algorithm=method, seed=seed, sdr=ret[0].tolist(), isr=ret[1].tolist(), sir=ret[2].tolist(), sar=ret[3].tolist(), ) filename = result_file.format(os.getpid()) json_append(filename, entry) return entry
def process_experiment_max_sinr(SIR, mic, args): nfft = args.nfft vad_guard = args.vad_guard if args.thresh is None: vad_thresh = thresh_opt[SIR] else: vad_thresh = args.thresh # read_in the mix signals fs_led, leds = wavfile.read( file_pattern.format('camera_leds_zero_hold', 'mix', SIR)) fs_snd, audio = wavfile.read( file_pattern.format(mic_choices[mic], 'mix', SIR)) assert fs_led == fs_snd # read in the ref signals r, noise_ref = wavfile.read( file_pattern.format(mic_choices[mic], 'noise_ref', SIR)) assert r == fs_snd r, speech_ref = wavfile.read(file_speech_ref.format(mic_choices[mic])) assert r == fs_snd r, leds_ref = wavfile.read(file_speech_ref.format('camera_leds_zero_hold')) assert r == fs_snd # In case of objective evaluation, we do an artificial mix if args.synth_mix: audio = noise_ref + speech_ref # get the geometry information to get nice plots. mics_loc = np.array(protocol['geometry']['microphones'][mic]['reference']) noise_loc = protocol['geometry']['speakers']['locations'][0] speech_loc = protocol['geometry']['speakers']['locations'][1] # the directions of arrival theta_speech = 0 p0 = speech_loc - mics_loc p1 = noise_loc - mics_loc theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1)) print('Source separation', theta_noise / np.pi * 180) if mic == 'pyramic': I = list(range(8, 16)) + list(range(24, 32)) + list(range( 40, 48)) # flat part #I = list(range(24,32)) + list(range(40,48)) # flat part #I = list(range(8,16)) #I = list(range(48)) audio = audio[:, I] noise_ref = noise_ref[:, I].copy() speech_ref = speech_ref[:, I].copy() mics_positions = mics_geom['pyramic'][I].copy() # place in room 2-806 mics_positions -= np.mean(mics_positions, axis=0)[None, :] mics_positions[:, 2] -= np.max(mics_positions[:, 2]) mics_positions += mics_loc elif mic == 'olympus': mics_positions = mics_geom['olympus'].copy() + mics_loc n_samples = audio.shape[0] # shorthand n_channels = audio.shape[1] # perform VAD vad_snd = leds > vad_thresh # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix. # For that we will remove frames neighbouring the detected speech vad_guarded = vad_snd.copy() if vad_guard is not None: for i, v in enumerate(vad_snd): if np.any(vad_snd[i - vad_guard:i + vad_guard]): vad_guarded[i] = True ############################## ## STFT and frame-level VAD ## ############################## print('STFT and stuff') sys.stdout.flush() engine = pra.realtime.STFT(nfft, nfft // 2, pra.hann(nfft), channels=audio.shape[1]) def analysis(x): engine.analysis(x) return np.moveaxis(engine.X, 1, 0) # Now compute the STFT of the microphone input X = analysis(audio) X_time = np.arange(1, X.shape[0] + 1) * (nfft / 2) / fs_snd X_speech = analysis(audio * vad_guarded[:, None]) X_noise = analysis(audio * (1 - vad_guarded[:, None])) S_ref = analysis(speech_ref) N_ref = analysis(noise_ref) ########################## ## MAX SINR BEAMFORMING ## ########################## print('Max SINR beamformer computation') sys.stdout.flush() # covariance matrices from noisy signal Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech)) Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise)) # compute covariances with reference signals to check everything is working correctly #Rs = np.einsum('i...j,i...k->...jk', S_ref, np.conj(S_ref)) #Rn = np.einsum('i...j,i...k->...jk', N_ref, np.conj(N_ref)) # compute the MaxSINR beamformer w = [ la.eigh(rs, b=rn, eigvals=(n_channels - 1, n_channels - 1))[1] for rs, rn in zip(Rs[1:], Rn[1:]) ] w = np.squeeze(np.array(w)) nw = la.norm(w, axis=1) w[nw > 1e-10, :] /= nw[nw > 1e-10, None] w = np.concatenate([np.ones((1, n_channels)), w], axis=0) if not args.no_norm: # normalize with respect to input signal z = compute_gain(w, X_speech, X_speech[:, :, 0], clip_up=args.clip_gain) w *= z[:, None] ########### ## APPLY ## ########### print('Apply beamformer') sys.stdout.flush() # 2D beamformer mic_array = pra.Beamformer(mics_positions[:, :2].T, fs=fs_snd, N=nfft, hop=nfft, zpb=nfft) mic_array.signals = audio.T mic_array.weights = w.T out = mic_array.process() # Signal alignment step ref = np.vstack([speech_ref[:, 0], noise_ref[:, 0]]) # Not sure why the delay is sometimes negative here... Need to check more delay = np.abs( int(pra.tdoa(out, speech_ref[:, 0].astype(np.float), phat=True))) if delay > 0: out_trunc = out[delay:delay + ref.shape[1]] noise_eval = audio[:ref.shape[1], 0] - out_trunc else: out_trunc = np.concatenate( (np.zeros(-delay), out[:ref.shape[1] + delay])) noise_eval = audio[:ref.shape[1], 0] - out_trunc sig_eval = np.vstack([out_trunc, noise_eval]) # We use the BSS eval toolbox metric = bss_eval_images(ref[:, :, None], sig_eval[:, :, None]) # we are only interested in SDR and SIR for the speech source SDR_out = metric[0][0] SIR_out = metric[2][0] ################## ## SAVE SAMPLES ## ################## if args.save_sample is not None: # for informal listening tests, we need to high pass and normalize the # amplitude. upper = np.maximum(audio[:, 0].max(), out.max()) sig_in = pra.highpass(audio[:, 0].astype(np.float) / upper, fs_snd, fc=150) sig_out = pra.highpass(out / upper, fs_snd, fc=150) f1 = os.path.join(args.save_sample, '{}_ch0_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f1, fs_snd, sig_in) f2 = os.path.join(args.save_sample, '{}_out_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f2, fs_snd, sig_out) ########## ## PLOT ## ########## if args.plot: plt.figure() plt.plot(out_trunc) plt.plot(speech_ref[:, 0]) plt.legend(['output', 'reference']) # time axis for plotting led_time = np.arange(leds.shape[0]) / fs_led + 1 / (2 * fs_led) audio_time = np.arange(n_samples) / fs_snd plt.figure() plt.plot(led_time, leds, 'r') plt.title('LED signal') # match the scales of VAD and light to sound before plotting q_vad = np.max(audio) q_led = np.max(audio) / np.max(leds) plt.figure() plt.plot(audio_time, audio[:, 0], 'b') plt.plot(led_time, leds * q_led, 'r') plt.plot(audio_time, vad_snd * q_vad, 'g') plt.plot(audio_time, vad_guarded * q_vad, 'g--') plt.legend(['audio', 'VAD']) plt.title('LED and audio signals') plt.figure() a_time = np.arange(audio.shape[0]) / fs_snd plt.plot(a_time, audio[:, 0]) plt.plot(a_time, out_trunc) #plt.plot(a_time, speech_ref[:,0]) plt.legend(['channel 0', 'beamformer output', 'speech reference']) plt.figure() mic_array.plot_beam_response() plt.vlines( [180 + np.degrees(theta_speech), 180 - np.degrees(theta_noise)], 0, nfft // 2) room = pra.ShoeBox(protocol['geometry']['room'][:2], fs=16000, max_order=1) room.add_source(noise_loc[:2]) # noise room.add_source(speech_loc[:2]) # speech room.add_source( protocol['geometry']['speakers']['locations'][1][:2]) # signal room.add_microphone_array(mic_array) room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000]) plt.figure() mic_array.plot() plt.show() # Return SDR and SIR return SDR_out, SIR_out
def test_sparseauxiva(): fs = 16000 signals = [ np.concatenate([ wavfile.read(f)[1].astype(np.float32, order='C') for f in source_files ]) for source_files in wav_files ] wavfile.write('sample1.wav', fs, np.asarray(signals[0], dtype=np.int16)) wavfile.write('sample2.wav', fs, np.asarray(signals[1], dtype=np.int16)) # Define an anechoic room envrionment, as well as the microphone array and source locations. # Room 4m by 6m room_dim = [8, 9] # source locations and delays locations = [[2.5, 3], [2.5, 6]] delays = [1., 0.] # create an anechoic room with sources and mics room = pra.ShoeBox(room_dim, fs=16000, max_order=15, absorption=0.35, sigma2_awgn=1e-8) # add mic and good source to room # Add silent signals to all sources for sig, d, loc in zip(signals, delays, locations): room.add_source(loc, signal=np.zeros_like(sig), delay=d) # add microphone array room.add_microphone_array( pra.MicrophoneArray(np.c_[[6.5, 4.49], [6.5, 4.51]], room.fs)) # Compute the RIRs as in the Room Impulse Response generation section. # compute RIRs room.compute_rir() # Record each source separately separate_recordings = [] for source, signal in zip(room.sources, signals): source.signal[:] = signal room.simulate() separate_recordings.append(room.mic_array.signals) source.signal[:] = 0. separate_recordings = np.array(separate_recordings) # Mix down the recorded signals mics_signals = np.sum(separate_recordings, axis=0) # save mixed signals as wav files wavfile.write('mix1.wav', fs, np.asarray(mics_signals[0].T, dtype=np.int16)) wavfile.write('mix2.wav', fs, np.asarray(mics_signals[1].T, dtype=np.int16)) wavfile.write( 'mix1_norm.wav', fs, np.asarray(mics_signals[0].T / np.max(np.abs(mics_signals[0].T)) * 32767, dtype=np.int16)) wavfile.write( 'mix2_norm.wav', fs, np.asarray(mics_signals[1].T / np.max(np.abs(mics_signals[1].T)) * 32767, dtype=np.int16)) # STFT frame length L = 2048 # START BSS ########### # Preprocessing # Observation vector in the STFT domain X = np.array([ pra.stft(ch, L, L, transform=np.fft.rfft, zp_front=L // 2, zp_back=L // 2) for ch in mics_signals ]) X = np.moveaxis(X, 0, 2) # Reference signal to calculate performance of BSS ref = np.moveaxis(separate_recordings, 1, 2) ratio = 0.35 average = np.abs(np.mean(np.mean(X, axis=2), axis=0)) k = np.int_(average.shape[0] * ratio) S = np.argpartition(average, -k)[-k:] S = np.sort(S) n_iter = 30 # Run SparseAuxIva Y = pra.bss.sparseauxiva(X, S, n_iter, lasso=True) # run iSTFT y = np.array([ pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_front=L // 2, zp_back=L // 2) for ch in range(Y.shape[2]) ]) # Compare SIR and SDR with our reference signal sdr, isr, sir, sar, perm = bss_eval_images( ref[:, :y.shape[1] - L // 2, 0], y[:, L // 2:ref.shape[1] + L // 2]) print('SDR: {0}, SIR: {1}'.format(sdr, sir)) wavfile.write('demix1.wav', fs, np.asarray(y[0].T, dtype=np.int16)) wavfile.write('demix2.wav', fs, np.asarray(y[1].T, dtype=np.int16)) wavfile.write( 'demix1_norm.wav', fs, np.asarray(y[0].T / np.max(np.abs(y[0].T)) * 32767, dtype=np.int16)) wavfile.write( 'demix2_norm.wav', fs, np.asarray(y[1].T / np.max(np.abs(y[1].T)) * 32767, dtype=np.int16))
proj_back=True, callback=convergence_callback) # run iSTFT y = np.array([ pra.istft(Y[:, :, ch], L, L, transform=np.fft.irfft, zp_front=L // 2, zp_back=L // 2) for ch in range(Y.shape[2]) ]) # Compare SIR ############# sdr, isr, sir, sar, perm = bss_eval_images( ref[:, :y.shape[1] - L // 2, 0], y[:, L // 2:ref.shape[1] + L // 2]) print('SDR:', sdr) print('SIR:', sir) import matplotlib.pyplot as plt plt.figure() plt.subplot(2, 2, 1) plt.specgram(ref[0, :, 0], NFFT=1024, Fs=room.fs) plt.title('Source 0 (clean)') plt.subplot(2, 2, 2) plt.specgram(ref[1, :, 0], NFFT=1024, Fs=room.fs) plt.title('Source 1 (clean)') plt.subplot(2, 2, 3)
def process_experiment_max_sinr(SIR, mic, blinky, args): session = args.session target = args.target with open(metadata_file.format(session=args.session), 'r') as f: metadata = json.load(f) file_pattern = os.path.join(experiment_folder, metadata['filename_pattern']) with open(protocol_file.format(session=args.session), 'r') as f: protocol = json.load(f) nfft = args.nfft vad_guard = args.vad_guard if args.thresh is None: vad_thresh = thresh_opt[SIR] else: vad_thresh = args.thresh # read_in the mix signals fs_led, leds = wavfile.read(file_pattern.format( session=session, snr=SIR, mic=blinky, source='mix', fs=fs)) fs_snd, audio = wavfile.read(file_pattern.format( session=session, snr=SIR, mic=mic_choices[mic], source='mix', fs=fs)) assert fs_led == fs_snd # read in the ref signals sources_ref = dict(zip(target_choices, [ wavfile.read(file_pattern.format( session=session, mic=mic_choices[mic], snr=SIR, source=ch, fs=fs))[1] for ch in target_choices ])) leds_ref = dict(zip(target_choices, [ wavfile.read(file_pattern.format( session=session, mic=blinky, snr=SIR, source=ch, fs=fs))[1] for ch in target_choices ])) # reorder with target in first position ref = np.array([sources_ref[target]] + [sources_ref[ch] for ch in target_choices if ch != target]) noise_ref = np.zeros_like(sources_ref[target]) n_ch = [ch for ch in target_choices if ch != target] for ch in n_ch: noise_ref += sources_ref[ch] # In case of objective evaluation, we do an artificial mix if args.synth_mix: audio = sources_ref[target] + noise_ref # get the geometry information to get nice plots. mics_geom = { 'pyramic' : np.array(protocol['geometry']['microphones']['pyramic']['locations']), 'camera' : np.array(protocol['geometry']['microphones']['camera']['locations']), } mics_loc = np.array(protocol['geometry']['microphones'][mic_choices[mic]]['reference']) noise_loc = protocol['geometry']['speakers']['locations'][0] speech_loc = protocol['geometry']['speakers']['locations'][1] # the directions of arrival theta_speech = 0 p0 = speech_loc - mics_loc p1 = noise_loc - mics_loc theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1)) print('Source separation', theta_noise / np.pi * 180) if 'pyramic' in mic: if mic == 'pyramic_2': I = pyramic_bss_2ch elif mic == 'pyramic_4': I = pyramic_bss_4ch elif mic == 'pyramic_24': I = list(range(8,16)) + list(range(24,32)) + list(range(40,48)) # flat part elif mic == 'pyramic_48': I = list(range(48)) else: raise ValueError('Unsupported configuration') audio = audio[:,I] noise_ref = noise_ref[:,I].copy() ref = ref[:,:,I].copy() mics_positions = mics_geom['pyramic'][I].copy() # place in room 2-806 mics_positions -= np.mean(mics_positions, axis=0)[None,:] mics_positions[:,2] -= np.max(mics_positions[:,2]) mics_positions += mics_loc elif mic == 'camera': mics_positions = mics_geom['camera'].copy() + mics_loc n_samples = audio.shape[0] # shorthand n_channels = audio.shape[1] # adjust length of led signal if necessary if leds.shape[0] < audio.shape[0]: z_missing = audio.shape[0] - leds.shape[0] leds = np.pad(leds, (0,z_missing), 'constant') elif leds.shape[0] > audio.shape[0]: leds = leds[:audio.shape[0],] # perform VAD led_target = leds[:,blinky_source_map[target]] vad_snd = led_target > vad_thresh # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix. # For that we will remove frames neighbouring the detected speech vad_guarded = vad_snd.copy() if vad_guard is not None: for i,v in enumerate(vad_snd): if np.any(vad_snd[i-vad_guard:i+vad_guard]): vad_guarded[i] = True ############################## ## STFT and frame-level VAD ## ############################## print('STFT and stuff') sys.stdout.flush() a_win = pra.hann(nfft) s_win = pra.realtime.compute_synthesis_window(a_win, nfft // 2) engine = pra.realtime.STFT(nfft, nfft // 2, analysis_window=a_win, synthesis_window=s_win, channels=audio.shape[1]) # Now compute the STFT of the microphone input X = engine.analysis(audio) X_time = np.arange(1, X.shape[0]+1) * (nfft / 2) / fs_snd X_speech = engine.analysis(audio * vad_guarded[:audio.shape[0],None]) X_noise = engine.analysis(audio * (1 - vad_guarded[:audio.shape[0],None])) ########################## ## MAX SINR BEAMFORMING ## ########################## print('Max SINR beamformer computation') sys.stdout.flush() # covariance matrices from noisy signal Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech)) Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise)) Rall = Rs + Rn # compute the MaxSINR beamformer w = [la.eigh(rs, b=rn, eigvals=(n_channels-1,n_channels-1))[1] for rs,rn in zip(Rall[1:], Rn[1:])] w = np.squeeze(np.array(w)) nw = la.norm(w, axis=1) w[nw > 1e-10,:] /= nw[nw > 1e-10,None] w = np.concatenate([np.ones((1,n_channels)), w], axis=0) # add dummy beamformer at DC if not args.no_norm: # normalize with respect to input signal z = compute_gain(w, X_speech, X_speech[:,:,0], clip_up=args.clip_gain) w *= z[:,None] ########### ## APPLY ## ########### print('Apply beamformer') sys.stdout.flush() # 2D beamformer mic_array = pra.Beamformer(mics_positions[:,:2].T, fs=fs_snd, N=nfft, hop=nfft, zpb=nfft) mic_array.signals = audio.T mic_array.weights = w.T out = mic_array.process() # Signal alignment step # Not sure why the delay is sometimes negative here... Need to check more delay = int(pra.tdoa(out, ref[0,:,0].astype(np.float), phat=True)) print(delay) delay = np.abs(delay) if delay > 0: out_trunc = out[delay:delay+ref.shape[1]] else: out_trunc = np.concatenate((np.zeros(-delay), out[:ref.shape[1]+delay])) sig_eval = np.vstack([out_trunc] * len(target_choices)) # We use the BSS eval toolbox metric = bss_eval_images(ref[:,:,0], sig_eval[:,:,None]) # we are only interested in SDR and SIR for the speech source ret = { 'Max-SINR' : {'SDR' : metric[0][0], 'SIR' : metric[2][0]} } ############################# ## BLIND SOURCE SEPARATION ## ############################# if mic in ['camera', 'pyramic_2', 'pyramic_4']: Y = pra.bss.auxiva(X, n_iter=40) bss = pra.realtime.synthesis(Y, nfft, nfft // 2, win=s_win) match = [] for col in range(bss.shape[1]): xcorr = fast_corr(bss[:,col], ref[0,:,0]) match.append(np.max(xcorr)) best_col = np.argmax(match) # Not sure why the delay is sometimes negative here... Need to check more delay = np.abs(int(pra.tdoa(bss[:,best_col], ref[0,:,0].astype(np.float), phat=True))) if delay > 0: bss_trunc = bss[delay:delay+ref.shape[1],] elif delay < 0: bss_trunc = np.concatenate((np.zeros((-delay, bss.shape[1])), bss[:ref.shape[1]+delay])) else: bss_trunc = bss[:ref.shape[1],] if ref.shape[1] > bss_trunc.shape[0]: ref_lim = bss_trunc.shape[0] else: ref_lim = ref.shape[1] if mic in ['camera', 'pyramic_2']: bss_trunc = np.hstack([bss_trunc] * 2) metric = bss_eval_images(ref[:,:ref_lim,0,None], bss_trunc.T[:,:,None]) SDR_bss = metric[0][0] SIR_bss = metric[2][0] ret['BSS'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] } ################################# ## Estimate SDR and SIR of mix ## ################################# # Not sure why the delay is sometimes negative here... Need to check more delay = np.abs(int(pra.tdoa(audio[:,0], ref[0,:,0].astype(np.float), phat=True))) if delay > 0: audio_trunc = audio[delay:delay+ref.shape[1],0] elif delay < 0: audio_trunc = np.concatenate((np.zeros(-delay), audio[:ref.shape[1]+delay,0])) else: audio_trunc = audio[:ref.shape[1],0] if ref.shape[1] > audio_trunc.shape[0]: ref_lim = audio_trunc.shape[0] else: ref_lim = ref.shape[1] audio_trunc = np.vstack([audio_trunc] * len(ref)) metric = bss_eval_images(ref[:,:ref_lim,0,None], audio_trunc[:,:,None]) SDR_bss = metric[0][0] SIR_bss = metric[2][0] ret['Mix'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] } ################## ## SAVE SAMPLES ## ################## if args.save_sample is not None: if not os.path.exists(args.save_sample): os.makedirs(args.save_sample) # for informal listening tests, we need to high pass and normalize the # amplitude. if mic in ['camera', 'pyramic_2', 'pyramic_4']: upper = np.max([audio[:,0].max(), out.max(), bss.max(), ref[0,:,0].max()]) else: upper = np.max([audio[:,0].max(), out.max(), ref[0,:,0].max()]) # Clean signal for reference sig_ref = pra.highpass(ref[0,:,0].astype(np.float) / upper, fs_snd, fc=150) f0 = os.path.join(args.save_sample, '{}_ref_SIR_NA_dB.wav'.format(mic)) wavfile.write(f0, fs_snd, sig_ref) # Mix signal for reference sig_mix = pra.highpass(audio[:,0].astype(np.float) / upper, fs_snd, fc=150) f1 = os.path.join(args.save_sample, '{}_mix_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f1, fs_snd, sig_mix) # Output of MaxSINR sig_out = pra.highpass(out / upper, fs_snd, fc=150) f2 = os.path.join(args.save_sample, '{}_maxsinr_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f2, fs_snd, sig_out) # Output of BSS if mic in ['camera', 'pyramic_2', 'pyramic_4']: sig_bss = pra.highpass(bss[:,best_col] / upper, fs_snd, fc=150) f3 = os.path.join(args.save_sample, '{}_bss_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f3, fs_snd, sig_bss) ########## ## PLOT ## ########## if args.plot: plt.figure() plt.plot(out_trunc) plt.plot(ref[0,:,0]) plt.legend(['output', 'reference']) # time axis for plotting led_time = np.arange(led_target.shape[0]) / fs_led + 1 / (2 * fs_led) audio_time = np.arange(n_samples) / fs_snd plt.figure() plt.plot(led_time, led_target, 'r') plt.title('LED signal') # match the scales of VAD and light to sound before plotting q_vad = np.max(audio) q_led = np.max(audio) / np.max(led_target) plt.figure() plt.plot(audio_time, audio[:,0], 'b') plt.plot(led_time, led_target * q_led, 'r') plt.plot(audio_time, vad_snd * q_vad, 'g') plt.plot(audio_time, vad_guarded * q_vad, 'g--') plt.legend(['audio','VAD']) plt.title('LED and audio signals') plt.figure() a_time = np.arange(audio.shape[0]) / fs_snd plt.plot(a_time, audio[:,0]) plt.plot(a_time, out_trunc) plt.legend(['channel 0', 'beamformer output', 'speech reference']) ''' plt.figure() mic_array.plot_beam_response() plt.vlines([180+np.degrees(theta_speech), 180-np.degrees(theta_noise)], 0, nfft // 2) room = pra.ShoeBox(protocol['geometry']['room'][:2], fs=16000, max_order=1) room.add_source(noise_loc[:2]) # noise room.add_source(speech_loc[:2]) # speech room.add_source(protocol['geometry']['speakers']['locations'][1][:2]) # signal room.add_microphone_array(mic_array) room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000]) ''' plt.figure() mic_array.plot() plt.show() # Return SDR and SIR return ret
W_init=W_dict, update_a=False, update_w=False, verbose=True) else: raise ValueError('Unknown algorithm {} requested'.format(method)) n_samples = np.minimum(single_sources.shape[2], sep_sources.shape[1]) reference_signals = [] for speech_ind in range(n_speech): reference_signals.append(single_sources[speech_ind, speech_ind, :n_samples, :]) reference_signals = np.array(reference_signals) ret = \ bss_eval_images(reference_signals, sep_sources[:,:n_samples,:]) print('SDR={} ISR={} SIR={} SAR={}'.format(*ret[:4])) mic_norm = 0.7 / np.max(np.abs(mic_signals)) sep_src_norm = 0.7 / np.max(np.abs(sep_sources)) if args.play: import sounddevice as sd sd.play(mic_signals[:, :2] / mic_norm, samplerate=fs, blocking=True) for s in range(n_speech): sd.play(sep_sources[s, :, :2] / sep_src_norm, samplerate=fs, blocking=True) if args.save is not None: