def save_main_beam(self, rs_x, rs_y, sp_x, sp_y, is_circular, mic_num, c_x, c_y, i_m_d, angle, room_frequency, freques, room_temp=None, room_humidity=None, is_airAbsorption=None): # Create a rs_x by rs_y metres shoe box room room = pra.ShoeBox([rs_x, rs_y], fs=room_frequency, temperature=room_temp, humidity=room_humidity, air_absorption=is_airAbsorption) # Add a source somewhere in the room room.add_source([sp_x, sp_y]) # Create a linear array beamformer with 4 microphones # with angle 0 degrees and inter mic distance 10 cm if (is_circular): R = pra.circular_2D_array([c_x, c_y], mic_num, angle, i_m_d) else: R = pra.linear_2D_array([c_x, c_y], mic_num, angle, i_m_d) room.add_microphone_array(pra.Beamformer(R, room.fs)) # Now compute the delay and sum weights for the beamformer room.mic_array.rake_delay_and_sum_weights(room.sources[0][:1]) # plot the room and resulting beamformer room.plot(freq=freques, img_order=0) # plt.show() plt.savefig('./fig.png')
def shoebox_rir(room_dim, source, mic): # Some simulation parameters Fs = 8000 t0 = 1./(Fs*np.pi*1e-2) # starting time function of sinc decay in RIR response absorption = 0.90 max_order_sim = 10 # create a microphone array R = pra.linear2DArray(mic, 1, 0, 1) mics = pra.Beamformer(R, Fs) # create the room with sources and mics room1 = pra.Room.shoeBox2D( [0,0], room_dim, Fs, t0 = t0, max_order=max_order_sim, absorption=absorption, sigma2_awgn=0) # add source and interferer room1.addSource(source) room1.addMicrophoneArray(mics) room1.compute_RIR() h = room1.rir[0][0] return h
def DAB_generate(source_audio, out_folder, name): shoebox = pra.ShoeBox( room_dimensions, absorption=wall_absorption, fs=fs, max_order=15, ) # number of microphones M = 4 source_position = np.array([ random.uniform(0, room_dimensions[0]), random.uniform(0, room_dimensions[1]) ]) distances = np.random.randint(1, 20, M) mic_pos = [] for m in range(M): mic_distance = distances[m] mic_m = guess_microphone( source_position, mic_distance ) # random way: guess microphone position until it's in the room: very long time for small rooms mic_pos.append(mic_m) out_mic_file = os.path.join(out_folder, 'log_%s.txt' % name) if os.path.exists(out_mic_file): os.remove(out_mic_file) f1 = open(out_mic_file, 'w') for l in range(M): f1.write("%s, %f\n" % (str(mic_pos[l]), distances[l])) Lg_t = 0.100 # filter size in seconds Lg = np.ceil(Lg_t * fs) # in samples fft_len = 512 mics = pra.Beamformer(np.asarray(mic_pos).T, shoebox.fs, N=fft_len, Lg=Lg) shoebox.add_source(source_position, signal=source_audio) shoebox.add_microphone_array(mics) shoebox.compute_rir() shoebox.simulate() # ADDING NOISE AND SAVING for n in range(M): signal = np.asarray(shoebox.mic_array.signals[n, :], dtype=float) signal = pra.utilities.normalize(signal, bits=16) mixed_signal = add_noise(source_audio, signal) mixed_signal = np.array(mixed_signal, dtype=np.int16) mixed_file = os.path.join(out_folder, 'mix%d_%s' % (n, name)) pp.write_audio(mixed_file, mixed_signal, fs)
def beamformed_das(comb, people_num, sr=16000): f1 = comb[0] f2 = comb[1] # def beamformed_das(f1, f2, people_num, sr=16000): f1_data = f1['data'] f2_data = f2['data'] signal_len = len(f1['data']) distance = 1.5 # azimuth = np.array([math.atan2(1.5, 0.5), math.atan2(1.5, -0.5)]) azimuth = np.array([ 90., 270., ]) * np.pi / 180 # centre = [2, 1.5] room_dim = np.r_[4, 6] room = pra.ShoeBox(room_dim, fs=sr) echo = pra.linear_2D_array(center=(room_dim / 2), M=5, phi=0, d=0.5) echo = np.concatenate((echo, np.array((room_dim / 2), ndmin=2).T), axis=1) mics = pra.Beamformer(echo, room.fs) room.add_microphone_array(mics) # room.add_source(np.array([1.5, 4.5]), delay=0., signal=f1_data) # room.add_source(np.array([2.5, 4.5]), delay=0., signal=f2_data[:len(f1_data)]) signals = [f1_data, f2_data] for i, ang in enumerate(azimuth): source_location = room_dim / 2 + distance * np.r_[np.cos(ang), np.sin(ang)] source_signal = signals[i] room.add_source(source_location, signal=source_signal[:signal_len], delay=0) mics.rake_delay_and_sum_weights(room.sources[0][:1]) # room.plot(freq=[300, 400, 500, 1000, 2000, 4000], img_order=0) # plt.show() # ax.legend(['300', '400', '500', '1000', '2000', '4000']) # fig.set_size_inches(20, 8) room.compute_rir() room.simulate() filename = 'beamformeded_%05d-%05d' % (f1['filename'], f2['filename']) + '.wav' with open(TXT_PATH + 'build_beamformeded.txt', 'a') as f: f.write(filename) f.write('\n') for i in range(5): wavfile.write(MICS_PATH + 'mic%d/' % (i + 1) + filename, sr, room.mic_array.signals[i, :])
def mic_rever_generator(room_size, target_location, target, fs, microphone_array, amplifier, absorption_value): ''' This function is used to implement single source microphone array reverberation speech generator. Usage: mic_rever_generator(room_size, target_location, target, fs, microphone_array, amplifier, absorption_value) room_size - the size of room [length, width, high] target_location - the location of target speech [x, y, z] target - the array of target speech file fs - sampling frequency microphone_array - the location of microphone array amplifier - the multiple of microphone's built-in amplifier absorption_value - absorption value of room wall Example call: clean_rever = mic_rever_generator(room_size, target_location, target, fs, microphone_array, amplifier, absorption_value) References: mircophone array speech generator release 0.1 Author: Rui Cheng ''' # create the room room = pra.ShoeBox(room_size, fs=fs, absorption=absorption_value, max_order=17) room.add_source(target_location, signal=target, delay=0) #room.add_source([3.5, 3.0, 1.76], signal=interf[:len(target)], delay=0) # add microphone array R = microphone_array fft_len = 512 Lg_t = 0.100 Lg = np.ceil(Lg_t * room.fs) mic_array = pra.Beamformer(R, room.fs, N=fft_len, Lg=Lg) room.add_microphone_array(mic_array) # create the room impulse response # compute image sources room.image_source_model(use_libroom=True) # microphone speech room.simulate() # clean speech in each channel clean_rever = amplifier * room.mic_array.signals.astype("int16") # return return clean_rever
def Beamformer_Distortionless(mixed: np.array, state: dict, options: dict): # Get options nSources = options['nSources'] stft_size = options['stft_size'] if 'stft_size' in options else 1024 delay = options['delay'] if 'delay' in options else 0.05 nPaths = options['nPaths'] if 'nPaths' in options else 1 FD = options['FD'] if 'FD' in options else False if 'room_object' in options: room = options['room_object'] else: warnings.warn( 'room_object is required in algorithm options for beamforming'. format(nSources)) return np.zeros((nSources, mixed.shape[1])), state fs = room.fs # Check number of sources to be equal to 2 if nSources != 2: warnings.warn( 'Perceptual beamformer is implemented only for 2 sources (instead=%d was requested)' .format(nSources)) return np.zeros((nSources, mixed.shape[1])), state # Create beamformer object bmfr = pra.Beamformer(room.mic_array.R, fs, N=stft_size) # "Record" mixed data with beamformer bmfr.record(mixed, fs) # Create filters that point to source 1 bmfr.rake_distortionless_filters(room.sources[0][0:nPaths], room.sources[1][0:nPaths], room.sigma2_awgn * np.eye(bmfr.Lg * bmfr.M), delay=delay) s1 = bmfr.process(FD) # Create filters that point to source 2 bmfr.rake_distortionless_filters(room.sources[1][0:nPaths], room.sources[0][0:nPaths], room.sigma2_awgn * np.eye(bmfr.Lg * bmfr.M), delay=delay) s2 = bmfr.process(FD) return np.stack([s1, s2], axis=0), state
def __init__(self): super(Room, self).__init__() # Create a ~4 by ~6 metres shoe box room rx = random.uniform(3.8, 4.2) ry = random.uniform(6.8, 7.2) self.room = pra.ShoeBox([rx, ry], fs=16000) # Create 2 microphones # 20 cm between self.x = random.uniform(0.5, 3.5) self.my = random.uniform(0.5, 1.5) R = np.c_[[self.x - 0.1, self.my], # mic 1 [self.x + 0.1, self.my], # mic 2 ] self.room.add_microphone_array(pra.Beamformer(R, self.room.fs)) self.delay = 0 self.rate = 16000
Lg_t = 0.100 # filter size in seconds Lg = np.ceil(Lg_t * Fs) # filter size in samples alphas = np.arange(0.1, 1, 0.05) source = np.array([1, 4.5]) interferer = np.array([3.5, 3.]) radius = 0.15 roomDim = [8, 6] center = [1, 3.5] fft_len = 512 echo = pra.circular_2D_array(center=center, M=6, phi0=0, radius=radius) echo = np.concatenate((echo, np.array(center, ndmin=2).T), axis=1) for alpha in alphas: room_bf = pra.ShoeBox(roomDim, fs=Fs, max_order=64, absorption=alpha) mics = pra.Beamformer(echo, room_bf.fs, N=fft_len, Lg=Lg) room_bf.add_microphone_array(mics) room_bf.add_source(source, delay=0., signal=xtone) room_bf.add_source(interferer, delay=0, signal=silence) # Compute DAS weights mics.rake_delay_and_sum_weights(room_bf.sources[0][:1]) # # Do Beamforming room_bf.image_source_model(use_libroom=True) room_bf.compute_rir() room_bf.simulate() # signal_das = mics.process(FD=False)
""" This example shows how to create delay and sum beamformers """ from __future__ import print_function, division import numpy as np import matplotlib.pyplot as plt import pyroomacoustics as pra # Create a 4 by 6 metres shoe box room room = pra.ShoeBox([4, 6]) # Add a source somewhere in the room room.add_source([2.5, 4.5]) # Create a linear array beamformer with 4 microphones # with angle 0 degrees and inter mic distance 10 cm R = pra.linear_2D_array([2, 1.5], 4, 0, 0.04) room.add_microphone_array(pra.Beamformer(R, room.fs)) # Now compute the delay and sum weights for the beamformer room.mic_array.rake_delay_and_sum_weights(room.sources[0][:1]) # plot the room and resulting beamformer room.plot(freq=[1000, 2000, 4000, 8000], img_order=0) plt.show()
def process_experiment_max_sinr(SIR, mic, args): nfft = args.nfft vad_guard = args.vad_guard if args.thresh is None: vad_thresh = thresh_opt[SIR] else: vad_thresh = args.thresh # read_in the mix signals fs_led, leds = wavfile.read( file_pattern.format('camera_leds_zero_hold', 'mix', SIR)) fs_snd, audio = wavfile.read( file_pattern.format(mic_choices[mic], 'mix', SIR)) assert fs_led == fs_snd # read in the ref signals r, noise_ref = wavfile.read( file_pattern.format(mic_choices[mic], 'noise_ref', SIR)) assert r == fs_snd r, speech_ref = wavfile.read(file_speech_ref.format(mic_choices[mic])) assert r == fs_snd r, leds_ref = wavfile.read(file_speech_ref.format('camera_leds_zero_hold')) assert r == fs_snd # In case of objective evaluation, we do an artificial mix if args.synth_mix: audio = noise_ref + speech_ref # get the geometry information to get nice plots. mics_loc = np.array(protocol['geometry']['microphones'][mic]['reference']) noise_loc = protocol['geometry']['speakers']['locations'][0] speech_loc = protocol['geometry']['speakers']['locations'][1] # the directions of arrival theta_speech = 0 p0 = speech_loc - mics_loc p1 = noise_loc - mics_loc theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1)) print('Source separation', theta_noise / np.pi * 180) if mic == 'pyramic': I = list(range(8, 16)) + list(range(24, 32)) + list(range( 40, 48)) # flat part #I = list(range(24,32)) + list(range(40,48)) # flat part #I = list(range(8,16)) #I = list(range(48)) audio = audio[:, I] noise_ref = noise_ref[:, I].copy() speech_ref = speech_ref[:, I].copy() mics_positions = mics_geom['pyramic'][I].copy() # place in room 2-806 mics_positions -= np.mean(mics_positions, axis=0)[None, :] mics_positions[:, 2] -= np.max(mics_positions[:, 2]) mics_positions += mics_loc elif mic == 'olympus': mics_positions = mics_geom['olympus'].copy() + mics_loc n_samples = audio.shape[0] # shorthand n_channels = audio.shape[1] # perform VAD vad_snd = leds > vad_thresh # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix. # For that we will remove frames neighbouring the detected speech vad_guarded = vad_snd.copy() if vad_guard is not None: for i, v in enumerate(vad_snd): if np.any(vad_snd[i - vad_guard:i + vad_guard]): vad_guarded[i] = True ############################## ## STFT and frame-level VAD ## ############################## print('STFT and stuff') sys.stdout.flush() engine = pra.realtime.STFT(nfft, nfft // 2, pra.hann(nfft), channels=audio.shape[1]) def analysis(x): engine.analysis(x) return np.moveaxis(engine.X, 1, 0) # Now compute the STFT of the microphone input X = analysis(audio) X_time = np.arange(1, X.shape[0] + 1) * (nfft / 2) / fs_snd X_speech = analysis(audio * vad_guarded[:, None]) X_noise = analysis(audio * (1 - vad_guarded[:, None])) S_ref = analysis(speech_ref) N_ref = analysis(noise_ref) ########################## ## MAX SINR BEAMFORMING ## ########################## print('Max SINR beamformer computation') sys.stdout.flush() # covariance matrices from noisy signal Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech)) Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise)) # compute covariances with reference signals to check everything is working correctly #Rs = np.einsum('i...j,i...k->...jk', S_ref, np.conj(S_ref)) #Rn = np.einsum('i...j,i...k->...jk', N_ref, np.conj(N_ref)) # compute the MaxSINR beamformer w = [ la.eigh(rs, b=rn, eigvals=(n_channels - 1, n_channels - 1))[1] for rs, rn in zip(Rs[1:], Rn[1:]) ] w = np.squeeze(np.array(w)) nw = la.norm(w, axis=1) w[nw > 1e-10, :] /= nw[nw > 1e-10, None] w = np.concatenate([np.ones((1, n_channels)), w], axis=0) if not args.no_norm: # normalize with respect to input signal z = compute_gain(w, X_speech, X_speech[:, :, 0], clip_up=args.clip_gain) w *= z[:, None] ########### ## APPLY ## ########### print('Apply beamformer') sys.stdout.flush() # 2D beamformer mic_array = pra.Beamformer(mics_positions[:, :2].T, fs=fs_snd, N=nfft, hop=nfft, zpb=nfft) mic_array.signals = audio.T mic_array.weights = w.T out = mic_array.process() # Signal alignment step ref = np.vstack([speech_ref[:, 0], noise_ref[:, 0]]) # Not sure why the delay is sometimes negative here... Need to check more delay = np.abs( int(pra.tdoa(out, speech_ref[:, 0].astype(np.float), phat=True))) if delay > 0: out_trunc = out[delay:delay + ref.shape[1]] noise_eval = audio[:ref.shape[1], 0] - out_trunc else: out_trunc = np.concatenate( (np.zeros(-delay), out[:ref.shape[1] + delay])) noise_eval = audio[:ref.shape[1], 0] - out_trunc sig_eval = np.vstack([out_trunc, noise_eval]) # We use the BSS eval toolbox metric = bss_eval_images(ref[:, :, None], sig_eval[:, :, None]) # we are only interested in SDR and SIR for the speech source SDR_out = metric[0][0] SIR_out = metric[2][0] ################## ## SAVE SAMPLES ## ################## if args.save_sample is not None: # for informal listening tests, we need to high pass and normalize the # amplitude. upper = np.maximum(audio[:, 0].max(), out.max()) sig_in = pra.highpass(audio[:, 0].astype(np.float) / upper, fs_snd, fc=150) sig_out = pra.highpass(out / upper, fs_snd, fc=150) f1 = os.path.join(args.save_sample, '{}_ch0_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f1, fs_snd, sig_in) f2 = os.path.join(args.save_sample, '{}_out_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f2, fs_snd, sig_out) ########## ## PLOT ## ########## if args.plot: plt.figure() plt.plot(out_trunc) plt.plot(speech_ref[:, 0]) plt.legend(['output', 'reference']) # time axis for plotting led_time = np.arange(leds.shape[0]) / fs_led + 1 / (2 * fs_led) audio_time = np.arange(n_samples) / fs_snd plt.figure() plt.plot(led_time, leds, 'r') plt.title('LED signal') # match the scales of VAD and light to sound before plotting q_vad = np.max(audio) q_led = np.max(audio) / np.max(leds) plt.figure() plt.plot(audio_time, audio[:, 0], 'b') plt.plot(led_time, leds * q_led, 'r') plt.plot(audio_time, vad_snd * q_vad, 'g') plt.plot(audio_time, vad_guarded * q_vad, 'g--') plt.legend(['audio', 'VAD']) plt.title('LED and audio signals') plt.figure() a_time = np.arange(audio.shape[0]) / fs_snd plt.plot(a_time, audio[:, 0]) plt.plot(a_time, out_trunc) #plt.plot(a_time, speech_ref[:,0]) plt.legend(['channel 0', 'beamformer output', 'speech reference']) plt.figure() mic_array.plot_beam_response() plt.vlines( [180 + np.degrees(theta_speech), 180 - np.degrees(theta_noise)], 0, nfft // 2) room = pra.ShoeBox(protocol['geometry']['room'][:2], fs=16000, max_order=1) room.add_source(noise_loc[:2]) # noise room.add_source(speech_loc[:2]) # speech room.add_source( protocol['geometry']['speakers']['locations'][1][:2]) # signal room.add_microphone_array(mic_array) room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000]) plt.figure() mic_array.plot() plt.show() # Return SDR and SIR return SDR_out, SIR_out
import numpy as np import pyroomacoustics as pra room = pra.ShoeBox([4, 6], fs=16000, max_order=1) # add sources in the room room.add_source([2, 1.5]) # nice source room.add_source([2, 4.5]) # interferer # add a circular beamforming array shape = pra.circular_2D_array([2.5, 3], 8, 0.0, 0.15) bf = pra.Beamformer(shape, room.fs, Lg=500) room.add_microphone_array(bf) # run the ISM room.image_source_model() # the noise matrix, note that the size is the number of # sensors multiplied by the filter size Rn = np.eye(bf.M * bf.Lg) * 1e-5 def test_rake_max_udr_filters(): # no interferer bf.rake_max_udr_filters(room.sources[0][:4], R_n=Rn, delay=0.015, epsilon=1e-2) # with interferer bf.rake_max_udr_filters( room.sources[0][:4],
def mic_clean_generator(room_size, target_location, target, fs, microphone_array, amplifier): ''' This function is used to implement single source microphone array clean speech generator. Usage: mic_clean_generator(room_size, target_location, target, fs,microphone_array, amplifier) room_size - the size of room [length, width, high] target_location - the location of target speech [x, y, z] target - the array of target speech file fs - sampling frequency microphone_array - the location of microphone array amplifier - the multiple of microphone's built-in amplifier Example call: clean = mic_clean_generator(room_size, target_location, target, fs, microphone_array, amplifier) References: mircophone array speech generator release 0.1 Author: Rui Cheng ''' # create the room room = pra.ShoeBox(room_size, fs=fs, absorption=1.0, max_order=17) '''fig, ax = room.plot() ax.set_xlim([0, 4.5]) ax.set_ylim([0, 6.5]) ax.set_zlim([0, 4]) plt.show() ''' # add source room.add_source(target_location, signal=target, delay=0) #room.add_source([3.5, 3.0, 1.76], signal=interf[:len(target)], delay=0) # for multi-source '''fig, ax = room.plot() ax.set_xlim([0, 4.5]) ax.set_ylim([0, 6.5]) ax.set_zlim([0, 4]) plt.show()''' # add microphone array R = microphone_array fft_len = 512 Lg_t = 0.100 Lg = np.ceil(Lg_t * room.fs) mic_array = pra.Beamformer(R, room.fs, N=fft_len, Lg=Lg) room.add_microphone_array(mic_array) '''fig, ax = room.plot() ax.set_xlim([0, 4.5]) ax.set_ylim([0, 6.5]) ax.set_zlim([0, 4]) plt.show()''' # create the room impulse response # compute image sources room.image_source_model(use_libroom=True) # visualize 3D polyhedron room and image sources '''fig, ax = room.plot(img_order=3) fig.set_size_inches(20, 10) plt.show()''' '''room.plot_rir() fig = plt.gcf() fig.set_size_inches(20, 10) plt.show()''' # microphone speech room.simulate() # clean speech in each channel clean = amplifier * room.mic_array.signals.astype("int16") return clean
def modify_input_wav_beamforming(wav, noise, room_dim, max_order, snr_vals, mic_array, pos_source, pos_noise, N): fs_s, audio_anechoic = wavfile.read(wav) fs_n, noise_anechoic = wavfile.read(noise) #Create a room for the signal room_signal = pra.ShoeBox(room_dim, absorption=0.2, fs=fs_s, max_order=max_order) #Create a room for the noise room_noise = pra.ShoeBox(room_dim, absorption=0.2, fs=fs_n, max_order=max_order) #source of the signal and of the noise in their respectiv boxes room_signal.add_source(pos_source, signal=audio_anechoic) room_noise.add_source(pos_noise, signal=noise_anechoic) #add the microphone array mics_signal = pra.Beamformer(mic_array, room_signal.fs, N) mics_noisy = pra.Beamformer(mic_array, room_noise.fs, N) room_signal.add_microphone_array(mics_signal) room_noise.add_microphone_array(mics_noisy) #simulate both rooms room_signal.simulate() room_noise.simulate() #take the mic_array.signals from each room audio_reverb = room_signal.mic_array.signals noise_reverb = room_noise.mic_array.signals #design beamforming filters mics_signal.rake_delay_and_sum_weights(room_signal.sources[0][:1]) mics_noisy.rake_delay_and_sum_weights(room_signal.sources[0][:1]) output_signal = mics_signal.process() output_noise = mics_noisy.process() #we're going to normalize the noise size = np.shape(audio_reverb) noise_normalized = np.zeros(size) #for each microphones if (len(noise_reverb[0]) < len(audio_reverb[0])): raise ValueError( 'the length of the noise signal is inferior to the one of the audio signal !!' ) output_noise = output_noise[:len(output_signal)] norm_fact = np.linalg.norm(noise_reverb[-1]) noise_normalized = output_noise / norm_fact #initilialize the array of noisy_signal noisy_signal = np.zeros([len(snr_vals), np.shape(output_signal)[0]]) for i, snr in enumerate(snr_vals): noise_std = np.linalg.norm(audio_reverb[-1]) / (10**(snr / 20.)) final_noise = noise_normalized * noise_std noisy_signal[i] = pra.normalize( pra.highpass(output_signal + final_noise, fs_s)) return noisy_signal
#create the room room = pra.ShoeBox( room_dim, absorption=0.2, fs=fs_s, t0=t0, max_order=max_order, sigma2_awgn=5e-7) #add the sources room.add_source(pos_source,signal=audio_anechoic,delay=0.) room.add_source(pos_noise,signal=noise_anechoic,delay=1.0) #add the microphone array and compute RIR mics = pra.Beamformer(R, room.fs,N,Lg=Lg) room.add_microphone_array(mics) room.compute_rir() room.simulate() #design the beamforming filters using some of the images sources good_sources = room.sources[0][:max_order_design+1] bad_sources = room.sources[1][:max_order_design+1] mics.rake_mvdr_filters(good_sources,bad_sources,5e-7*np.eye(mics.Lg*mics.M),delay=delay) #process the signal noisy_signal_beamforming = mics.process() out_RakeMVDR = pra.highpass(noisy_signal_beamforming,room.fs).astype(np.int16) dest = os.path.join(dest_dir,"beamforming_signal.wav") wavfile.write(dest,16000,out_RakeMVDR) score_beamformer = label_wav(dest, labels_file, graph_file, speech.meta.as_dict()['word'])
def DB_generate(source_audio, out_folder, name): #source_audio = pra.normalize(source_audio, bits=16) mic_distance = random.randint( 1, 20) # mean distance from source to microphones source_position = np.array([ random.uniform(0, room_dimensions[0]), random.uniform(0, room_dimensions[1]) ]) # random way: guess array center until it's in the room: very long time for small rooms mic_in_room = False while mic_in_room == False: theta = random.uniform(0, 2 * math.pi) mic_center = source_position - mic_distance * np.array( [math.cos(theta), math.sin(theta)]) print(mic_center) if (0 <= mic_center[0] <= room_dimensions[0]) and ( 0 <= mic_center[1] <= room_dimensions[1]): mic_in_room = True # number of lateral microphones M = 4 # counterclockwise rotation of array: phi = 0 # distance between microphones d = 0.4 mic_pos = pra.beamforming.linear_2D_array(mic_center, M, phi, d) mic_pos = np.concatenate((mic_pos, np.array(mic_center, ndmin=2).T), axis=1) distances = [] for m in range(M): d = math.sqrt((source_position[0] - mic_pos[0, m])**2 + (source_position[1] - mic_pos[1, m])**2) distances.append(d) # create room shoebox = pra.ShoeBox( room_dimensions, absorption=wall_absorption, fs=fs, max_order=15, ) # shoebox.mic_array.to_wav(os.path.join(out_folder + '_DB', 'mix_' + name), norm=True, bitdepth=np.int16) Lg_t = 0.100 # filter size in seconds Lg = np.ceil(Lg_t * fs) # in samples fft_len = 512 mics = pra.Beamformer(mic_pos, shoebox.fs, N=fft_len, Lg=Lg) shoebox.add_source(source_position, signal=source_audio) shoebox.add_microphone_array(mics) shoebox.compute_rir() shoebox.simulate() # ADDING NOISE for n in range(M + 1): signal = np.asarray(shoebox.mic_array.signals[n, :], dtype=float) signal = pra.utilities.normalize(signal, bits=16) mixed_signal = add_noise(source_audio, signal) mixed_signal = np.array(mixed_signal, dtype=np.int16) mixed_file = os.path.join(out_folder, 'mix%d_%s' % (n, name)) pp.write_audio(mixed_file, mixed_signal, fs)
noise_loc = np.r_[2.5, 4.5] SIR = 25 # decibels SINR = SIR - 1 # decibels sigma_i, sigma_n = compute_variances(SIR, SINR, src_loc, noise_loc, mics_loc.mean(axis=1), sigma_s=sigma_s) interference_audio = np.random.randn(target_audio.shape[0] + fs_sound) * sigma_i room = pra.ShoeBox([6,5], fs=16000, max_order=12, absorption=0.4, sigma2_awgn=sigma_n**2) room.add_source(src_loc, signal=target_audio) room.add_source(noise_loc, signal=interference_audio) # conventional microphone array M = mics_loc.shape[1] mics = pra.Beamformer(mics_loc, fs=fs_sound, N=nfft, hop=nfft // 2, zpb=nfft) room.add_microphone_array(mics) room.simulate() # sound-to-light sensor # we assume there is no propagation delay between speaker and sensor leds = LightArray2(src_loc, fs=fs_light) leds.record(target_audio + np.random.randn(*target_audio.shape) * sigma_n, fs=fs_sound) leds_sig = leds.signals - leds.signals.min() leds_sig /= leds_sig.max() leds_time = np.arange(leds.signals.shape[0]) / fs_light # perform VAD on the light signal vad = leds.signals > vad_thresh
max_order_design = 1 # maximum image generation used in design shape = 'Circular' # array shape # TD filter length Lg_t = 0.05 # Filter size in seconds Lg = int(np.ceil(Lg_t * Fs)) Lgp = np.floor(0.4 * Lg) Lgm = Lg - Lgp print 'Lg=', Lg # create a microphone array if shape is 'Circular': R = circular2DArray(mic1, M, phi, d * M / (2 * np.pi)) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N, Lg=Lg, hop=hop, zpf=zp, zpb=zp) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_' + str(Fs) + '.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_' + str(Fs) + '.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1.
def process_experiment_max_sinr(SIR, mic, blinky, args): session = args.session target = args.target with open(metadata_file.format(session=args.session), 'r') as f: metadata = json.load(f) file_pattern = os.path.join(experiment_folder, metadata['filename_pattern']) with open(protocol_file.format(session=args.session), 'r') as f: protocol = json.load(f) nfft = args.nfft vad_guard = args.vad_guard if args.thresh is None: vad_thresh = thresh_opt[SIR] else: vad_thresh = args.thresh # read_in the mix signals fs_led, leds = wavfile.read(file_pattern.format( session=session, snr=SIR, mic=blinky, source='mix', fs=fs)) fs_snd, audio = wavfile.read(file_pattern.format( session=session, snr=SIR, mic=mic_choices[mic], source='mix', fs=fs)) assert fs_led == fs_snd # read in the ref signals sources_ref = dict(zip(target_choices, [ wavfile.read(file_pattern.format( session=session, mic=mic_choices[mic], snr=SIR, source=ch, fs=fs))[1] for ch in target_choices ])) leds_ref = dict(zip(target_choices, [ wavfile.read(file_pattern.format( session=session, mic=blinky, snr=SIR, source=ch, fs=fs))[1] for ch in target_choices ])) # reorder with target in first position ref = np.array([sources_ref[target]] + [sources_ref[ch] for ch in target_choices if ch != target]) noise_ref = np.zeros_like(sources_ref[target]) n_ch = [ch for ch in target_choices if ch != target] for ch in n_ch: noise_ref += sources_ref[ch] # In case of objective evaluation, we do an artificial mix if args.synth_mix: audio = sources_ref[target] + noise_ref # get the geometry information to get nice plots. mics_geom = { 'pyramic' : np.array(protocol['geometry']['microphones']['pyramic']['locations']), 'camera' : np.array(protocol['geometry']['microphones']['camera']['locations']), } mics_loc = np.array(protocol['geometry']['microphones'][mic_choices[mic]]['reference']) noise_loc = protocol['geometry']['speakers']['locations'][0] speech_loc = protocol['geometry']['speakers']['locations'][1] # the directions of arrival theta_speech = 0 p0 = speech_loc - mics_loc p1 = noise_loc - mics_loc theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1)) print('Source separation', theta_noise / np.pi * 180) if 'pyramic' in mic: if mic == 'pyramic_2': I = pyramic_bss_2ch elif mic == 'pyramic_4': I = pyramic_bss_4ch elif mic == 'pyramic_24': I = list(range(8,16)) + list(range(24,32)) + list(range(40,48)) # flat part elif mic == 'pyramic_48': I = list(range(48)) else: raise ValueError('Unsupported configuration') audio = audio[:,I] noise_ref = noise_ref[:,I].copy() ref = ref[:,:,I].copy() mics_positions = mics_geom['pyramic'][I].copy() # place in room 2-806 mics_positions -= np.mean(mics_positions, axis=0)[None,:] mics_positions[:,2] -= np.max(mics_positions[:,2]) mics_positions += mics_loc elif mic == 'camera': mics_positions = mics_geom['camera'].copy() + mics_loc n_samples = audio.shape[0] # shorthand n_channels = audio.shape[1] # adjust length of led signal if necessary if leds.shape[0] < audio.shape[0]: z_missing = audio.shape[0] - leds.shape[0] leds = np.pad(leds, (0,z_missing), 'constant') elif leds.shape[0] > audio.shape[0]: leds = leds[:audio.shape[0],] # perform VAD led_target = leds[:,blinky_source_map[target]] vad_snd = led_target > vad_thresh # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix. # For that we will remove frames neighbouring the detected speech vad_guarded = vad_snd.copy() if vad_guard is not None: for i,v in enumerate(vad_snd): if np.any(vad_snd[i-vad_guard:i+vad_guard]): vad_guarded[i] = True ############################## ## STFT and frame-level VAD ## ############################## print('STFT and stuff') sys.stdout.flush() a_win = pra.hann(nfft) s_win = pra.realtime.compute_synthesis_window(a_win, nfft // 2) engine = pra.realtime.STFT(nfft, nfft // 2, analysis_window=a_win, synthesis_window=s_win, channels=audio.shape[1]) # Now compute the STFT of the microphone input X = engine.analysis(audio) X_time = np.arange(1, X.shape[0]+1) * (nfft / 2) / fs_snd X_speech = engine.analysis(audio * vad_guarded[:audio.shape[0],None]) X_noise = engine.analysis(audio * (1 - vad_guarded[:audio.shape[0],None])) ########################## ## MAX SINR BEAMFORMING ## ########################## print('Max SINR beamformer computation') sys.stdout.flush() # covariance matrices from noisy signal Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech)) Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise)) Rall = Rs + Rn # compute the MaxSINR beamformer w = [la.eigh(rs, b=rn, eigvals=(n_channels-1,n_channels-1))[1] for rs,rn in zip(Rall[1:], Rn[1:])] w = np.squeeze(np.array(w)) nw = la.norm(w, axis=1) w[nw > 1e-10,:] /= nw[nw > 1e-10,None] w = np.concatenate([np.ones((1,n_channels)), w], axis=0) # add dummy beamformer at DC if not args.no_norm: # normalize with respect to input signal z = compute_gain(w, X_speech, X_speech[:,:,0], clip_up=args.clip_gain) w *= z[:,None] ########### ## APPLY ## ########### print('Apply beamformer') sys.stdout.flush() # 2D beamformer mic_array = pra.Beamformer(mics_positions[:,:2].T, fs=fs_snd, N=nfft, hop=nfft, zpb=nfft) mic_array.signals = audio.T mic_array.weights = w.T out = mic_array.process() # Signal alignment step # Not sure why the delay is sometimes negative here... Need to check more delay = int(pra.tdoa(out, ref[0,:,0].astype(np.float), phat=True)) print(delay) delay = np.abs(delay) if delay > 0: out_trunc = out[delay:delay+ref.shape[1]] else: out_trunc = np.concatenate((np.zeros(-delay), out[:ref.shape[1]+delay])) sig_eval = np.vstack([out_trunc] * len(target_choices)) # We use the BSS eval toolbox metric = bss_eval_images(ref[:,:,0], sig_eval[:,:,None]) # we are only interested in SDR and SIR for the speech source ret = { 'Max-SINR' : {'SDR' : metric[0][0], 'SIR' : metric[2][0]} } ############################# ## BLIND SOURCE SEPARATION ## ############################# if mic in ['camera', 'pyramic_2', 'pyramic_4']: Y = pra.bss.auxiva(X, n_iter=40) bss = pra.realtime.synthesis(Y, nfft, nfft // 2, win=s_win) match = [] for col in range(bss.shape[1]): xcorr = fast_corr(bss[:,col], ref[0,:,0]) match.append(np.max(xcorr)) best_col = np.argmax(match) # Not sure why the delay is sometimes negative here... Need to check more delay = np.abs(int(pra.tdoa(bss[:,best_col], ref[0,:,0].astype(np.float), phat=True))) if delay > 0: bss_trunc = bss[delay:delay+ref.shape[1],] elif delay < 0: bss_trunc = np.concatenate((np.zeros((-delay, bss.shape[1])), bss[:ref.shape[1]+delay])) else: bss_trunc = bss[:ref.shape[1],] if ref.shape[1] > bss_trunc.shape[0]: ref_lim = bss_trunc.shape[0] else: ref_lim = ref.shape[1] if mic in ['camera', 'pyramic_2']: bss_trunc = np.hstack([bss_trunc] * 2) metric = bss_eval_images(ref[:,:ref_lim,0,None], bss_trunc.T[:,:,None]) SDR_bss = metric[0][0] SIR_bss = metric[2][0] ret['BSS'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] } ################################# ## Estimate SDR and SIR of mix ## ################################# # Not sure why the delay is sometimes negative here... Need to check more delay = np.abs(int(pra.tdoa(audio[:,0], ref[0,:,0].astype(np.float), phat=True))) if delay > 0: audio_trunc = audio[delay:delay+ref.shape[1],0] elif delay < 0: audio_trunc = np.concatenate((np.zeros(-delay), audio[:ref.shape[1]+delay,0])) else: audio_trunc = audio[:ref.shape[1],0] if ref.shape[1] > audio_trunc.shape[0]: ref_lim = audio_trunc.shape[0] else: ref_lim = ref.shape[1] audio_trunc = np.vstack([audio_trunc] * len(ref)) metric = bss_eval_images(ref[:,:ref_lim,0,None], audio_trunc[:,:,None]) SDR_bss = metric[0][0] SIR_bss = metric[2][0] ret['Mix'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] } ################## ## SAVE SAMPLES ## ################## if args.save_sample is not None: if not os.path.exists(args.save_sample): os.makedirs(args.save_sample) # for informal listening tests, we need to high pass and normalize the # amplitude. if mic in ['camera', 'pyramic_2', 'pyramic_4']: upper = np.max([audio[:,0].max(), out.max(), bss.max(), ref[0,:,0].max()]) else: upper = np.max([audio[:,0].max(), out.max(), ref[0,:,0].max()]) # Clean signal for reference sig_ref = pra.highpass(ref[0,:,0].astype(np.float) / upper, fs_snd, fc=150) f0 = os.path.join(args.save_sample, '{}_ref_SIR_NA_dB.wav'.format(mic)) wavfile.write(f0, fs_snd, sig_ref) # Mix signal for reference sig_mix = pra.highpass(audio[:,0].astype(np.float) / upper, fs_snd, fc=150) f1 = os.path.join(args.save_sample, '{}_mix_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f1, fs_snd, sig_mix) # Output of MaxSINR sig_out = pra.highpass(out / upper, fs_snd, fc=150) f2 = os.path.join(args.save_sample, '{}_maxsinr_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f2, fs_snd, sig_out) # Output of BSS if mic in ['camera', 'pyramic_2', 'pyramic_4']: sig_bss = pra.highpass(bss[:,best_col] / upper, fs_snd, fc=150) f3 = os.path.join(args.save_sample, '{}_bss_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f3, fs_snd, sig_bss) ########## ## PLOT ## ########## if args.plot: plt.figure() plt.plot(out_trunc) plt.plot(ref[0,:,0]) plt.legend(['output', 'reference']) # time axis for plotting led_time = np.arange(led_target.shape[0]) / fs_led + 1 / (2 * fs_led) audio_time = np.arange(n_samples) / fs_snd plt.figure() plt.plot(led_time, led_target, 'r') plt.title('LED signal') # match the scales of VAD and light to sound before plotting q_vad = np.max(audio) q_led = np.max(audio) / np.max(led_target) plt.figure() plt.plot(audio_time, audio[:,0], 'b') plt.plot(led_time, led_target * q_led, 'r') plt.plot(audio_time, vad_snd * q_vad, 'g') plt.plot(audio_time, vad_guarded * q_vad, 'g--') plt.legend(['audio','VAD']) plt.title('LED and audio signals') plt.figure() a_time = np.arange(audio.shape[0]) / fs_snd plt.plot(a_time, audio[:,0]) plt.plot(a_time, out_trunc) plt.legend(['channel 0', 'beamformer output', 'speech reference']) ''' plt.figure() mic_array.plot_beam_response() plt.vlines([180+np.degrees(theta_speech), 180-np.degrees(theta_noise)], 0, nfft // 2) room = pra.ShoeBox(protocol['geometry']['room'][:2], fs=16000, max_order=1) room.add_source(noise_loc[:2]) # noise room.add_source(speech_loc[:2]) # speech room.add_source(protocol['geometry']['speakers']['locations'][1][:2]) # signal room.add_microphone_array(mic_array) room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000]) ''' plt.figure() mic_array.plot() plt.show() # Return SDR and SIR return ret
shape = 'Linear' # array shape Lg_t = 0.050 # Filter size in seconds Lg = np.ceil(Lg_t * Fs) # Filter size in samples delay = 0.03 # define the FFT length N = 1024 # create a microphone array if shape is 'Circular': R = pra.circular2DArray(mic1, M, phi, d * M / (2 * np.pi)) elif shape is 'Poisson': R = pra.poisson2DArray(mic1, M, d) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N=N, Lg=Lg) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_' + str(Fs) + '.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_' + str(Fs) + '.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1.
center = [1, 2] fft_len = 512 echo = pra.circular_2D_array(center=center, M=6, phi0=0, radius=radius) echo = np.concatenate((echo, np.array(center, ndmin=2).T), axis=1) sigma2_n = 5e-7 max_order_design = 1 for alpha in alphas: corners = np.array([[0, 0], [0, 4], [6, 4], [6, 1], [2, 1], [2, 0]]).T # [x,y] roomPoly = pra.Room.from_corners(corners, fs=Fs, max_order=12, absorption=alpha) mics = pra.Beamformer(echo, Fs, N=fft_len, Lg=Lg) roomPoly.add_microphone_array(mics) roomPoly.add_source(source, delay=0, signal=xtone) roomPoly.add_source(interferer, delay=0, signal=silence) roomPoly.image_source_model(use_libroom=True) roomPoly.compute_rir() roomPoly.simulate() # Rake MVDR simulation BeamformerType = 'RakeMVDR' good_sources = roomPoly.sources[0][:max_order_design + 1] bad_sources = roomPoly.sources[1][:max_order_design + 1] mics.rake_mvdr_filters(good_sources, bad_sources, sigma2_n * np.eye(mics.Lg * mics.M)) output = mics.process() out = pra.normalize(pra.highpass(output, Fs))
def perceptual_quality_evaluation(room_dim, mics, good_pos, good_index, bad_pos, bad_index, rir_location): print 'start' import numpy as np from scipy.io import wavfile from os import getpid import pyroomacoustics as pra # number of sources to consider n_sources = np.arange(1, 8) S = n_sources.shape[0] # number of mics n_mic = mics.shape[1] # Set the speed of sound to match that of the measured RIR pra.constants.set('c', 345.5) Fs = 8000. N = 1024 Lg = int(0.03 * Fs) # 350 ms long filter delay_bf = 0.02 sigma2_n = 1e-6 # reflection coefficients from the walls (hand-waving) reflection = { 'ground': 0.8, 'south': 0.8, 'west': 0.8, 'north': 0.8, 'east': 0.8, 'ceilling': 0.5 } speech_sample1 = 'samples/fq_sample1_8000.wav' speech_sample2 = 'samples/fq_sample2_8000.wav' # Create the room room = pra.ShoeBox3D(np.zeros(3), room_dim, Fs, max_order=1, absorption=reflection, sigma2_awgn=sigma2_n) # Create the beamformer bf = pra.Beamformer(mics, Fs, N=N, Lg=Lg) room.addMicrophoneArray(bf) # data receptacles beamformer_names = ['Rake Perceptual', 'Rake MVDR'] bf_weights_fun = [bf.rakePerceptualFilters, bf.rakeMVDRFilters] bf_fnames = ['1', '2'] NBF = len(beamformer_names) # receptacle arrays pesq_input = np.zeros(2) pesq_bf = np.zeros((2, NBF, S)) # create a single reference mic at position of microphone 4 ref_mic_n = 4 ref_mic = pra.MicrophoneArray(bf.R[:, ref_mic_n, np.newaxis], Fs) # since we run multiple thread, we need to uniquely identify filenames pid = str(getpid()) file_ref = 'output_samples/fqref' + pid + '.wav' file_suffix = '-' + pid + '.wav' files_bf = [ 'output_samples/fq' + str(i + 1) + file_suffix for i in xrange(NBF) ] file_raw = 'output_samples/fqraw' + pid + '.wav' # index of good and bad sources good = good_index bad = bad_index # Read the two speech samples used rate, good_signal = wavfile.read(speech_sample1) good_signal = np.array(good_signal, dtype='float64') good_signal = pra.normalize(good_signal) good_signal = pra.highpass(good_signal, rate) good_len = good_signal.shape[0] / float(Fs) rate, bad_signal = wavfile.read(speech_sample2) bad_signal = np.array(bad_signal, dtype='float64') bad_signal = pra.normalize(bad_signal) bad_signal = pra.highpass(bad_signal, rate) bad_len = bad_signal.shape[0] / float(Fs) # variance of good signal good_sigma2 = np.mean(good_signal**2) # normalize interference signal to have equal power with desired signal bad_signal *= good_sigma2 / np.mean(bad_signal**2) # pick good source position at random good_distance = np.linalg.norm(bf.center[:, 0] - good_pos) # pick bad source position at random bad_distance = np.linalg.norm(bf.center[:, 0] - bad_pos) if good_len > bad_len: good_delay = 0 bad_delay = (good_len - bad_len) / 2. else: bad_delay = 0 good_delay = (bad_len - good_len) / 2. # create the reference room for freespace, noisless, no interference simulation ref_room = pra.ShoeBox3D([0, 0, 0], room_dim, Fs, max_order=0) ref_room.addSource(good_pos, signal=good_signal, delay=good_delay) ref_room.addMicrophoneArray(ref_mic) ref_room.compute_RIR() ref_room.simulate() reference = pra.highpass(ref_mic.signals[0], Fs) reference_n = pra.normalize(reference) # save the reference desired signal #wavfile.write(file_ref, Fs, pra.to_16b(reference_n)) new_ref = good_signal.copy() new_ref = pra.normalize(pra.highpass(new_ref, Fs)) wavfile.write(file_ref, Fs, pra.to_16b(new_ref)) # add the sources to the 'real' room room.addSource(good_pos, signal=good_signal, delay=good_delay) room.addSource(bad_pos, signal=bad_signal, delay=bad_delay) # read in the RIR from file for r in range(n_mic): for s in [good_index, bad_index]: # read wav file fname_rir = rir_location % (r + 1, s + 1) rir_fs, rir = wavfile.read(fname_rir) rir = np.array(rir, dtype='float64') if rir_fs != Fs: raise NameError( 'The RIR and the signals do not have the same sampling rate.' ) ''' import scikits.samplerate as sr rir = sr.resample(rir, Fs/float(rir_fs), 'sinc_best') # the factor 2 was empirically determined to be necessary to get # amplitude of RIR in the correct ballpark. rir *= 2. ''' room.rir.append([]) room.rir[r].append(rir) # compute the input signal to the microphones room.simulate() # save degraded signal at reference microphone raw = bf.signals[ref_mic_n] raw_n = pra.normalize(pra.highpass(raw, Fs)) wavfile.write(file_raw, Fs, pra.to_16b(raw_n)) pesq_input = pra.pesq(file_ref, file_raw, Fs=Fs) for src in room.sources: src.setOrdering('strongest', ref_point=bf.center) for k, s in enumerate(n_sources): good_img = room.sources[0][:s] bad_img = room.sources[1][:s] for i, bfr in enumerate(beamformer_names): bf_weights_fun[i](good_img, bad_img, sigma2_n * np.eye(n_mic * Lg), delay=delay_bf) # run beamformer output = bf.process() output = pra.normalize(pra.highpass(output, Fs)) output = pra.time_align(reference_n, output) # save files for PESQ evaluation wavfile.write(files_bf[i], Fs, pra.to_16b(output)) # compute PESQ x = pra.pesq(file_ref, files_bf[i], Fs=Fs) pesq_bf[:, i, k] = pra.pesq(file_ref, files_bf[i], Fs=Fs).T ''' This is how you can compare the true RIRs with the image src model generated one plt.figure() for m in range(n_mic): rir_sim = room.sources[0].getRIR(mics[:,m], Fs) plt.subplot(3,3,m+1) plt.plot(room.rir[m][0][:rir_sim.shape[0]]) plt.plot(rir_sim) plt.show() ''' print 'Finished' return pesq_input, pesq_bf
pos_noise = [2.8, 4.3] fft_len = 1024 # use circular array with center mic center = np.array([2, 1.5]) radius = 0.2 R = pra.circular_2D_array(center, M=6, phi0=0, radius=radius) R = np.concatenate((R, np.array(center, ndmin=2).T), axis=1) # visualize the setup room = pra.ShoeBox(room_dim, absorption=absorption_fact, max_order=max_order) room.add_source(pos_source) room.add_source(pos_noise) room.add_microphone_array(pra.Beamformer(R, room.fs, N=fft_len)) room.mic_array.rake_delay_and_sum_weights(room.sources[0][:1]) room.plot(freq=[500, 1000, 2000, 4000], img_order=0) plt.title("Simulation setup and polar patterns") plt.legend(['500', '1000', '2000', '4000']) plt.grid() #create object dataset = pra.datasets.GoogleSpeechCommands(download=True, subset=1) #separate the noise and the speech samples noise_samps = dataset.filter(speech=0) speech_samps = dataset.filter(speech=1) speech_samps = speech_samps.filter(word=desired_word) #pick one of each from WAV