## Read target speech audio while True: spe_id = random.randint(start_spe_id, end_spe_id) utt_key = sp_utts_scp[spe_id][0] spe_path = sp_utts_scp[spe_id][1] spe_name = file_name(pathName=spe_path) sample_rate, spe_wav = wavfile.read(spe_path) if len(spe_wav.shape) > 1: spe_wav = np.mean(spe_wav, 1) spe_wav = spe_wav.astype(np.float) if np.mean(np.abs(spe_wav)) > 0: break spe_length = spe_wav.shape[0] spe_wav = pra.normalize(spe_wav) spe_wav = pra.highpass(spe_wav, Fs, 50) room_mix.add_source(target_source, signal=spe_wav, delay=delay) room_ref.add_source(target_source, signal=spe_wav, delay=delay) #room_dir.add_source(target_source, signal = spe_wav, delay = delay) ## Read interfere speech audio for it in range(0, interf_num): while True: while True: inf_id = random.randint(start_spe_id, end_spe_id) if np.abs(spe_id - inf_id) > 500: break inf_path = sp_utts_scp[inf_id][1] sample_rate, inf_wav = wavfile.read( inf_path) # (nsample, nchannel)
# define the FFT length N = 1024 # create a microphone array if shape is 'Circular': R = pra.circular2DArray(mic1, M, phi, d*M/(2*np.pi)) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N=N, Lg=Lg) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_'+str(Fs)+'.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_'+str(Fs)+'.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1. # create the room with sources and mics room1 = pra.Room.shoeBox2D( [0,0], room_dim, Fs, t0 = t0,
fs_silence, rec_silence = wavfile.read(rec_folder + 'silence.wav') if fs_file != fs_silence: raise ValueError('Weird: fs of signals and silence are different...') # Resample the files if required if fs_file != fs: print 'Resampling signals' from scikits.samplerate import resample resampled_signals = [] resampled_silence = [] for i in R_flat_I: resampled_signals.append( pra.highpass(resample(rec_signals[:, i], fs / fs_file, 'sinc_best'), fs, fc=150.)) resampled_silence.append( pra.highpass(resample(rec_silence[:, i], fs / fs_file, 'sinc_best'), fs, fc=150.)) speech_signals = np.array(resampled_signals, dtype=np.float).T silence = np.array(resampled_silence, dtype=np.float).T else: print('No need to resample signals') speech_signals = np.array(rec_signals[:, R_flat_I], dtype=np.float32) silence = np.array(rec_silence[:, R_flat_I], dtype=np.float32) # highpass filter at 150
mics = pra.Beamformer(echo, Fs, N=fft_len, Lg=Lg) roomPoly.add_microphone_array(mics) roomPoly.add_source(source, delay=0, signal=xtone) roomPoly.add_source(interferer, delay=0, signal=silence) roomPoly.image_source_model(use_libroom=True) roomPoly.compute_rir() roomPoly.simulate() # Rake MVDR simulation BeamformerType = 'RakeMVDR' good_sources = roomPoly.sources[0][:max_order_design + 1] bad_sources = roomPoly.sources[1][:max_order_design + 1] mics.rake_mvdr_filters(good_sources, bad_sources, sigma2_n * np.eye(mics.Lg * mics.M)) output = mics.process() out = pra.normalize(pra.highpass(output, Fs)) out = normalize(out) # Rake Perceptual simulation # BeamformerType = 'RakePerceptual' # good_sources = room1.sources[0][:max_order_design+1] # bad_sources = room1.sources[1][:max_order_design+1] # mics.rake_perceptual_filters(good_sources, # bad_sources, # sigma2_n*np.eye(mics.Lg*mics.M)) # output = mics.process() # out = pra.normalize(pra.highpass(output, Fs)) # input_mic = pra.normalize(pra.highpass(mics.signals[mics.M//2], Fs)) # input_mic = normalize(input_mic)
room = pra.ShoeBox( room_dim, absorption=0.2, fs=fs_s, t0=t0, max_order=max_order, sigma2_awgn=5e-7) #add the sources room.add_source(pos_source,signal=audio_anechoic,delay=0.) room.add_source(pos_noise,signal=noise_anechoic,delay=1.0) #add the microphone array and compute RIR mics = pra.Beamformer(R, room.fs,N,Lg=Lg) room.add_microphone_array(mics) room.compute_rir() room.simulate() #design the beamforming filters using some of the images sources good_sources = room.sources[0][:max_order_design+1] bad_sources = room.sources[1][:max_order_design+1] mics.rake_mvdr_filters(good_sources,bad_sources,5e-7*np.eye(mics.Lg*mics.M),delay=delay) #process the signal noisy_signal_beamforming = mics.process() out_RakeMVDR = pra.highpass(noisy_signal_beamforming,room.fs).astype(np.int16) dest = os.path.join(dest_dir,"beamforming_signal.wav") wavfile.write(dest,16000,out_RakeMVDR) score_beamformer = label_wav(dest, labels_file, graph_file, speech.meta.as_dict()['word']) print(score_beamformer)
Lgp = np.floor(0.4*Lg) Lgm = Lg - Lgp print 'Lg=',Lg # create a microphone array if shape is 'Circular': R = circular2DArray(mic1, M, phi, d*M/(2*np.pi)) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N, Lg=Lg, hop=hop, zpf=zp, zpb=zp) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_'+str(Fs)+'.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_'+str(Fs)+'.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1. # create the room with sources and mics room1 = pra.Room.shoeBox2D( [0,0], room_dim, Fs, t0 = t0,
def make_noisy(args, thread_id, num_make_utts): spe_utt_ids, noise_utt_ids, diffuse_utt_ids, text_dict, utt2spk_dict, utt2data_dict = load_data(args) audio_parser = AudioParser() spe_utt_size = len(spe_utt_ids) if spe_utt_ids is not None else 0 noise_utt_size = len(noise_utt_ids) if noise_utt_ids is not None else 0 diffuse_utt_size = len(diffuse_utt_ids) if diffuse_utt_ids is not None else 0 noisy_scp_list = [] noisy_utt2spk = [] noisy_text_dict = [] mix2info = [] num_utts = 0 all_angle = 360.0 Targ_Ang_Num = args.num_targ_ang Targ_Ang_Resolution = all_angle / Targ_Ang_Num if Targ_Ang_Num > 0 else 0.0 save_mix = args.save_mix save_reverb = args.save_reverb save_clean = args.save_clean while True: ## Random a room room_x = random.uniform(args.min_room_length, args.max_room_length) room_y = random.uniform(args.min_room_weidth, args.max_room_weidth) room_z = random.uniform(args.min_room_height, args.max_room_height) room_dim = [room_x, room_y, room_z] ## Create the room T60 = random.uniform(args.min_T60, args.max_T60) absorption, max_order = pra.inverse_sabine(T60, room_dim) if save_mix: room_mix = pra.ShoeBox(room_dim, fs = args.sample_rate, materials=pra.Material(absorption), max_order=max_order, sigma2_awgn = None) else: room_mix = None if save_reverb: room_ref = pra.ShoeBox(room_dim, fs = args.sample_rate, materials=pra.Material(absorption), max_order=max_order, sigma2_awgn = None) else: room_mix = None if save_clean: room_dir = pra.ShoeBox(room_dim, fs = args.sample_rate, materials=pra.Material(0.99999), max_order=max_order, sigma2_awgn = None) else: room_dir = None ## Random the position of microphone array mic_x = random.uniform(args.min_mic_x, room_x - args.min_mic_x) mic_y = random.uniform(args.min_mic_y, room_y - args.min_mic_y) mic_z = random.uniform(args.min_mic_z, max(min(room_z - args.min_mic_z, 2.0), args.min_mic_z + 0.5)) ## Compute The position of microphones mic_xyz = [] for m in range(args.num_mic): mic_pos = args.mic_pos[m] x = mic_x + mic_pos[0] y = mic_y + mic_pos[1] z = mic_z mic_xyz.append([x, y, z]) mic_xyz = np.array(mic_xyz) # ( 6, 3 ) mic_xyz = mic_xyz.T # ( 3, 6 ) ## Add micphone array mic_array = pra.MicrophoneArray(mic_xyz, args.sample_rate) if room_mix is not None: room_mix = room_mix.add_microphone_array(mic_array) if room_ref is not None: room_ref = room_ref.add_microphone_array(mic_array) if room_dir is not None: room_dir = room_dir.add_microphone_array(mic_array) ##print("room = [%.2f %.2f %.2f], micro = [%.2f %.2f %.2f]" % (room_x, room_y, room_z, mic_x, mic_y, mic_z)) ## Add target sources to room_mix and room_ref target_source = None while True: if args.num_targ_ang <= 0.0: targ_ang = random.randint( 0, int(all_angle) ) else: targ_ang = int(random.randint(0, Targ_Ang_Num - 1) * Targ_Ang_Resolution) targ_theta = np.pi * targ_ang / 180.0 targ_dist = random.uniform(args.min_targ_distance, args.max_targ_distance) targ_x = mic_x + np.cos(targ_theta) * targ_dist targ_y = mic_y + np.sin(targ_theta) * targ_dist targ_z = mic_z target_source = [targ_x, targ_y, targ_z] if (targ_x < (room_x - 0.5) and targ_x > 0.5) and (targ_y < (room_y - 0.5) and targ_y > 0.5): break if target_source is None and not room_mix.is_inside(target_source): continue ##print("room = [%.2f %.2f %.2f], target_source = [%.2f %.2f %.2f]" % (room_x, room_y, room_z, target_source[0], target_source[1], target_source[2])) ##print("targ_ang = %d, targ_dist %.2f" % (targ_ang, targ_dist)) targ_tdoa = targ_ang if args.is_linear_mic and targ_tdoa > 180: targ_tdoa = 360.0 - targ_tdoa ## Add interference sources to room_mix num_interf = min(random.randint(1, args.max_num_interf), 1) interf_angs = [] interf_dists = [] interf_source = [] while True: interf_ang = random.randint(0, int(all_angle)) interf_tdoa = interf_ang if args.is_linear_mic and interf_tdoa > 180: interf_tdoa = 360.0 - interf_tdoa if np.abs(targ_tdoa - interf_tdoa) < args.minAD: continue interf_theta = np.pi * interf_ang / 180.0 interf_dist = random.uniform(args.min_interf_distance, args.max_interf_distance) interf_x = mic_x + np.cos(interf_theta) * interf_dist interf_y = mic_y + np.sin(interf_theta) * interf_dist interf_z = mic_z ainterf_source = [interf_x, interf_y, interf_z] if (interf_x < (room_x - 0.5) and interf_x > 0.5) and (interf_y < (room_y - 0.5) and interf_y > 0.5): interf_angs.append(interf_ang) interf_dists.append(interf_dist) interf_source.append(ainterf_source) if len(interf_source) >= num_interf: break ##print("interf_ang = %d, interf_dist %.2f, num_interf = %d" % (interf_ang, interf_dist, len(interf_source))) for sim in range(args.nutt_per_room): if room_mix is not None: room_mix.sources = [] if room_ref is not None: room_ref.sources = [] if room_dir is not None: room_dir.sources = [] ## Add Speech to microphone array while True: spe_idx = random.randint(0, spe_utt_size - 1) spe_key, spe_path = spe_utt_ids[spe_idx] spe_wav = audio_parser.WaveData(spe_path, sample_rate = args.sample_rate) if spe_wav is None or spe_wav.shape[0] < args.sample_rate: continue spe_wav = np.squeeze(spe_wav) if np.mean(np.abs(spe_wav)) > 0: break spe_length = spe_wav.shape[0] spe_wav = pra.normalize(spe_wav) spe_wav = pra.highpass(spe_wav, args.sample_rate, 50) if room_mix is not None and room_mix.is_inside(target_source): room_mix = room_mix.add_source(target_source, signal = spe_wav, delay = 0) else: print("target_source not in room_mix") continue if room_ref is not None and room_ref.is_inside(target_source): room_ref = room_ref.add_source(target_source, signal = spe_wav, delay = 0) else: print("target_source not in room_ref") if room_dir is not None and room_dir.is_inside(target_source): room_dir = room_dir.add_source(target_source, signal = spe_wav, delay = 0) else: print("target_source not in room_dir") if room_mix is not None and len(room_mix.sources) < 1: print("target_source not in room_mix") break if room_ref is not None and len(room_ref.sources) < 1: print("target_source not in room_ref") break if room_dir is not None and len(room_dir.sources) < 1: print("target_source not in room_dir") break ## Add Interference to microphone array for it in range(0, num_interf): while True: inf_idx = random.randint(0, noise_utt_size - 1) inf_path = noise_utt_ids[inf_idx] inf_wav = audio_parser.WaveData(inf_path, sample_rate = args.sample_rate) if inf_wav is None or inf_wav.shape[0] < args.sample_rate: continue inf_wav = np.squeeze(inf_wav) if np.mean(np.abs(inf_wav)) > 0: break inf_length = inf_wav.shape[0] inf_wav = pra.normalize(inf_wav) inf_wav = pra.highpass(inf_wav, args.sample_rate, 50) while(inf_length < spe_length): inf_wav = np.concatenate((inf_wav, inf_wav), axis = 0) inf_length = inf_wav.shape[0] inf_wav = inf_wav[:spe_length] if room_mix is not None and room_mix.is_inside(interf_source[it]): room_mix = room_mix.add_source(interf_source[it], signal = inf_wav, delay = 0) else: print("interf_source not in room_mix") continue if room_mix is not None and len(room_mix.sources) < 1: break ## Make the far-field mixture audio iSIR = random.uniform(args.lowSIR, args.upSIR) room_mix.simulate(callback_mix = callback_mix, callback_mix_kwargs = {'snr': 30, 'sir': iSIR, 'n_src': num_interf + 1, 'n_tgt': 1, 'ref_mic': 0}) mix_wav = room_mix.mic_array.signals.T # (nchannel, nsample) mix_length, num_channel = mix_wav.shape ## Read diffuse noise if diffuse_utt_ids is not None: while True: diff_idx = random.randint(0, diffuse_utt_size - 1) diff_path = diffuse_utt_ids[diff_idx] diff_wav = audio_parser.WaveData(diff_path, sample_rate = args.sample_rate, id_channel = list(range(0, num_channel))) if diff_wav is None or diff_wav.shape[0] < args.sample_rate: continue if np.mean(np.abs(diff_wav)) > 0: break dif_length, num_channel = diff_wav.shape ''' for i in range(int(num_channel / 2)): ch_wav = diff_wav[:, i] diff_wav[:, i] = diff_wav[:, num_channel - i -1] diff_wav[:, num_channel - i -1] = ch_wav ''' ## Add diffuse noise into mix while( dif_length < mix_length ): diff_wav = np.concatenate((diff_wav, diff_wav), axis = 0) dif_length = diff_wav.shape[0] diff_wav = diff_wav[0:mix_length, :] iSNR = random.uniform(args.lowSNR, args.upSNR) mix_wav = audio_parser.MixWave(mix_wav, diff_wav, snr = iSNR) ## Adapt gain of mixture audio by given gain gain = random.uniform(args.lowGain, args.upGain) scale = gain / np.max(np.abs(mix_wav)) mix_wav = mix_wav * scale mix_wav = mix_wav * 32767.0 mix_wav = mix_wav.astype(np.int16) if room_dir is not None: ## Simulate directional signals room_dir.simulate() dir_wav = room_dir.mic_array.signals[0,:].T # (spe_length) dir_wav = dir_wav * scale dir_wav = dir_wav * 32767.0 dir_wav = dir_wav.astype(np.int16) else: dir_wav = None if room_ref is not None: ## Simulate the clean far-field signal to make ref signal for compute metrics room_ref.simulate() ref_wav = room_ref.mic_array.signals # (num_channel, spe_length) ref_wav = ref_wav * scale # (num_channel, spe_length) else: ref_wav = None if ref_wav is not None: if args.targ_bf is not None: num_block = 1 ref_wav = ref_wav[np.newaxis, :, :] # [ num_block, num_channel, spe_length ] ref_wav = torch.FloatTensor(ref_wav) # [ num_block, num_channel, spe_length ] ref_wav = ref_wav.view(num_block * num_channel, 1, -1) # [ num_block * num_channel, 1, spe_length ] input_audio = ref_wav.to(args.device) # (num_block * num_channel, 1, spe_length) mFFT = args.convstft(input_audio) # (num_block * num_channel, num_bin * 2, num_frame) num_frame = mFFT.size(2) mFFT = mFFT.view(num_block, num_channel, num_bin * 2, -1) #( num_block, num_channel, num_bin * 2, num_frame) mFFT_r = mFFT[:, :, :num_bin, :] #( num_block, num_channel, num_bin, num_frame) mFFT_i = mFFT[:, :, num_bin:, :] #( num_block, num_channel, num_bin, num_frame) mFFT_r = mFFT_r.permute([0, 3, 2, 1]).contiguous() #( num_block, num_frame, num_bin, num_channel) mFFT_i = mFFT_i.permute([0, 3, 2, 1]).contiguous() #( num_block, num_frame, num_bin, num_channel) mFFT_r = mFFT_r.view(num_block * num_frame, num_bin, num_channel) # ( num_block * num_frame, num_bin, num_channel) mFFT_i = mFFT_i.view(num_block * num_frame, num_bin, num_channel) # ( num_block * num_frame, num_bin, num_channel) mFFT = torch.cat([torch.unsqueeze(mFFT_r, 1), torch.unsqueeze(mFFT_i, 1)], dim = 1) # ( num_block * num_frame, 2, num_bin, num_channel ) # Compute the BF bf_direction_resolution targ_tdoa = targ_ang if num_channel == 2 or args.is_linear_mic: if targ_tdoa > 180: targ_tdoa = 360.0 - targ_tdoa bf_beam = targ_tdoa / args.bf_direction_resolution + 0.5 bf_beam = int(bf_beam) % args.num_beam print("tdoa = %d, beam = %d" % (targ_ang, bf_beam)) rFFT = args.targ_bf(mFFT, bf_beam) # (num_block * num_frame, 2, num_bin, 1) rFFT = rFFT[:, :, :, 0].view([num_block, -1, 2, num_bin]) # (num_block, num_frame, 2, num_bin) rFFT = rFFT.permute([0, 2, 3, 1]).contiguous() # ( num_block, 2, num_bin, num_frame ) est_fft = torch.cat([rFFT[:,0], rFFT[:,1]], 1) # ( num_block, num_bin * 2, num_frame ) ref_wav = args.convistft(est_fft) # ( num_block, 1, num_sample) ref_wav = torch.squeeze(ref_wav, 1) # ( num_block, num_sample) ref_wav = ref_wav[0, :] # ( num_sample) ref_wav = ref_wav.data.cpu().numpy() # ( num_sample) else: ref_wav = ref_wav[0, :] # ( num_sample) ref_wav = ref_wav * 32767.0 ref_wav = ref_wav.astype(np.int16) else: ref_wav = None ## Align mix_wav, ref_wav and dir_wav nsample = min(mix_wav.shape[0], ref_wav.shape[0], dir_wav.shape[0]) mix_wav = mix_wav[:nsample] if ref_wav is not None: ref_wav = ref_wav[:nsample] if dir_wav is not None: dir_wav = dir_wav[:nsample] num_utts += 1 _, spe_name, _ = file_parse.getFileInfo(spe_path) out_path = os.path.join(args.out_path, 'wav') if not os.path.exists(out_path): os.makedirs(out_path) if utt2data_dict is not None: data_key, data_id = utt2data_dict[spe_idx] out_path = os.path.join(out_path, data_id) if not os.path.exists(out_path): os.makedirs(out_path) else: data_id = 'data01' if utt2spk_dict is not None: spk_key, spk_id = utt2spk_dict[spe_idx] out_path = os.path.join(out_path, spk_id) if not os.path.exists(out_path): os.makedirs(out_path) else: spk_id = 'spk01' out_path = os.path.join(out_path, 'wav') if not os.path.exists(out_path): os.makedirs(out_path) spe_key = spe_key.replace('_', '').replace('-', '').replace('.', '') spk_id = spk_id.replace('_', '').replace('-', '').replace('.', '') #utt_id = spk_id + "_" + spe_key + "%02d%07d" % (thread_id, num_utts) utt_id = spk_id + "_" + "%02d%07d" % (thread_id, num_utts) if mix_wav is not None: ## Write the mixture audio filename = "%s_id%02d%07d_Doa%d_SIR%.1f_SNR%.1f" % (spe_key, thread_id, num_utts, targ_ang, iSIR, iSNR) mix_path = os.path.join(out_path, '%s.wav' % (filename) ) audio_parser.WriteWave(mix_path, mix_wav, args.sample_rate) else: mix_path = None if dir_wav is not None: filename = "%s_id%02d%07d_Doa%d_DS" % (spe_key, thread_id, num_utts, targ_ang) ds_path = os.path.join(out_path, '%s.wav' % (filename) ) audio_parser.WriteWave(ds_path, dir_wav, args.sample_rate) else: ds_path = None if ref_wav is not None: filename = "%s_id%02d%07d_Doa%d_Ref" % (spe_key, thread_id, num_utts, targ_ang) ref_path = os.path.join(out_path, '%s.wav' % (filename) ) audio_parser.WriteWave(ref_path, ref_wav, args.sample_rate) else: ref_path = None if text_dict is not None: text_key, text_value = text_dict[spe_idx] else: text_value = ' ' noisy_scp_list.append((utt_id, mix_path, ds_path, ref_path, targ_ang, targ_dist, iSIR, iSNR, scale)) noisy_utt2spk.append(spk_id) noisy_text_dict.append(text_value) info = (utt_id, spe_key, mix_path, ds_path, ref_path, targ_ang, targ_dist, interf_angs, interf_dists, iSIR, iSNR, scale) mix2info.append(info) print("%d / %d: %s" % (num_utts, num_make_utts, mix_path)) if num_utts >= num_make_utts: return noisy_scp_list, noisy_utt2spk, noisy_text_dict, mix2info
def perceptual_quality_evaluation(room_dim, mics, good_pos, good_index, bad_pos, bad_index, rir_location): print 'start' import numpy as np from scipy.io import wavfile from os import getpid import pyroomacoustics as pra # number of sources to consider n_sources = np.arange(1, 8) S = n_sources.shape[0] # number of mics n_mic = mics.shape[1] # Set the speed of sound to match that of the measured RIR pra.constants.set('c', 345.5) Fs = 8000. N = 1024 Lg = int(0.03 * Fs) # 350 ms long filter delay_bf = 0.02 sigma2_n = 1e-6 # reflection coefficients from the walls (hand-waving) reflection = { 'ground': 0.8, 'south': 0.8, 'west': 0.8, 'north': 0.8, 'east': 0.8, 'ceilling': 0.5 } speech_sample1 = 'samples/fq_sample1_8000.wav' speech_sample2 = 'samples/fq_sample2_8000.wav' # Create the room room = pra.ShoeBox3D(np.zeros(3), room_dim, Fs, max_order=1, absorption=reflection, sigma2_awgn=sigma2_n) # Create the beamformer bf = pra.Beamformer(mics, Fs, N=N, Lg=Lg) room.addMicrophoneArray(bf) # data receptacles beamformer_names = ['Rake Perceptual', 'Rake MVDR'] bf_weights_fun = [bf.rakePerceptualFilters, bf.rakeMVDRFilters] bf_fnames = ['1', '2'] NBF = len(beamformer_names) # receptacle arrays pesq_input = np.zeros(2) pesq_bf = np.zeros((2, NBF, S)) # create a single reference mic at position of microphone 4 ref_mic_n = 4 ref_mic = pra.MicrophoneArray(bf.R[:, ref_mic_n, np.newaxis], Fs) # since we run multiple thread, we need to uniquely identify filenames pid = str(getpid()) file_ref = 'output_samples/fqref' + pid + '.wav' file_suffix = '-' + pid + '.wav' files_bf = [ 'output_samples/fq' + str(i + 1) + file_suffix for i in xrange(NBF) ] file_raw = 'output_samples/fqraw' + pid + '.wav' # index of good and bad sources good = good_index bad = bad_index # Read the two speech samples used rate, good_signal = wavfile.read(speech_sample1) good_signal = np.array(good_signal, dtype='float64') good_signal = pra.normalize(good_signal) good_signal = pra.highpass(good_signal, rate) good_len = good_signal.shape[0] / float(Fs) rate, bad_signal = wavfile.read(speech_sample2) bad_signal = np.array(bad_signal, dtype='float64') bad_signal = pra.normalize(bad_signal) bad_signal = pra.highpass(bad_signal, rate) bad_len = bad_signal.shape[0] / float(Fs) # variance of good signal good_sigma2 = np.mean(good_signal**2) # normalize interference signal to have equal power with desired signal bad_signal *= good_sigma2 / np.mean(bad_signal**2) # pick good source position at random good_distance = np.linalg.norm(bf.center[:, 0] - good_pos) # pick bad source position at random bad_distance = np.linalg.norm(bf.center[:, 0] - bad_pos) if good_len > bad_len: good_delay = 0 bad_delay = (good_len - bad_len) / 2. else: bad_delay = 0 good_delay = (bad_len - good_len) / 2. # create the reference room for freespace, noisless, no interference simulation ref_room = pra.ShoeBox3D([0, 0, 0], room_dim, Fs, max_order=0) ref_room.addSource(good_pos, signal=good_signal, delay=good_delay) ref_room.addMicrophoneArray(ref_mic) ref_room.compute_RIR() ref_room.simulate() reference = pra.highpass(ref_mic.signals[0], Fs) reference_n = pra.normalize(reference) # save the reference desired signal #wavfile.write(file_ref, Fs, pra.to_16b(reference_n)) new_ref = good_signal.copy() new_ref = pra.normalize(pra.highpass(new_ref, Fs)) wavfile.write(file_ref, Fs, pra.to_16b(new_ref)) # add the sources to the 'real' room room.addSource(good_pos, signal=good_signal, delay=good_delay) room.addSource(bad_pos, signal=bad_signal, delay=bad_delay) # read in the RIR from file for r in range(n_mic): for s in [good_index, bad_index]: # read wav file fname_rir = rir_location % (r + 1, s + 1) rir_fs, rir = wavfile.read(fname_rir) rir = np.array(rir, dtype='float64') if rir_fs != Fs: raise NameError( 'The RIR and the signals do not have the same sampling rate.' ) ''' import scikits.samplerate as sr rir = sr.resample(rir, Fs/float(rir_fs), 'sinc_best') # the factor 2 was empirically determined to be necessary to get # amplitude of RIR in the correct ballpark. rir *= 2. ''' room.rir.append([]) room.rir[r].append(rir) # compute the input signal to the microphones room.simulate() # save degraded signal at reference microphone raw = bf.signals[ref_mic_n] raw_n = pra.normalize(pra.highpass(raw, Fs)) wavfile.write(file_raw, Fs, pra.to_16b(raw_n)) pesq_input = pra.pesq(file_ref, file_raw, Fs=Fs) for src in room.sources: src.setOrdering('strongest', ref_point=bf.center) for k, s in enumerate(n_sources): good_img = room.sources[0][:s] bad_img = room.sources[1][:s] for i, bfr in enumerate(beamformer_names): bf_weights_fun[i](good_img, bad_img, sigma2_n * np.eye(n_mic * Lg), delay=delay_bf) # run beamformer output = bf.process() output = pra.normalize(pra.highpass(output, Fs)) output = pra.time_align(reference_n, output) # save files for PESQ evaluation wavfile.write(files_bf[i], Fs, pra.to_16b(output)) # compute PESQ x = pra.pesq(file_ref, files_bf[i], Fs=Fs) pesq_bf[:, i, k] = pra.pesq(file_ref, files_bf[i], Fs=Fs).T ''' This is how you can compare the true RIRs with the image src model generated one plt.figure() for m in range(n_mic): rir_sim = room.sources[0].getRIR(mics[:,m], Fs) plt.subplot(3,3,m+1) plt.plot(room.rir[m][0][:rir_sim.shape[0]]) plt.plot(rir_sim) plt.show() ''' print 'Finished' return pesq_input, pesq_bf
def process_experiment_max_sinr(SIR, mic, blinky, args): session = args.session target = args.target with open(metadata_file.format(session=args.session), 'r') as f: metadata = json.load(f) file_pattern = os.path.join(experiment_folder, metadata['filename_pattern']) with open(protocol_file.format(session=args.session), 'r') as f: protocol = json.load(f) nfft = args.nfft vad_guard = args.vad_guard if args.thresh is None: vad_thresh = thresh_opt[SIR] else: vad_thresh = args.thresh # read_in the mix signals fs_led, leds = wavfile.read(file_pattern.format( session=session, snr=SIR, mic=blinky, source='mix', fs=fs)) fs_snd, audio = wavfile.read(file_pattern.format( session=session, snr=SIR, mic=mic_choices[mic], source='mix', fs=fs)) assert fs_led == fs_snd # read in the ref signals sources_ref = dict(zip(target_choices, [ wavfile.read(file_pattern.format( session=session, mic=mic_choices[mic], snr=SIR, source=ch, fs=fs))[1] for ch in target_choices ])) leds_ref = dict(zip(target_choices, [ wavfile.read(file_pattern.format( session=session, mic=blinky, snr=SIR, source=ch, fs=fs))[1] for ch in target_choices ])) # reorder with target in first position ref = np.array([sources_ref[target]] + [sources_ref[ch] for ch in target_choices if ch != target]) noise_ref = np.zeros_like(sources_ref[target]) n_ch = [ch for ch in target_choices if ch != target] for ch in n_ch: noise_ref += sources_ref[ch] # In case of objective evaluation, we do an artificial mix if args.synth_mix: audio = sources_ref[target] + noise_ref # get the geometry information to get nice plots. mics_geom = { 'pyramic' : np.array(protocol['geometry']['microphones']['pyramic']['locations']), 'camera' : np.array(protocol['geometry']['microphones']['camera']['locations']), } mics_loc = np.array(protocol['geometry']['microphones'][mic_choices[mic]]['reference']) noise_loc = protocol['geometry']['speakers']['locations'][0] speech_loc = protocol['geometry']['speakers']['locations'][1] # the directions of arrival theta_speech = 0 p0 = speech_loc - mics_loc p1 = noise_loc - mics_loc theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1)) print('Source separation', theta_noise / np.pi * 180) if 'pyramic' in mic: if mic == 'pyramic_2': I = pyramic_bss_2ch elif mic == 'pyramic_4': I = pyramic_bss_4ch elif mic == 'pyramic_24': I = list(range(8,16)) + list(range(24,32)) + list(range(40,48)) # flat part elif mic == 'pyramic_48': I = list(range(48)) else: raise ValueError('Unsupported configuration') audio = audio[:,I] noise_ref = noise_ref[:,I].copy() ref = ref[:,:,I].copy() mics_positions = mics_geom['pyramic'][I].copy() # place in room 2-806 mics_positions -= np.mean(mics_positions, axis=0)[None,:] mics_positions[:,2] -= np.max(mics_positions[:,2]) mics_positions += mics_loc elif mic == 'camera': mics_positions = mics_geom['camera'].copy() + mics_loc n_samples = audio.shape[0] # shorthand n_channels = audio.shape[1] # adjust length of led signal if necessary if leds.shape[0] < audio.shape[0]: z_missing = audio.shape[0] - leds.shape[0] leds = np.pad(leds, (0,z_missing), 'constant') elif leds.shape[0] > audio.shape[0]: leds = leds[:audio.shape[0],] # perform VAD led_target = leds[:,blinky_source_map[target]] vad_snd = led_target > vad_thresh # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix. # For that we will remove frames neighbouring the detected speech vad_guarded = vad_snd.copy() if vad_guard is not None: for i,v in enumerate(vad_snd): if np.any(vad_snd[i-vad_guard:i+vad_guard]): vad_guarded[i] = True ############################## ## STFT and frame-level VAD ## ############################## print('STFT and stuff') sys.stdout.flush() a_win = pra.hann(nfft) s_win = pra.realtime.compute_synthesis_window(a_win, nfft // 2) engine = pra.realtime.STFT(nfft, nfft // 2, analysis_window=a_win, synthesis_window=s_win, channels=audio.shape[1]) # Now compute the STFT of the microphone input X = engine.analysis(audio) X_time = np.arange(1, X.shape[0]+1) * (nfft / 2) / fs_snd X_speech = engine.analysis(audio * vad_guarded[:audio.shape[0],None]) X_noise = engine.analysis(audio * (1 - vad_guarded[:audio.shape[0],None])) ########################## ## MAX SINR BEAMFORMING ## ########################## print('Max SINR beamformer computation') sys.stdout.flush() # covariance matrices from noisy signal Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech)) Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise)) Rall = Rs + Rn # compute the MaxSINR beamformer w = [la.eigh(rs, b=rn, eigvals=(n_channels-1,n_channels-1))[1] for rs,rn in zip(Rall[1:], Rn[1:])] w = np.squeeze(np.array(w)) nw = la.norm(w, axis=1) w[nw > 1e-10,:] /= nw[nw > 1e-10,None] w = np.concatenate([np.ones((1,n_channels)), w], axis=0) # add dummy beamformer at DC if not args.no_norm: # normalize with respect to input signal z = compute_gain(w, X_speech, X_speech[:,:,0], clip_up=args.clip_gain) w *= z[:,None] ########### ## APPLY ## ########### print('Apply beamformer') sys.stdout.flush() # 2D beamformer mic_array = pra.Beamformer(mics_positions[:,:2].T, fs=fs_snd, N=nfft, hop=nfft, zpb=nfft) mic_array.signals = audio.T mic_array.weights = w.T out = mic_array.process() # Signal alignment step # Not sure why the delay is sometimes negative here... Need to check more delay = int(pra.tdoa(out, ref[0,:,0].astype(np.float), phat=True)) print(delay) delay = np.abs(delay) if delay > 0: out_trunc = out[delay:delay+ref.shape[1]] else: out_trunc = np.concatenate((np.zeros(-delay), out[:ref.shape[1]+delay])) sig_eval = np.vstack([out_trunc] * len(target_choices)) # We use the BSS eval toolbox metric = bss_eval_images(ref[:,:,0], sig_eval[:,:,None]) # we are only interested in SDR and SIR for the speech source ret = { 'Max-SINR' : {'SDR' : metric[0][0], 'SIR' : metric[2][0]} } ############################# ## BLIND SOURCE SEPARATION ## ############################# if mic in ['camera', 'pyramic_2', 'pyramic_4']: Y = pra.bss.auxiva(X, n_iter=40) bss = pra.realtime.synthesis(Y, nfft, nfft // 2, win=s_win) match = [] for col in range(bss.shape[1]): xcorr = fast_corr(bss[:,col], ref[0,:,0]) match.append(np.max(xcorr)) best_col = np.argmax(match) # Not sure why the delay is sometimes negative here... Need to check more delay = np.abs(int(pra.tdoa(bss[:,best_col], ref[0,:,0].astype(np.float), phat=True))) if delay > 0: bss_trunc = bss[delay:delay+ref.shape[1],] elif delay < 0: bss_trunc = np.concatenate((np.zeros((-delay, bss.shape[1])), bss[:ref.shape[1]+delay])) else: bss_trunc = bss[:ref.shape[1],] if ref.shape[1] > bss_trunc.shape[0]: ref_lim = bss_trunc.shape[0] else: ref_lim = ref.shape[1] if mic in ['camera', 'pyramic_2']: bss_trunc = np.hstack([bss_trunc] * 2) metric = bss_eval_images(ref[:,:ref_lim,0,None], bss_trunc.T[:,:,None]) SDR_bss = metric[0][0] SIR_bss = metric[2][0] ret['BSS'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] } ################################# ## Estimate SDR and SIR of mix ## ################################# # Not sure why the delay is sometimes negative here... Need to check more delay = np.abs(int(pra.tdoa(audio[:,0], ref[0,:,0].astype(np.float), phat=True))) if delay > 0: audio_trunc = audio[delay:delay+ref.shape[1],0] elif delay < 0: audio_trunc = np.concatenate((np.zeros(-delay), audio[:ref.shape[1]+delay,0])) else: audio_trunc = audio[:ref.shape[1],0] if ref.shape[1] > audio_trunc.shape[0]: ref_lim = audio_trunc.shape[0] else: ref_lim = ref.shape[1] audio_trunc = np.vstack([audio_trunc] * len(ref)) metric = bss_eval_images(ref[:,:ref_lim,0,None], audio_trunc[:,:,None]) SDR_bss = metric[0][0] SIR_bss = metric[2][0] ret['Mix'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] } ################## ## SAVE SAMPLES ## ################## if args.save_sample is not None: if not os.path.exists(args.save_sample): os.makedirs(args.save_sample) # for informal listening tests, we need to high pass and normalize the # amplitude. if mic in ['camera', 'pyramic_2', 'pyramic_4']: upper = np.max([audio[:,0].max(), out.max(), bss.max(), ref[0,:,0].max()]) else: upper = np.max([audio[:,0].max(), out.max(), ref[0,:,0].max()]) # Clean signal for reference sig_ref = pra.highpass(ref[0,:,0].astype(np.float) / upper, fs_snd, fc=150) f0 = os.path.join(args.save_sample, '{}_ref_SIR_NA_dB.wav'.format(mic)) wavfile.write(f0, fs_snd, sig_ref) # Mix signal for reference sig_mix = pra.highpass(audio[:,0].astype(np.float) / upper, fs_snd, fc=150) f1 = os.path.join(args.save_sample, '{}_mix_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f1, fs_snd, sig_mix) # Output of MaxSINR sig_out = pra.highpass(out / upper, fs_snd, fc=150) f2 = os.path.join(args.save_sample, '{}_maxsinr_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f2, fs_snd, sig_out) # Output of BSS if mic in ['camera', 'pyramic_2', 'pyramic_4']: sig_bss = pra.highpass(bss[:,best_col] / upper, fs_snd, fc=150) f3 = os.path.join(args.save_sample, '{}_bss_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f3, fs_snd, sig_bss) ########## ## PLOT ## ########## if args.plot: plt.figure() plt.plot(out_trunc) plt.plot(ref[0,:,0]) plt.legend(['output', 'reference']) # time axis for plotting led_time = np.arange(led_target.shape[0]) / fs_led + 1 / (2 * fs_led) audio_time = np.arange(n_samples) / fs_snd plt.figure() plt.plot(led_time, led_target, 'r') plt.title('LED signal') # match the scales of VAD and light to sound before plotting q_vad = np.max(audio) q_led = np.max(audio) / np.max(led_target) plt.figure() plt.plot(audio_time, audio[:,0], 'b') plt.plot(led_time, led_target * q_led, 'r') plt.plot(audio_time, vad_snd * q_vad, 'g') plt.plot(audio_time, vad_guarded * q_vad, 'g--') plt.legend(['audio','VAD']) plt.title('LED and audio signals') plt.figure() a_time = np.arange(audio.shape[0]) / fs_snd plt.plot(a_time, audio[:,0]) plt.plot(a_time, out_trunc) plt.legend(['channel 0', 'beamformer output', 'speech reference']) ''' plt.figure() mic_array.plot_beam_response() plt.vlines([180+np.degrees(theta_speech), 180-np.degrees(theta_noise)], 0, nfft // 2) room = pra.ShoeBox(protocol['geometry']['room'][:2], fs=16000, max_order=1) room.add_source(noise_loc[:2]) # noise room.add_source(speech_loc[:2]) # speech room.add_source(protocol['geometry']['speakers']['locations'][1][:2]) # signal room.add_microphone_array(mic_array) room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000]) ''' plt.figure() mic_array.plot() plt.show() # Return SDR and SIR return ret
Lgp = np.floor(0.4 * Lg) Lgm = Lg - Lgp print 'Lg=', Lg # create a microphone array if shape is 'Circular': R = circular2DArray(mic1, M, phi, d * M / (2 * np.pi)) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N, Lg=Lg, hop=hop, zpf=zp, zpb=zp) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_' + str(Fs) + '.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_' + str(Fs) + '.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1. # create the room with sources and mics room1 = pra.Room.shoeBox2D([0, 0], room_dim, Fs, t0=t0, max_order=max_order_sim,
def modify_input_wav_beamforming(wav, noise, room_dim, max_order, snr_vals, mic_array, pos_source, pos_noise, N): fs_s, audio_anechoic = wavfile.read(wav) fs_n, noise_anechoic = wavfile.read(noise) #Create a room for the signal room_signal = pra.ShoeBox(room_dim, absorption=0.2, fs=fs_s, max_order=max_order) #Create a room for the noise room_noise = pra.ShoeBox(room_dim, absorption=0.2, fs=fs_n, max_order=max_order) #source of the signal and of the noise in their respectiv boxes room_signal.add_source(pos_source, signal=audio_anechoic) room_noise.add_source(pos_noise, signal=noise_anechoic) #add the microphone array mics_signal = pra.Beamformer(mic_array, room_signal.fs, N) mics_noisy = pra.Beamformer(mic_array, room_noise.fs, N) room_signal.add_microphone_array(mics_signal) room_noise.add_microphone_array(mics_noisy) #simulate both rooms room_signal.simulate() room_noise.simulate() #take the mic_array.signals from each room audio_reverb = room_signal.mic_array.signals noise_reverb = room_noise.mic_array.signals #design beamforming filters mics_signal.rake_delay_and_sum_weights(room_signal.sources[0][:1]) mics_noisy.rake_delay_and_sum_weights(room_signal.sources[0][:1]) output_signal = mics_signal.process() output_noise = mics_noisy.process() #we're going to normalize the noise size = np.shape(audio_reverb) noise_normalized = np.zeros(size) #for each microphones if (len(noise_reverb[0]) < len(audio_reverb[0])): raise ValueError( 'the length of the noise signal is inferior to the one of the audio signal !!' ) output_noise = output_noise[:len(output_signal)] norm_fact = np.linalg.norm(noise_reverb[-1]) noise_normalized = output_noise / norm_fact #initilialize the array of noisy_signal noisy_signal = np.zeros([len(snr_vals), np.shape(output_signal)[0]]) for i, snr in enumerate(snr_vals): noise_std = np.linalg.norm(audio_reverb[-1]) / (10**(snr / 20.)) final_noise = noise_normalized * noise_std noisy_signal[i] = pra.normalize( pra.highpass(output_signal + final_noise, fs_s)) return noisy_signal
under different SNRs. """ if not os.path.exists(dest_dir): os.makedirs(dest_dir) # truncate beamformed noise noise_bf = noise_bf[:len(speech_bf)] # compute score for different SNR vals print() score_beamformed = np.empty(len(snr_vals)) score_single = np.empty(len(snr_vals)) for idx, snr in enumerate(snr_vals): noisy_signal = speech_bf + snr_facts[idx] * noise_bf noisy_signal = pra.normalize(pra.highpass(noisy_signal, fs_s), bits=16).astype(np.int16) dest = os.path.join(dest_dir, "das_bf_snr_db_%d.wav" % (snr)) wavfile.write(dest, fs_s, noisy_signal) score_beamformed[idx] = label_wav(dest, labels_file, graph_file, speech_samp.meta.word) # compute score for single mic for reference single_mic = ref_mic_sig + snr_facts[idx] * ref_mic_noise single_mic = pra.normalize(pra.highpass(single_mic, fs_s), bits=16).astype(np.int16) dest = os.path.join(dest_dir, "single_mic_snr_db_%d.wav" % (snr)) wavfile.write(dest, fs_s, single_mic) score_single[idx] = label_wav(dest, labels_file, graph_file, speech_samp.meta.word)
def perceptual_quality_evaluation(room_dim, mics, good_pos, good_index, bad_pos, bad_index, rir_location): print 'start' import numpy as np from scipy.io import wavfile from os import getpid import pyroomacoustics as pra # number of sources to consider n_sources = np.arange(1,8) S = n_sources.shape[0] # number of mics n_mic = mics.shape[1] # Set the speed of sound to match that of the measured RIR pra.constants.set('c', 345.5) Fs = 8000. N = 1024 Lg = int(0.03*Fs) # 350 ms long filter delay_bf = 0.02 sigma2_n = 1e-6 # reflection coefficients from the walls (hand-waving) reflection = {'ground':0.8, 'south':0.8, 'west':0.8, 'north':0.8, 'east':0.8, 'ceilling':0.5} speech_sample1 = 'samples/fq_sample1_8000.wav' speech_sample2 = 'samples/fq_sample2_8000.wav' # Create the room room = pra.ShoeBox3D(np.zeros(3), room_dim, Fs, max_order=1, absorption=reflection, sigma2_awgn=sigma2_n) # Create the beamformer bf = pra.Beamformer(mics, Fs, N=N, Lg=Lg) room.addMicrophoneArray(bf) # data receptacles beamformer_names = ['Rake Perceptual', 'Rake MVDR'] bf_weights_fun = [bf.rakePerceptualFilters, bf.rakeMVDRFilters] bf_fnames = ['1','2'] NBF = len(beamformer_names) # receptacle arrays pesq_input = np.zeros(2) pesq_bf = np.zeros((2,NBF,S)) # create a single reference mic at position of microphone 4 ref_mic_n = 4 ref_mic = pra.MicrophoneArray(bf.R[:,ref_mic_n,np.newaxis], Fs) # since we run multiple thread, we need to uniquely identify filenames pid = str(getpid()) file_ref = 'output_samples/fqref' + pid + '.wav' file_suffix = '-' + pid + '.wav' files_bf = ['output_samples/fq' + str(i+1) + file_suffix for i in xrange(NBF)] file_raw = 'output_samples/fqraw' + pid + '.wav' # index of good and bad sources good = good_index bad = bad_index # Read the two speech samples used rate, good_signal = wavfile.read(speech_sample1) good_signal = np.array(good_signal, dtype='float64') good_signal = pra.normalize(good_signal) good_signal = pra.highpass(good_signal, rate) good_len = good_signal.shape[0]/float(Fs) rate, bad_signal = wavfile.read(speech_sample2) bad_signal = np.array(bad_signal, dtype='float64') bad_signal = pra.normalize(bad_signal) bad_signal = pra.highpass(bad_signal, rate) bad_len = bad_signal.shape[0]/float(Fs) # variance of good signal good_sigma2 = np.mean(good_signal**2) # normalize interference signal to have equal power with desired signal bad_signal *= good_sigma2/np.mean(bad_signal**2) # pick good source position at random good_distance = np.linalg.norm(bf.center[:,0] - good_pos) # pick bad source position at random bad_distance = np.linalg.norm(bf.center[:,0] - bad_pos) if good_len > bad_len: good_delay = 0 bad_delay = (good_len - bad_len)/2. else: bad_delay = 0 good_delay = (bad_len - good_len)/2. # create the reference room for freespace, noisless, no interference simulation ref_room = pra.ShoeBox3D( [0,0,0], room_dim, Fs, max_order=0) ref_room.addSource(good_pos, signal=good_signal, delay=good_delay) ref_room.addMicrophoneArray(ref_mic) ref_room.compute_RIR() ref_room.simulate() reference = pra.highpass(ref_mic.signals[0], Fs) reference_n = pra.normalize(reference) # save the reference desired signal #wavfile.write(file_ref, Fs, pra.to_16b(reference_n)) new_ref = good_signal.copy() new_ref = pra.normalize(pra.highpass(new_ref, Fs)) wavfile.write(file_ref, Fs, pra.to_16b(new_ref)) # add the sources to the 'real' room room.addSource(good_pos, signal=good_signal, delay=good_delay) room.addSource(bad_pos, signal=bad_signal, delay=bad_delay) # read in the RIR from file for r in range(n_mic): for s in [good_index, bad_index]: # read wav file fname_rir = rir_location % (r+1,s+1) rir_fs,rir = wavfile.read(fname_rir) rir = np.array(rir, dtype='float64') if rir_fs != Fs: raise NameError('The RIR and the signals do not have the same sampling rate.') ''' import scikits.samplerate as sr rir = sr.resample(rir, Fs/float(rir_fs), 'sinc_best') # the factor 2 was empirically determined to be necessary to get # amplitude of RIR in the correct ballpark. rir *= 2. ''' room.rir.append([]) room.rir[r].append(rir) # compute the input signal to the microphones room.simulate() # save degraded signal at reference microphone raw = bf.signals[ref_mic_n] raw_n = pra.normalize(pra.highpass(raw, Fs)) wavfile.write(file_raw, Fs, pra.to_16b(raw_n)) pesq_input = pra.pesq(file_ref, file_raw, Fs=Fs) for src in room.sources: src.setOrdering('strongest', ref_point=bf.center) for k,s in enumerate(n_sources): good_img = room.sources[0][:s] bad_img = room.sources[1][:s] for i, bfr in enumerate(beamformer_names): bf_weights_fun[i](good_img, bad_img, sigma2_n*np.eye(n_mic*Lg), delay=delay_bf) # run beamformer output = bf.process() output = pra.normalize(pra.highpass(output, Fs)) output = pra.time_align(reference_n, output) # save files for PESQ evaluation wavfile.write(files_bf[i], Fs, pra.to_16b(output)) # compute PESQ x = pra.pesq(file_ref, files_bf[i], Fs=Fs) pesq_bf[:,i,k] = pra.pesq(file_ref, files_bf[i], Fs=Fs).T ''' This is how you can compare the true RIRs with the image src model generated one plt.figure() for m in range(n_mic): rir_sim = room.sources[0].getRIR(mics[:,m], Fs) plt.subplot(3,3,m+1) plt.plot(room.rir[m][0][:rir_sim.shape[0]]) plt.plot(rir_sim) plt.show() ''' print 'Finished' return pesq_input, pesq_bf
def process_experiment_max_sinr(SIR, mic, args): nfft = args.nfft vad_guard = args.vad_guard if args.thresh is None: vad_thresh = thresh_opt[SIR] else: vad_thresh = args.thresh # read_in the mix signals fs_led, leds = wavfile.read( file_pattern.format('camera_leds_zero_hold', 'mix', SIR)) fs_snd, audio = wavfile.read( file_pattern.format(mic_choices[mic], 'mix', SIR)) assert fs_led == fs_snd # read in the ref signals r, noise_ref = wavfile.read( file_pattern.format(mic_choices[mic], 'noise_ref', SIR)) assert r == fs_snd r, speech_ref = wavfile.read(file_speech_ref.format(mic_choices[mic])) assert r == fs_snd r, leds_ref = wavfile.read(file_speech_ref.format('camera_leds_zero_hold')) assert r == fs_snd # In case of objective evaluation, we do an artificial mix if args.synth_mix: audio = noise_ref + speech_ref # get the geometry information to get nice plots. mics_loc = np.array(protocol['geometry']['microphones'][mic]['reference']) noise_loc = protocol['geometry']['speakers']['locations'][0] speech_loc = protocol['geometry']['speakers']['locations'][1] # the directions of arrival theta_speech = 0 p0 = speech_loc - mics_loc p1 = noise_loc - mics_loc theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1)) print('Source separation', theta_noise / np.pi * 180) if mic == 'pyramic': I = list(range(8, 16)) + list(range(24, 32)) + list(range( 40, 48)) # flat part #I = list(range(24,32)) + list(range(40,48)) # flat part #I = list(range(8,16)) #I = list(range(48)) audio = audio[:, I] noise_ref = noise_ref[:, I].copy() speech_ref = speech_ref[:, I].copy() mics_positions = mics_geom['pyramic'][I].copy() # place in room 2-806 mics_positions -= np.mean(mics_positions, axis=0)[None, :] mics_positions[:, 2] -= np.max(mics_positions[:, 2]) mics_positions += mics_loc elif mic == 'olympus': mics_positions = mics_geom['olympus'].copy() + mics_loc n_samples = audio.shape[0] # shorthand n_channels = audio.shape[1] # perform VAD vad_snd = leds > vad_thresh # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix. # For that we will remove frames neighbouring the detected speech vad_guarded = vad_snd.copy() if vad_guard is not None: for i, v in enumerate(vad_snd): if np.any(vad_snd[i - vad_guard:i + vad_guard]): vad_guarded[i] = True ############################## ## STFT and frame-level VAD ## ############################## print('STFT and stuff') sys.stdout.flush() engine = pra.realtime.STFT(nfft, nfft // 2, pra.hann(nfft), channels=audio.shape[1]) def analysis(x): engine.analysis(x) return np.moveaxis(engine.X, 1, 0) # Now compute the STFT of the microphone input X = analysis(audio) X_time = np.arange(1, X.shape[0] + 1) * (nfft / 2) / fs_snd X_speech = analysis(audio * vad_guarded[:, None]) X_noise = analysis(audio * (1 - vad_guarded[:, None])) S_ref = analysis(speech_ref) N_ref = analysis(noise_ref) ########################## ## MAX SINR BEAMFORMING ## ########################## print('Max SINR beamformer computation') sys.stdout.flush() # covariance matrices from noisy signal Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech)) Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise)) # compute covariances with reference signals to check everything is working correctly #Rs = np.einsum('i...j,i...k->...jk', S_ref, np.conj(S_ref)) #Rn = np.einsum('i...j,i...k->...jk', N_ref, np.conj(N_ref)) # compute the MaxSINR beamformer w = [ la.eigh(rs, b=rn, eigvals=(n_channels - 1, n_channels - 1))[1] for rs, rn in zip(Rs[1:], Rn[1:]) ] w = np.squeeze(np.array(w)) nw = la.norm(w, axis=1) w[nw > 1e-10, :] /= nw[nw > 1e-10, None] w = np.concatenate([np.ones((1, n_channels)), w], axis=0) if not args.no_norm: # normalize with respect to input signal z = compute_gain(w, X_speech, X_speech[:, :, 0], clip_up=args.clip_gain) w *= z[:, None] ########### ## APPLY ## ########### print('Apply beamformer') sys.stdout.flush() # 2D beamformer mic_array = pra.Beamformer(mics_positions[:, :2].T, fs=fs_snd, N=nfft, hop=nfft, zpb=nfft) mic_array.signals = audio.T mic_array.weights = w.T out = mic_array.process() # Signal alignment step ref = np.vstack([speech_ref[:, 0], noise_ref[:, 0]]) # Not sure why the delay is sometimes negative here... Need to check more delay = np.abs( int(pra.tdoa(out, speech_ref[:, 0].astype(np.float), phat=True))) if delay > 0: out_trunc = out[delay:delay + ref.shape[1]] noise_eval = audio[:ref.shape[1], 0] - out_trunc else: out_trunc = np.concatenate( (np.zeros(-delay), out[:ref.shape[1] + delay])) noise_eval = audio[:ref.shape[1], 0] - out_trunc sig_eval = np.vstack([out_trunc, noise_eval]) # We use the BSS eval toolbox metric = bss_eval_images(ref[:, :, None], sig_eval[:, :, None]) # we are only interested in SDR and SIR for the speech source SDR_out = metric[0][0] SIR_out = metric[2][0] ################## ## SAVE SAMPLES ## ################## if args.save_sample is not None: # for informal listening tests, we need to high pass and normalize the # amplitude. upper = np.maximum(audio[:, 0].max(), out.max()) sig_in = pra.highpass(audio[:, 0].astype(np.float) / upper, fs_snd, fc=150) sig_out = pra.highpass(out / upper, fs_snd, fc=150) f1 = os.path.join(args.save_sample, '{}_ch0_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f1, fs_snd, sig_in) f2 = os.path.join(args.save_sample, '{}_out_SIR_{}_dB.wav'.format(mic, SIR)) wavfile.write(f2, fs_snd, sig_out) ########## ## PLOT ## ########## if args.plot: plt.figure() plt.plot(out_trunc) plt.plot(speech_ref[:, 0]) plt.legend(['output', 'reference']) # time axis for plotting led_time = np.arange(leds.shape[0]) / fs_led + 1 / (2 * fs_led) audio_time = np.arange(n_samples) / fs_snd plt.figure() plt.plot(led_time, leds, 'r') plt.title('LED signal') # match the scales of VAD and light to sound before plotting q_vad = np.max(audio) q_led = np.max(audio) / np.max(leds) plt.figure() plt.plot(audio_time, audio[:, 0], 'b') plt.plot(led_time, leds * q_led, 'r') plt.plot(audio_time, vad_snd * q_vad, 'g') plt.plot(audio_time, vad_guarded * q_vad, 'g--') plt.legend(['audio', 'VAD']) plt.title('LED and audio signals') plt.figure() a_time = np.arange(audio.shape[0]) / fs_snd plt.plot(a_time, audio[:, 0]) plt.plot(a_time, out_trunc) #plt.plot(a_time, speech_ref[:,0]) plt.legend(['channel 0', 'beamformer output', 'speech reference']) plt.figure() mic_array.plot_beam_response() plt.vlines( [180 + np.degrees(theta_speech), 180 - np.degrees(theta_noise)], 0, nfft // 2) room = pra.ShoeBox(protocol['geometry']['room'][:2], fs=16000, max_order=1) room.add_source(noise_loc[:2]) # noise room.add_source(speech_loc[:2]) # speech room.add_source( protocol['geometry']['speakers']['locations'][1][:2]) # signal room.add_microphone_array(mic_array) room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000]) plt.figure() mic_array.plot() plt.show() # Return SDR and SIR return SDR_out, SIR_out
N = 1024 # create a microphone array if shape is 'Circular': R = pra.circular2DArray(mic1, M, phi, d * M / (2 * np.pi)) elif shape is 'Poisson': R = pra.poisson2DArray(mic1, M, d) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N=N, Lg=Lg) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_' + str(Fs) + '.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_' + str(Fs) + '.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1. # create the room with sources and mics room1 = pra.Room.shoeBox2D([0, 0], room_dim, Fs, t0=t0, max_order=max_order_sim,
N = 1024 # Create a microphone array if shape is "Circular": R = pra.circular_2D_array(mic1, M, phi, d * M / (2 * np.pi)) else: R = pra.linear_2D_array(mic1, M, phi, d) # path to samples path = os.path.dirname(__file__) # The first signal (of interest) is singing rate1, signal1 = wavfile.read(path + "/input_samples/singing_" + str(Fs) + ".wav") signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0.0 # The second signal (interferer) is some german speech rate2, signal2 = wavfile.read(path + "/input_samples/german_speech_" + str(Fs) + ".wav") signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1.0 # Create the room room_dim = [4, 6] room1 = pra.ShoeBox( room_dim, absorption=absorption, fs=Fs,
def parallel_loop(filename, algo_names, pmt): ''' This is one loop of the computation extracted for parallelization ''' # We need to do a bunch of imports import pyroomacoustics as pra import os import numpy as np from scipy.io import wavfile import mkl as mkl_service import copy import doa from tools import rfft # for such parallel processing, it is better # to deactivate multithreading in mkl mkl_service.set_num_threads(1) # exctract the speaker names from filename name = os.path.splitext(os.path.basename(filename))[0] sources = name.split('-') # number of sources K = len(sources) # Import speech signal fs_file, rec_signals = wavfile.read(filename) # sanity check if pmt['fs'] != fs_file: raise ValueError('The sampling frequency of the files doesn''t match that of the script') speech_signals = np.array(rec_signals[:,pmt['mic_select']], dtype=np.float32) # Remove the DC bias for s in speech_signals.T: s[:] = pra.highpass(s, pmt['fs'], 100.) if pmt['stft_win']: stft_win = np.hanning(pmt['nfft']) else: stft_win = None # Normalize the amplitude speech_signals *= pmt['scaling'] # Compute STFT of signal # ------------------------- y_mic_stft = [] for k in range(speech_signals.shape[1]): y_stft = pra.stft(speech_signals[:, k], pmt['nfft'], pmt['stft_hop'], transform=rfft, win=stft_win).T / np.sqrt(pmt['nfft']) y_mic_stft.append(y_stft) y_mic_stft = np.array(y_mic_stft) # estimate SNR in dB (on 1st microphone) sig_var = np.var(speech_signals) SNR = 10*np.log10( (sig_var - pmt['noise_var']) / pmt['noise_var'] ) freq_bins = copy.copy(pmt['freq_bins'][K-1]) # dict for output phi_recon = {} for alg in algo_names: # Use the convenient dictionary of algorithms defined d = doa.algos[alg]( L=pmt['mic_array'], fs=pmt['fs'], nfft=pmt['nfft'], num_src=K, c=pmt['c'], theta=pmt['phi_grid'], max_four=pmt['M'], num_iter=pmt['num_iter'], G_iter = pmt['G_iter'] ) # perform localization d.locate_sources(y_mic_stft, freq_bins=freq_bins[alg]) # store result phi_recon[alg] = d.phi_recon return SNR, sources, phi_recon
mics = pra.Beamformer(echo, Fs, N=fft_len, Lg=Lg) room1.add_microphone_array(mics) room1.add_source(source, delay=0, signal=xtone) room1.add_source(interferer, delay=0, signal=silence) room1.image_source_model(use_libroom=True) room1.compute_rir() room1.simulate() # Rake MVDR simulation BeamformerType = 'RakeMVDR' good_sources = room1.sources[0][:max_order_design + 1] bad_sources = room1.sources[1][:max_order_design + 1] mics.rake_mvdr_filters(good_sources, bad_sources, sigma2_n * np.eye(mics.Lg * mics.M)) output = mics.process() out = pra.normalize(pra.highpass(output, Fs)) out = normalize(out) # Rake Perceptual simulation # BeamformerType = 'RakePerceptual' # good_sources = room1.sources[0][:max_order_design+1] # bad_sources = room1.sources[1][:max_order_design+1] # mics.rake_perceptual_filters(good_sources, # bad_sources, # sigma2_n*np.eye(mics.Lg*mics.M)) # output = mics.process() # out = pra.normalize(pra.highpass(output, Fs)) input_mic = pra.normalize(pra.highpass(mics.signals[mics.M // 2], Fs)) input_mic = normalize(input_mic)