def mix_and_separate2(sourceX, sourceY, noises): mix, y_hat, sir, sdr = createroom(sourceX, sourceY, noises, mic_p, mic_d, sour_p, sour_d, callback_mix, roomdim, absorption, max_order, n_mics, angle) sep1 = pra.normalize(y_hat.T[0], bits=16).astype(np.int16).T sep2 = pra.normalize(y_hat.T[1], bits=16).astype(np.int16).T return mix, sep1, sep2, sir, sdr
def perceptual_quality_evaluation(room_dim, mics, good_pos, good_index, bad_pos, bad_index, rir_location): print 'start' import numpy as np from scipy.io import wavfile from os import getpid import pyroomacoustics as pra # number of sources to consider n_sources = np.arange(1, 8) S = n_sources.shape[0] # number of mics n_mic = mics.shape[1] # Set the speed of sound to match that of the measured RIR pra.constants.set('c', 345.5) Fs = 8000. N = 1024 Lg = int(0.03 * Fs) # 350 ms long filter delay_bf = 0.02 sigma2_n = 1e-6 # reflection coefficients from the walls (hand-waving) reflection = { 'ground': 0.8, 'south': 0.8, 'west': 0.8, 'north': 0.8, 'east': 0.8, 'ceilling': 0.5 } speech_sample1 = 'samples/fq_sample1_8000.wav' speech_sample2 = 'samples/fq_sample2_8000.wav' # Create the room room = pra.ShoeBox3D(np.zeros(3), room_dim, Fs, max_order=1, absorption=reflection, sigma2_awgn=sigma2_n) # Create the beamformer bf = pra.Beamformer(mics, Fs, N=N, Lg=Lg) room.addMicrophoneArray(bf) # data receptacles beamformer_names = ['Rake Perceptual', 'Rake MVDR'] bf_weights_fun = [bf.rakePerceptualFilters, bf.rakeMVDRFilters] bf_fnames = ['1', '2'] NBF = len(beamformer_names) # receptacle arrays pesq_input = np.zeros(2) pesq_bf = np.zeros((2, NBF, S)) # create a single reference mic at position of microphone 4 ref_mic_n = 4 ref_mic = pra.MicrophoneArray(bf.R[:, ref_mic_n, np.newaxis], Fs) # since we run multiple thread, we need to uniquely identify filenames pid = str(getpid()) file_ref = 'output_samples/fqref' + pid + '.wav' file_suffix = '-' + pid + '.wav' files_bf = [ 'output_samples/fq' + str(i + 1) + file_suffix for i in xrange(NBF) ] file_raw = 'output_samples/fqraw' + pid + '.wav' # index of good and bad sources good = good_index bad = bad_index # Read the two speech samples used rate, good_signal = wavfile.read(speech_sample1) good_signal = np.array(good_signal, dtype='float64') good_signal = pra.normalize(good_signal) good_signal = pra.highpass(good_signal, rate) good_len = good_signal.shape[0] / float(Fs) rate, bad_signal = wavfile.read(speech_sample2) bad_signal = np.array(bad_signal, dtype='float64') bad_signal = pra.normalize(bad_signal) bad_signal = pra.highpass(bad_signal, rate) bad_len = bad_signal.shape[0] / float(Fs) # variance of good signal good_sigma2 = np.mean(good_signal**2) # normalize interference signal to have equal power with desired signal bad_signal *= good_sigma2 / np.mean(bad_signal**2) # pick good source position at random good_distance = np.linalg.norm(bf.center[:, 0] - good_pos) # pick bad source position at random bad_distance = np.linalg.norm(bf.center[:, 0] - bad_pos) if good_len > bad_len: good_delay = 0 bad_delay = (good_len - bad_len) / 2. else: bad_delay = 0 good_delay = (bad_len - good_len) / 2. # create the reference room for freespace, noisless, no interference simulation ref_room = pra.ShoeBox3D([0, 0, 0], room_dim, Fs, max_order=0) ref_room.addSource(good_pos, signal=good_signal, delay=good_delay) ref_room.addMicrophoneArray(ref_mic) ref_room.compute_RIR() ref_room.simulate() reference = pra.highpass(ref_mic.signals[0], Fs) reference_n = pra.normalize(reference) # save the reference desired signal #wavfile.write(file_ref, Fs, pra.to_16b(reference_n)) new_ref = good_signal.copy() new_ref = pra.normalize(pra.highpass(new_ref, Fs)) wavfile.write(file_ref, Fs, pra.to_16b(new_ref)) # add the sources to the 'real' room room.addSource(good_pos, signal=good_signal, delay=good_delay) room.addSource(bad_pos, signal=bad_signal, delay=bad_delay) # read in the RIR from file for r in range(n_mic): for s in [good_index, bad_index]: # read wav file fname_rir = rir_location % (r + 1, s + 1) rir_fs, rir = wavfile.read(fname_rir) rir = np.array(rir, dtype='float64') if rir_fs != Fs: raise NameError( 'The RIR and the signals do not have the same sampling rate.' ) ''' import scikits.samplerate as sr rir = sr.resample(rir, Fs/float(rir_fs), 'sinc_best') # the factor 2 was empirically determined to be necessary to get # amplitude of RIR in the correct ballpark. rir *= 2. ''' room.rir.append([]) room.rir[r].append(rir) # compute the input signal to the microphones room.simulate() # save degraded signal at reference microphone raw = bf.signals[ref_mic_n] raw_n = pra.normalize(pra.highpass(raw, Fs)) wavfile.write(file_raw, Fs, pra.to_16b(raw_n)) pesq_input = pra.pesq(file_ref, file_raw, Fs=Fs) for src in room.sources: src.setOrdering('strongest', ref_point=bf.center) for k, s in enumerate(n_sources): good_img = room.sources[0][:s] bad_img = room.sources[1][:s] for i, bfr in enumerate(beamformer_names): bf_weights_fun[i](good_img, bad_img, sigma2_n * np.eye(n_mic * Lg), delay=delay_bf) # run beamformer output = bf.process() output = pra.normalize(pra.highpass(output, Fs)) output = pra.time_align(reference_n, output) # save files for PESQ evaluation wavfile.write(files_bf[i], Fs, pra.to_16b(output)) # compute PESQ x = pra.pesq(file_ref, files_bf[i], Fs=Fs) pesq_bf[:, i, k] = pra.pesq(file_ref, files_bf[i], Fs=Fs).T ''' This is how you can compare the true RIRs with the image src model generated one plt.figure() for m in range(n_mic): rir_sim = room.sources[0].getRIR(mics[:,m], Fs) plt.subplot(3,3,m+1) plt.plot(room.rir[m][0][:rir_sim.shape[0]]) plt.plot(rir_sim) plt.show() ''' print 'Finished' return pesq_input, pesq_bf
def play(self, src): sd.play(pra.normalize(src) * 0.75, samplerate=self.fs, blocking=False)
speech_file_location, noise_file_location, room_dim, max_order, snr_vals, R, pos_source, pos_noise, N) ''' Write to WAV + labelling of our processed noisy signals ''' # we flatten by one dimension our array. In fact we just say that it does'nt have a microphones dimension noisy_signal_flatten = noisy_signal[:, 0, :] # labelling our beamformed signals and comparing their classification with the one for the original noisy signals score_processing = np.zeros(len(snr_vals)) score_original = np.zeros(len(snr_vals)) for i, snr in enumerate(snr_vals): print("SNR / %f dB" % snr) dest = os.path.join(dest_dir, "beamformed_signal_snr_db_%d.wav" % snr) signal = pra.normalize(noisy_signal_beamformed[i], bits=16).astype(np.int16) wavfile.write(dest, 16000, signal) score_processing[i] = label_wav(dest, labels_file, graph_file, speech.meta.as_dict()['word']) dest = os.path.join(dest_dir, "original_signal_snr_db_%d.wav" % (snr)) signal = pra.normalize(noisy_signal_flatten[i], bits=16).astype(np.int16) wavfile.write(dest, 16000, signal) score_original[i] = label_wav(dest, labels_file, graph_file, speech.meta.as_dict()['word']) print() #plotting the result plt.plot(snr_vals, score_processing, label="beamformed signal") plt.plot(snr_vals, score_original, label="original")
# back to time domain processed_audio[n:n + hop, ] = stft.synthesis(X) # update step n += hop proc_time = time.time() - start_time print("Processing time: {} minutes".format(proc_time / 60)) """ Save and plot spectrogram """ wavfile.write( os.path.join(os.path.dirname(__file__), 'output_samples', 'denoise_output_IterativeWiener.wav'), fs, pra.normalize(processed_audio).astype(np.float32)) print("Noisy and denoised file written to: '%s'" % os.path.join(os.path.dirname(__file__), 'output_samples')) signal_norm = signal / np.abs(signal).max() processed_audio_norm = processed_audio / np.abs(processed_audio).max() if plot_spec: min_val = -80 max_val = -40 plt.figure() plt.subplot(3, 1, 1) plt.specgram(noisy_signal[:n - hop], NFFT=256, Fs=fs, vmin=min_val,
label=f"SIR {s+1}", marker="o") plt.title(args.algo) plt.legend() plt.tight_layout(pad=0.5) if not args.gui: plt.show() else: plt.show(block=False) if args.save: wavfile.write( "bss_iva_mix.wav", room.fs, pra.normalize(mix[0, :], bits=16).astype(np.int16), ) for i, sig in enumerate(y_hat): wavfile.write( "bss_iva_source{}.wav".format(i + 1), room.fs, pra.normalize(sig, bits=16).astype(np.int16), ) if args.gui: from tkinter import Tk # Make a simple GUI to listen to the separated samples root = Tk() my_gui = PlaySoundGUI(root,
## Read target speech audio while True: spe_id = random.randint(start_spe_id, end_spe_id) utt_key = sp_utts_scp[spe_id][0] spe_path = sp_utts_scp[spe_id][1] spe_name = file_name(pathName=spe_path) sample_rate, spe_wav = wavfile.read(spe_path) if len(spe_wav.shape) > 1: spe_wav = np.mean(spe_wav, 1) spe_wav = spe_wav.astype(np.float) if np.mean(np.abs(spe_wav)) > 0: break spe_length = spe_wav.shape[0] spe_wav = pra.normalize(spe_wav) spe_wav = pra.highpass(spe_wav, Fs, 50) room_mix.add_source(target_source, signal=spe_wav, delay=delay) room_ref.add_source(target_source, signal=spe_wav, delay=delay) #room_dir.add_source(target_source, signal = spe_wav, delay = delay) ## Read interfere speech audio for it in range(0, interf_num): while True: while True: inf_id = random.randint(start_spe_id, end_spe_id) if np.abs(spe_id - inf_id) > 500: break inf_path = sp_utts_scp[inf_id][1] sample_rate, inf_wav = wavfile.read(
def modify_input_wav_beamforming(wav, noise, room_dim, max_order, snr_vals, mic_array, pos_source, pos_noise, N): fs_s, audio_anechoic = wavfile.read(wav) fs_n, noise_anechoic = wavfile.read(noise) #Create a room for the signal room_signal = pra.ShoeBox(room_dim, absorption=0.2, fs=fs_s, max_order=max_order) #Create a room for the noise room_noise = pra.ShoeBox(room_dim, absorption=0.2, fs=fs_n, max_order=max_order) #source of the signal and of the noise in their respectiv boxes room_signal.add_source(pos_source, signal=audio_anechoic) room_noise.add_source(pos_noise, signal=noise_anechoic) #add the microphone array mics_signal = pra.Beamformer(mic_array, room_signal.fs, N) mics_noisy = pra.Beamformer(mic_array, room_noise.fs, N) room_signal.add_microphone_array(mics_signal) room_noise.add_microphone_array(mics_noisy) #simulate both rooms room_signal.simulate() room_noise.simulate() #take the mic_array.signals from each room audio_reverb = room_signal.mic_array.signals noise_reverb = room_noise.mic_array.signals #design beamforming filters mics_signal.rake_delay_and_sum_weights(room_signal.sources[0][:1]) mics_noisy.rake_delay_and_sum_weights(room_signal.sources[0][:1]) output_signal = mics_signal.process() output_noise = mics_noisy.process() #we're going to normalize the noise size = np.shape(audio_reverb) noise_normalized = np.zeros(size) #for each microphones if (len(noise_reverb[0]) < len(audio_reverb[0])): raise ValueError( 'the length of the noise signal is inferior to the one of the audio signal !!' ) output_noise = output_noise[:len(output_signal)] norm_fact = np.linalg.norm(noise_reverb[-1]) noise_normalized = output_noise / norm_fact #initilialize the array of noisy_signal noisy_signal = np.zeros([len(snr_vals), np.shape(output_signal)[0]]) for i, snr in enumerate(snr_vals): noise_std = np.linalg.norm(audio_reverb[-1]) / (10**(snr / 20.)) final_noise = noise_normalized * noise_std noisy_signal[i] = pra.normalize( pra.highpass(output_signal + final_noise, fs_s)) return noisy_signal
processed_signal[i] = sp.process.denoise(noisy_signal[i], fft_len, lpc_order, iterations) processed_signal_VAD[i], _, _ = sp.process.denoise_with_vad( noisy_signal[i], sr, fft_len, lpc_order, iterations, alpha) ''' Write to WAV + labelling of our processed noisy signals ''' # labelling our different single noise channel removed signals and comparing their classification with the one for the original noisy signals score_processing = np.zeros(len(snr_vals)) score_processing_VAD = np.zeros(len(snr_vals)) score_original = np.zeros(len(snr_vals)) for i, snr in enumerate(snr_vals): print("SNR : %f dB" % snr) dest = os.path.join(dest_dir, "denoised_snr_db_%d.wav" % (snr)) signal = pra.normalize(processed_signal[i], bits=16).astype(np.int16) wavfile.write(dest, 16000, signal) score_processing[i] = label_wav(dest, labels_file, graph_file, speech.meta.as_dict()['word']) dest = os.path.join(dest_dir, "denoised_with_VAD_snr_db_%d.wav" % (snr)) signal = pra.normalize(processed_signal_VAD[i], bits=16).astype(np.int16) wavfile.write(dest, 16000, signal) score_processing_VAD[i] = label_wav(dest, labels_file, graph_file, speech.meta.as_dict()['word']) dest = os.path.join(dest_dir, "noisy_snr_db_%d.wav" % (snr)) signal = pra.normalize(noisy_signal[i], bits=16).astype(np.int16) wavfile.write(dest, 16000, signal)
score_map_original[w] = np.zeros([sub, len(snr_vals)]) score_map_processing[w] = np.zeros([sub, len(snr_vals)]) # now we are gonna compute the labelling idx = 0 for s in speech_samps: for i, snr in enumerate(snr_vals): word = s.meta.as_dict()['word'] # destination of the processed signal dest_pro = os.path.join( dest_dir, "processed_signal%d%s_snr_db_%d" % (idx, word, snr)) # destination of the original siganl dest_ori = os.path.join( dest_dir, "original_signal%d%s_snr_db_%d" % (idx, word, snr)) # noisy processed signal noisy_pro = pra.normalize(processed_audio_map[s][i], bits=16).astype(np.int16) wavfile.write(dest_pro, 16000, noisy_pro) # noisy original signal noisy_ori = pra.normalize(noisy_signal[s][i], bits=16).astype(np.int16) wavfile.write(dest_ori, 16000, noisy_ori) # update the score maps print("score for processed signal: ") score_map_processing[word][idx][i] = label_wav( dest_pro, labels_file, graph_file, word) print() print("score for original signal: ") score_map_original[word][idx][i] = label_wav( dest_ori, labels_file, graph_file, word) print() idx += 1
def modify_input_wav_multiple_mics(wav, noise, room_dim, max_order, snr_vals, mic_array, pos_source, pos_noise): fs_s, audio_anechoic = wavfile.read(wav) fs_n, noise_anechoic = wavfile.read(noise) #Create a room for the signal room_signal = pra.ShoeBox(room_dim, absorption=0.2, fs=fs_s, max_order=max_order) #Create a room for the noise room_noise = pra.ShoeBox(room_dim, absorption=0.2, fs=fs_n, max_order=max_order) #source of the signal and of the noise in their respectiv boxes room_signal.add_source(pos_source, signal=audio_anechoic) room_noise.add_source(pos_noise, signal=noise_anechoic) #we had the microphones array in both room room_signal.add_microphone_array( pra.MicrophoneArray(mic_array.T, room_signal.fs)) room_noise.add_microphone_array( pra.MicrophoneArray(mic_array.T, room_noise.fs)) #simulate both rooms room_signal.simulate() room_noise.simulate() #take the mic_array.signals from each room audio_reverb = room_signal.mic_array.signals noise_reverb = room_noise.mic_array.signals shape = np.shape(audio_reverb) noise_normalized = np.zeros(shape) #for each microphones if (len(noise_reverb[0]) < len(audio_reverb[0])): raise ValueError( 'the length of the noise signal is inferior to the one of the audio signal !!' ) noise_reverb = noise_reverb[:, :len(audio_reverb[0])] norm_fact = np.linalg.norm(noise_reverb[0]) noise_normalized = noise_reverb / norm_fact #initilialize the array of noisy_signal noisy_signal = np.zeros([len(snr_vals), shape[0], shape[1]]) for i, snr in enumerate(snr_vals): noise_std = np.linalg.norm(audio_reverb[0]) / (10**(snr / 20.)) for m in range(shape[0]): final_noise = noise_normalized[m] * noise_std noisy_signal[i][m] = pra.normalize(audio_reverb[m] + final_noise) return noisy_signal
under different SNRs. """ if not os.path.exists(dest_dir): os.makedirs(dest_dir) # truncate beamformed noise noise_bf = noise_bf[:len(speech_bf)] # compute score for different SNR vals print() score_beamformed = np.empty(len(snr_vals)) score_single = np.empty(len(snr_vals)) for idx, snr in enumerate(snr_vals): noisy_signal = speech_bf + snr_facts[idx] * noise_bf noisy_signal = pra.normalize(pra.highpass(noisy_signal, fs_s), bits=16).astype(np.int16) dest = os.path.join(dest_dir, "das_bf_snr_db_%d.wav" % (snr)) wavfile.write(dest, fs_s, noisy_signal) score_beamformed[idx] = label_wav(dest, labels_file, graph_file, speech_samp.meta.word) # compute score for single mic for reference single_mic = ref_mic_sig + snr_facts[idx] * ref_mic_noise single_mic = pra.normalize(pra.highpass(single_mic, fs_s), bits=16).astype(np.int16) dest = os.path.join(dest_dir, "single_mic_snr_db_%d.wav" % (snr)) wavfile.write(dest, fs_s, single_mic) score_single[idx] = label_wav(dest, labels_file, graph_file, speech_samp.meta.word) plt.figure()
def perceptual_quality_evaluation(room_dim, mics, good_pos, good_index, bad_pos, bad_index, rir_location): print 'start' import numpy as np from scipy.io import wavfile from os import getpid import pyroomacoustics as pra # number of sources to consider n_sources = np.arange(1,8) S = n_sources.shape[0] # number of mics n_mic = mics.shape[1] # Set the speed of sound to match that of the measured RIR pra.constants.set('c', 345.5) Fs = 8000. N = 1024 Lg = int(0.03*Fs) # 350 ms long filter delay_bf = 0.02 sigma2_n = 1e-6 # reflection coefficients from the walls (hand-waving) reflection = {'ground':0.8, 'south':0.8, 'west':0.8, 'north':0.8, 'east':0.8, 'ceilling':0.5} speech_sample1 = 'samples/fq_sample1_8000.wav' speech_sample2 = 'samples/fq_sample2_8000.wav' # Create the room room = pra.ShoeBox3D(np.zeros(3), room_dim, Fs, max_order=1, absorption=reflection, sigma2_awgn=sigma2_n) # Create the beamformer bf = pra.Beamformer(mics, Fs, N=N, Lg=Lg) room.addMicrophoneArray(bf) # data receptacles beamformer_names = ['Rake Perceptual', 'Rake MVDR'] bf_weights_fun = [bf.rakePerceptualFilters, bf.rakeMVDRFilters] bf_fnames = ['1','2'] NBF = len(beamformer_names) # receptacle arrays pesq_input = np.zeros(2) pesq_bf = np.zeros((2,NBF,S)) # create a single reference mic at position of microphone 4 ref_mic_n = 4 ref_mic = pra.MicrophoneArray(bf.R[:,ref_mic_n,np.newaxis], Fs) # since we run multiple thread, we need to uniquely identify filenames pid = str(getpid()) file_ref = 'output_samples/fqref' + pid + '.wav' file_suffix = '-' + pid + '.wav' files_bf = ['output_samples/fq' + str(i+1) + file_suffix for i in xrange(NBF)] file_raw = 'output_samples/fqraw' + pid + '.wav' # index of good and bad sources good = good_index bad = bad_index # Read the two speech samples used rate, good_signal = wavfile.read(speech_sample1) good_signal = np.array(good_signal, dtype='float64') good_signal = pra.normalize(good_signal) good_signal = pra.highpass(good_signal, rate) good_len = good_signal.shape[0]/float(Fs) rate, bad_signal = wavfile.read(speech_sample2) bad_signal = np.array(bad_signal, dtype='float64') bad_signal = pra.normalize(bad_signal) bad_signal = pra.highpass(bad_signal, rate) bad_len = bad_signal.shape[0]/float(Fs) # variance of good signal good_sigma2 = np.mean(good_signal**2) # normalize interference signal to have equal power with desired signal bad_signal *= good_sigma2/np.mean(bad_signal**2) # pick good source position at random good_distance = np.linalg.norm(bf.center[:,0] - good_pos) # pick bad source position at random bad_distance = np.linalg.norm(bf.center[:,0] - bad_pos) if good_len > bad_len: good_delay = 0 bad_delay = (good_len - bad_len)/2. else: bad_delay = 0 good_delay = (bad_len - good_len)/2. # create the reference room for freespace, noisless, no interference simulation ref_room = pra.ShoeBox3D( [0,0,0], room_dim, Fs, max_order=0) ref_room.addSource(good_pos, signal=good_signal, delay=good_delay) ref_room.addMicrophoneArray(ref_mic) ref_room.compute_RIR() ref_room.simulate() reference = pra.highpass(ref_mic.signals[0], Fs) reference_n = pra.normalize(reference) # save the reference desired signal #wavfile.write(file_ref, Fs, pra.to_16b(reference_n)) new_ref = good_signal.copy() new_ref = pra.normalize(pra.highpass(new_ref, Fs)) wavfile.write(file_ref, Fs, pra.to_16b(new_ref)) # add the sources to the 'real' room room.addSource(good_pos, signal=good_signal, delay=good_delay) room.addSource(bad_pos, signal=bad_signal, delay=bad_delay) # read in the RIR from file for r in range(n_mic): for s in [good_index, bad_index]: # read wav file fname_rir = rir_location % (r+1,s+1) rir_fs,rir = wavfile.read(fname_rir) rir = np.array(rir, dtype='float64') if rir_fs != Fs: raise NameError('The RIR and the signals do not have the same sampling rate.') ''' import scikits.samplerate as sr rir = sr.resample(rir, Fs/float(rir_fs), 'sinc_best') # the factor 2 was empirically determined to be necessary to get # amplitude of RIR in the correct ballpark. rir *= 2. ''' room.rir.append([]) room.rir[r].append(rir) # compute the input signal to the microphones room.simulate() # save degraded signal at reference microphone raw = bf.signals[ref_mic_n] raw_n = pra.normalize(pra.highpass(raw, Fs)) wavfile.write(file_raw, Fs, pra.to_16b(raw_n)) pesq_input = pra.pesq(file_ref, file_raw, Fs=Fs) for src in room.sources: src.setOrdering('strongest', ref_point=bf.center) for k,s in enumerate(n_sources): good_img = room.sources[0][:s] bad_img = room.sources[1][:s] for i, bfr in enumerate(beamformer_names): bf_weights_fun[i](good_img, bad_img, sigma2_n*np.eye(n_mic*Lg), delay=delay_bf) # run beamformer output = bf.process() output = pra.normalize(pra.highpass(output, Fs)) output = pra.time_align(reference_n, output) # save files for PESQ evaluation wavfile.write(files_bf[i], Fs, pra.to_16b(output)) # compute PESQ x = pra.pesq(file_ref, files_bf[i], Fs=Fs) pesq_bf[:,i,k] = pra.pesq(file_ref, files_bf[i], Fs=Fs).T ''' This is how you can compare the true RIRs with the image src model generated one plt.figure() for m in range(n_mic): rir_sim = room.sources[0].getRIR(mics[:,m], Fs) plt.subplot(3,3,m+1) plt.plot(room.rir[m][0][:rir_sim.shape[0]]) plt.plot(rir_sim) plt.show() ''' print 'Finished' return pesq_input, pesq_bf
idx = 0 for s in speech_samps: for i, snr in enumerate(snr_vals): word = s.meta.as_dict()['word'] # destination of the processed signal dest_pro = os.path.join( dest_dir, "processed_signal%d%s_snr_db_%d" % (idx, word, snr)) # destination of the processed with VAD signal dest_pro_vad = os.path.join( dest_dir, "processed_signal_VAD%d%s_snr_db_%d" % (idx, word, snr)) # destination of the original siganl dest_ori = os.path.join( dest_dir, "original_signal%d%s_snr_db_%d" % (idx, word, snr)) # noisy processed signal noisy_pro = pra.normalize(processed_signal[s][i], bits=16).astype(np.int16) wavfile.write(dest_pro, 16000, noisy_pro) # noisy VAD+processed signal noisy_pro_vad = pra.normalize(processed_signal_VAD[s][i], bits=16).astype(np.int16) wavfile.write(dest_pro_vad, 16000, noisy_pro_vad) # noisy original signal noisy_ori = pra.normalize(noisy_signal[s][i], bits=16).astype(np.int16) wavfile.write(dest_ori, 16000, noisy_ori) # update the score maps print("score for original signal: ") score_map_original[word][idx][i] = label_wav( dest_ori, labels_file, graph_file, word) print() print("score for denoised signal: ")
def line_createroom(Bird1,Bird2,Bird3,callback_mix): roomdim = np.array([20, 20, 10]) max_order = 17 absorption = 0.9 mic_p = [13, 10, 3.5] #mic_center_point mic_d = 0.015 #mic_dinstance sour_p = [7,10,6] #source_postion sour_d = 5 #source_distance n_mics = 4 #mic_number n_sources = 3 mic_rot = np.pi/2 bird_rot = np.pi/2 ### params setting ### np.random.seed(10) # STFT parameters framesize = 4096 win_a = pra.hann(framesize) win_s = pra.transform.compute_synthesis_window( win_a, framesize // 2) ogive_mu = 0.1 ogive_update = "switching" ogive_iter = 2000 SIR = 10 # dB SNR = (60) algo = algo_choices[0] no_cb = True save = True n_iter = 60 dist = "gauss" # laplace fs = 44100 n_sources_target = 3 assert n_sources_target <= n_mics, "More sources than microphones is not supported" source_std = np.ones(n_sources_target) # room size room_dim = roomdim # micro position mic_locs = semi_line_layout(mic_p,mic_rot,mic_d,n_mics) # target position source_locs = semi_line_layout(sour_p,bird_rot,sour_d,n_sources) source_locs[0][0],source_locs[0][2] = source_locs[0][0]+0,source_locs[0][2]+0##push value # target_locs = np.transpose([[7, 10, 6], [9, 16, 6]]) # audio loaded wav_files = [Bird1,Bird2,Bird3] signals = wav_read_center(wav_files, seed=123) #create room room = pra.ShoeBox(room_dim, fs=44100, absorption=absorption, max_order=max_order, air_absorption=True, humidity=50) # add source for sig, loc in zip(signals, source_locs.T): room.add_source(loc, signal=sig) # add micro room.add_microphone_array( pra.MicrophoneArray(mic_locs, fs=room.fs)) callback_mix_kwargs = { "snr": SNR, "sir": SIR, "n_src": n_sources, "n_tgt": n_sources_target, "src_std": source_std, "ref_mic": 0, } # # draw # x = mic_locs[:2][0] # y = mic_locs[:2][1] # import matplotlib.pyplot as plt # plt.scatter(x,y) # plt.axis('equal') # plt.xlim([0,20]) # plt.ylim([0,20]) # x1 = source_locs[:2][0] # y1 = source_locs[:2][1] # plt.scatter(x1,y1) # plt.xlim([0,20]) # plt.ylim([0,20]) # plt.axis('equal') # plt.show() # Run the simulation separate_recordings = room.simulate( callback_mix=callback_mix, callback_mix_kwargs=callback_mix_kwargs, return_premix=True, ) mics_signals = room.mic_array.signals print("line Simulation done.") # Monitor Convergence ref = np.moveaxis(separate_recordings, 1, 2) if ref.shape[0] < n_mics: ref = np.concatenate( (ref, np.random.randn(n_mics - ref.shape[0], ref.shape[1], ref.shape[2])), axis=0, ) convergence_callback = None X_all = pra.transform.analysis( mics_signals.T, framesize, framesize // 2, win=win_a ).astype(np.complex128) X_mics = X_all[:, :, :n_mics] tic = time.perf_counter() if algo == "auxiva": # Run AuxIVA Y = overiva( X_mics, n_iter=n_iter, proj_back=True, model=dist, callback=convergence_callback, ) elif algo == "auxiva_pca": # Run AuxIVA Y = auxiva_pca( X_mics, n_src=n_sources_target, n_iter=n_iter, proj_back=True, model=dist, callback=convergence_callback, ) elif algo == "overiva": # Run AuxIVA Y = overiva( X_mics, n_src=n_sources_target, n_iter=n_iter, proj_back=True, model=dist, init_eig=(init == init_choices[1]), callback=convergence_callback, ) elif algo == "ilrma": # Run AuxIVA Y = pra.bss.ilrma( X_mics, n_iter=n_iter, n_components=2, proj_back=True, callback=convergence_callback, ) elif algo == "ogive": # Run OGIVE Y = ogive( X_mics, n_iter=ogive_iter, step_size=ogive_mu, update=ogive_update, proj_back=True, model=dist, init_eig=(init == init_choices[1]), callback=convergence_callback, ) elif algo == "ogive_matlab": # Run OGIVE Y = ogive_matlab_wrapper( X_mics, n_iter=ogive_iter, step_size=ogive_mu, update=ogive_update, proj_back=True, init_eig=(init == init_choices[1]), callback=convergence_callback, ) else: raise ValueError("No such algorithm {}".format(algo)) # Run iSTFT if Y.shape[2] == 1: y = pra.transform.synthesis(Y[:, :, 0], framesize, framesize // 2, win=win_s)[ :, None ] y = y.astype(np.float64) else: y = pra.transform.synthesis(Y, framesize, framesize // 2, win=win_s).astype( np.float64 ) # If some of the output are uniformly zero, just add a bit of noise to compare for k in range(y.shape[1]): if np.sum(np.abs(y[:, k])) < 1e-10: y[:, k] = np.random.randn(y.shape[0]) * 1e-10 # For conventional methods of BSS, reorder the signals by decreasing power if algo != "blinkiva": new_ord = np.argsort(np.std(y, axis=0))[::-1] y = y[:, new_ord] # Compare SIR m = np.minimum(y.shape[0] - framesize // 2, ref.shape[1]) sdr, sir, sar, perm = bss_eval_sources( ref[:n_sources_target, :m, 0], y[framesize // 2: m + framesize // 2, :n_sources_target].T, ) # reorder the vector of reconstructed signals y_hat = y[:, perm] #return mixdata = pra.normalize(mics_signals, bits=16).astype(np.int16).T separationdata = [] for sig in y_hat.T: separationdata.append(pra.normalize(sig, bits=16).astype(np.int16).T) print("sdr",sdr) return sdr,sir,mixdata,separationdata #wavefile(mixdata) wavefile(separationdata[0]) wavefile(separationdata[1])
Lg = int(np.ceil(Lg_t * Fs)) Lgp = np.floor(0.4 * Lg) Lgm = Lg - Lgp print 'Lg=', Lg # create a microphone array if shape is 'Circular': R = circular2DArray(mic1, M, phi, d * M / (2 * np.pi)) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N, Lg=Lg, hop=hop, zpf=zp, zpb=zp) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_' + str(Fs) + '.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_' + str(Fs) + '.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1. # create the room with sources and mics room1 = pra.Room.shoeBox2D([0, 0], room_dim, Fs, t0=t0,
mics = pra.Beamformer(echo, Fs, N=fft_len, Lg=Lg) roomPoly.add_microphone_array(mics) roomPoly.add_source(source, delay=0, signal=xtone) roomPoly.add_source(interferer, delay=0, signal=silence) roomPoly.image_source_model(use_libroom=True) roomPoly.compute_rir() roomPoly.simulate() # Rake MVDR simulation BeamformerType = 'RakeMVDR' good_sources = roomPoly.sources[0][:max_order_design + 1] bad_sources = roomPoly.sources[1][:max_order_design + 1] mics.rake_mvdr_filters(good_sources, bad_sources, sigma2_n * np.eye(mics.Lg * mics.M)) output = mics.process() out = pra.normalize(pra.highpass(output, Fs)) out = normalize(out) # Rake Perceptual simulation # BeamformerType = 'RakePerceptual' # good_sources = room1.sources[0][:max_order_design+1] # bad_sources = room1.sources[1][:max_order_design+1] # mics.rake_perceptual_filters(good_sources, # bad_sources, # sigma2_n*np.eye(mics.Lg*mics.M)) # output = mics.process() # out = pra.normalize(pra.highpass(output, Fs)) # input_mic = pra.normalize(pra.highpass(mics.signals[mics.M//2], Fs)) # input_mic = normalize(input_mic)
# compute the MaxSINR beamformer w = [la.eigh(rs, b=rn, eigvals=(M-1,M-1))[1] for rs,rn in zip(Rs[1:], Rn[1:])] w = np.squeeze(np.array(w)) w /= la.norm(w, axis=1)[:,None] w = np.concatenate([np.ones((1,M))/np.sqrt(M), w], axis=0) # Compute the gain ref = X[vad_x,:,0] #z = compute_gain(w, X[vad_x,:,:], ref, clip_up=1.0, clip_down=0.1) z = compute_gain(w, X[vad_x,:,:], ref, clip_up=2.0) #z = compute_gain(w, X[vad_x,:,:], ref) #z = compute_gain(w, X[vad_x,:,:], ref) sig_in = pra.normalize(mics.signals[0]) mics.weights = w.T room.plot(img_order=1, freq=[800,1000,1200, 1400, 1600, 1800, 2000]) plt.title('No matching') plt.figure() mics.plot_beam_response() plt.title('No matching') sig_out_flat = mics.process() sig_out_flat = pra.normalize(sig_out_flat) mics.weights = (z[:,None] * w).T sig_out_ref0 = mics.process() sig_out_ref0 = pra.normalize(sig_out_ref0)
def createroom(mic_p, mic_d, sour_p, sour_d, callback_mix, roomdim, absorption, max_order, n_mics, angle): np.random.seed(10) # STFT parameters framesize = 4096 win_a = pra.hann(framesize) win_s = pra.transform.compute_synthesis_window(win_a, framesize // 2) # algorithm parameters # param ogive ogive_mu = 0.1 ogive_update = "switching" ogive_iter = 2000 SIR = 10 # dB SNR = ( 60 ) # dB, this is the SNR with respect to a single target source and microphone self-noise ########separation params############# algo = algo_choices[0] no_cb = True save = True n_iter = 60 dist = "gauss" #guass or laplace ########paramas set################## fs = 44100 n_sources = 2 n_mics = n_mics n_sources_target = 2 assert n_sources_target <= n_mics, "More sources than microphones is not supported" # set the source powers, the first one is half source_std = np.ones(n_sources_target) # room size room_dim = roomdim # micro position rot = angle offset = np.pi - rot / 2 mic_locs = semi_circle_layout(mic_p, rot, mic_d, n_mics, rot=offset) ###micro2 # target position target_locs = np.transpose([[7, 10, 6], [9, 16, 6]]) #interference position interferer_locs = random_layout([14, 0, 6], n_sources - n_sources_target, offset=[5, 20, 3], seed=1) source_locs = target_locs # audio loaded wav_files = [amBird, saBird] signals = wav_read_center(wav_files, seed=123) #create room room = pra.ShoeBox(room_dim, fs=44100, absorption=absorption, max_order=max_order, air_absorption=True, humidity=50) # add source for sig, loc in zip(signals, source_locs.T): room.add_source(loc, signal=sig) # add micro room.add_microphone_array(pra.MicrophoneArray(mic_locs, fs=room.fs)) callback_mix_kwargs = { "snr": SNR, "sir": SIR, "n_src": n_sources, "n_tgt": n_sources_target, "src_std": source_std, "ref_mic": 0, } # Run the simulation separate_recordings = room.simulate( callback_mix=callback_mix, callback_mix_kwargs=callback_mix_kwargs, return_premix=True, ) mics_signals = room.mic_array.signals print("Simulation done.") # rt60 = room.measure_rt60() # print(rt60) # Monitor Convergence ref = np.moveaxis(separate_recordings, 1, 2) if ref.shape[0] < n_mics: ref = np.concatenate( (ref, np.random.randn(n_mics - ref.shape[0], ref.shape[1], ref.shape[2])), axis=0, ) SDR, SIR, cost_func = [], [], [] convergence_callback = None # START BSS # shape: (n_frames, n_freq, n_mics) X_all = pra.transform.analysis(mics_signals.T, framesize, framesize // 2, win=win_a).astype(np.complex128) X_mics = X_all[:, :, :n_mics] tic = time.perf_counter() # Run BSS if algo == "auxiva": # Run AuxIVA Y = overiva( X_mics, n_iter=n_iter, proj_back=True, model=dist, callback=convergence_callback, ) elif algo == "auxiva_pca": # Run AuxIVA Y = auxiva_pca( X_mics, n_src=n_sources_target, n_iter=n_iter, proj_back=True, model=dist, callback=convergence_callback, ) elif algo == "overiva": # Run AuxIVA Y = overiva( X_mics, n_src=n_sources_target, n_iter=n_iter, proj_back=True, model=dist, init_eig=(init == init_choices[1]), callback=convergence_callback, ) elif algo == "ilrma": # Run AuxIVA Y = pra.bss.ilrma( X_mics, n_iter=n_iter, n_components=2, proj_back=True, callback=convergence_callback, ) elif algo == "ogive": # Run OGIVE Y = ogive( X_mics, n_iter=ogive_iter, step_size=ogive_mu, update=ogive_update, proj_back=True, model=dist, init_eig=(init == init_choices[1]), callback=convergence_callback, ) elif algo == "ogive_matlab": # Run OGIVE Y = ogive_matlab_wrapper( X_mics, n_iter=ogive_iter, step_size=ogive_mu, update=ogive_update, proj_back=True, init_eig=(init == init_choices[1]), callback=convergence_callback, ) else: raise ValueError("No such algorithm {}".format(algo)) toc = time.perf_counter() # Run iSTFT if Y.shape[2] == 1: y = pra.transform.synthesis(Y[:, :, 0], framesize, framesize // 2, win=win_s)[:, None] y = y.astype(np.float64) else: y = pra.transform.synthesis(Y, framesize, framesize // 2, win=win_s).astype(np.float64) # If some of the output are uniformly zero, just add a bit of noise to compare for k in range(y.shape[1]): if np.sum(np.abs(y[:, k])) < 1e-10: y[:, k] = np.random.randn(y.shape[0]) * 1e-10 # For conventional methods of BSS, reorder the signals by decreasing power if algo != "blinkiva": new_ord = np.argsort(np.std(y, axis=0))[::-1] y = y[:, new_ord] # Compare SIR m = np.minimum(y.shape[0] - framesize // 2, ref.shape[1]) sdr, sir, sar, perm = bss_eval_sources( ref[:n_sources_target, :m, 0], y[framesize // 2:m + framesize // 2, :n_sources_target].T, ) # reorder the vector of reconstructed signals y_hat = y[:, perm] print("SDR:", sdr) print("SIR:", sir) ####save mix and separation ####### if save: from scipy.io import wavfile wavfile.write( "birdmix.wav", room.fs, (pra.normalize(mics_signals, bits=16).astype(np.int16).T)[:, 0], ) for i, sig in enumerate(y_hat.T): wavfile.write( "birdsep{}.wav".format(i + 1), room.fs, pra.normalize(sig, bits=16).astype(np.int16).T, )
score_map_original[w] = np.zeros([sub, len(snr_vals)]) score_map_processing[w] = np.zeros([sub, len(snr_vals)]) # now we are gonna compute the labelling idx = 0 for s in speech_samps: for i, snr in enumerate(snr_vals): word = s.meta.as_dict()['word'] # destination of the processed signal dest_pro = os.path.join( dest_dir, "processed_signal%d%s_snr_db_%d" % (idx, word, snr)) # destination of the original siganl dest_ori = os.path.join( dest_dir, "original_signal%d%s_snr_db_%d" % (idx, word, snr)) # noisy processed signal noisy_pro = pra.normalize(beamformed_signal[s][i], bits=16).astype(np.int16) wavfile.write(dest_pro, 16000, noisy_pro) # noisy original signal noisy_ori = pra.normalize(noisy_signal[s][i], bits=16).astype(np.int16) wavfile.write(dest_ori, 16000, noisy_ori) # update the score maps print("score for processed signal: ") score_map_processing[word][idx][i] = label_wav( dest_pro, labels_file, graph_file, word) print() print("score for original signal: ") score_map_original[word][idx][i] = label_wav( dest_ori, labels_file, graph_file, word) print() idx += 1
delay = 0.02 # define the FFT length N = 1024 # create a microphone array if shape is 'Circular': R = pra.circular2DArray(mic1, M, phi, d*M/(2*np.pi)) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N=N, Lg=Lg) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_'+str(Fs)+'.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_'+str(Fs)+'.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1. # create the room with sources and mics room1 = pra.Room.shoeBox2D( [0,0], room_dim, Fs,
def make_noisy(args, thread_id, num_make_utts): spe_utt_ids, noise_utt_ids, diffuse_utt_ids, text_dict, utt2spk_dict, utt2data_dict = load_data(args) audio_parser = AudioParser() spe_utt_size = len(spe_utt_ids) if spe_utt_ids is not None else 0 noise_utt_size = len(noise_utt_ids) if noise_utt_ids is not None else 0 diffuse_utt_size = len(diffuse_utt_ids) if diffuse_utt_ids is not None else 0 noisy_scp_list = [] noisy_utt2spk = [] noisy_text_dict = [] mix2info = [] num_utts = 0 all_angle = 360.0 Targ_Ang_Num = args.num_targ_ang Targ_Ang_Resolution = all_angle / Targ_Ang_Num if Targ_Ang_Num > 0 else 0.0 save_mix = args.save_mix save_reverb = args.save_reverb save_clean = args.save_clean while True: ## Random a room room_x = random.uniform(args.min_room_length, args.max_room_length) room_y = random.uniform(args.min_room_weidth, args.max_room_weidth) room_z = random.uniform(args.min_room_height, args.max_room_height) room_dim = [room_x, room_y, room_z] ## Create the room T60 = random.uniform(args.min_T60, args.max_T60) absorption, max_order = pra.inverse_sabine(T60, room_dim) if save_mix: room_mix = pra.ShoeBox(room_dim, fs = args.sample_rate, materials=pra.Material(absorption), max_order=max_order, sigma2_awgn = None) else: room_mix = None if save_reverb: room_ref = pra.ShoeBox(room_dim, fs = args.sample_rate, materials=pra.Material(absorption), max_order=max_order, sigma2_awgn = None) else: room_mix = None if save_clean: room_dir = pra.ShoeBox(room_dim, fs = args.sample_rate, materials=pra.Material(0.99999), max_order=max_order, sigma2_awgn = None) else: room_dir = None ## Random the position of microphone array mic_x = random.uniform(args.min_mic_x, room_x - args.min_mic_x) mic_y = random.uniform(args.min_mic_y, room_y - args.min_mic_y) mic_z = random.uniform(args.min_mic_z, max(min(room_z - args.min_mic_z, 2.0), args.min_mic_z + 0.5)) ## Compute The position of microphones mic_xyz = [] for m in range(args.num_mic): mic_pos = args.mic_pos[m] x = mic_x + mic_pos[0] y = mic_y + mic_pos[1] z = mic_z mic_xyz.append([x, y, z]) mic_xyz = np.array(mic_xyz) # ( 6, 3 ) mic_xyz = mic_xyz.T # ( 3, 6 ) ## Add micphone array mic_array = pra.MicrophoneArray(mic_xyz, args.sample_rate) if room_mix is not None: room_mix = room_mix.add_microphone_array(mic_array) if room_ref is not None: room_ref = room_ref.add_microphone_array(mic_array) if room_dir is not None: room_dir = room_dir.add_microphone_array(mic_array) ##print("room = [%.2f %.2f %.2f], micro = [%.2f %.2f %.2f]" % (room_x, room_y, room_z, mic_x, mic_y, mic_z)) ## Add target sources to room_mix and room_ref target_source = None while True: if args.num_targ_ang <= 0.0: targ_ang = random.randint( 0, int(all_angle) ) else: targ_ang = int(random.randint(0, Targ_Ang_Num - 1) * Targ_Ang_Resolution) targ_theta = np.pi * targ_ang / 180.0 targ_dist = random.uniform(args.min_targ_distance, args.max_targ_distance) targ_x = mic_x + np.cos(targ_theta) * targ_dist targ_y = mic_y + np.sin(targ_theta) * targ_dist targ_z = mic_z target_source = [targ_x, targ_y, targ_z] if (targ_x < (room_x - 0.5) and targ_x > 0.5) and (targ_y < (room_y - 0.5) and targ_y > 0.5): break if target_source is None and not room_mix.is_inside(target_source): continue ##print("room = [%.2f %.2f %.2f], target_source = [%.2f %.2f %.2f]" % (room_x, room_y, room_z, target_source[0], target_source[1], target_source[2])) ##print("targ_ang = %d, targ_dist %.2f" % (targ_ang, targ_dist)) targ_tdoa = targ_ang if args.is_linear_mic and targ_tdoa > 180: targ_tdoa = 360.0 - targ_tdoa ## Add interference sources to room_mix num_interf = min(random.randint(1, args.max_num_interf), 1) interf_angs = [] interf_dists = [] interf_source = [] while True: interf_ang = random.randint(0, int(all_angle)) interf_tdoa = interf_ang if args.is_linear_mic and interf_tdoa > 180: interf_tdoa = 360.0 - interf_tdoa if np.abs(targ_tdoa - interf_tdoa) < args.minAD: continue interf_theta = np.pi * interf_ang / 180.0 interf_dist = random.uniform(args.min_interf_distance, args.max_interf_distance) interf_x = mic_x + np.cos(interf_theta) * interf_dist interf_y = mic_y + np.sin(interf_theta) * interf_dist interf_z = mic_z ainterf_source = [interf_x, interf_y, interf_z] if (interf_x < (room_x - 0.5) and interf_x > 0.5) and (interf_y < (room_y - 0.5) and interf_y > 0.5): interf_angs.append(interf_ang) interf_dists.append(interf_dist) interf_source.append(ainterf_source) if len(interf_source) >= num_interf: break ##print("interf_ang = %d, interf_dist %.2f, num_interf = %d" % (interf_ang, interf_dist, len(interf_source))) for sim in range(args.nutt_per_room): if room_mix is not None: room_mix.sources = [] if room_ref is not None: room_ref.sources = [] if room_dir is not None: room_dir.sources = [] ## Add Speech to microphone array while True: spe_idx = random.randint(0, spe_utt_size - 1) spe_key, spe_path = spe_utt_ids[spe_idx] spe_wav = audio_parser.WaveData(spe_path, sample_rate = args.sample_rate) if spe_wav is None or spe_wav.shape[0] < args.sample_rate: continue spe_wav = np.squeeze(spe_wav) if np.mean(np.abs(spe_wav)) > 0: break spe_length = spe_wav.shape[0] spe_wav = pra.normalize(spe_wav) spe_wav = pra.highpass(spe_wav, args.sample_rate, 50) if room_mix is not None and room_mix.is_inside(target_source): room_mix = room_mix.add_source(target_source, signal = spe_wav, delay = 0) else: print("target_source not in room_mix") continue if room_ref is not None and room_ref.is_inside(target_source): room_ref = room_ref.add_source(target_source, signal = spe_wav, delay = 0) else: print("target_source not in room_ref") if room_dir is not None and room_dir.is_inside(target_source): room_dir = room_dir.add_source(target_source, signal = spe_wav, delay = 0) else: print("target_source not in room_dir") if room_mix is not None and len(room_mix.sources) < 1: print("target_source not in room_mix") break if room_ref is not None and len(room_ref.sources) < 1: print("target_source not in room_ref") break if room_dir is not None and len(room_dir.sources) < 1: print("target_source not in room_dir") break ## Add Interference to microphone array for it in range(0, num_interf): while True: inf_idx = random.randint(0, noise_utt_size - 1) inf_path = noise_utt_ids[inf_idx] inf_wav = audio_parser.WaveData(inf_path, sample_rate = args.sample_rate) if inf_wav is None or inf_wav.shape[0] < args.sample_rate: continue inf_wav = np.squeeze(inf_wav) if np.mean(np.abs(inf_wav)) > 0: break inf_length = inf_wav.shape[0] inf_wav = pra.normalize(inf_wav) inf_wav = pra.highpass(inf_wav, args.sample_rate, 50) while(inf_length < spe_length): inf_wav = np.concatenate((inf_wav, inf_wav), axis = 0) inf_length = inf_wav.shape[0] inf_wav = inf_wav[:spe_length] if room_mix is not None and room_mix.is_inside(interf_source[it]): room_mix = room_mix.add_source(interf_source[it], signal = inf_wav, delay = 0) else: print("interf_source not in room_mix") continue if room_mix is not None and len(room_mix.sources) < 1: break ## Make the far-field mixture audio iSIR = random.uniform(args.lowSIR, args.upSIR) room_mix.simulate(callback_mix = callback_mix, callback_mix_kwargs = {'snr': 30, 'sir': iSIR, 'n_src': num_interf + 1, 'n_tgt': 1, 'ref_mic': 0}) mix_wav = room_mix.mic_array.signals.T # (nchannel, nsample) mix_length, num_channel = mix_wav.shape ## Read diffuse noise if diffuse_utt_ids is not None: while True: diff_idx = random.randint(0, diffuse_utt_size - 1) diff_path = diffuse_utt_ids[diff_idx] diff_wav = audio_parser.WaveData(diff_path, sample_rate = args.sample_rate, id_channel = list(range(0, num_channel))) if diff_wav is None or diff_wav.shape[0] < args.sample_rate: continue if np.mean(np.abs(diff_wav)) > 0: break dif_length, num_channel = diff_wav.shape ''' for i in range(int(num_channel / 2)): ch_wav = diff_wav[:, i] diff_wav[:, i] = diff_wav[:, num_channel - i -1] diff_wav[:, num_channel - i -1] = ch_wav ''' ## Add diffuse noise into mix while( dif_length < mix_length ): diff_wav = np.concatenate((diff_wav, diff_wav), axis = 0) dif_length = diff_wav.shape[0] diff_wav = diff_wav[0:mix_length, :] iSNR = random.uniform(args.lowSNR, args.upSNR) mix_wav = audio_parser.MixWave(mix_wav, diff_wav, snr = iSNR) ## Adapt gain of mixture audio by given gain gain = random.uniform(args.lowGain, args.upGain) scale = gain / np.max(np.abs(mix_wav)) mix_wav = mix_wav * scale mix_wav = mix_wav * 32767.0 mix_wav = mix_wav.astype(np.int16) if room_dir is not None: ## Simulate directional signals room_dir.simulate() dir_wav = room_dir.mic_array.signals[0,:].T # (spe_length) dir_wav = dir_wav * scale dir_wav = dir_wav * 32767.0 dir_wav = dir_wav.astype(np.int16) else: dir_wav = None if room_ref is not None: ## Simulate the clean far-field signal to make ref signal for compute metrics room_ref.simulate() ref_wav = room_ref.mic_array.signals # (num_channel, spe_length) ref_wav = ref_wav * scale # (num_channel, spe_length) else: ref_wav = None if ref_wav is not None: if args.targ_bf is not None: num_block = 1 ref_wav = ref_wav[np.newaxis, :, :] # [ num_block, num_channel, spe_length ] ref_wav = torch.FloatTensor(ref_wav) # [ num_block, num_channel, spe_length ] ref_wav = ref_wav.view(num_block * num_channel, 1, -1) # [ num_block * num_channel, 1, spe_length ] input_audio = ref_wav.to(args.device) # (num_block * num_channel, 1, spe_length) mFFT = args.convstft(input_audio) # (num_block * num_channel, num_bin * 2, num_frame) num_frame = mFFT.size(2) mFFT = mFFT.view(num_block, num_channel, num_bin * 2, -1) #( num_block, num_channel, num_bin * 2, num_frame) mFFT_r = mFFT[:, :, :num_bin, :] #( num_block, num_channel, num_bin, num_frame) mFFT_i = mFFT[:, :, num_bin:, :] #( num_block, num_channel, num_bin, num_frame) mFFT_r = mFFT_r.permute([0, 3, 2, 1]).contiguous() #( num_block, num_frame, num_bin, num_channel) mFFT_i = mFFT_i.permute([0, 3, 2, 1]).contiguous() #( num_block, num_frame, num_bin, num_channel) mFFT_r = mFFT_r.view(num_block * num_frame, num_bin, num_channel) # ( num_block * num_frame, num_bin, num_channel) mFFT_i = mFFT_i.view(num_block * num_frame, num_bin, num_channel) # ( num_block * num_frame, num_bin, num_channel) mFFT = torch.cat([torch.unsqueeze(mFFT_r, 1), torch.unsqueeze(mFFT_i, 1)], dim = 1) # ( num_block * num_frame, 2, num_bin, num_channel ) # Compute the BF bf_direction_resolution targ_tdoa = targ_ang if num_channel == 2 or args.is_linear_mic: if targ_tdoa > 180: targ_tdoa = 360.0 - targ_tdoa bf_beam = targ_tdoa / args.bf_direction_resolution + 0.5 bf_beam = int(bf_beam) % args.num_beam print("tdoa = %d, beam = %d" % (targ_ang, bf_beam)) rFFT = args.targ_bf(mFFT, bf_beam) # (num_block * num_frame, 2, num_bin, 1) rFFT = rFFT[:, :, :, 0].view([num_block, -1, 2, num_bin]) # (num_block, num_frame, 2, num_bin) rFFT = rFFT.permute([0, 2, 3, 1]).contiguous() # ( num_block, 2, num_bin, num_frame ) est_fft = torch.cat([rFFT[:,0], rFFT[:,1]], 1) # ( num_block, num_bin * 2, num_frame ) ref_wav = args.convistft(est_fft) # ( num_block, 1, num_sample) ref_wav = torch.squeeze(ref_wav, 1) # ( num_block, num_sample) ref_wav = ref_wav[0, :] # ( num_sample) ref_wav = ref_wav.data.cpu().numpy() # ( num_sample) else: ref_wav = ref_wav[0, :] # ( num_sample) ref_wav = ref_wav * 32767.0 ref_wav = ref_wav.astype(np.int16) else: ref_wav = None ## Align mix_wav, ref_wav and dir_wav nsample = min(mix_wav.shape[0], ref_wav.shape[0], dir_wav.shape[0]) mix_wav = mix_wav[:nsample] if ref_wav is not None: ref_wav = ref_wav[:nsample] if dir_wav is not None: dir_wav = dir_wav[:nsample] num_utts += 1 _, spe_name, _ = file_parse.getFileInfo(spe_path) out_path = os.path.join(args.out_path, 'wav') if not os.path.exists(out_path): os.makedirs(out_path) if utt2data_dict is not None: data_key, data_id = utt2data_dict[spe_idx] out_path = os.path.join(out_path, data_id) if not os.path.exists(out_path): os.makedirs(out_path) else: data_id = 'data01' if utt2spk_dict is not None: spk_key, spk_id = utt2spk_dict[spe_idx] out_path = os.path.join(out_path, spk_id) if not os.path.exists(out_path): os.makedirs(out_path) else: spk_id = 'spk01' out_path = os.path.join(out_path, 'wav') if not os.path.exists(out_path): os.makedirs(out_path) spe_key = spe_key.replace('_', '').replace('-', '').replace('.', '') spk_id = spk_id.replace('_', '').replace('-', '').replace('.', '') #utt_id = spk_id + "_" + spe_key + "%02d%07d" % (thread_id, num_utts) utt_id = spk_id + "_" + "%02d%07d" % (thread_id, num_utts) if mix_wav is not None: ## Write the mixture audio filename = "%s_id%02d%07d_Doa%d_SIR%.1f_SNR%.1f" % (spe_key, thread_id, num_utts, targ_ang, iSIR, iSNR) mix_path = os.path.join(out_path, '%s.wav' % (filename) ) audio_parser.WriteWave(mix_path, mix_wav, args.sample_rate) else: mix_path = None if dir_wav is not None: filename = "%s_id%02d%07d_Doa%d_DS" % (spe_key, thread_id, num_utts, targ_ang) ds_path = os.path.join(out_path, '%s.wav' % (filename) ) audio_parser.WriteWave(ds_path, dir_wav, args.sample_rate) else: ds_path = None if ref_wav is not None: filename = "%s_id%02d%07d_Doa%d_Ref" % (spe_key, thread_id, num_utts, targ_ang) ref_path = os.path.join(out_path, '%s.wav' % (filename) ) audio_parser.WriteWave(ref_path, ref_wav, args.sample_rate) else: ref_path = None if text_dict is not None: text_key, text_value = text_dict[spe_idx] else: text_value = ' ' noisy_scp_list.append((utt_id, mix_path, ds_path, ref_path, targ_ang, targ_dist, iSIR, iSNR, scale)) noisy_utt2spk.append(spk_id) noisy_text_dict.append(text_value) info = (utt_id, spe_key, mix_path, ds_path, ref_path, targ_ang, targ_dist, interf_angs, interf_dists, iSIR, iSNR, scale) mix2info.append(info) print("%d / %d: %s" % (num_utts, num_make_utts, mix_path)) if num_utts >= num_make_utts: return noisy_scp_list, noisy_utt2spk, noisy_text_dict, mix2info
recording = room.mic_array.signals.T ########################## # Prepare the beamformer # output_signal = np.zeros(recording.shape[0], dtype=recording.dtype) # look direction look_dir = np.array(source_locations[0]) - np.mean(array, axis=1) look_dir /= np.linalg.norm(look_dir) # the matched response beamformer mrbf = MatchResponse(array, look_dir, 40, 32, nfft, fs, c) # processing loop n = 0 while n + shift < recording.shape[0]: newframe = recording[n:n + shift, :] X = stft_input.analysis(newframe) out_frame = mrbf.process(X) # synthesize the output signal output_signal[n:n + shift] = stft_output.synthesis(out_frame) n += shift wavfile.write('output_mic1.wav', fs, pra.normalize(recording[:, 0]) * 0.85) wavfile.write('output_mf.wav', fs, pra.normalize(output_signal) * 0.85)
def play(ch): sd.play(pra.normalize(y[ch]) * 0.75, samplerate=room.fs, blocking=True)
marker='o') plt.legend() plt.tight_layout(pad=0.5) if not args.gui: plt.show() else: plt.show(block=False) if args.save: from scipy.io import wavfile wavfile.write( 'bss_iva_mix.wav', room.fs, pra.normalize(mics_signals[0, :], bits=16).astype(np.int16)) for i, sig in enumerate(y): wavfile.write('bss_iva_source{}.wav'.format(i + 1), room.fs, pra.normalize(sig, bits=16).astype(np.int16)) if args.gui: # Make a simple GUI to listen to the separated samples from tkinter import Tk, Button, Label import sounddevice as sd # Now come the GUI part class PlaySoundGUI(object): def __init__(self, master, fs, mix, sources): self.master = master self.fs = fs
delay = 0.050 # Beamformer delay in seconds # define the FFT length N = 1024 # create a microphone array if shape is 'Circular': R = pra.circular2DArray(mic1, M, phi, d*M/(2*np.pi)) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N=N, Lg=Lg) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_'+str(Fs)+'.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_'+str(Fs)+'.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1. # create the room with sources and mics room1 = pra.Room.shoeBox2D( [0,0], room_dim, Fs,
# define the FFT length N = 1024 # create a microphone array if shape is 'Circular': R = pra.circular2DArray(mic1, M, phi, d * M / (2 * np.pi)) elif shape is 'Poisson': R = pra.poisson2DArray(mic1, M, d) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N=N, Lg=Lg) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_' + str(Fs) + '.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_' + str(Fs) + '.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1. # create the room with sources and mics room1 = pra.Room.shoeBox2D([0, 0], room_dim, Fs, t0=t0,
# Define the FFT length N = 1024 # Create a microphone array if shape is "Circular": R = pra.circular_2D_array(mic1, M, phi, d * M / (2 * np.pi)) else: R = pra.linear_2D_array(mic1, M, phi, d) # path to samples path = os.path.dirname(__file__) # The first signal (of interest) is singing rate1, signal1 = wavfile.read(path + "/input_samples/singing_" + str(Fs) + ".wav") signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0.0 # The second signal (interferer) is some german speech rate2, signal2 = wavfile.read(path + "/input_samples/german_speech_" + str(Fs) + ".wav") signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1.0 # Create the room room_dim = [4, 6] room1 = pra.ShoeBox( room_dim, absorption=absorption,
def createroom(amBird, saBird, noises, mic_p, mic_d, sour_p, sour_d, callback_mix, roomdim, absorption, max_order, n_mics, angle): np.random.seed(10) # STFT parameters framesize = 4096 win_a = pra.hann(framesize) win_s = pra.transform.compute_synthesis_window(win_a, framesize // 2) # algorithm parameters # param ogive ogive_mu = 0.1 ogive_update = "switching" ogive_iter = 2000 ########separation params############## algo = algo_choices[0] no_cb = True save = True n_iter = 60 dist = "gauss" # guass or laplace ########paramas set################## fs = 44100 snr = 60 sinr = 10 # absorption, max_order = 0.45, 12 # RT60 == 0.2 # absorption,max_order=0.9,17 n_sources = 2 + 3 n_mics = n_mics n_sources_target = 2 assert n_sources_target <= n_mics, "More sources than microphones is not supported" # set the source powers, the first one is half source_std = np.ones(n_sources_target) # position #room size room_dim = roomdim #micro position rot = angle offset = np.pi - rot / 2 mic_locs = semi_circle_layout(mic_p, rot, mic_d, n_mics, rot=offset) # micro2 # mic_locs = np.transpose([[13, 9.99, 3.5],[13, 10, 3.5],[13, 10.01, 3.5]])###micro3 # targent position target_locs = np.transpose([[7, 10, 6], [9, 16, 6]]) # inferences position interferer_locs = random_layout([16, 2, 6], n_sources - n_sources_target, offset=[5, 18, 3], seed=1) source_locs = np.concatenate((target_locs, interferer_locs), axis=1) # audios loaded wav_files = [amBird, saBird, noises[0], noises[1], noises[2]] signals = wav_read_center(wav_files, seed=123) # create room room = pra.ShoeBox(room_dim, fs=44100, absorption=absorption, max_order=max_order, air_absorption=True, humidity=50) # add source for sig, loc in zip(signals, source_locs.T): room.add_source(loc, signal=sig) # add micro room.add_microphone_array(pra.MicrophoneArray(mic_locs, fs=room.fs)) # power set premix = room.simulate(return_premix=True) n_samples = premix.shape[2] # Normalize the signals so that they all have unit variance at the reference microphone ref_mic = 0 p_mic_ref = np.std(premix[:, ref_mic, :], axis=1) premix /= p_mic_ref[:, None, None] sources_var = np.ones(n_sources_target) # scale to pre-defined variance premix[:n_sources_target, :, :] *= np.sqrt(sources_var[:, None, None]) # compute noise variance sigma_n = np.sqrt(10**(-snr / 10) * np.sum(sources_var)) # now compute the power of interference signal needed to achieve desired SINR sigma_i = np.sqrt( np.maximum(0, 10**(-sinr / 10) * np.sum(sources_var) - sigma_n**2) / (n_sources - n_sources_target)) premix[n_sources_target:, :, :] *= sigma_i background = (np.sum(premix[n_sources_target:, :, :], axis=0)) # Mix down the recorded signals mix = np.sum(premix, axis=0) mics_signals = room.mic_array.signals print("Simulation done.") # rt60 = room.measure_rt60() # print(rt60) # Monitor Convergence ref = np.zeros((n_sources_target + 1, premix.shape[2], premix.shape[1]), dtype=premix.dtype) ref[:n_sources_target, :, :] = premix[:n_sources_target, :, :].swapaxes( 1, 2) ref[n_sources_target, :, :] = background.T convergence_callback = None # START BSS # shape: (n_frames, n_freq, n_mics) X_all = pra.transform.analysis(mics_signals.T, framesize, framesize // 2, win=win_a).astype(np.complex128) X_mics = X_all[:, :, :n_mics] # Run BSS if algo == "auxiva": # Run AuxIVA Y = overiva( X_mics, n_iter=n_iter, proj_back=True, model=dist, callback=convergence_callback, ) elif algo == "auxiva_pca": # Run AuxIVA Y = auxiva_pca( X_mics, n_src=n_sources_target, n_iter=n_iter, proj_back=True, model=dist, callback=convergence_callback, ) elif algo == "overiva": # Run AuxIVA Y = overiva( X_mics, n_src=n_sources_target, n_iter=n_iter, proj_back=True, model=dist, init_eig=(init == init_choices[1]), callback=convergence_callback, ) elif algo == "ilrma": # Run AuxIVA Y = pra.bss.ilrma( X_mics, n_iter=n_iter, n_components=2, proj_back=True, callback=convergence_callback, ) elif algo == "ogive": # Run OGIVE Y = ogive( X_mics, n_iter=ogive_iter, step_size=ogive_mu, update=ogive_update, proj_back=True, model=dist, init_eig=(init == init_choices[1]), callback=convergence_callback, ) elif algo == "ogive_matlab": # Run OGIVE Y = ogive_matlab_wrapper( X_mics, n_iter=ogive_iter, step_size=ogive_mu, update=ogive_update, proj_back=True, init_eig=(init == init_choices[1]), callback=convergence_callback, ) else: raise ValueError("No such algorithm {}".format(algo)) # Run iSTFT if Y.shape[2] == 1: y = pra.transform.synthesis(Y[:, :, 0], framesize, framesize // 2, win=win_s)[:, None] y = y.astype(np.float64) else: y = pra.transform.synthesis(Y, framesize, framesize // 2, win=win_s).astype(np.float64) # If some of the output are uniformly zero, just add a bit of noise to compare for k in range(y.shape[1]): if np.sum(np.abs(y[:, k])) < 1e-10: y[:, k] = np.random.randn(y.shape[0]) * 1e-10 # For conventional methods of BSS, reorder the signals by decreasing power if algo != "blinkiva": new_ord = np.argsort(np.std(y, axis=0))[::-1] y = y[:, new_ord] # Compare SIR m = np.minimum(y.shape[0] - framesize // 2, ref.shape[1]) sdr, sir, sar, perm = bss_eval_sources( ref[:n_sources_target, :m, 0], y[framesize // 2:m + framesize // 2, :n_sources_target].T, ) # reorder the vector of reconstructed signals y_hat = y[:, perm] return pra.normalize(mics_signals, bits=16).astype(np.int16).T, y_hat, sir, sdr
Lg = int(np.ceil(Lg_t*Fs)) Lgp = np.floor(0.4*Lg) Lgm = Lg - Lgp print 'Lg=',Lg # create a microphone array if shape is 'Circular': R = circular2DArray(mic1, M, phi, d*M/(2*np.pi)) else: R = pra.linear2DArray(mic1, M, phi, d) mics = pra.Beamformer(R, Fs, N, Lg=Lg, hop=hop, zpf=zp, zpb=zp) # The first signal (of interest) is singing rate1, signal1 = wavfile.read('samples/singing_'+str(Fs)+'.wav') signal1 = np.array(signal1, dtype=float) signal1 = pra.normalize(signal1) signal1 = pra.highpass(signal1, Fs) delay1 = 0. # the second signal (interferer) is some german speech rate2, signal2 = wavfile.read('samples/german_speech_'+str(Fs)+'.wav') signal2 = np.array(signal2, dtype=float) signal2 = pra.normalize(signal2) signal2 = pra.highpass(signal2, Fs) delay2 = 1. # create the room with sources and mics room1 = pra.Room.shoeBox2D( [0,0], room_dim, Fs,
P_prev = np.roll(P_prev, -1, axis=1) n += hop # we reset the STFT object stft.reset() ''' Write to WAV + labelling of our processed noisy signals ''' # labelling our different single noise channel removed signals and comparing their classification with the one for the original noisy signals score_processing = np.zeros(len(snr_vals)) score_original = np.zeros(len(snr_vals)) for i, snr in enumerate(snr_vals): print("SNR : %f dB" % snr) dest = os.path.join( dest_dir, "single_noise_channel_signal_snr_db_%d.wav" % (snr)) signal = pra.normalize(processed_audio_array[i], bits=16).astype(np.int16) wavfile.write(dest, 16000, signal) score_processing[i] = label_wav(dest, labels_file, graph_file, speech.meta.as_dict()['word']) dest = os.path.join(dest_dir, "original_signal_snr_db_%d.wav" % (snr)) signal = pra.normalize(noisy_single_mic[i], bits=16).astype(np.int16) wavfile.write(dest, 16000, signal) score_original[i] = label_wav(dest, labels_file, graph_file, speech.meta.as_dict()['word']) print() # plotting the result plt.plot(snr_vals, score_processing, label="single noise channel removal signal")
mics = pra.Beamformer(echo, Fs, N=fft_len, Lg=Lg) room1.add_microphone_array(mics) room1.add_source(source, delay=0, signal=xtone) room1.add_source(interferer, delay=0, signal=silence) room1.image_source_model(use_libroom=True) room1.compute_rir() room1.simulate() # Rake MVDR simulation BeamformerType = 'RakeMVDR' good_sources = room1.sources[0][:max_order_design + 1] bad_sources = room1.sources[1][:max_order_design + 1] mics.rake_mvdr_filters(good_sources, bad_sources, sigma2_n * np.eye(mics.Lg * mics.M)) output = mics.process() out = pra.normalize(pra.highpass(output, Fs)) out = normalize(out) # Rake Perceptual simulation # BeamformerType = 'RakePerceptual' # good_sources = room1.sources[0][:max_order_design+1] # bad_sources = room1.sources[1][:max_order_design+1] # mics.rake_perceptual_filters(good_sources, # bad_sources, # sigma2_n*np.eye(mics.Lg*mics.M)) # output = mics.process() # out = pra.normalize(pra.highpass(output, Fs)) input_mic = pra.normalize(pra.highpass(mics.signals[mics.M // 2], Fs)) input_mic = normalize(input_mic)