示例#1
0
        ## Read target speech audio
        while True:
            spe_id = random.randint(start_spe_id, end_spe_id)
            utt_key = sp_utts_scp[spe_id][0]
            spe_path = sp_utts_scp[spe_id][1]
            spe_name = file_name(pathName=spe_path)
            sample_rate, spe_wav = wavfile.read(spe_path)
            if len(spe_wav.shape) > 1:
                spe_wav = np.mean(spe_wav, 1)
            spe_wav = spe_wav.astype(np.float)
            if np.mean(np.abs(spe_wav)) > 0:
                break

        spe_length = spe_wav.shape[0]
        spe_wav = pra.normalize(spe_wav)
        spe_wav = pra.highpass(spe_wav, Fs, 50)

        room_mix.add_source(target_source, signal=spe_wav, delay=delay)
        room_ref.add_source(target_source, signal=spe_wav, delay=delay)
        #room_dir.add_source(target_source, signal = spe_wav, delay = delay)

        ## Read interfere speech audio
        for it in range(0, interf_num):
            while True:
                while True:
                    inf_id = random.randint(start_spe_id, end_spe_id)
                    if np.abs(spe_id - inf_id) > 500:
                        break
                inf_path = sp_utts_scp[inf_id][1]
                sample_rate, inf_wav = wavfile.read(
                    inf_path)  # (nsample, nchannel)
# define the FFT length
N = 1024

# create a microphone array
if shape is 'Circular':
    R = pra.circular2DArray(mic1, M, phi, d*M/(2*np.pi)) 
else:
    R = pra.linear2DArray(mic1, M, phi, d) 
mics = pra.Beamformer(R, Fs, N=N, Lg=Lg)

# The first signal (of interest) is singing
rate1, signal1 = wavfile.read('samples/singing_'+str(Fs)+'.wav')
signal1 = np.array(signal1, dtype=float)
signal1 = pra.normalize(signal1)
signal1 = pra.highpass(signal1, Fs)
delay1 = 0.

# the second signal (interferer) is some german speech
rate2, signal2 = wavfile.read('samples/german_speech_'+str(Fs)+'.wav')
signal2 = np.array(signal2, dtype=float)
signal2 = pra.normalize(signal2)
signal2 = pra.highpass(signal2, Fs)
delay2 = 1.

# create the room with sources and mics
room1 = pra.Room.shoeBox2D(
    [0,0],
    room_dim,
    Fs,
    t0 = t0,
示例#3
0
    fs_silence, rec_silence = wavfile.read(rec_folder + 'silence.wav')

    if fs_file != fs_silence:
        raise ValueError('Weird: fs of signals and silence are different...')

    # Resample the files if required
    if fs_file != fs:
        print 'Resampling signals'
        from scikits.samplerate import resample

        resampled_signals = []
        resampled_silence = []
        for i in R_flat_I:
            resampled_signals.append(
                pra.highpass(resample(rec_signals[:, i], fs / fs_file,
                                      'sinc_best'),
                             fs,
                             fc=150.))
            resampled_silence.append(
                pra.highpass(resample(rec_silence[:, i], fs / fs_file,
                                      'sinc_best'),
                             fs,
                             fc=150.))
        speech_signals = np.array(resampled_signals, dtype=np.float).T
        silence = np.array(resampled_silence, dtype=np.float).T

    else:
        print('No need to resample signals')
        speech_signals = np.array(rec_signals[:, R_flat_I], dtype=np.float32)
        silence = np.array(rec_silence[:, R_flat_I], dtype=np.float32)

        # highpass filter at 150
示例#4
0
    mics = pra.Beamformer(echo, Fs, N=fft_len, Lg=Lg)
    roomPoly.add_microphone_array(mics)
    roomPoly.add_source(source, delay=0, signal=xtone)
    roomPoly.add_source(interferer, delay=0, signal=silence)
    roomPoly.image_source_model(use_libroom=True)
    roomPoly.compute_rir()
    roomPoly.simulate()

    # Rake MVDR simulation
    BeamformerType = 'RakeMVDR'
    good_sources = roomPoly.sources[0][:max_order_design + 1]
    bad_sources = roomPoly.sources[1][:max_order_design + 1]
    mics.rake_mvdr_filters(good_sources, bad_sources,
                           sigma2_n * np.eye(mics.Lg * mics.M))
    output = mics.process()
    out = pra.normalize(pra.highpass(output, Fs))
    out = normalize(out)

    # Rake Perceptual simulation
    # BeamformerType = 'RakePerceptual'
    # good_sources = room1.sources[0][:max_order_design+1]
    # bad_sources = room1.sources[1][:max_order_design+1]
    # mics.rake_perceptual_filters(good_sources,
    #                     bad_sources,
    #                     sigma2_n*np.eye(mics.Lg*mics.M))
    # output          =   mics.process()
    # out             =   pra.normalize(pra.highpass(output, Fs))

    # input_mic       =   pra.normalize(pra.highpass(mics.signals[mics.M//2], Fs))
    # input_mic       =   normalize(input_mic)
示例#5
0
    room = pra.ShoeBox(
        room_dim,
        absorption=0.2,
        fs=fs_s,
        t0=t0,
        max_order=max_order,
        sigma2_awgn=5e-7)

    #add the sources
    room.add_source(pos_source,signal=audio_anechoic,delay=0.)
    room.add_source(pos_noise,signal=noise_anechoic,delay=1.0)

    #add the microphone array and compute RIR
    mics = pra.Beamformer(R, room.fs,N,Lg=Lg)
    room.add_microphone_array(mics)
    room.compute_rir()
    room.simulate()

    #design the beamforming filters using some of the images sources
    good_sources = room.sources[0][:max_order_design+1]
    bad_sources = room.sources[1][:max_order_design+1]
    mics.rake_mvdr_filters(good_sources,bad_sources,5e-7*np.eye(mics.Lg*mics.M),delay=delay)

    #process the signal
    noisy_signal_beamforming = mics.process()
    out_RakeMVDR = pra.highpass(noisy_signal_beamforming,room.fs).astype(np.int16)
    dest = os.path.join(dest_dir,"beamforming_signal.wav")
    wavfile.write(dest,16000,out_RakeMVDR)
    score_beamformer = label_wav(dest, labels_file, graph_file, speech.meta.as_dict()['word'])
    print(score_beamformer)
Lgp = np.floor(0.4*Lg)
Lgm = Lg - Lgp
print 'Lg=',Lg

# create a microphone array
if shape is 'Circular':
    R = circular2DArray(mic1, M, phi, d*M/(2*np.pi)) 
else:
    R = pra.linear2DArray(mic1, M, phi, d) 
mics = pra.Beamformer(R, Fs, N, Lg=Lg, hop=hop, zpf=zp, zpb=zp)

# The first signal (of interest) is singing
rate1, signal1 = wavfile.read('samples/singing_'+str(Fs)+'.wav')
signal1 = np.array(signal1, dtype=float)
signal1 = pra.normalize(signal1)
signal1 = pra.highpass(signal1, Fs)
delay1 = 0.

# the second signal (interferer) is some german speech
rate2, signal2 = wavfile.read('samples/german_speech_'+str(Fs)+'.wav')
signal2 = np.array(signal2, dtype=float)
signal2 = pra.normalize(signal2)
signal2 = pra.highpass(signal2, Fs)
delay2 = 1.

# create the room with sources and mics
room1 = pra.Room.shoeBox2D(
    [0,0],
    room_dim,
    Fs,
    t0 = t0,
示例#7
0
def make_noisy(args, thread_id, num_make_utts):
    
    spe_utt_ids, noise_utt_ids, diffuse_utt_ids, text_dict, utt2spk_dict, utt2data_dict = load_data(args)

    audio_parser      = AudioParser()

    spe_utt_size     = len(spe_utt_ids) if spe_utt_ids is not None else 0
    noise_utt_size   = len(noise_utt_ids) if noise_utt_ids is not None else 0
    diffuse_utt_size = len(diffuse_utt_ids) if diffuse_utt_ids is not None else 0
    
    noisy_scp_list   = []
    noisy_utt2spk    = []
    noisy_text_dict  = []
    mix2info         = []
    num_utts         = 0

    all_angle           = 360.0
    Targ_Ang_Num        = args.num_targ_ang
    Targ_Ang_Resolution = all_angle / Targ_Ang_Num if Targ_Ang_Num > 0 else 0.0

    save_mix    = args.save_mix
    save_reverb = args.save_reverb
    save_clean  = args.save_clean
    while True:
        ## Random a room
        room_x   = random.uniform(args.min_room_length, args.max_room_length)
        room_y   = random.uniform(args.min_room_weidth, args.max_room_weidth)
        room_z   = random.uniform(args.min_room_height, args.max_room_height)
        room_dim = [room_x, room_y, room_z]

        ## Create the room
        T60                   = random.uniform(args.min_T60, args.max_T60)
        absorption, max_order = pra.inverse_sabine(T60, room_dim)
        if save_mix:
            room_mix   = pra.ShoeBox(room_dim, fs = args.sample_rate, materials=pra.Material(absorption), max_order=max_order, sigma2_awgn = None)
        else:
            room_mix   = None
        if save_reverb:
            room_ref   = pra.ShoeBox(room_dim, fs = args.sample_rate, materials=pra.Material(absorption), max_order=max_order, sigma2_awgn = None)
        else:
            room_mix   = None
        if save_clean:
            room_dir   = pra.ShoeBox(room_dim, fs = args.sample_rate, materials=pra.Material(0.99999), max_order=max_order, sigma2_awgn = None)
        else:
            room_dir = None
        
        ## Random the position of microphone array
        mic_x  = random.uniform(args.min_mic_x, room_x - args.min_mic_x)
        mic_y  = random.uniform(args.min_mic_y, room_y - args.min_mic_y)
        mic_z  = random.uniform(args.min_mic_z, max(min(room_z - args.min_mic_z, 2.0), args.min_mic_z + 0.5))

        ## Compute The position of microphones
        mic_xyz = []
        for m in range(args.num_mic):
            mic_pos   = args.mic_pos[m]
            x         = mic_x + mic_pos[0]
            y         = mic_y + mic_pos[1]
            z         = mic_z
            mic_xyz.append([x, y, z])
        mic_xyz = np.array(mic_xyz) # ( 6, 3 )
        mic_xyz = mic_xyz.T			# ( 3, 6 )

        ## Add micphone array
        mic_array = pra.MicrophoneArray(mic_xyz, args.sample_rate)
        if room_mix is not None:
            room_mix  = room_mix.add_microphone_array(mic_array)
        if room_ref is not None:
            room_ref  = room_ref.add_microphone_array(mic_array)
        if room_dir is not None:
            room_dir  = room_dir.add_microphone_array(mic_array)

        ##print("room = [%.2f %.2f %.2f], micro = [%.2f %.2f %.2f]" % (room_x, room_y, room_z, mic_x, mic_y, mic_z))
        
        ## Add target sources to room_mix and room_ref
        target_source = None
        while True:
            if args.num_targ_ang <= 0.0:
                targ_ang = random.randint( 0, int(all_angle) )
            else:
                targ_ang = int(random.randint(0, Targ_Ang_Num - 1) * Targ_Ang_Resolution)

            targ_theta  = np.pi * targ_ang / 180.0
            targ_dist   = random.uniform(args.min_targ_distance, args.max_targ_distance)
            
            targ_x      = mic_x + np.cos(targ_theta) * targ_dist
            targ_y      = mic_y + np.sin(targ_theta) * targ_dist
            targ_z      = mic_z

            target_source = [targ_x, targ_y, targ_z]

            if (targ_x < (room_x - 0.5) and targ_x > 0.5) and (targ_y < (room_y - 0.5) and targ_y > 0.5):
                break
            
        if target_source is None and not room_mix.is_inside(target_source):
            continue
        
        ##print("room = [%.2f %.2f %.2f], target_source = [%.2f %.2f %.2f]" % (room_x, room_y, room_z, target_source[0], target_source[1], target_source[2]))
        ##print("targ_ang = %d, targ_dist %.2f" % (targ_ang, targ_dist))
        targ_tdoa = targ_ang
        if args.is_linear_mic and targ_tdoa > 180:
            targ_tdoa = 360.0 - targ_tdoa
        
        ## Add interference sources to room_mix
        num_interf    = min(random.randint(1, args.max_num_interf), 1)
        interf_angs   = []
        interf_dists  = []
        interf_source = []
        
        while True:
            interf_ang  = random.randint(0, int(all_angle))
            interf_tdoa = interf_ang
            if args.is_linear_mic and interf_tdoa > 180:
                interf_tdoa = 360.0 - interf_tdoa
            if np.abs(targ_tdoa - interf_tdoa) < args.minAD:
                continue
            interf_theta = np.pi * interf_ang / 180.0
            interf_dist  = random.uniform(args.min_interf_distance, args.max_interf_distance)

            interf_x      = mic_x + np.cos(interf_theta) * interf_dist
            interf_y      = mic_y + np.sin(interf_theta) * interf_dist
            interf_z      = mic_z

            ainterf_source = [interf_x, interf_y, interf_z]
            if (interf_x < (room_x - 0.5) and interf_x > 0.5) and (interf_y < (room_y - 0.5) and interf_y > 0.5):
                interf_angs.append(interf_ang)
                interf_dists.append(interf_dist)
                interf_source.append(ainterf_source)
            
            if len(interf_source) >= num_interf:
                break
                
        ##print("interf_ang = %d, interf_dist %.2f, num_interf = %d" % (interf_ang, interf_dist, len(interf_source)))

        for sim in range(args.nutt_per_room):
            if room_mix is not None:
                room_mix.sources = []
            if room_ref is not None:
                room_ref.sources = []
            if room_dir is not None:
                room_dir.sources = []
            
            ## Add Speech to microphone array
            while True:
                spe_idx = random.randint(0, spe_utt_size - 1)
                spe_key, spe_path = spe_utt_ids[spe_idx]

                spe_wav = audio_parser.WaveData(spe_path, sample_rate = args.sample_rate)
                if spe_wav is None or spe_wav.shape[0] < args.sample_rate:
                    continue
                spe_wav = np.squeeze(spe_wav)
                if np.mean(np.abs(spe_wav)) > 0:
                    break
            
            spe_length 	   = spe_wav.shape[0]
            spe_wav        = pra.normalize(spe_wav)
            spe_wav        = pra.highpass(spe_wav, args.sample_rate, 50)
            
            if room_mix is not None and room_mix.is_inside(target_source):
                room_mix = room_mix.add_source(target_source, signal = spe_wav, delay = 0)
            else:
                print("target_source not in room_mix")
                continue
            if room_ref is not None and room_ref.is_inside(target_source):
                room_ref = room_ref.add_source(target_source, signal = spe_wav, delay = 0)
            else:
                print("target_source not in room_ref")
            if room_dir is not None and room_dir.is_inside(target_source):
                room_dir = room_dir.add_source(target_source, signal = spe_wav, delay = 0)
            else:
                print("target_source not in room_dir")
                        
            if room_mix is not None and len(room_mix.sources) < 1:
                print("target_source not in room_mix")
                break
            if room_ref is not None and len(room_ref.sources) < 1:
                print("target_source not in room_ref")
                break
            if room_dir is not None and len(room_dir.sources) < 1:
                print("target_source not in room_dir")
                break
            
            ## Add Interference to microphone array
            for it in range(0, num_interf):
                while True:
                    inf_idx = random.randint(0, noise_utt_size - 1)
                    inf_path = noise_utt_ids[inf_idx]

                    inf_wav = audio_parser.WaveData(inf_path, sample_rate = args.sample_rate)
                    if inf_wav is None or inf_wav.shape[0] < args.sample_rate:
                        continue
                    inf_wav = np.squeeze(inf_wav)
                    if np.mean(np.abs(inf_wav)) > 0:
                        break
                
                inf_length = inf_wav.shape[0]
                inf_wav = pra.normalize(inf_wav)
                inf_wav = pra.highpass(inf_wav, args.sample_rate, 50)

                while(inf_length < spe_length):
                    inf_wav    = np.concatenate((inf_wav, inf_wav), axis = 0)
                    inf_length = inf_wav.shape[0]
                inf_wav = inf_wav[:spe_length]
                
                if room_mix is not None and room_mix.is_inside(interf_source[it]):
                    room_mix = room_mix.add_source(interf_source[it], signal = inf_wav, delay = 0)
                else:
                    print("interf_source not in room_mix")
                    continue

            if room_mix is not None and len(room_mix.sources) < 1:
                break

            ## Make the far-field mixture audio
            iSIR  = random.uniform(args.lowSIR, args.upSIR)
            room_mix.simulate(callback_mix = callback_mix, callback_mix_kwargs = {'snr': 30, 'sir': iSIR, 'n_src': num_interf + 1, 'n_tgt': 1, 'ref_mic': 0})
            
            mix_wav 				= room_mix.mic_array.signals.T	# (nchannel, nsample)
            mix_length, num_channel = mix_wav.shape
            
            ## Read diffuse noise
            if diffuse_utt_ids is not None:
                while True:
                    diff_idx = random.randint(0, diffuse_utt_size - 1)
                    diff_path = diffuse_utt_ids[diff_idx]

                    diff_wav = audio_parser.WaveData(diff_path, sample_rate = args.sample_rate, id_channel = list(range(0, num_channel)))
                    if diff_wav is None or diff_wav.shape[0] < args.sample_rate:
                        continue
                    if np.mean(np.abs(diff_wav)) > 0:
                        break
                
                dif_length, num_channel = diff_wav.shape
                '''
                for i in range(int(num_channel / 2)):
                    ch_wav = diff_wav[:, i]
                    diff_wav[:, i] = diff_wav[:, num_channel - i -1]
                    diff_wav[:, num_channel - i -1] = ch_wav
                '''
                
                ## Add diffuse noise into mix
                while( dif_length < mix_length ):
                    diff_wav    = np.concatenate((diff_wav, diff_wav), axis = 0)
                    dif_length = diff_wav.shape[0]
                diff_wav = diff_wav[0:mix_length, :]
                
                iSNR    = random.uniform(args.lowSNR, args.upSNR)
                mix_wav = audio_parser.MixWave(mix_wav, diff_wav, snr = iSNR)

            ## Adapt gain of mixture audio by given gain
            gain     = random.uniform(args.lowGain, args.upGain)
            scale	 = gain / np.max(np.abs(mix_wav))
            mix_wav  = mix_wav * scale
            mix_wav  = mix_wav * 32767.0
            mix_wav  = mix_wav.astype(np.int16)

            if room_dir is not None:
                ## Simulate directional signals
                room_dir.simulate()
                dir_wav = room_dir.mic_array.signals[0,:].T # (spe_length)
                dir_wav = dir_wav * scale
                dir_wav = dir_wav * 32767.0
                dir_wav = dir_wav.astype(np.int16)
            else:
                dir_wav = None

            if room_ref is not None:
                ## Simulate the clean far-field signal to make ref signal for compute metrics
                room_ref.simulate()
                ref_wav = room_ref.mic_array.signals 		 # (num_channel, spe_length)
                ref_wav = ref_wav * scale			  		 # (num_channel, spe_length)
            else:
                ref_wav = None
            
            if ref_wav is not None:
                if args.targ_bf is not None:
                    num_block = 1
                    ref_wav   = ref_wav[np.newaxis, :, :]    	 			 # [ num_block, num_channel, spe_length ]
                    ref_wav   = torch.FloatTensor(ref_wav)   	 		     # [ num_block, num_channel, spe_length ]
                    ref_wav   = ref_wav.view(num_block * num_channel, 1, -1) # [ num_block * num_channel, 1, spe_length ]

                    input_audio  = ref_wav.to(args.device)     		 # (num_block * num_channel, 1, spe_length)

                    mFFT  = args.convstft(input_audio)                # (num_block * num_channel, num_bin * 2, num_frame)

                    num_frame = mFFT.size(2)
                    mFFT   = mFFT.view(num_block, num_channel, num_bin * 2, -1) #( num_block, num_channel, num_bin * 2, num_frame)
                    mFFT_r = mFFT[:, :, :num_bin, :] 							#( num_block, num_channel, num_bin, num_frame)
                    mFFT_i = mFFT[:, :, num_bin:, :] 							#( num_block, num_channel, num_bin, num_frame)

                    mFFT_r = mFFT_r.permute([0, 3, 2, 1]).contiguous() 		    #( num_block, num_frame, num_bin, num_channel)
                    mFFT_i = mFFT_i.permute([0, 3, 2, 1]).contiguous()          #( num_block, num_frame, num_bin, num_channel)

                    mFFT_r = mFFT_r.view(num_block * num_frame, num_bin, num_channel) # ( num_block * num_frame, num_bin, num_channel)
                    mFFT_i = mFFT_i.view(num_block * num_frame, num_bin, num_channel) # ( num_block * num_frame, num_bin, num_channel)

                    mFFT = torch.cat([torch.unsqueeze(mFFT_r, 1), torch.unsqueeze(mFFT_i, 1)], dim = 1) # ( num_block * num_frame, 2, num_bin, num_channel )

                    # Compute the BF bf_direction_resolution
                    targ_tdoa = targ_ang
                    if num_channel == 2 or args.is_linear_mic:
                        if targ_tdoa > 180:
                            targ_tdoa = 360.0 - targ_tdoa
                    bf_beam = targ_tdoa / args.bf_direction_resolution + 0.5
                    bf_beam = int(bf_beam) % args.num_beam
                    print("tdoa = %d, beam = %d" % (targ_ang, bf_beam))

                    rFFT = args.targ_bf(mFFT, bf_beam) 				            # (num_block * num_frame, 2, num_bin, 1)
                    rFFT = rFFT[:, :, :, 0].view([num_block, -1, 2, num_bin])   # (num_block, num_frame, 2, num_bin)

                    rFFT    = rFFT.permute([0, 2, 3, 1]).contiguous()    # ( num_block, 2, num_bin, num_frame )
                    est_fft = torch.cat([rFFT[:,0], rFFT[:,1]], 1) 	     # ( num_block, num_bin * 2, num_frame )
                    ref_wav = args.convistft(est_fft)                    # ( num_block, 1, num_sample)
                    ref_wav = torch.squeeze(ref_wav, 1)                  # ( num_block, num_sample)
                    ref_wav = ref_wav[0, :]								 # ( num_sample)
                    ref_wav = ref_wav.data.cpu().numpy() 				 # ( num_sample)
                else:
                    ref_wav = ref_wav[0, :]								 # ( num_sample)
                
                ref_wav = ref_wav * 32767.0
                ref_wav = ref_wav.astype(np.int16)
            else:
                ref_wav = None
            
            ## Align mix_wav, ref_wav and dir_wav
            nsample = min(mix_wav.shape[0], ref_wav.shape[0], dir_wav.shape[0])
            mix_wav = mix_wav[:nsample]
            if ref_wav is not None:
                ref_wav = ref_wav[:nsample]
            if dir_wav is not None:
                dir_wav = dir_wav[:nsample]

            num_utts += 1

            _, spe_name, _ = file_parse.getFileInfo(spe_path)

            out_path = os.path.join(args.out_path, 'wav')
            if not os.path.exists(out_path):
                os.makedirs(out_path)
            
            if utt2data_dict is not None:
                data_key, data_id = utt2data_dict[spe_idx]
                out_path = os.path.join(out_path, data_id)
                if not os.path.exists(out_path):
                    os.makedirs(out_path)
            else:
                data_id = 'data01'

            if utt2spk_dict is not None:
                spk_key, spk_id = utt2spk_dict[spe_idx]
                out_path = os.path.join(out_path, spk_id)
                if not os.path.exists(out_path):
                    os.makedirs(out_path)
            else:
                spk_id = 'spk01'
                out_path = os.path.join(out_path, 'wav')
                if not os.path.exists(out_path):
                    os.makedirs(out_path)
            
            spe_key = spe_key.replace('_', '').replace('-', '').replace('.', '')
            spk_id  = spk_id.replace('_', '').replace('-', '').replace('.', '')
            #utt_id = spk_id + "_" + spe_key + "%02d%07d" % (thread_id, num_utts)
            utt_id = spk_id + "_" + "%02d%07d" % (thread_id, num_utts)
            
            if mix_wav is not None:
                ## Write the mixture audio
                filename = "%s_id%02d%07d_Doa%d_SIR%.1f_SNR%.1f" % (spe_key, thread_id, num_utts, targ_ang, iSIR, iSNR)
                mix_path = os.path.join(out_path, '%s.wav' % (filename) )
                audio_parser.WriteWave(mix_path, mix_wav, args.sample_rate)
            else:
                mix_path = None

            if dir_wav is not None:
                filename = "%s_id%02d%07d_Doa%d_DS" % (spe_key, thread_id, num_utts, targ_ang)
                ds_path = os.path.join(out_path, '%s.wav' % (filename) )
                audio_parser.WriteWave(ds_path, dir_wav, args.sample_rate)
            else:
                ds_path = None
            
            if ref_wav is not None:
                filename = "%s_id%02d%07d_Doa%d_Ref" % (spe_key, thread_id, num_utts, targ_ang)
                ref_path = os.path.join(out_path, '%s.wav' % (filename) )
                audio_parser.WriteWave(ref_path, ref_wav, args.sample_rate)
            else:
                ref_path = None

            if text_dict is not None:
                text_key, text_value = text_dict[spe_idx]
            else:
                text_value = ' '
            
            noisy_scp_list.append((utt_id, mix_path, ds_path, ref_path, targ_ang, targ_dist, iSIR, iSNR, scale))
            noisy_utt2spk.append(spk_id)
            noisy_text_dict.append(text_value)

            info = (utt_id, spe_key, mix_path, ds_path, ref_path, targ_ang, targ_dist, interf_angs, interf_dists, iSIR, iSNR, scale)

            mix2info.append(info)
            
            print("%d / %d: %s" % (num_utts, num_make_utts, mix_path))

            if num_utts >= num_make_utts:
                return noisy_scp_list, noisy_utt2spk, noisy_text_dict, mix2info
def perceptual_quality_evaluation(room_dim, mics, good_pos, good_index,
                                  bad_pos, bad_index, rir_location):
    print 'start'

    import numpy as np
    from scipy.io import wavfile
    from os import getpid

    import pyroomacoustics as pra

    # number of sources to  consider
    n_sources = np.arange(1, 8)
    S = n_sources.shape[0]

    # number of mics
    n_mic = mics.shape[1]

    # Set the speed of sound to match that of the measured RIR
    pra.constants.set('c', 345.5)

    Fs = 8000.
    N = 1024
    Lg = int(0.03 * Fs)  # 350 ms long filter
    delay_bf = 0.02
    sigma2_n = 1e-6

    # reflection coefficients from the walls (hand-waving)
    reflection = {
        'ground': 0.8,
        'south': 0.8,
        'west': 0.8,
        'north': 0.8,
        'east': 0.8,
        'ceilling': 0.5
    }

    speech_sample1 = 'samples/fq_sample1_8000.wav'
    speech_sample2 = 'samples/fq_sample2_8000.wav'

    # Create the room
    room = pra.ShoeBox3D(np.zeros(3),
                         room_dim,
                         Fs,
                         max_order=1,
                         absorption=reflection,
                         sigma2_awgn=sigma2_n)

    # Create the beamformer
    bf = pra.Beamformer(mics, Fs, N=N, Lg=Lg)
    room.addMicrophoneArray(bf)

    # data receptacles
    beamformer_names = ['Rake Perceptual', 'Rake MVDR']
    bf_weights_fun = [bf.rakePerceptualFilters, bf.rakeMVDRFilters]
    bf_fnames = ['1', '2']
    NBF = len(beamformer_names)

    # receptacle arrays
    pesq_input = np.zeros(2)
    pesq_bf = np.zeros((2, NBF, S))

    # create a single reference mic at position of microphone 4
    ref_mic_n = 4
    ref_mic = pra.MicrophoneArray(bf.R[:, ref_mic_n, np.newaxis], Fs)

    # since we run multiple thread, we need to uniquely identify filenames
    pid = str(getpid())

    file_ref = 'output_samples/fqref' + pid + '.wav'
    file_suffix = '-' + pid + '.wav'
    files_bf = [
        'output_samples/fq' + str(i + 1) + file_suffix for i in xrange(NBF)
    ]
    file_raw = 'output_samples/fqraw' + pid + '.wav'

    # index of good and bad sources
    good = good_index
    bad = bad_index

    # Read the two speech samples used
    rate, good_signal = wavfile.read(speech_sample1)
    good_signal = np.array(good_signal, dtype='float64')
    good_signal = pra.normalize(good_signal)
    good_signal = pra.highpass(good_signal, rate)
    good_len = good_signal.shape[0] / float(Fs)

    rate, bad_signal = wavfile.read(speech_sample2)
    bad_signal = np.array(bad_signal, dtype='float64')
    bad_signal = pra.normalize(bad_signal)
    bad_signal = pra.highpass(bad_signal, rate)
    bad_len = bad_signal.shape[0] / float(Fs)

    # variance of good signal
    good_sigma2 = np.mean(good_signal**2)

    # normalize interference signal to have equal power with desired signal
    bad_signal *= good_sigma2 / np.mean(bad_signal**2)

    # pick good source position at random
    good_distance = np.linalg.norm(bf.center[:, 0] - good_pos)

    # pick bad source position at random
    bad_distance = np.linalg.norm(bf.center[:, 0] - bad_pos)

    if good_len > bad_len:
        good_delay = 0
        bad_delay = (good_len - bad_len) / 2.
    else:
        bad_delay = 0
        good_delay = (bad_len - good_len) / 2.

    # create the reference room for freespace, noisless, no interference simulation
    ref_room = pra.ShoeBox3D([0, 0, 0], room_dim, Fs, max_order=0)
    ref_room.addSource(good_pos, signal=good_signal, delay=good_delay)
    ref_room.addMicrophoneArray(ref_mic)
    ref_room.compute_RIR()
    ref_room.simulate()
    reference = pra.highpass(ref_mic.signals[0], Fs)
    reference_n = pra.normalize(reference)

    # save the reference desired signal
    #wavfile.write(file_ref, Fs, pra.to_16b(reference_n))

    new_ref = good_signal.copy()
    new_ref = pra.normalize(pra.highpass(new_ref, Fs))
    wavfile.write(file_ref, Fs, pra.to_16b(new_ref))

    # add the sources to the 'real' room
    room.addSource(good_pos, signal=good_signal, delay=good_delay)
    room.addSource(bad_pos, signal=bad_signal, delay=bad_delay)

    # read in the RIR from file
    for r in range(n_mic):
        for s in [good_index, bad_index]:

            # read wav file
            fname_rir = rir_location % (r + 1, s + 1)
            rir_fs, rir = wavfile.read(fname_rir)
            rir = np.array(rir, dtype='float64')

            if rir_fs != Fs:
                raise NameError(
                    'The RIR and the signals do not have the same sampling rate.'
                )
                '''
                import scikits.samplerate as sr
                rir = sr.resample(rir, Fs/float(rir_fs), 'sinc_best')

                # the factor 2 was empirically determined to be necessary to get
                # amplitude of RIR in the correct ballpark.
                rir *= 2.
                '''

            room.rir.append([])
            room.rir[r].append(rir)

    # compute the input signal to the microphones
    room.simulate()

    # save degraded signal at reference microphone
    raw = bf.signals[ref_mic_n]
    raw_n = pra.normalize(pra.highpass(raw, Fs))
    wavfile.write(file_raw, Fs, pra.to_16b(raw_n))

    pesq_input = pra.pesq(file_ref, file_raw, Fs=Fs)

    for src in room.sources:
        src.setOrdering('strongest', ref_point=bf.center)

    for k, s in enumerate(n_sources):

        good_img = room.sources[0][:s]
        bad_img = room.sources[1][:s]

        for i, bfr in enumerate(beamformer_names):

            bf_weights_fun[i](good_img,
                              bad_img,
                              sigma2_n * np.eye(n_mic * Lg),
                              delay=delay_bf)

            # run beamformer
            output = bf.process()
            output = pra.normalize(pra.highpass(output, Fs))
            output = pra.time_align(reference_n, output)

            # save files for PESQ evaluation
            wavfile.write(files_bf[i], Fs, pra.to_16b(output))

            # compute PESQ
            x = pra.pesq(file_ref, files_bf[i], Fs=Fs)
            pesq_bf[:, i, k] = pra.pesq(file_ref, files_bf[i], Fs=Fs).T
    ''' This is how you can compare the true RIRs with the image src model generated one
    plt.figure()
    for m in range(n_mic):

        rir_sim = room.sources[0].getRIR(mics[:,m], Fs)
        plt.subplot(3,3,m+1)
        plt.plot(room.rir[m][0][:rir_sim.shape[0]])
        plt.plot(rir_sim)

    plt.show()
    '''

    print 'Finished'

    return pesq_input, pesq_bf
def process_experiment_max_sinr(SIR, mic, blinky, args):

    session = args.session
    target = args.target

    with open(metadata_file.format(session=args.session), 'r') as f:
        metadata = json.load(f)

    file_pattern = os.path.join(experiment_folder, metadata['filename_pattern'])

    with open(protocol_file.format(session=args.session), 'r') as f:
        protocol = json.load(f)

    nfft = args.nfft
    vad_guard = args.vad_guard
    if args.thresh is None:
        vad_thresh = thresh_opt[SIR]
    else:
        vad_thresh = args.thresh

    # read_in the mix signals
    fs_led, leds   = wavfile.read(file_pattern.format(
        session=session, snr=SIR, mic=blinky, source='mix', fs=fs))
    fs_snd, audio  = wavfile.read(file_pattern.format(
        session=session, snr=SIR, mic=mic_choices[mic], source='mix', fs=fs))
    assert fs_led == fs_snd

    # read in the ref signals
    sources_ref  = dict(zip(target_choices,
        [ wavfile.read(file_pattern.format(
                session=session, mic=mic_choices[mic], snr=SIR, source=ch, fs=fs))[1]
            for ch in target_choices ]))
    leds_ref  = dict(zip(target_choices,
        [ wavfile.read(file_pattern.format(
                session=session, mic=blinky, snr=SIR, source=ch, fs=fs))[1]
            for ch in target_choices ]))

    # reorder with target in first position
    ref = np.array([sources_ref[target]] + [sources_ref[ch]
                for ch in target_choices if ch != target])

    noise_ref = np.zeros_like(sources_ref[target])
    n_ch = [ch for ch in target_choices if ch != target]
    for ch in n_ch:
        noise_ref += sources_ref[ch]

    # In case of objective evaluation, we do an artificial mix
    if args.synth_mix:
        audio = sources_ref[target] + noise_ref

    # get the geometry information to get nice plots.
    mics_geom = {
            'pyramic' : np.array(protocol['geometry']['microphones']['pyramic']['locations']),
            'camera'  : np.array(protocol['geometry']['microphones']['camera']['locations']),
            }

    mics_loc = np.array(protocol['geometry']['microphones'][mic_choices[mic]]['reference'])
    noise_loc = protocol['geometry']['speakers']['locations'][0]
    speech_loc = protocol['geometry']['speakers']['locations'][1]

    # the directions of arrival
    theta_speech = 0
    p0 = speech_loc - mics_loc
    p1 = noise_loc - mics_loc
    theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1))
    print('Source separation', theta_noise / np.pi * 180)

    if 'pyramic' in mic:

        if mic == 'pyramic_2':
            I = pyramic_bss_2ch
        elif mic == 'pyramic_4':
            I = pyramic_bss_4ch
        elif mic == 'pyramic_24':
            I = list(range(8,16)) + list(range(24,32)) + list(range(40,48)) # flat part
        elif mic == 'pyramic_48':
            I = list(range(48))
        else:
            raise ValueError('Unsupported configuration')

        audio = audio[:,I]
        noise_ref = noise_ref[:,I].copy()
        ref = ref[:,:,I].copy()

        mics_positions = mics_geom['pyramic'][I].copy()
        # place in room 2-806
        mics_positions -= np.mean(mics_positions, axis=0)[None,:]
        mics_positions[:,2] -= np.max(mics_positions[:,2])
        mics_positions += mics_loc

    elif mic == 'camera':
        mics_positions = mics_geom['camera'].copy() + mics_loc


    n_samples = audio.shape[0]  # shorthand
    n_channels = audio.shape[1]

    # adjust length of led signal if necessary
    if leds.shape[0] < audio.shape[0]:
        z_missing = audio.shape[0] - leds.shape[0]
        leds = np.pad(leds, (0,z_missing), 'constant')
    elif leds.shape[0] > audio.shape[0]:
        leds = leds[:audio.shape[0],]

    # perform VAD
    led_target = leds[:,blinky_source_map[target]]
    vad_snd = led_target > vad_thresh

    # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix.
    # For that we will remove frames neighbouring the detected speech
    vad_guarded = vad_snd.copy()
    if vad_guard is not None:
        for i,v in enumerate(vad_snd):
            if np.any(vad_snd[i-vad_guard:i+vad_guard]):
                vad_guarded[i] = True

    ##############################
    ## STFT and frame-level VAD ##
    ##############################

    print('STFT and stuff')
    sys.stdout.flush()

    a_win = pra.hann(nfft)
    s_win = pra.realtime.compute_synthesis_window(a_win, nfft // 2)

    engine = pra.realtime.STFT(nfft, nfft // 2,
            analysis_window=a_win, synthesis_window=s_win,
            channels=audio.shape[1])

    # Now compute the STFT of the microphone input
    X = engine.analysis(audio)
    X_time = np.arange(1, X.shape[0]+1) * (nfft / 2) / fs_snd

    X_speech = engine.analysis(audio * vad_guarded[:audio.shape[0],None])
    X_noise = engine.analysis(audio * (1 - vad_guarded[:audio.shape[0],None]))

    ##########################
    ## MAX SINR BEAMFORMING ##
    ##########################

    print('Max SINR beamformer computation')
    sys.stdout.flush()

    # covariance matrices from noisy signal
    Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech))
    Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise)) 
    Rall = Rs + Rn

    # compute the MaxSINR beamformer
    w = [la.eigh(rs, b=rn, eigvals=(n_channels-1,n_channels-1))[1] for rs,rn in zip(Rall[1:], Rn[1:])]
    w = np.squeeze(np.array(w))
    nw = la.norm(w, axis=1)
    w[nw > 1e-10,:] /= nw[nw > 1e-10,None]
    w = np.concatenate([np.ones((1,n_channels)), w], axis=0)  # add dummy beamformer at DC

    if not args.no_norm:
        # normalize with respect to input signal
        z = compute_gain(w, X_speech, X_speech[:,:,0], clip_up=args.clip_gain)
        w *= z[:,None]


    ###########
    ## APPLY ##
    ###########

    print('Apply beamformer')
    sys.stdout.flush()

    # 2D beamformer
    mic_array = pra.Beamformer(mics_positions[:,:2].T, fs=fs_snd, N=nfft, hop=nfft, zpb=nfft)
    mic_array.signals = audio.T
    mic_array.weights = w.T

    out = mic_array.process()

    # Signal alignment step

    # Not sure why the delay is sometimes negative here... Need to check more
    delay = int(pra.tdoa(out, ref[0,:,0].astype(np.float), phat=True))
    print(delay)
    delay = np.abs(delay)
    if delay > 0:
        out_trunc = out[delay:delay+ref.shape[1]]
    else:
        out_trunc = np.concatenate((np.zeros(-delay), out[:ref.shape[1]+delay]))
    sig_eval = np.vstack([out_trunc] * len(target_choices))

    # We use the BSS eval toolbox
    metric = bss_eval_images(ref[:,:,0], sig_eval[:,:,None])

    # we are only interested in SDR and SIR for the speech source
    ret = { 'Max-SINR' : {'SDR' : metric[0][0], 'SIR' : metric[2][0]} }


    #############################
    ## BLIND SOURCE SEPARATION ##
    #############################

    if mic in ['camera', 'pyramic_2', 'pyramic_4']:

        Y = pra.bss.auxiva(X, n_iter=40)
        bss = pra.realtime.synthesis(Y, nfft, nfft // 2, win=s_win)

        match = []
        for col in range(bss.shape[1]):
            xcorr = fast_corr(bss[:,col], ref[0,:,0])
            match.append(np.max(xcorr))
        best_col = np.argmax(match)

        # Not sure why the delay is sometimes negative here... Need to check more
        delay = np.abs(int(pra.tdoa(bss[:,best_col], ref[0,:,0].astype(np.float), phat=True)))
        if delay > 0:
            bss_trunc = bss[delay:delay+ref.shape[1],]
        elif delay < 0:
            bss_trunc = np.concatenate((np.zeros((-delay, bss.shape[1])), bss[:ref.shape[1]+delay]))
        else:
            bss_trunc = bss[:ref.shape[1],]

        if ref.shape[1] > bss_trunc.shape[0]:
            ref_lim = bss_trunc.shape[0]
        else:
            ref_lim = ref.shape[1]

        if mic in ['camera', 'pyramic_2']:
            bss_trunc = np.hstack([bss_trunc] * 2)

        metric = bss_eval_images(ref[:,:ref_lim,0,None], bss_trunc.T[:,:,None])
        SDR_bss = metric[0][0]
        SIR_bss = metric[2][0]
        ret['BSS'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] }

    #################################
    ## Estimate SDR and SIR of mix ##
    #################################

    # Not sure why the delay is sometimes negative here... Need to check more
    delay = np.abs(int(pra.tdoa(audio[:,0], ref[0,:,0].astype(np.float), phat=True)))
    if delay > 0:
        audio_trunc = audio[delay:delay+ref.shape[1],0]
    elif delay < 0:
        audio_trunc = np.concatenate((np.zeros(-delay), audio[:ref.shape[1]+delay,0]))
    else:
        audio_trunc = audio[:ref.shape[1],0]

    if ref.shape[1] > audio_trunc.shape[0]:
        ref_lim = audio_trunc.shape[0]
    else:
        ref_lim = ref.shape[1]

    audio_trunc = np.vstack([audio_trunc] * len(ref))

    metric = bss_eval_images(ref[:,:ref_lim,0,None], audio_trunc[:,:,None])
    SDR_bss = metric[0][0]
    SIR_bss = metric[2][0]
    ret['Mix'] = { 'SDR' : metric[0][0], 'SIR' : metric[2][0] }

    ##################
    ## SAVE SAMPLES ##
    ##################

    if args.save_sample is not None:

        if not os.path.exists(args.save_sample):
            os.makedirs(args.save_sample)

        # for informal listening tests, we need to high pass and normalize the
        # amplitude.
        if mic in ['camera', 'pyramic_2', 'pyramic_4']:
            upper = np.max([audio[:,0].max(), out.max(), bss.max(), ref[0,:,0].max()])
        else:
            upper = np.max([audio[:,0].max(), out.max(), ref[0,:,0].max()])


        # Clean signal for reference
        sig_ref = pra.highpass(ref[0,:,0].astype(np.float) / upper, fs_snd, fc=150)
        f0 = os.path.join(args.save_sample, '{}_ref_SIR_NA_dB.wav'.format(mic))
        wavfile.write(f0, fs_snd, sig_ref)

        # Mix signal for reference
        sig_mix = pra.highpass(audio[:,0].astype(np.float) / upper, fs_snd, fc=150)
        f1 = os.path.join(args.save_sample, '{}_mix_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f1, fs_snd, sig_mix)

        # Output of MaxSINR
        sig_out = pra.highpass(out / upper, fs_snd, fc=150)
        f2 = os.path.join(args.save_sample, '{}_maxsinr_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f2, fs_snd, sig_out)

        # Output of BSS
        if mic in ['camera', 'pyramic_2', 'pyramic_4']:
            sig_bss = pra.highpass(bss[:,best_col] / upper, fs_snd, fc=150)
            f3 = os.path.join(args.save_sample, '{}_bss_SIR_{}_dB.wav'.format(mic, SIR))
            wavfile.write(f3, fs_snd, sig_bss)


    ##########
    ## PLOT ##
    ##########

    if args.plot:

        plt.figure()
        plt.plot(out_trunc)
        plt.plot(ref[0,:,0])
        plt.legend(['output', 'reference'])

        # time axis for plotting
        led_time = np.arange(led_target.shape[0]) / fs_led + 1 / (2 * fs_led)
        audio_time = np.arange(n_samples) / fs_snd

        plt.figure()
        plt.plot(led_time, led_target, 'r')
        plt.title('LED signal')

        # match the scales of VAD and light to sound before plotting
        q_vad = np.max(audio)
        q_led = np.max(audio) / np.max(led_target)

        plt.figure()
        plt.plot(audio_time, audio[:,0], 'b') 
        plt.plot(led_time, led_target * q_led, 'r')
        plt.plot(audio_time, vad_snd * q_vad, 'g')
        plt.plot(audio_time, vad_guarded * q_vad, 'g--')
        plt.legend(['audio','VAD'])
        plt.title('LED and audio signals')

        plt.figure()
        a_time = np.arange(audio.shape[0]) / fs_snd
        plt.plot(a_time, audio[:,0])
        plt.plot(a_time, out_trunc)
        plt.legend(['channel 0', 'beamformer output', 'speech reference'])

        '''
        plt.figure()
        mic_array.plot_beam_response()
        plt.vlines([180+np.degrees(theta_speech), 180-np.degrees(theta_noise)], 0, nfft // 2)

        room = pra.ShoeBox(protocol['geometry']['room'][:2], fs=16000, max_order=1)

        room.add_source(noise_loc[:2])   # noise
        room.add_source(speech_loc[:2])  # speech
        room.add_source(protocol['geometry']['speakers']['locations'][1][:2])  # signal

        room.add_microphone_array(mic_array)
        room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000])
        '''

        plt.figure()
        mic_array.plot()

        plt.show()


    # Return SDR and SIR
    return ret
Lgp = np.floor(0.4 * Lg)
Lgm = Lg - Lgp
print 'Lg=', Lg

# create a microphone array
if shape is 'Circular':
    R = circular2DArray(mic1, M, phi, d * M / (2 * np.pi))
else:
    R = pra.linear2DArray(mic1, M, phi, d)
mics = pra.Beamformer(R, Fs, N, Lg=Lg, hop=hop, zpf=zp, zpb=zp)

# The first signal (of interest) is singing
rate1, signal1 = wavfile.read('samples/singing_' + str(Fs) + '.wav')
signal1 = np.array(signal1, dtype=float)
signal1 = pra.normalize(signal1)
signal1 = pra.highpass(signal1, Fs)
delay1 = 0.

# the second signal (interferer) is some german speech
rate2, signal2 = wavfile.read('samples/german_speech_' + str(Fs) + '.wav')
signal2 = np.array(signal2, dtype=float)
signal2 = pra.normalize(signal2)
signal2 = pra.highpass(signal2, Fs)
delay2 = 1.

# create the room with sources and mics
room1 = pra.Room.shoeBox2D([0, 0],
                           room_dim,
                           Fs,
                           t0=t0,
                           max_order=max_order_sim,
示例#11
0
def modify_input_wav_beamforming(wav, noise, room_dim, max_order, snr_vals,
                                 mic_array, pos_source, pos_noise, N):

    fs_s, audio_anechoic = wavfile.read(wav)
    fs_n, noise_anechoic = wavfile.read(noise)

    #Create a room for the signal
    room_signal = pra.ShoeBox(room_dim,
                              absorption=0.2,
                              fs=fs_s,
                              max_order=max_order)

    #Create a room for the noise
    room_noise = pra.ShoeBox(room_dim,
                             absorption=0.2,
                             fs=fs_n,
                             max_order=max_order)

    #source of the signal and of the noise in their respectiv boxes
    room_signal.add_source(pos_source, signal=audio_anechoic)
    room_noise.add_source(pos_noise, signal=noise_anechoic)

    #add the microphone array
    mics_signal = pra.Beamformer(mic_array, room_signal.fs, N)
    mics_noisy = pra.Beamformer(mic_array, room_noise.fs, N)
    room_signal.add_microphone_array(mics_signal)
    room_noise.add_microphone_array(mics_noisy)

    #simulate both rooms
    room_signal.simulate()
    room_noise.simulate()

    #take the mic_array.signals from each room
    audio_reverb = room_signal.mic_array.signals
    noise_reverb = room_noise.mic_array.signals

    #design beamforming filters
    mics_signal.rake_delay_and_sum_weights(room_signal.sources[0][:1])
    mics_noisy.rake_delay_and_sum_weights(room_signal.sources[0][:1])

    output_signal = mics_signal.process()
    output_noise = mics_noisy.process()

    #we're going to normalize the noise
    size = np.shape(audio_reverb)
    noise_normalized = np.zeros(size)

    #for each microphones
    if (len(noise_reverb[0]) < len(audio_reverb[0])):
        raise ValueError(
            'the length of the noise signal is inferior to the one of the audio signal !!'
        )
    output_noise = output_noise[:len(output_signal)]

    norm_fact = np.linalg.norm(noise_reverb[-1])
    noise_normalized = output_noise / norm_fact

    #initilialize the array of noisy_signal
    noisy_signal = np.zeros([len(snr_vals), np.shape(output_signal)[0]])

    for i, snr in enumerate(snr_vals):
        noise_std = np.linalg.norm(audio_reverb[-1]) / (10**(snr / 20.))
        final_noise = noise_normalized * noise_std
        noisy_signal[i] = pra.normalize(
            pra.highpass(output_signal + final_noise, fs_s))

    return noisy_signal
示例#12
0
    under different SNRs.
    """
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # truncate beamformed noise
    noise_bf = noise_bf[:len(speech_bf)]

    # compute score for different SNR vals
    print()
    score_beamformed = np.empty(len(snr_vals))
    score_single = np.empty(len(snr_vals))
    for idx, snr in enumerate(snr_vals):

        noisy_signal = speech_bf + snr_facts[idx] * noise_bf
        noisy_signal = pra.normalize(pra.highpass(noisy_signal, fs_s),
                                     bits=16).astype(np.int16)
        dest = os.path.join(dest_dir, "das_bf_snr_db_%d.wav" % (snr))
        wavfile.write(dest, fs_s, noisy_signal)
        score_beamformed[idx] = label_wav(dest, labels_file, graph_file,
                                          speech_samp.meta.word)

        # compute score for single mic for reference
        single_mic = ref_mic_sig + snr_facts[idx] * ref_mic_noise
        single_mic = pra.normalize(pra.highpass(single_mic, fs_s),
                                   bits=16).astype(np.int16)
        dest = os.path.join(dest_dir, "single_mic_snr_db_%d.wav" % (snr))
        wavfile.write(dest, fs_s, single_mic)
        score_single[idx] = label_wav(dest, labels_file, graph_file,
                                      speech_samp.meta.word)
def perceptual_quality_evaluation(room_dim, mics, good_pos, good_index, bad_pos, bad_index, rir_location):
    print 'start'

    import numpy as np
    from scipy.io import wavfile
    from os import getpid

    import pyroomacoustics as pra

    # number of sources to  consider
    n_sources = np.arange(1,8)
    S = n_sources.shape[0]

    # number of mics
    n_mic = mics.shape[1]

    # Set the speed of sound to match that of the measured RIR
    pra.constants.set('c', 345.5)

    Fs = 8000.
    N = 1024
    Lg = int(0.03*Fs) # 350 ms long filter
    delay_bf = 0.02
    sigma2_n = 1e-6

    # reflection coefficients from the walls (hand-waving)
    reflection = {'ground':0.8, 'south':0.8, 'west':0.8, 'north':0.8, 'east':0.8, 'ceilling':0.5}

    speech_sample1 = 'samples/fq_sample1_8000.wav'
    speech_sample2 = 'samples/fq_sample2_8000.wav'

    # Create the room
    room = pra.ShoeBox3D(np.zeros(3), room_dim, Fs, 
            max_order=1, 
            absorption=reflection,
            sigma2_awgn=sigma2_n)

    # Create the beamformer
    bf = pra.Beamformer(mics, Fs, N=N, Lg=Lg)
    room.addMicrophoneArray(bf)

    # data receptacles
    beamformer_names = ['Rake Perceptual',
                        'Rake MVDR']
    bf_weights_fun   = [bf.rakePerceptualFilters,
                        bf.rakeMVDRFilters]
    bf_fnames = ['1','2']
    NBF = len(beamformer_names)

    # receptacle arrays
    pesq_input = np.zeros(2)
    pesq_bf = np.zeros((2,NBF,S))

    # create a single reference mic at position of microphone 4
    ref_mic_n = 4
    ref_mic = pra.MicrophoneArray(bf.R[:,ref_mic_n,np.newaxis], Fs)

    # since we run multiple thread, we need to uniquely identify filenames
    pid = str(getpid())

    file_ref  = 'output_samples/fqref' + pid + '.wav'
    file_suffix = '-' + pid + '.wav'
    files_bf = ['output_samples/fq' + str(i+1) + file_suffix for i in xrange(NBF)]
    file_raw  = 'output_samples/fqraw' + pid + '.wav'

    # index of good and bad sources
    good = good_index
    bad =  bad_index

    # Read the two speech samples used
    rate, good_signal = wavfile.read(speech_sample1)
    good_signal = np.array(good_signal, dtype='float64')
    good_signal = pra.normalize(good_signal)
    good_signal = pra.highpass(good_signal, rate)
    good_len = good_signal.shape[0]/float(Fs)

    rate, bad_signal = wavfile.read(speech_sample2)
    bad_signal = np.array(bad_signal, dtype='float64')
    bad_signal = pra.normalize(bad_signal)
    bad_signal = pra.highpass(bad_signal, rate)
    bad_len = bad_signal.shape[0]/float(Fs)

    # variance of good signal
    good_sigma2 = np.mean(good_signal**2)

    # normalize interference signal to have equal power with desired signal
    bad_signal *= good_sigma2/np.mean(bad_signal**2)

    # pick good source position at random
    good_distance = np.linalg.norm(bf.center[:,0] - good_pos)

    # pick bad source position at random
    bad_distance = np.linalg.norm(bf.center[:,0] - bad_pos)

    if good_len > bad_len:
        good_delay = 0
        bad_delay = (good_len - bad_len)/2.
    else:
        bad_delay = 0
        good_delay = (bad_len - good_len)/2.


    # create the reference room for freespace, noisless, no interference simulation
    ref_room = pra.ShoeBox3D(
        [0,0,0],
        room_dim,
        Fs,
        max_order=0)
    ref_room.addSource(good_pos, signal=good_signal, delay=good_delay)
    ref_room.addMicrophoneArray(ref_mic)
    ref_room.compute_RIR()
    ref_room.simulate()
    reference = pra.highpass(ref_mic.signals[0], Fs)
    reference_n = pra.normalize(reference)

    # save the reference desired signal
    #wavfile.write(file_ref, Fs, pra.to_16b(reference_n))

    new_ref = good_signal.copy()
    new_ref = pra.normalize(pra.highpass(new_ref, Fs))
    wavfile.write(file_ref, Fs, pra.to_16b(new_ref))

    # add the sources to the 'real' room
    room.addSource(good_pos, signal=good_signal, delay=good_delay)
    room.addSource(bad_pos, signal=bad_signal, delay=bad_delay)

    # read in the RIR from file
    for r in range(n_mic):
        for s in [good_index, bad_index]:

            # read wav file
            fname_rir = rir_location % (r+1,s+1)
            rir_fs,rir = wavfile.read(fname_rir)
            rir = np.array(rir, dtype='float64')

            if rir_fs != Fs:
                raise NameError('The RIR and the signals do not have the same sampling rate.')
                '''
                import scikits.samplerate as sr
                rir = sr.resample(rir, Fs/float(rir_fs), 'sinc_best')

                # the factor 2 was empirically determined to be necessary to get
                # amplitude of RIR in the correct ballpark.
                rir *= 2.
                '''

            room.rir.append([])
            room.rir[r].append(rir)

    # compute the input signal to the microphones
    room.simulate()

    # save degraded signal at reference microphone
    raw = bf.signals[ref_mic_n]
    raw_n = pra.normalize(pra.highpass(raw, Fs))
    wavfile.write(file_raw, Fs, pra.to_16b(raw_n))

    pesq_input = pra.pesq(file_ref, file_raw, Fs=Fs)

    for src in room.sources:
        src.setOrdering('strongest', ref_point=bf.center)

    for k,s in enumerate(n_sources):

        good_img = room.sources[0][:s]
        bad_img = room.sources[1][:s]

        for i, bfr in enumerate(beamformer_names):

            bf_weights_fun[i](good_img, bad_img, sigma2_n*np.eye(n_mic*Lg), delay=delay_bf)

            # run beamformer
            output = bf.process()
            output = pra.normalize(pra.highpass(output, Fs))
            output = pra.time_align(reference_n, output)

            # save files for PESQ evaluation
            wavfile.write(files_bf[i], Fs, pra.to_16b(output))

            # compute PESQ
            x = pra.pesq(file_ref, files_bf[i], Fs=Fs)
            pesq_bf[:,i,k] = pra.pesq(file_ref, files_bf[i], Fs=Fs).T

    ''' This is how you can compare the true RIRs with the image src model generated one
    plt.figure()
    for m in range(n_mic):

        rir_sim = room.sources[0].getRIR(mics[:,m], Fs)
        plt.subplot(3,3,m+1)
        plt.plot(room.rir[m][0][:rir_sim.shape[0]])
        plt.plot(rir_sim)

    plt.show()
    '''

    print 'Finished'

    return pesq_input, pesq_bf
示例#14
0
def process_experiment_max_sinr(SIR, mic, args):

    nfft = args.nfft
    vad_guard = args.vad_guard
    if args.thresh is None:
        vad_thresh = thresh_opt[SIR]
    else:
        vad_thresh = args.thresh

    # read_in the mix signals
    fs_led, leds = wavfile.read(
        file_pattern.format('camera_leds_zero_hold', 'mix', SIR))
    fs_snd, audio = wavfile.read(
        file_pattern.format(mic_choices[mic], 'mix', SIR))
    assert fs_led == fs_snd

    # read in the ref signals
    r, noise_ref = wavfile.read(
        file_pattern.format(mic_choices[mic], 'noise_ref', SIR))
    assert r == fs_snd
    r, speech_ref = wavfile.read(file_speech_ref.format(mic_choices[mic]))
    assert r == fs_snd
    r, leds_ref = wavfile.read(file_speech_ref.format('camera_leds_zero_hold'))
    assert r == fs_snd

    # In case of objective evaluation, we do an artificial mix
    if args.synth_mix:
        audio = noise_ref + speech_ref

    # get the geometry information to get nice plots.
    mics_loc = np.array(protocol['geometry']['microphones'][mic]['reference'])
    noise_loc = protocol['geometry']['speakers']['locations'][0]
    speech_loc = protocol['geometry']['speakers']['locations'][1]

    # the directions of arrival
    theta_speech = 0
    p0 = speech_loc - mics_loc
    p1 = noise_loc - mics_loc
    theta_noise = np.arccos(np.inner(p0, p1) / la.norm(p0) / la.norm(p1))
    print('Source separation', theta_noise / np.pi * 180)

    if mic == 'pyramic':
        I = list(range(8, 16)) + list(range(24, 32)) + list(range(
            40, 48))  # flat part
        #I = list(range(24,32)) + list(range(40,48)) # flat part
        #I = list(range(8,16))
        #I = list(range(48))
        audio = audio[:, I]
        noise_ref = noise_ref[:, I].copy()
        speech_ref = speech_ref[:, I].copy()
        mics_positions = mics_geom['pyramic'][I].copy()
        # place in room 2-806
        mics_positions -= np.mean(mics_positions, axis=0)[None, :]
        mics_positions[:, 2] -= np.max(mics_positions[:, 2])
        mics_positions += mics_loc

    elif mic == 'olympus':
        mics_positions = mics_geom['olympus'].copy() + mics_loc

    n_samples = audio.shape[0]  # shorthand
    n_channels = audio.shape[1]

    # perform VAD
    vad_snd = leds > vad_thresh

    # Now we want to make sure no speech speech goes in estimation of the noise covariance matrix.
    # For that we will remove frames neighbouring the detected speech
    vad_guarded = vad_snd.copy()
    if vad_guard is not None:
        for i, v in enumerate(vad_snd):
            if np.any(vad_snd[i - vad_guard:i + vad_guard]):
                vad_guarded[i] = True

    ##############################
    ## STFT and frame-level VAD ##
    ##############################

    print('STFT and stuff')
    sys.stdout.flush()

    engine = pra.realtime.STFT(nfft,
                               nfft // 2,
                               pra.hann(nfft),
                               channels=audio.shape[1])

    def analysis(x):
        engine.analysis(x)
        return np.moveaxis(engine.X, 1, 0)

    # Now compute the STFT of the microphone input
    X = analysis(audio)
    X_time = np.arange(1, X.shape[0] + 1) * (nfft / 2) / fs_snd

    X_speech = analysis(audio * vad_guarded[:, None])
    X_noise = analysis(audio * (1 - vad_guarded[:, None]))

    S_ref = analysis(speech_ref)
    N_ref = analysis(noise_ref)

    ##########################
    ## MAX SINR BEAMFORMING ##
    ##########################

    print('Max SINR beamformer computation')
    sys.stdout.flush()

    # covariance matrices from noisy signal
    Rs = np.einsum('i...j,i...k->...jk', X_speech, np.conj(X_speech))
    Rn = np.einsum('i...j,i...k->...jk', X_noise, np.conj(X_noise))

    # compute covariances with reference signals to check everything is working correctly
    #Rs = np.einsum('i...j,i...k->...jk', S_ref, np.conj(S_ref))
    #Rn = np.einsum('i...j,i...k->...jk', N_ref, np.conj(N_ref))

    # compute the MaxSINR beamformer
    w = [
        la.eigh(rs, b=rn, eigvals=(n_channels - 1, n_channels - 1))[1]
        for rs, rn in zip(Rs[1:], Rn[1:])
    ]
    w = np.squeeze(np.array(w))
    nw = la.norm(w, axis=1)
    w[nw > 1e-10, :] /= nw[nw > 1e-10, None]
    w = np.concatenate([np.ones((1, n_channels)), w], axis=0)

    if not args.no_norm:
        # normalize with respect to input signal
        z = compute_gain(w,
                         X_speech,
                         X_speech[:, :, 0],
                         clip_up=args.clip_gain)
        w *= z[:, None]

    ###########
    ## APPLY ##
    ###########

    print('Apply beamformer')
    sys.stdout.flush()

    # 2D beamformer
    mic_array = pra.Beamformer(mics_positions[:, :2].T,
                               fs=fs_snd,
                               N=nfft,
                               hop=nfft,
                               zpb=nfft)
    mic_array.signals = audio.T
    mic_array.weights = w.T

    out = mic_array.process()

    # Signal alignment step
    ref = np.vstack([speech_ref[:, 0], noise_ref[:, 0]])
    # Not sure why the delay is sometimes negative here... Need to check more
    delay = np.abs(
        int(pra.tdoa(out, speech_ref[:, 0].astype(np.float), phat=True)))
    if delay > 0:
        out_trunc = out[delay:delay + ref.shape[1]]
        noise_eval = audio[:ref.shape[1], 0] - out_trunc
    else:
        out_trunc = np.concatenate(
            (np.zeros(-delay), out[:ref.shape[1] + delay]))
        noise_eval = audio[:ref.shape[1], 0] - out_trunc
    sig_eval = np.vstack([out_trunc, noise_eval])

    # We use the BSS eval toolbox
    metric = bss_eval_images(ref[:, :, None], sig_eval[:, :, None])

    # we are only interested in SDR and SIR for the speech source
    SDR_out = metric[0][0]
    SIR_out = metric[2][0]

    ##################
    ## SAVE SAMPLES ##
    ##################

    if args.save_sample is not None:

        # for informal listening tests, we need to high pass and normalize the
        # amplitude.
        upper = np.maximum(audio[:, 0].max(), out.max())
        sig_in = pra.highpass(audio[:, 0].astype(np.float) / upper,
                              fs_snd,
                              fc=150)
        sig_out = pra.highpass(out / upper, fs_snd, fc=150)

        f1 = os.path.join(args.save_sample,
                          '{}_ch0_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f1, fs_snd, sig_in)
        f2 = os.path.join(args.save_sample,
                          '{}_out_SIR_{}_dB.wav'.format(mic, SIR))
        wavfile.write(f2, fs_snd, sig_out)

    ##########
    ## PLOT ##
    ##########

    if args.plot:

        plt.figure()
        plt.plot(out_trunc)
        plt.plot(speech_ref[:, 0])
        plt.legend(['output', 'reference'])

        # time axis for plotting
        led_time = np.arange(leds.shape[0]) / fs_led + 1 / (2 * fs_led)
        audio_time = np.arange(n_samples) / fs_snd

        plt.figure()
        plt.plot(led_time, leds, 'r')
        plt.title('LED signal')

        # match the scales of VAD and light to sound before plotting
        q_vad = np.max(audio)
        q_led = np.max(audio) / np.max(leds)

        plt.figure()
        plt.plot(audio_time, audio[:, 0], 'b')
        plt.plot(led_time, leds * q_led, 'r')
        plt.plot(audio_time, vad_snd * q_vad, 'g')
        plt.plot(audio_time, vad_guarded * q_vad, 'g--')
        plt.legend(['audio', 'VAD'])
        plt.title('LED and audio signals')

        plt.figure()
        a_time = np.arange(audio.shape[0]) / fs_snd
        plt.plot(a_time, audio[:, 0])
        plt.plot(a_time, out_trunc)
        #plt.plot(a_time, speech_ref[:,0])
        plt.legend(['channel 0', 'beamformer output', 'speech reference'])

        plt.figure()
        mic_array.plot_beam_response()
        plt.vlines(
            [180 + np.degrees(theta_speech), 180 - np.degrees(theta_noise)], 0,
            nfft // 2)

        room = pra.ShoeBox(protocol['geometry']['room'][:2],
                           fs=16000,
                           max_order=1)

        room.add_source(noise_loc[:2])  # noise
        room.add_source(speech_loc[:2])  # speech
        room.add_source(
            protocol['geometry']['speakers']['locations'][1][:2])  # signal

        room.add_microphone_array(mic_array)
        room.plot(img_order=1, freq=[800, 1000, 1200, 1400, 1600, 2500, 4000])

        plt.figure()
        mic_array.plot()

        plt.show()

    # Return SDR and SIR
    return SDR_out, SIR_out
# define the FFT length
N = 1024

# create a microphone array
if shape is 'Circular':
    R = pra.circular2DArray(mic1, M, phi, d*M/(2*np.pi)) 
else:
    R = pra.linear2DArray(mic1, M, phi, d) 
mics = pra.Beamformer(R, Fs, N=N, Lg=Lg)

# The first signal (of interest) is singing
rate1, signal1 = wavfile.read('samples/singing_'+str(Fs)+'.wav')
signal1 = np.array(signal1, dtype=float)
signal1 = pra.normalize(signal1)
signal1 = pra.highpass(signal1, Fs)
delay1 = 0.

# the second signal (interferer) is some german speech
rate2, signal2 = wavfile.read('samples/german_speech_'+str(Fs)+'.wav')
signal2 = np.array(signal2, dtype=float)
signal2 = pra.normalize(signal2)
signal2 = pra.highpass(signal2, Fs)
delay2 = 1.

# create the room with sources and mics
room1 = pra.Room.shoeBox2D(
    [0,0],
    room_dim,
    Fs,
    t0 = t0,
N = 1024

# create a microphone array
if shape is 'Circular':
    R = pra.circular2DArray(mic1, M, phi, d * M / (2 * np.pi))
elif shape is 'Poisson':
    R = pra.poisson2DArray(mic1, M, d)
else:
    R = pra.linear2DArray(mic1, M, phi, d)
mics = pra.Beamformer(R, Fs, N=N, Lg=Lg)

# The first signal (of interest) is singing
rate1, signal1 = wavfile.read('samples/singing_' + str(Fs) + '.wav')
signal1 = np.array(signal1, dtype=float)
signal1 = pra.normalize(signal1)
signal1 = pra.highpass(signal1, Fs)
delay1 = 0.

# the second signal (interferer) is some german speech
rate2, signal2 = wavfile.read('samples/german_speech_' + str(Fs) + '.wav')
signal2 = np.array(signal2, dtype=float)
signal2 = pra.normalize(signal2)
signal2 = pra.highpass(signal2, Fs)
delay2 = 1.

# create the room with sources and mics
room1 = pra.Room.shoeBox2D([0, 0],
                           room_dim,
                           Fs,
                           t0=t0,
                           max_order=max_order_sim,
示例#17
0
N = 1024

# Create a microphone array
if shape is "Circular":
    R = pra.circular_2D_array(mic1, M, phi, d * M / (2 * np.pi))
else:
    R = pra.linear_2D_array(mic1, M, phi, d)

# path to samples
path = os.path.dirname(__file__)

# The first signal (of interest) is singing
rate1, signal1 = wavfile.read(path + "/input_samples/singing_" + str(Fs) + ".wav")
signal1 = np.array(signal1, dtype=float)
signal1 = pra.normalize(signal1)
signal1 = pra.highpass(signal1, Fs)
delay1 = 0.0

# The second signal (interferer) is some german speech
rate2, signal2 = wavfile.read(path + "/input_samples/german_speech_" + str(Fs) + ".wav")
signal2 = np.array(signal2, dtype=float)
signal2 = pra.normalize(signal2)
signal2 = pra.highpass(signal2, Fs)
delay2 = 1.0

# Create the room
room_dim = [4, 6]
room1 = pra.ShoeBox(
    room_dim,
    absorption=absorption,
    fs=Fs,
示例#18
0
def parallel_loop(filename, algo_names, pmt):
    '''
    This is one loop of the computation
    extracted for parallelization
    '''

    # We need to do a bunch of imports
    import pyroomacoustics as pra
    import os
    import numpy as np
    from scipy.io import wavfile
    import mkl as mkl_service
    import copy

    import doa
    from tools import rfft

    # for such parallel processing, it is better 
    # to deactivate multithreading in mkl
    mkl_service.set_num_threads(1)

    # exctract the speaker names from filename
    name = os.path.splitext(os.path.basename(filename))[0]
    sources = name.split('-')

    # number of sources
    K = len(sources)

    # Import speech signal
    fs_file, rec_signals = wavfile.read(filename)

    # sanity check
    if pmt['fs'] != fs_file:
        raise ValueError('The sampling frequency of the files doesn''t match that of the script')
    
    speech_signals = np.array(rec_signals[:,pmt['mic_select']], dtype=np.float32)

    # Remove the DC bias
    for s in speech_signals.T:
        s[:] = pra.highpass(s, pmt['fs'], 100.)

    if pmt['stft_win']:
        stft_win = np.hanning(pmt['nfft'])
    else:
        stft_win = None

    # Normalize the amplitude
    speech_signals *= pmt['scaling']

    # Compute STFT of signal
    # -------------------------
    y_mic_stft = []
    for k in range(speech_signals.shape[1]):
        y_stft = pra.stft(speech_signals[:, k], pmt['nfft'], pmt['stft_hop'],
                          transform=rfft, win=stft_win).T / np.sqrt(pmt['nfft'])
        y_mic_stft.append(y_stft)
    y_mic_stft = np.array(y_mic_stft)

    # estimate SNR in dB (on 1st microphone)
    sig_var = np.var(speech_signals)
    SNR = 10*np.log10( (sig_var - pmt['noise_var']) / pmt['noise_var'] )

    freq_bins = copy.copy(pmt['freq_bins'][K-1])

    # dict for output
    phi_recon = {}

    for alg in algo_names:

        # Use the convenient dictionary of algorithms defined
        d = doa.algos[alg](
                L=pmt['mic_array'], 
                fs=pmt['fs'], 
                nfft=pmt['nfft'], 
                num_src=K, 
                c=pmt['c'], 
                theta=pmt['phi_grid'], 
                max_four=pmt['M'], 
                num_iter=pmt['num_iter'],
                G_iter = pmt['G_iter']
                )

        # perform localization
        d.locate_sources(y_mic_stft, freq_bins=freq_bins[alg])

        # store result
        phi_recon[alg] = d.phi_recon

    return SNR, sources, phi_recon
示例#19
0
    mics = pra.Beamformer(echo, Fs, N=fft_len, Lg=Lg)
    room1.add_microphone_array(mics)
    room1.add_source(source, delay=0, signal=xtone)
    room1.add_source(interferer, delay=0, signal=silence)
    room1.image_source_model(use_libroom=True)
    room1.compute_rir()
    room1.simulate()

    # Rake MVDR simulation
    BeamformerType = 'RakeMVDR'
    good_sources = room1.sources[0][:max_order_design + 1]
    bad_sources = room1.sources[1][:max_order_design + 1]
    mics.rake_mvdr_filters(good_sources, bad_sources,
                           sigma2_n * np.eye(mics.Lg * mics.M))
    output = mics.process()
    out = pra.normalize(pra.highpass(output, Fs))
    out = normalize(out)

    # Rake Perceptual simulation
    # BeamformerType = 'RakePerceptual'
    # good_sources = room1.sources[0][:max_order_design+1]
    # bad_sources = room1.sources[1][:max_order_design+1]
    # mics.rake_perceptual_filters(good_sources,
    #                     bad_sources,
    #                     sigma2_n*np.eye(mics.Lg*mics.M))
    # output          =   mics.process()
    # out             =   pra.normalize(pra.highpass(output, Fs))

    input_mic = pra.normalize(pra.highpass(mics.signals[mics.M // 2], Fs))
    input_mic = normalize(input_mic)