def calc_pesq(ref_sig, deg_sig, samplerate, is_file=False): ''' 计算语音质量听觉评估 return 评估的分数,分数高的结果比较好 ''' if 'Windows' in platform.system(): # 暂不支持windows下pesq计算 return 0 if is_file: output = os.popen('%s +%d %s %s' % (PESQ_PATH, samplerate, ref_sig, deg_sig)) msg = output.read() else: tmp_ref = tempfile.NamedTemporaryFile(suffix='.wav', delete=True) tmp_deg = tempfile.NamedTemporaryFile(suffix='.wav', delete=True) # librosa.output.write_wav(tmp_ref.name, ref_sig, samplerate) # librosa.output.write_wav(tmp_deg.name, deg_sig, samplerate) audio_tool.write_audio(tmp_ref.name, ref_sig, samplerate) audio_tool.write_audio(tmp_deg.name, deg_sig, samplerate) output = os.popen('%s +%d %s %s' % (PESQ_PATH, samplerate, tmp_ref.name, tmp_deg.name)) msg = output.read() # print(msg) tmp_ref.close() tmp_deg.close() # os.unlink(tmp_ref.name) # os.unlink(tmp_deg.name) # print(msg) score = msg.split('Prediction : PESQ_MOS = ') # print(msg) # exit(0) # print(score) if len(score) <= 1: print('calculate error.') return 2.0 return float(score[1][:-1])
def _addnoise_and_decoder_one_batch(i_p, speaker_id, sub_process_speaker_num, waves_dir, noise_dir, sess, model): """ x_wav, y_wav_est """ s_time = time.time() noise_dir_list = [ os.path.join(noise_dir, _dir) for _dir in os.listdir(noise_dir) ] n_noise = len(noise_dir_list) wave_dir_list = [ os.path.join(waves_dir, _dir) for _dir in os.listdir(waves_dir) ] # print(len(wave_dir_list), os.path.dirname(wave_dir_list[0])) # mix && get input # x_batch = [] # [n_wav, time, 257] # x_theta_batch = [] # [n_wav, time, 257] # x_lengths = [] # [n_wav] batch_size = 0 for wav_dir in wave_dir_list: batch_size += 1 y_wave, sr_y = audio_tool.read_audio(wav_dir) if y_wave.ndim != 1: # aishell has 2 channel wav y_wave = y_wave.T[0] + y_wave.T[1] noise_id = np.random.randint(n_noise) noise_wave, sr_n = audio_tool.read_audio(noise_dir_list[noise_id]) noise_wave = audio_tool.repeat_to_len(noise_wave, len(y_wave)) x_wave, alpha = audio_tool._mix_wav_by_randomSNR(y_wave, noise_wave) assert sr_y == sr_n and sr_y == 16000, 'sr error sr_y:%d, sr_n %d' % ( sr_y, sr_n) x_wav_dir = wav_dir.replace('wav', addnoise_dir_name, 1) x_wav_dir = x_wav_dir.replace(root_dir, new_root_dir, 1) x_wav_father_dir = os.path.dirname(x_wav_dir) if not os.path.exists(x_wav_father_dir): os.makedirs(x_wav_father_dir) audio_tool.write_audio(x_wav_dir, x_wave, sr_y) x_spec_t = spectrum_tool.magnitude_spectrum_librosa_stft( x_wave, # [time, 257] PARAM.NFFT, PARAM.OVERLAP) x_phase_t = spectrum_tool.phase_spectrum_librosa_stft( x_wave, PARAM.NFFT, PARAM.OVERLAP) # x_batch.append(x_spec_t) # x_theta_batch.append(x_phase_t) # x_lengths.append(np.shape(x_spec_t)[0]) x_batch = np.array([x_spec_t], dtype=np.float32) x_theta_batch = np.array([x_phase_t], dtype=np.float32) x_lengths = np.array([np.shape(x_spec_t)[0]], dtype=np.int32) # enhance y_mag_est = sess.run(model.y_mag_estimation, feed_dict={ model.x_mag: x_batch, model.x_theta: x_theta_batch, model.lengths: x_lengths, }) # istf && save if PARAM.RESTORE_PHASE != 'MIXED': raise ValueError('Please set PARAM.RESTORE_PHASE=MIXED.') # istft y_mag_est = y_mag_est * np.exp(1j * x_phase_t) reY = spectrum_tool.librosa_istft(y_mag_est, PARAM.NFFT, PARAM.OVERLAP) y_wav_dir = wav_dir.replace('wav', enhanced_dir_name, 1) y_wav_dir = y_wav_dir.replace(root_dir, new_root_dir, 1) y_wav_father_dir = os.path.dirname(y_wav_dir) if not os.path.exists(y_wav_father_dir): os.makedirs(y_wav_father_dir) audio_tool.write_audio(y_wav_dir, reY, PARAM.FS) max_len = np.max(x_lengths) e_time = time.time() print("\n----------------\n" "%d workers\n" "%s\n" "Worker_id %03d, rate of progress: %d/%d\n" "time_step_max_len: %d\n" "batch_sie: %d\n" 'batch_cost_time: %ds\n' % (num_process, time.ctime(), i_p + 1, speaker_id, sub_process_speaker_num, max_len, batch_size, e_time - s_time), flush=True)
def decode_and_getMeature(mixed_file_list, ref_list, sess, model, decode_ans_file, save_audio, ans_file): ''' (mixed_dir,ref_dir,sess,model,'decode_nnet_C001_8_2',False,'xxxans.txt') ''' if os.path.exists(os.path.join(decode_ans_file, ans_file)): os.remove(os.path.join(decode_ans_file, ans_file)) pesq_raw_sum = 0 pesq_en_sum = 0 stoi_raw_sum = 0 stoi_en_sum = 0 sdr_raw_sum = 0 sdr_en_sum = 0 for i, mixed_dir in enumerate(mixed_file_list): print('\n', i + 1, mixed_dir) waveData, sr = audio_tool.read_audio(mixed_dir) reY, mask = decode_one_wav(sess, model, waveData) abs_max = (2**(MIXED_AISHELL_PARAM.AUDIO_BITS - 1) - 1) reY = np.where(reY > abs_max, abs_max, reY) reY = np.where(reY < -abs_max, -abs_max, reY) file_name = mixed_dir[mixed_dir.rfind('/') + 1:mixed_dir.rfind('.')] if save_audio: audio_tool.write_audio( os.path.join(decode_ans_file, (ckpt + '_%03d_' % (i + 1)) + mixed_dir[mixed_dir.rfind('/') + 1:]), reY, sr) spectrum_tool.picture_spec( mask, os.path.join(decode_ans_file, (ckpt + '_%03d_' % (i + 1)) + file_name)) if i < len(ref_list): ref, sr = audio_tool.read_audio(ref_list[i]) print(' refer: ', ref_list[i]) len_small = min(len(ref), len(waveData), len(reY)) ref = np.array(ref[:len_small]) waveData = np.array(waveData[:len_small]) reY = np.array(reY[:len_small]) # sdr sdr_raw = audio_tool.cal_SDR(np.array([ref]), np.array([waveData])) sdr_en = audio_tool.cal_SDR(np.array([ref]), np.array(reY)) sdr_raw_sum += sdr_raw sdr_en_sum += sdr_en # pesq # pesq_raw = pesq(ref,waveData,sr) # pesq_en = pesq(ref,reY,sr) pesq_raw = pesqexe.calc_pesq(ref, waveData, sr) pesq_en = pesqexe.calc_pesq(ref, reY, sr) pesq_raw_sum += pesq_raw pesq_en_sum += pesq_en # stoi stoi_raw = stoi.stoi(ref, waveData, sr) stoi_en = stoi.stoi(ref, reY, sr) stoi_raw_sum += stoi_raw stoi_en_sum += stoi_en print("SR = %d" % sr) print("PESQ_raw: %.3f, PESQ_en: %.3f, PESQimp: %.3f. " % (pesq_raw, pesq_en, pesq_en - pesq_raw)) print("SDR_raw: %.3f, SDR_en: %.3f, SDRimp: %.3f. " % (sdr_raw, sdr_en, sdr_en - sdr_raw)) print("STOI_raw: %.3f, STOI_en: %.3f, STOIimp: %.3f. " % (stoi_raw, stoi_en, stoi_en - stoi_raw)) sys.stdout.flush() with open(os.path.join(decode_ans_file, ans_file), 'a+') as f: f.write(file_name + '\r\n') f.write( " |-PESQ_raw: %.3f, PESQ_en: %.3f, PESQimp: %.3f. \r\n" % (pesq_raw, pesq_en, pesq_en - pesq_raw)) f.write( " |-SDR_raw: %.3f, SDR_en: %.3f, SDRimp: %.3f. \r\n" % (sdr_raw, sdr_en, sdr_en - sdr_raw)) f.write( " |-STOI_raw: %.3f, STOI_en: %.3f, STOIimp: %.3f. \r\n" % (stoi_raw, stoi_en, stoi_en - stoi_raw)) len_list = len(ref_list) with open(os.path.join(decode_ans_file, ans_file), 'a+') as f: f.write('PESQ_raw:%.3f, PESQ_en:%.3f, PESQi_avg:%.3f. \r\n' % (pesq_raw_sum / len_list, pesq_en_sum / len_list, (pesq_en_sum - pesq_raw_sum) / len_list)) f.write('SDR_raw:%.3f, SDR_en:%.3f, SDRi_avg:%.3f. \r\n' % (sdr_raw_sum / len_list, sdr_en_sum / len_list, (sdr_en_sum - sdr_raw_sum) / len_list)) f.write('STOI_raw:%.3f, STOI_en:%.3f, STOIi_avg:%.3f. \r\n' % (stoi_raw_sum / len_list, stoi_en_sum / len_list, (stoi_en_sum - stoi_raw_sum) / len_list)) print('\n\n\n-----------------------------------------') print('PESQ_raw:%.3f, PESQ_en:%.3f, PESQi_avg:%.3f. \r\n' % (pesq_raw_sum / len_list, pesq_en_sum / len_list, (pesq_en_sum - pesq_raw_sum) / len_list)) print('SDR_raw:%.3f, SDR_en:%.3f, SDRi_avg:%.3f. \r\n' % (sdr_raw_sum / len_list, sdr_en_sum / len_list, (sdr_en_sum - sdr_raw_sum) / len_list)) print('STOI_raw:%.3f, STOI_en:%.3f, STOIi_avg:%.3f. \r\n' % (stoi_raw_sum / len_list, stoi_en_sum / len_list, (stoi_en_sum - stoi_raw_sum) / len_list)) sys.stdout.flush()
def addnoise_and_decoder_one_batch(waves_dir, noise_dir, sess, model): """ x_wav, y_wav_est """ s_time = time.time() global speaker_n speaker_n += 1 print("\n----------------\n","%d/%d"%(speaker_n,all_speaker)) sys.stdout.flush() noise_dir_list = [os.path.join(noise_dir, _dir) for _dir in os.listdir(noise_dir)] n_noise = len(noise_dir_list) wave_dir_list = [os.path.join(waves_dir, _dir) for _dir in os.listdir(waves_dir)] # print(len(wave_dir_list), os.path.dirname(wave_dir_list[0])) # mix && get input x_batch = [] # [n_wav, time, 257] x_theta_batch = [] # [n_wav, time, 257] x_lengths = [] # [n_wav] for wav_dir in wave_dir_list: y_wave, sr_y = audio_tool.read_audio(wav_dir) if y_wave.ndim != 1: # aishell has 2 channel wav y_wave = y_wave.T[0]+y_wave.T[1] noise_id = np.random.randint(n_noise) noise_wave, sr_n = audio_tool.read_audio(noise_dir_list[noise_id]) noise_wave = audio_tool.repeat_to_len(noise_wave, len(y_wave)) x_wave, alpha = audio_tool._mix_wav_by_randomSNR(y_wave, noise_wave) assert sr_y == sr_n and sr_y == 16000, 'sr error sr_y:%d, sr_n %d' % (sr_y, sr_n) x_wav_dir = wav_dir.replace('wav', addnoise_dir_name, 1) x_wav_father_dir = os.path.dirname(x_wav_dir) if not os.path.exists(x_wav_father_dir): os.makedirs(x_wav_father_dir) audio_tool.write_audio(x_wav_dir, x_wave, sr_y) x_spec_t = spectrum_tool.magnitude_spectrum_librosa_stft(x_wave, # [time, 257] PARAM.NFFT, PARAM.OVERLAP) x_phase_t = spectrum_tool.phase_spectrum_librosa_stft(x_wave, PARAM.NFFT, PARAM.OVERLAP) x_batch.append(x_spec_t) x_theta_batch.append(x_phase_t) x_lengths.append(np.shape(x_spec_t)[0]) max_len = np.max(x_lengths) print("time_step_max_len:",max_len) sys.stdout.flush() x_batch_mat = [] x_theta_batch_mat = [] for x_spec, x_theta, length in zip(x_batch, x_theta_batch, x_lengths): x_spec_mat = np.pad(x_spec, ((0,max_len-length),(0,0)), 'constant', constant_values=((0,0),(0,0))) x_theta_mat = np.pad(x_theta, ((0,max_len-length),(0,0)), 'constant', constant_values=((0,0),(0,0))) x_batch_mat.append(x_spec_mat) x_theta_batch_mat.append(x_theta_mat) x_batch = np.array(x_batch_mat, dtype=np.float32) x_theta_batch = np.array(x_theta_batch_mat, dtype=np.float32) x_lengths = np.array(x_lengths, dtype=np.int32) # enhance y_mag_est_batch = sess.run( model.y_mag_estimation, feed_dict={ model.x_mag: x_batch, model.x_theta: x_theta_batch, model.lengths: x_lengths, }) # istf && save print(np.shape(y_mag_est_batch), np.shape(x_theta_batch), np.shape(x_lengths)) sys.stdout.flush() for y_mag_est, x_theta, length, wav_dir in zip(y_mag_est_batch, x_theta_batch, x_lengths, wave_dir_list): if PARAM.RESTORE_PHASE != 'MIXED': raise ValueError('Please set PARAM.RESTORE_PHASE=MIXED.') # cat padding y_mag_est = y_mag_est[:length,:] x_theta = x_theta[:length,:] # istft y_mag_est = y_mag_est*np.exp(1j*x_theta) reY = spectrum_tool.librosa_istft(y_mag_est, PARAM.NFFT, PARAM.OVERLAP) y_wav_dir = wav_dir.replace('wav', enhanced_dir_name, 1) y_wav_father_dir = os.path.dirname(y_wav_dir) if not os.path.exists(y_wav_father_dir): os.makedirs(y_wav_father_dir) audio_tool.write_audio(y_wav_dir, reY, PARAM.FS) e_time = time.time() print('batch_cost_time: %ds' % (e_time-s_time), flush=True)
if MIXED_AISHELL_PARAM.FS == 8000: decode_file_list = decode_file_list_8k elif MIXED_AISHELL_PARAM.FS == 16000: decode_file_list = decode_file_list_16k else: print('PARAM.FS error, exit.'), exit(-1) for i, mixed_dir in enumerate(decode_file_list): print(i + 1, mixed_dir) waveData, sr = audio_tool.read_audio(mixed_dir) reY, mask = decode_one_wav(sess, model, waveData) print(np.max(reY)) abs_max = (2**(MIXED_AISHELL_PARAM.AUDIO_BITS - 1) - 1) reY = np.where(reY > abs_max, abs_max, reY) reY = np.where(reY < -abs_max, -abs_max, reY) audio_tool.write_audio( os.path.join(decode_ans_file, (ckpt + '_%03d_' % (i + 1)) + mixed_dir[mixed_dir.rfind('/') + 1:]), reY, sr) file_name = mixed_dir[mixed_dir.rfind('/') + 1:mixed_dir.rfind('.')] spectrum_tool.picture_spec( mask, os.path.join(decode_ans_file, (ckpt + '_%03d_' % (i + 1)) + file_name)) elif int(sys.argv[1]) == 0: # decode exp/test_oc mixed_dir = 'exp/test_oc/mixed_wav' decode_file_list = os.listdir(mixed_dir) decode_file_list = [ os.path.join(mixed_dir, mixed) for mixed in decode_file_list ] decode_file_list.sort()