def dnn1_colors(input): scaler_path = os.path.join(conf1.packed_feature_dir, "test", "scaler.p") scaler = dnn1.pickle.load(open(scaler_path, 'rb')) # n_pad = (conf1.n_concat - 1) / 2 # enh_pad[0] = pp.pad_with_border(enh_pad[0], n_pad) prova = pp.log_sp(input) prova = pp.scale_on_2d(np.abs(prova), scaler) prova = pp.inverse_scale_on_2d(prova, scaler) return -prova
def predict_file(file_path, model, scaler): (a, _) = pp.read_audio(file_path) mixed_complex = pp.calc_sp(a, 'complex') mixed_x = np.abs(mixed_complex) # Process data. n_pad = (conf1.n_concat - 1) / 2 mixed_x = pp.pad_with_border(mixed_x, n_pad) mixed_x = pp.log_sp(mixed_x) # speech_x = dnn1_train.log_sp(speech_x) # Scale data. # if scale: mixed_x = pp.scale_on_2d(mixed_x, scaler) # speech_x = pp.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) if visualize_plot: visualize(mixed_x, pred) # Inverse scale. # if scale: mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler) # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler) pred = pp.inverse_scale_on_2d(pred, scaler) # Debug plot. # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. # audio_path = os.path.dirname(file_path) # pp.write_audio(audio_path, s, conf1.sample_rate) return mixed_complex, pred, s
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) model = load_model(model_path) # Load scaler. scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(workspace, tr_snr, te_snr, n_concat, iteration, model_name=None, visualize=False, force=False): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True if model_name is None: model_name = '_'.join([str(snr) for snr in tr_snr]) + 'ddbs' # Load model. model_path = os.path.join(workspace, "models", model_name, "md_%diters.h5" % iteration) print('GPU available: ', tf.test.is_gpu_available()) model = load_model(model_path) # Load scaler. scaler = read_combined_scaler(workspace, tr_snr) for snr in te_snr: # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(snr)) feat_paths = all_file_paths(feat_dir) for (cnt, feat_path) in tqdm(enumerate(feat_paths), 'Inference (creating enhanced speech)'): # Check if the enhanced audio is already inferred na = str(PurePath(feat_path).relative_to(feat_dir).with_suffix('')) out_path = os.path.join(workspace, "enh_wavs", "test", model_name, "%ddb" % int(snr), "%s.enh.wav" % na) if os.path.isfile(out_path) and not force: print(f'Enhanced audio {out_path} is already made') continue # Load feature. data = pickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, ir_mask, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) #print(cnt, na) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) #pred = pp_data.inverse_scale_on_2d(pred, scaler) # Debug plot. if visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav s = recover_wav(pred, mixed_cmplx_x, n_overlap, np.hamming, irr_mask=True) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration data_type = 'IRM' n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. if data_type == "DM": model_path = os.path.join(workspace, "models", "mixdb", "md_%diters.h5" % 120000) else: model_path = os.path.join(workspace, "models", "mask_mixdb", "md_%diters.h5" % 265000) model = load_model(model_path) # Load scaler. scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "mixdb", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "mixdb") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) if data_type == "IRM": mixed_x = speech_x + noise_x mixed_x1 = speech_x + noise_x # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) if data_type == "IRM": pred_sp = pred * mixed_x1 print(cnt, na) # Inverse scale. if data_type == "DM": pred = pp_data.inverse_scale_on_2d(pred, scaler) pred_sp = np.exp(pred) # Debug plot. # Recover enhanced wav. s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. if data_type == "DM": out_path = os.path.join(workspace, "enh_wavs", "test", "mixdb", "%s.enh.wav" % na) else: out_path = os.path.join(workspace, "enh_wavs", "test", "mask_mixdb", "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration calc_log = args.calc_log model_file = args.model_file n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Build model n_concat = 7 n_freq = 257 n_hid = 2048 lr = 1e-3 model = Sequential() model.add(Flatten(input_shape=(n_concat, n_freq))) model.add(Dropout(0.1)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) if calc_log: model.add(Dense(n_freq, activation='linear')) else: model.add(Dense(n_freq, activation='relu')) model.summary() model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr)) # Load model. if (model_file == "null"): model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) #model = load_model(model_path) model.load_weights(model_path) else: model.load_weights(model_file) # Load scaler. if calc_log: scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) if calc_log: mixed_x = pp_data.log_sp(mixed_x) #speech_x = pp_data.log_sp(speech_x) else: mixed_x = mixed_x #speech_x = speech_x # Scale data. if calc_log: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) #speech_x = pp_data.scale_on_2d(speech_x, scaler) else: mixed_x_max = np.max(mixed_x) print("max of tr_x:", mixed_x_max) mixed_x = mixed_x / mixed_x_max speech_x_max = np.max(speech_x) print("max of speech_x:", speech_x_max) speech_x = speech_x / speech_x_max # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. if False: print(mixed_x_3d) pred = model.predict(mixed_x_3d) print(cnt, na) if False: print("pred") print(pred) print("speech") print(speech_x) # Inverse scale. if calc_log: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) #speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) else: mixed_x = mixed_x * mixed_x_max #speech_x = speech_x * 16384 pred = pred * mixed_x_max # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') #axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. if calc_log: pred_sp = np.exp(pred) else: #gv = 0.025 #pred_sp = np.maximum(0,pred - gv) pred_sp = pred if False: pred_sp = mixed_x[3:-3] s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) # Write out enhanced pcm 8K pcm_s16le. out_pcm_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.pcm" % na) cmd = ' '.join([ "./ffmpeg -y -i ", out_path, " -f s16le -ar 8000 -ac 1 -acodec pcm_s16le ", out_pcm_path ]) os.system(cmd) # Write out webrtc-denoised enhanced pcm 8K pcm_s16le. ns_out_pcm_path = os.path.join(workspace, "ns_enh_wavs", "test", "%ddb" % int(te_snr), "%s.ns_enh.pcm" % na) ns_out_wav_path = os.path.join(workspace, "ns_enh_wavs", "test", "%ddb" % int(te_snr), "%s.ns_enh.wav" % na) pp_data.create_folder(os.path.dirname(ns_out_pcm_path)) cmd = ' '.join(["./ns", out_pcm_path, ns_out_pcm_path]) os.system(cmd) cmd = ' '.join([ "./ffmpeg -y -f s16le -ar 8000 -ac 1 -acodec pcm_s16le -i ", ns_out_pcm_path, " ", ns_out_wav_path ]) os.system(cmd) cmd = ' '.join(["rm ", out_pcm_path]) os.system(cmd) cmd = ' '.join(["rm ", ns_out_pcm_path]) os.system(cmd)
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace #tr_snr = args.tr_snr #te_snr = args.te_snr n_concat = args.n_concat #iter = args.iteration TF = args.TF model_name = args.model_name n_window = cfg.n_window n_overlap = cfg.n_overlap #snr = cfg.SNR n_hop = int(n_window-n_overlap) fs = cfg.sample_rate scale = True # Load model t1 = time.time() #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5") mag_model_root = os.path.join(workspace, "saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' mag_model_files = find_models(mag_model_root) epoch_num = [] for i in range(len(mag_model_files)): epoch_num.append(int(mag_model_files[i].split("/")[-1].split('-')[2])) mag_model_index = epoch_num.index(max(epoch_num)) mag_model_path = mag_model_files[mag_model_index] print("The selected model path is %s :" % mag_model_path) mag_model = load_model(mag_model_path) ''' # loading phase model phase_model_root = os.path.join(workspace, "phase_saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' phase_model_files = find_models(phase_model_root) epoch_num1 = [] for i in range(len(phase_model_files)): epoch_num1.append(int(phase_model_files[i].split("/")[-1].split('-')[2])) phase_model_index = epoch_num1.index(max(epoch_num1)) phase_model_path = phase_model_files[phase_model_index] print("The selected model path is %s :" % phase_model_path) phase_model = load_model(phase_model_path) ''' # Load scaler mag_scaler_path = os.path.join(workspace, "packed_features", "train", "mag_scaler.p") mag_scaler = pickle.load(open(mag_scaler_path, 'rb')) #phase_scaler_path = os.path.join(workspace, "packed_features", "train", "phase_scaler.p") #phase_scaler = pickle.load(open(phase_scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "test") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_cmplx_x] = data n_pad = (n_concat - 1) / 2 if TF == "spectrogram": mixed_x = np.abs(mixed_cmplx_x) # mixed_phase = np.angle(mixed_cmplx_x) # Process data. #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) # mixed_phase = pp_data.pad_with_border(mixed_phase, n_pad) # speech_x = pp_data.log_sp(np.abs(speech_cmplx_x)) #speech_phase = np.angle(speech_cmplx_x) else: raise Exception("TF must be spectrogram, timedomain or fftmagnitude!") # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, mag_scaler) # speech_x = pp_data.scale_on_2d(speech_x, mag_scaler) #mixed_phase = pp_data.scale_on_2d(mixed_phase, phase_scaler) #speech_phase = pp_data.scale_on_2d(speech_phase, phase_scaler) # Cut input spectrogram to 3D segments with n_concat. #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) #mixed_phase_3d = pp_data.mat_2d_to_3d(mixed_phase, agg_num=n_concat, hop=1) #print("loading data time: %s s" % (time.time() - t1,)) ''' layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号 f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数) for _ in range(12): show_img = f1[1, :, :, _] show_img.shape = [1, 257] plt.subplot(3, 4, _ + 1) plt.imshow(show_img.T, cmap='gray') plt.axis('off') plt.show() ''' # Predict. t2 = time.time() mag_pred = mag_model.predict(mixed_x_3d) #phase_pred = phase_model.predict(mixed_phase_3d) print("model predicts %d utterance : %s successfully" % (cnt, na)) #print(pred) # Inverse scale. if scale: # mixed_x = pp_data.inverse_scale_on_2d(mixed_x, mag_scaler) # speech_x = pp_data.inverse_scale_on_2d(speech_x, mag_scaler) mag_pred = pp_data.inverse_scale_on_2d(mag_pred, mag_scaler) #mixed_phase = pp_data.inverse_scale_on_2d(mixed_phase, phase_scaler) #speech_phase = pp_data.inverse_scale_on_2d(speech_phase, phase_scaler) #phase_pred = pp_data.inverse_scale_on_2d(phase_pred, phase_scaler) # Recover enhanced wav. #pred_sp = np.exp(pred) if TF == "spectrogram": pred_sp = (10**(mag_pred/10))-1e-10 #pred_ph = np.exp(1j * phase_pred) ''' R = np.multiply(pred_sp, pred_ph) result = librosa.istft(R.T, hop_length=n_hop, win_length=cfg.n_window, window=scipy.signal.hamming, center=False) result /= abs(result).max() y_out = result*0.8''' #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) #s *= np.sqrt((np.hamming(n_window)**2).sum()) # Scaler for compensate the amplitude s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming') # Write out enhanced wav. out_path = os.path.join(workspace, "enh_flipphase", "test", "%s" % model_name, "{}_fft_dnn_map.wav".format(na.split('.')[0])) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) print("predict an utterance time: %s s" % (time.time() - t2,)) print("total test time: %s s" % (time.time() - t1,))
def inference1111(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace #tr_snr = args.tr_snr #te_snr = args.te_snr n_concat = args.n_concat #iter = args.iteration TF = args.TF model_name = args.model_name n_window = cfg.n_window n_overlap = cfg.n_overlap #snr = cfg.SNR n_hop = int(n_window-n_overlap) fs = cfg.sample_rate scale = True # Load model t1 = time.time() #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5") model_root = os.path.join(workspace, "saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' model_files = find_models(model_root) epoch_num = [] for i in range(len(model_files)): epoch_num.append(int(model_files[i].split("/")[-1].split('-')[2])) model_index = epoch_num.index(max(epoch_num)) model_path = model_files[model_index] print("The selected model path is %s :" % model_path) model = load_model(model_path) # Load scaler scaler_path = os.path.join(workspace, "packed_features", "train", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "test") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, na] = data n_pad = (n_concat - 1) / 2 if TF == "spectrogram": mixed_x = np.abs(mixed_cmplx_x) # Process data. #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) elif TF == "timedomain": #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_cmplx_x, n_pad) elif TF == "fftmagnitude": #n_pad = (n_concat - 1) / 2 mixed_x = np.abs(mixed_cmplx_x) mixed_x = pp_data.pad_with_border(mixed_x, n_pad) else: raise Exception("TF must be spectrogram, timedomain or fftmagnitude!") # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) #print("loading data time: %s s" % (time.time() - t1,)) ''' layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号 f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数) for _ in range(12): show_img = f1[1, :, :, _] show_img.shape = [1, 257] plt.subplot(3, 4, _ + 1) plt.imshow(show_img.T, cmap='gray') plt.axis('off') plt.show() ''' # Predict. t2 = time.time() pred = model.predict(mixed_x_3d) print("model predicts %d utterance : %s successfully" % (cnt, na)) #print(pred) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) #(frames, frame_length) = pred.shape #print("pred domensions %d and %d : " % (frames, frame_length)) # Debug plot. if args.visualize: if TF == "spectrogram": fig, axs = plt.subplots(3,1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.savefig('debug_model_spectra.png') plt.show() elif TF == "timedomain": fig, axs = plt.subplots(3,1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture time domain" % int(te_snr)) axs[1].set_title("Clean speech time domain") axs[2].set_title("Enhanced speech time domain") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.savefig('debug model_time.png') plt.show() else: raise Exception("TF must be spectrogram or timedomain!") # Recover enhanced wav. #pred_sp = np.exp(pred) if TF == "spectrogram": pred_sp = (10**(pred/20))-1e-10 #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) #s *= np.sqrt((np.hamming(n_window)**2).sum()) # Scaler for compensate the amplitude s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming') # change after spectrogram and IFFT. elif TF == "timedomain": s = time_recover_wav(pred, n_window, n_hop, 'hamming') #s *= np.sqrt((np.hamming(n_window)**2).sum()) elif TF == "fftmagnitude": #n_pad = (n_concat - 1) / 2 s = spectra_to_wav(pred, mixed_cmplx_x, n_window, n_hop, 'hamming') else: raise Exception("TF must be spectrogram timedomain or fftmagnitude!") # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%s" % model_name, "%s.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) print("predict an utterance time: %s s" % (time.time() - t2,)) print("total test time: %s s" % (time.time() - t1,))
def predict_folder(input_file_folder: object, output_file_folder: object) -> object: # Load model. data_type = "test" model_path = os.path.join(conf1.model_dir, "md_%diters.h5" % conf1.iterations) model = load_model(model_path) # Load scaler. # if scale: scaler_path = os.path.join(conf1.packed_feature_dir, data_type, "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. # names = os.listdir(input_file_folder) names = [f for f in sorted(os.listdir(input_file_folder)) if f.startswith("mix")] mixed_all = [] pred_all = [] for (cnt, na) in enumerate(names): # Load feature. file_path = os.path.join(input_file_folder, na) (a, _) = pp.read_audio(file_path) mixed_complex = pp.calc_sp(a, 'complex') mixed_x = np.abs(mixed_complex) # Process data. n_pad = (conf1.n_concat - 1) / 2 mixed_x = pp.pad_with_border(mixed_x, n_pad) mixed_x = pp.log_sp(mixed_x) # speech_x = dnn1_train.log_sp(speech_x) # Scale data. # if scale: mixed_x = pp.scale_on_2d(mixed_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. #if scale: mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler) # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler) pred = pp.inverse_scale_on_2d(pred, scaler) # Debug plot. if visualize_plot: visualize(mixed_x, pred) mixed_all.append(mixed_complex) pred_all.append(real_to_complex(pred, mixed_complex)) # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. pp.create_folder(output_file_folder) audio_path = os.path.join(output_file_folder, "enh_%s" % na) pp.write_audio(audio_path, s, conf1.sample_rate) return mixed_all, pred_all