def separate(args, bgn_iter, fin_iter, interval): workspace = cfg.workspace events = cfg.events te_fold = cfg.te_fold n_events = args.n_events n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate clip_duration = cfg.clip_duration snr = args.snr # Load ground truth data. feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data( feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) at_y = te_at_y sed_y = te_sed_y na_list = te_na_list # Load and sum preds_dir = os.path.join(workspace, "preds", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) at_probs_list, seg_masks_list = [], [] for iter in xrange(bgn_iter, fin_iter, interval): seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter, "seg_masks.p") seg_masks = cPickle.load(open(seg_masks_path, 'rb')) seg_masks_list.append(seg_masks) seg_masks = np.mean(seg_masks_list, axis=0) # (n_clips, n_classes, n_time, n_freq) print(seg_masks.shape) # audio_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) sep_dir = os.path.join(workspace, "sep_audio", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) pp_data.create_folder(sep_dir) ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) melW = librosa.filters.mel(sr=fs, n_fft=n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) # (64, 513) seg_stats = {} for e in events: seg_stats[e] = {'fvalue': [], 'auc': [], 'iou': [], 'hit': [], 'fa': [], 'tp': [], 'fn': [], 'fp': []} cnt = 0 for (i1, na) in enumerate(na_list): bare_na = os.path.splitext(na)[0] audio_path = os.path.join(audio_dir, "%s.wav" % bare_na) (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs) event_audio = stereo_audio[:, 0] noise_audio = stereo_audio[:, 1] mixed_audio = event_audio + noise_audio mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window, n_overlap) mixed_sp = np.abs(mixed_cmplx_sp) event_sp = np.abs(pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap)) noise_sp = np.abs(pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap)) sm = seg_masks[i1] # (n_classes, n_time, n_freq) sm_upsampled = np.dot(sm, inverse_melW) # (n_classes, n_time, 513) print(na) # Write out separated events. for j1 in xrange(len(events)): if at_y[i1][j1] == 1: (fvalue, auc, iou, tp, fn, fp) = fvalue_iou(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True) (hit, fa) = hit_fa(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True) seg_stats[events[j1]]['fvalue'].append(fvalue) seg_stats[events[j1]]['auc'].append(auc) seg_stats[events[j1]]['iou'].append(iou) seg_stats[events[j1]]['hit'].append(hit) seg_stats[events[j1]]['fa'].append(fa) seg_stats[events[j1]]['tp'].append(tp) seg_stats[events[j1]]['fn'].append(fn) seg_stats[events[j1]]['fp'].append(fp) sep_event_sp = sm_upsampled[j1] * mixed_sp sep_event_s = spectrogram_to_wave.recover_wav(sep_event_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_duration)) sep_event_s *= recover_scaler out_event_audio_path = os.path.join(sep_dir, "%s.%s.wav" % (bare_na, events[j1])) pp_data.write_audio(out_event_audio_path, sep_event_s, fs) # Write out separated noise. sm_noise_upsampled = np.clip(1. - np.sum(sm_upsampled, axis=0), 0., 1.) sep_noise_sp = sm_noise_upsampled * mixed_sp sep_noise_s = spectrogram_to_wave.recover_wav(sep_noise_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_duration)) sep_noise_s *= recover_scaler out_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na) pp_data.write_audio(out_noise_audio_path, sep_noise_s, fs) cnt += 1 # if cnt == 2: break fvalues, aucs, ious, hits, fas, tps, fns, fps = [], [], [], [], [], [], [], [] for e in events: fvalues.append(np.mean(seg_stats[e]['fvalue'])) ious.append(np.mean(seg_stats[e]['iou'])) aucs.append(np.mean(seg_stats[e]['auc'])) hits.append(np.mean(seg_stats[e]['hit'])) fas.append(np.mean(seg_stats[e]['fa'])) tps.append(np.mean(seg_stats[e]['tp'])) fns.append(np.mean(seg_stats[e]['fn'])) fps.append(np.mean(seg_stats[e]['fp'])) logging.info("%sfvalue\tauc\tiou\tHit\tFa\tHit-Fa\tTP\tFN\tFP" % ("".ljust(16))) logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Avg. of each".ljust(16), np.mean(fvalues), np.mean(aucs), np.mean(ious), np.mean(hits), np.mean(fas), np.mean(hits) - np.mean(fas), np.mean(tps), np.mean(fns), np.mean(fps))) for i1 in xrange(len(events)): logging.info("%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" % (events[i1].ljust(16), fvalues[i1], aucs[i1], ious[i1], hits[i1], fas[i1], hits[i1] - fas[i1], tps[i1], fns[i1], fps[i1]))
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration calc_log = args.calc_log model_file = args.model_file n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Build model n_concat = 7 n_freq = 257 n_hid = 2048 lr = 1e-3 model = Sequential() model.add(Flatten(input_shape=(n_concat, n_freq))) model.add(Dropout(0.1)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) if calc_log: model.add(Dense(n_freq, activation='linear')) else: model.add(Dense(n_freq, activation='relu')) model.summary() model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr)) # Load model. if (model_file == "null"): model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) #model = load_model(model_path) model.load_weights(model_path) else: model.load_weights(model_file) # Load scaler. if calc_log: scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) if calc_log: mixed_x = pp_data.log_sp(mixed_x) #speech_x = pp_data.log_sp(speech_x) else: mixed_x = mixed_x #speech_x = speech_x # Scale data. if calc_log: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) #speech_x = pp_data.scale_on_2d(speech_x, scaler) else: mixed_x_max = np.max(mixed_x) print("max of tr_x:", mixed_x_max) mixed_x = mixed_x / mixed_x_max speech_x_max = np.max(speech_x) print("max of speech_x:", speech_x_max) speech_x = speech_x / speech_x_max # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. if False: print(mixed_x_3d) pred = model.predict(mixed_x_3d) print(cnt, na) if False: print("pred") print(pred) print("speech") print(speech_x) # Inverse scale. if calc_log: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) #speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) else: mixed_x = mixed_x * mixed_x_max #speech_x = speech_x * 16384 pred = pred * mixed_x_max # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') #axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. if calc_log: pred_sp = np.exp(pred) else: #gv = 0.025 #pred_sp = np.maximum(0,pred - gv) pred_sp = pred if False: pred_sp = mixed_x[3:-3] s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) # Write out enhanced pcm 8K pcm_s16le. out_pcm_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.pcm" % na) cmd = ' '.join([ "./ffmpeg -y -i ", out_path, " -f s16le -ar 8000 -ac 1 -acodec pcm_s16le ", out_pcm_path ]) os.system(cmd) # Write out webrtc-denoised enhanced pcm 8K pcm_s16le. ns_out_pcm_path = os.path.join(workspace, "ns_enh_wavs", "test", "%ddb" % int(te_snr), "%s.ns_enh.pcm" % na) ns_out_wav_path = os.path.join(workspace, "ns_enh_wavs", "test", "%ddb" % int(te_snr), "%s.ns_enh.wav" % na) pp_data.create_folder(os.path.dirname(ns_out_pcm_path)) cmd = ' '.join(["./ns", out_pcm_path, ns_out_pcm_path]) os.system(cmd) cmd = ' '.join([ "./ffmpeg -y -f s16le -ar 8000 -ac 1 -acodec pcm_s16le -i ", ns_out_pcm_path, " ", ns_out_wav_path ]) os.system(cmd) cmd = ' '.join(["rm ", out_pcm_path]) os.system(cmd) cmd = ' '.join(["rm ", ns_out_pcm_path]) os.system(cmd)
def inference(args): """Inference all test data, write out recovered wavs to disk. 推测所有的测试数据,并将恢复的wavs写入磁盘 Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. 加载模型 model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) model = load_model(model_path) # Load scaler. 加载缩放器 scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. 加载测试数据 feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. 加载特征 feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. 处理数据 n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) # Scale data. 缩放数据 if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. 使用n_concat将输入频谱图切割为3D段。 mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. 预测 pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. 反预测 if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) # Debug plot. 调试图 if args.visualize: fig, axs = plt.subplots(3,1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. 恢复增强的wav。 pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. 缩放器用于补偿频谱图和IFFT后的幅度变化。 # Write out enhanced wav. 写出增强的wav。 out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def enhance_audio(workspace, speech_to_enhance_dir, train_snr, test_snr, n_concat, iteration): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. train_snr: float, training SNR. test_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iteration: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ begin_time = time.time() n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. model_path = os.path.join(workspace, "models", "{}db".format(int(train_snr)), "md_{}iters.h5".format(iteration)) model = load_model(model_path) # Load scaler. scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "{}db".format(int(train_snr)), "scaler.pickle") scaler = pickle.load(open(scaler_path, "rb")) # Load test data. features_dir = os.path.join(workspace, "data", "speech_to_enhance", "features", "spectrogram", "{}db".format(int(test_snr))) for sample_id, feature_filename in enumerate(os.listdir(features_dir)): # Load feature. feature_path = os.path.join(features_dir, feature_filename) feature_data = pickle.load(open(feature_path, "rb")) mixed_audio_complex_spectrogram, audio_name = feature_data mixed_audio_spectrogram = np.abs(mixed_audio_complex_spectrogram) # Process data. n_pad = (n_concat - 1) / 2 mixed_audio_spectrogram = audio_utils.pad_with_border( mixed_audio_spectrogram, n_pad) mixed_audio_spectrogram = audio_utils.log_sp(mixed_audio_spectrogram) # Scale data. if scale: mixed_audio_spectrogram = audio_utils.scale_on_2d( mixed_audio_spectrogram, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_audio_spectrogram_3d = audio_utils.mat_2d_to_3d( mixed_audio_spectrogram, agg_num=n_concat, hop=1) # Predict. prediction = model.predict(mixed_audio_spectrogram_3d) print("Sample id: {}. sample name: {}".format(sample_id, audio_name)) # Inverse scale. if scale: prediction = audio_utils.inverse_scale_on_2d(prediction, scaler) # Recover enhanced wav. prediction_spectrogram = np.exp(prediction) recovered_wave = recover_wav(prediction_spectrogram, mixed_audio_complex_spectrogram, n_overlap, np.hamming) # Scaler for compensate the amplitude change after spectrogram and IFFT. recovered_wave *= np.sqrt((np.hamming(n_window)**2).sum()) # Write out enhanced wav. enhanced_audio_filename = "{}.enh.wav".format(audio_name) enhanced_audio_dir = os.path.join(workspace, "data", "speech_to_enhance", "enhanced_wavs", "{}db".format(int(test_snr))) create_directory(enhanced_audio_dir) audio_utils.write_audio( os.path.join(enhanced_audio_dir, enhanced_audio_filename), recovered_wave, fs) print() print("Inference time: {}".format(time.time() - begin_time)) print()
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace n_concat = args.n_concat iter = args.iteration dir_name = args.dir_name model_name = args.model_name n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True tr_enh = args.tr_enh # Load model. model_dir = os.path.join(workspace, "models", model_name) with tf.Session() as sess: model = DNN(sess, 0.0, 1, (n_concat, int(n_window / 2 + 1)), int(n_window / 2 + 1)) model.build() saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(model_dir) saver.restore(sess, ckpt) # saver.restore(sess, ckpt.model_checkpoint_path) # model_path = os.path.join(model_dir, "md_%diters.h5" % iter) # model = load_model(model_path) # Load scaler. scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "REVERB_tr_cut", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", tr_enh, dir_name) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = pickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = sess.run([model.enhanced_outputs], feed_dict={model.x_noisy: mixed_x_3d }) # model.predict(mixed_x_3d) pred = np.reshape(pred, (-1, int(n_window / 2 + 1))) print(cnt, na) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') # axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", dir_name, "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
test_padding = np.concatenate((test_spec_norm, padding)) # print(test_padding.shape) test_padding_var = torch.from_numpy(test_padding).type(torch.FloatTensor) start = timeit.default_timer() predictions = model(test_padding_var) print(predictions) stop = timeit.default_timer() print('Time: ', stop - start) # print(predictions.shape) pre_removed = predictions[:test_spec_norm.shape[0]] # print(pre_removed.shape) pre_inversed = inverse_scale_on_2d(pre_removed.cpu().detach().numpy(), scaler_label) # print(np.exp(pre_inversed)) pre_inversed = np.exp(pre_inversed) s = recover_wav(pre_inversed, mixed_complx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2).sum()) write_audio("gpu_enh.wav", s, 16000) pre_fpga = np.loadtxt('fpga_output') print(pre_fpga) pre_fpga_inversed = inverse_scale_on_2d(pre_fpga, scaler_label) pre_fpga_inversed = np.exp(pre_fpga_inversed) s_fpga = recover_wav(pre_fpga_inversed, mixed_complx_x, n_overlap, np.hamming) s_fpga *= np.sqrt((np.hamming(n_window)**2).sum()) write_audio("fpga_enh.wav", s_fpga, 16000) print(pre_fpga.shape) # pre_inversed = pre_inversed.T #plt.matshow(pre_inversed) #plt.savefig('test.png')
def predict_folder(input_file_folder: object, output_file_folder: object) -> object: # Load model. data_type = "test" model_path = os.path.join(conf1.model_dir, "md_%diters.h5" % conf1.iterations) model = load_model(model_path) # Load scaler. # if scale: scaler_path = os.path.join(conf1.packed_feature_dir, data_type, "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. # names = os.listdir(input_file_folder) names = [f for f in sorted(os.listdir(input_file_folder)) if f.startswith("mix")] mixed_all = [] pred_all = [] for (cnt, na) in enumerate(names): # Load feature. file_path = os.path.join(input_file_folder, na) (a, _) = pp.read_audio(file_path) mixed_complex = pp.calc_sp(a, 'complex') mixed_x = np.abs(mixed_complex) # Process data. n_pad = (conf1.n_concat - 1) / 2 mixed_x = pp.pad_with_border(mixed_x, n_pad) mixed_x = pp.log_sp(mixed_x) # speech_x = dnn1_train.log_sp(speech_x) # Scale data. # if scale: mixed_x = pp.scale_on_2d(mixed_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. #if scale: mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler) # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler) pred = pp.inverse_scale_on_2d(pred, scaler) # Debug plot. if visualize_plot: visualize(mixed_x, pred) mixed_all.append(mixed_complex) pred_all.append(real_to_complex(pred, mixed_complex)) # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. pp.create_folder(output_file_folder) audio_path = os.path.join(output_file_folder, "enh_%s" % na) pp.write_audio(audio_path, s, conf1.sample_rate) return mixed_all, pred_all