def predict_file(file_path, model, scaler): (a, _) = pp.read_audio(file_path) mixed_complex = pp.calc_sp(a, 'complex') mixed_x = np.abs(mixed_complex) # Process data. n_pad = (conf1.n_concat - 1) / 2 mixed_x = pp.pad_with_border(mixed_x, n_pad) mixed_x = pp.log_sp(mixed_x) # speech_x = dnn1_train.log_sp(speech_x) # Scale data. # if scale: mixed_x = pp.scale_on_2d(mixed_x, scaler) # speech_x = pp.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) if visualize_plot: visualize(mixed_x, pred) # Inverse scale. # if scale: mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler) # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler) pred = pp.inverse_scale_on_2d(pred, scaler) # Debug plot. # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. # audio_path = os.path.dirname(file_path) # pp.write_audio(audio_path, s, conf1.sample_rate) return mixed_complex, pred, s
def inference_wiener(args): workspace = args.workspace iter = args.iteration stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) # Load model. target_type = ['speech', 'noise'] model_dict = {} for e in target_type: n_freq = 257 model = DNN(stack_num, n_freq) model_path = os.path.join(workspace, "models", filename, e, "md_%d_iters.tar" % iter) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) # Move model to GPU. if cuda: model.cuda() model.eval() model_dict[e] = model # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 out_wav_dir = os.path.join(workspace, "enh_wavs", filename) pp_data.create_folder(out_wav_dir) for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio) cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window) x = np.abs(cmplx_sp) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(x, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) # Predict. pred_dict = {} for e in target_type: pred = forward(model_dict[e], x, mean_, std_, cuda) pred = pred.data.cpu().numpy() pred_dict[e] = pred print(cnt, name) # Wiener filter. pred_mag_sp = pred_dict['speech'] / ( pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp) pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp) frames = stft.istft(pred_cmplx_sp) cola_constant = stft.get_cola_constant(hop_size, window) seq = stft.overlap_add(frames, hop_size, cola_constant) seq = seq[0:len(audio)] # Write out wav out_wav_path = os.path.join(out_wav_dir, name) pp_data.write_audio(out_wav_path, seq, sample_rate) print("Write out wav to: %s" % out_wav_path) if visualize: vmin = -5. vmax = 5. fig, axs = plt.subplots(3, 1, sharex=True) axs[0].matshow(np.log(np.abs(cmplx_sp)).T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T, origin='lower', aspect='auto', cmap='jet') plt.show()
def inference(args): workspace = args.workspace iter = args.iteration stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) audio_type = 'speech' sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) speech_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/timit_wavs/subtest" # Load model model_path = os.path.join(workspace, "models", filename, audio_type, "md_%d_iters.tar" % iter) n_freq = 257 model = DNN(stack_num, n_freq) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) if cuda: model.cuda() # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 out_wav_dir = os.path.join(workspace, "enh_wavs", filename) pp_data.create_folder(out_wav_dir) dft = pp_data.DFT(fft_size, cuda) for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio0, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio0) # Enframe frames = stft.enframe(audio, fft_size, hop_size) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(frames, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) pred_frames = forward(model, x, mean_, std_, cuda) pred_frames = pred_frames.data.cpu().numpy() # cola_constant = 0.5 # seq = stft.overlap_add(pred_frames, hop_size, cola_constant) pred_frames *= window cola_constant = stft.get_cola_constant(hop_size, window) seq = stft.overlap_add(pred_frames, hop_size, cola_constant) seq = seq[0 : len(audio)] # Write out wav out_wav_path = os.path.join(out_wav_dir, name) pp_data.write_audio(out_wav_path, seq, sample_rate) print("Write out wav to: %s" % out_wav_path) if visualize: clean_audio_path = os.path.join(speech_dir, name.split('.')[0] + ".WAV") (clean_audio, _) = pp_data.read_audio(clean_audio_path, sample_rate) clean_audio = pp_data.normalize(clean_audio) clean_frames = stft.enframe(clean_audio, fft_size, hop_size) mix_sp = np.abs(np.fft.rfft(frames * window, norm='ortho')) enh_sp = np.abs(np.fft.rfft(pred_frames * window, norm='ortho')) clean_sp = np.abs(np.fft.rfft(clean_frames * window, norm='ortho')) K = 10 fig, axs = plt.subplots(K/2,2, sharex=True) for k in range(K): axs[k / 2, k % 2].plot(frames[k+100], color='y') axs[k / 2, k % 2].plot(clean_frames[k+100], color='r') axs[k / 2, k % 2].plot(pred_frames[k+100], color='b') plt.show() # import crash # asdf vmin = -5. vmax = 5. fig, axs = plt.subplots(3,1, sharex=True) axs[0].matshow(np.log(np.abs(mix_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax) axs[1].matshow(np.log(np.abs(clean_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax) axs[2].matshow(np.log(np.abs(enh_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax) plt.show()
def inference(args): cuda = args.use_cuda and torch.cuda.is_available() workspace = args.workspace model_name = args.model_name feat_type = args.feat_type script_na = args.script_na # Load data. te_packed_feat_path = os.path.join(workspace, "packed_features", feat_type, "test.p") [te_x_list, te_y_list, te_na_list] = cPickle.load(open(te_packed_feat_path, 'rb')) # Scale. if True: scale_path = os.path.join(workspace, "scalers", feat_type, "scaler.p") scaler = pickle.load(open(scale_path, 'rb')) te_x_list = pp_data.scale_on_x_list(te_x_list, scaler) # Construct model topology. n_concat = 3 te_n_hop = 1 n_freq = te_x_list[0].shape[-1] n_out = te_y_list[0].shape[-1] model = Net(n_concat, n_freq, n_out) # Init the weights of model using trained weights. model_path = os.path.join(workspace, "models", script_na, feat_type, model_name) if os.path.isfile(model_path): print("Loading checkpoint '%s'" % model_path) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) else: raise Exception("Model path %s does not exist!" % model_path) # Move model to GPU. if cuda: model.cuda() # Directory to write out transcript midi files. out_midi_dir = os.path.join(workspace, "out_midis", pp_data.get_filename(__file__), feat_type) pp_data.create_folder(out_midi_dir) # Data to 3d. n_half = (n_concat - 1) / 2 for i1 in xrange(len(te_x_list)): x = te_x_list[i1] # (n_time, n_freq) y = te_y_list[i1] # (n_time, n_out) bare_na = os.path.splitext(te_na_list[i1])[0] (n_time, n_freq) = x.shape zero_pad = np.zeros((n_half, n_freq)) x = np.concatenate((zero_pad, x, zero_pad), axis=0) x3d = pp_data.mat_2d_to_3d(x, n_concat, te_n_hop) # (n_time, n_concat, n_freq) # Move data to GPU. x3d = torch.Tensor(x3d) x3d = Variable(x3d) if cuda: x3d = x3d.cuda() # Inference. model.eval() pred = model(x3d) # (n_time, n_out) # Convert data type to numpy. pred = pred.data.cpu().numpy() # Threshold and write out predicted piano roll to midi file. mid_roll = pp_data.prob_to_midi_roll(pred, 0.5) out_path = os.path.join(out_midi_dir, "%s.mid" % bare_na) print("Write out to: %s" % out_path) pp_data.write_midi_roll_to_midi(mid_roll, out_path) # Debug plot. if True: fig, axs = plt.subplots(3, 1, sharex=True) axs[0].matshow(y.T, origin='lower', aspect='auto') axs[1].matshow(pred.T, origin='lower', aspect='auto') binary_pred = (np.sign(pred - 0.5) + 1) / 2 axs[2].matshow(binary_pred.T, origin='lower', aspect='auto') axs[0].set_title("Ground truth") axs[1].set_title("DNN output probability") axs[2].set_title("DNN output probability after thresholding") for j1 in xrange(3): axs[j1].set_ylabel('note index') axs[j1].set_xlabel('frames') axs[j1].xaxis.set_label_coords(1.06, -0.01) axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show()
def inference(args): workspace = "workspace" n_concat = 11 iter = 50000 n_window = 320 n_overlap = 160 fs = 16000 # Load model. model_path = os.path.join(workspace, "models", "crn_mixdb", "md_%diters.h5" % iter) model = load_model(model_path, custom_objects={'keras': keras}) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "crn_mixdb") #feat_dir = os.path.join(workspace, "features", "spectrogram", "train", "office_mixdb") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) #mixed_x = pad_with_border(mixed_x, n_pad) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=11) #[100, 7, 257] #mixed_x = pad_with_border(mixed_x, n_pad) #mixed_x_3d = mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. w, h, l = mixed_x_3d.shape pred = model.predict(mixed_x_3d) pred_sp = np.reshape(pred, [w * h, l]) mixed_cmplx_x = mixed_cmplx_x[:w * h, :] #pred_sp = pred[:, -1, :] print(cnt, na) if False: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred_sp.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(1)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in range(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. #pred_sp = np.exp(pred) #pred_sp = pred s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "crn_mixdb", "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) model = load_model(model_path) # Load scaler. scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(workspace, tr_snr, te_snr, n_concat, iteration, model_name=None, visualize=False, force=False): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True if model_name is None: model_name = '_'.join([str(snr) for snr in tr_snr]) + 'ddbs' # Load model. model_path = os.path.join(workspace, "models", model_name, "md_%diters.h5" % iteration) print('GPU available: ', tf.test.is_gpu_available()) model = load_model(model_path) # Load scaler. scaler = read_combined_scaler(workspace, tr_snr) for snr in te_snr: # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(snr)) feat_paths = all_file_paths(feat_dir) for (cnt, feat_path) in tqdm(enumerate(feat_paths), 'Inference (creating enhanced speech)'): # Check if the enhanced audio is already inferred na = str(PurePath(feat_path).relative_to(feat_dir).with_suffix('')) out_path = os.path.join(workspace, "enh_wavs", "test", model_name, "%ddb" % int(snr), "%s.enh.wav" % na) if os.path.isfile(out_path) and not force: print(f'Enhanced audio {out_path} is already made') continue # Load feature. data = pickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, ir_mask, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) #print(cnt, na) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) #pred = pp_data.inverse_scale_on_2d(pred, scaler) # Debug plot. if visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav s = recover_wav(pred, mixed_cmplx_x, n_overlap, np.hamming, irr_mask=True) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration data_type = 'IRM' n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. if data_type == "DM": model_path = os.path.join(workspace, "models", "mixdb", "md_%diters.h5" % 120000) else: model_path = os.path.join(workspace, "models", "mask_mixdb", "md_%diters.h5" % 265000) model = load_model(model_path) # Load scaler. scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "mixdb", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "mixdb") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) if data_type == "IRM": mixed_x = speech_x + noise_x mixed_x1 = speech_x + noise_x # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) if data_type == "IRM": pred_sp = pred * mixed_x1 print(cnt, na) # Inverse scale. if data_type == "DM": pred = pp_data.inverse_scale_on_2d(pred, scaler) pred_sp = np.exp(pred) # Debug plot. # Recover enhanced wav. s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. if data_type == "DM": out_path = os.path.join(workspace, "enh_wavs", "test", "mixdb", "%s.enh.wav" % na) else: out_path = os.path.join(workspace, "enh_wavs", "test", "mask_mixdb", "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration calc_log = args.calc_log model_file = args.model_file n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Build model n_concat = 7 n_freq = 257 n_hid = 2048 lr = 1e-3 model = Sequential() model.add(Flatten(input_shape=(n_concat, n_freq))) model.add(Dropout(0.1)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) if calc_log: model.add(Dense(n_freq, activation='linear')) else: model.add(Dense(n_freq, activation='relu')) model.summary() model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr)) # Load model. if (model_file == "null"): model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) #model = load_model(model_path) model.load_weights(model_path) else: model.load_weights(model_file) # Load scaler. if calc_log: scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) if calc_log: mixed_x = pp_data.log_sp(mixed_x) #speech_x = pp_data.log_sp(speech_x) else: mixed_x = mixed_x #speech_x = speech_x # Scale data. if calc_log: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) #speech_x = pp_data.scale_on_2d(speech_x, scaler) else: mixed_x_max = np.max(mixed_x) print("max of tr_x:", mixed_x_max) mixed_x = mixed_x / mixed_x_max speech_x_max = np.max(speech_x) print("max of speech_x:", speech_x_max) speech_x = speech_x / speech_x_max # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. if False: print(mixed_x_3d) pred = model.predict(mixed_x_3d) print(cnt, na) if False: print("pred") print(pred) print("speech") print(speech_x) # Inverse scale. if calc_log: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) #speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) else: mixed_x = mixed_x * mixed_x_max #speech_x = speech_x * 16384 pred = pred * mixed_x_max # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') #axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. if calc_log: pred_sp = np.exp(pred) else: #gv = 0.025 #pred_sp = np.maximum(0,pred - gv) pred_sp = pred if False: pred_sp = mixed_x[3:-3] s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) # Write out enhanced pcm 8K pcm_s16le. out_pcm_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.pcm" % na) cmd = ' '.join([ "./ffmpeg -y -i ", out_path, " -f s16le -ar 8000 -ac 1 -acodec pcm_s16le ", out_pcm_path ]) os.system(cmd) # Write out webrtc-denoised enhanced pcm 8K pcm_s16le. ns_out_pcm_path = os.path.join(workspace, "ns_enh_wavs", "test", "%ddb" % int(te_snr), "%s.ns_enh.pcm" % na) ns_out_wav_path = os.path.join(workspace, "ns_enh_wavs", "test", "%ddb" % int(te_snr), "%s.ns_enh.wav" % na) pp_data.create_folder(os.path.dirname(ns_out_pcm_path)) cmd = ' '.join(["./ns", out_pcm_path, ns_out_pcm_path]) os.system(cmd) cmd = ' '.join([ "./ffmpeg -y -f s16le -ar 8000 -ac 1 -acodec pcm_s16le -i ", ns_out_pcm_path, " ", ns_out_wav_path ]) os.system(cmd) cmd = ' '.join(["rm ", out_pcm_path]) os.system(cmd) cmd = ' '.join(["rm ", ns_out_pcm_path]) os.system(cmd)
def inference(args): workspace = args.workspace model_name = args.model_name stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) # Load model model_path = os.path.join(workspace, "models", filename, model_name) n_freq = 257 model = DNN(stack_num, n_freq) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) if cuda: model.cuda() # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio) sp = pp_data.calc_sp(audio, fft_size, hop_size, window) x = np.abs(sp) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(x, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) output = forward(model, x, mean_, std_, cuda) output = output.data.cpu().numpy() print(output.shape) if visualize: fig, axs = plt.subplots(2, 1, sharex=True) axs[0].matshow(np.log(np.abs(sp)).T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(np.log(np.abs(output)).T, origin='lower', aspect='auto', cmap='jet') plt.show() import crash pause
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace #tr_snr = args.tr_snr #te_snr = args.te_snr n_concat = args.n_concat #iter = args.iteration TF = args.TF model_name = args.model_name n_window = cfg.n_window n_overlap = cfg.n_overlap #snr = cfg.SNR n_hop = int(n_window-n_overlap) fs = cfg.sample_rate scale = True # Load model t1 = time.time() #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5") mag_model_root = os.path.join(workspace, "saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' mag_model_files = find_models(mag_model_root) epoch_num = [] for i in range(len(mag_model_files)): epoch_num.append(int(mag_model_files[i].split("/")[-1].split('-')[2])) mag_model_index = epoch_num.index(max(epoch_num)) mag_model_path = mag_model_files[mag_model_index] print("The selected model path is %s :" % mag_model_path) mag_model = load_model(mag_model_path) ''' # loading phase model phase_model_root = os.path.join(workspace, "phase_saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' phase_model_files = find_models(phase_model_root) epoch_num1 = [] for i in range(len(phase_model_files)): epoch_num1.append(int(phase_model_files[i].split("/")[-1].split('-')[2])) phase_model_index = epoch_num1.index(max(epoch_num1)) phase_model_path = phase_model_files[phase_model_index] print("The selected model path is %s :" % phase_model_path) phase_model = load_model(phase_model_path) ''' # Load scaler mag_scaler_path = os.path.join(workspace, "packed_features", "train", "mag_scaler.p") mag_scaler = pickle.load(open(mag_scaler_path, 'rb')) #phase_scaler_path = os.path.join(workspace, "packed_features", "train", "phase_scaler.p") #phase_scaler = pickle.load(open(phase_scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "test") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_cmplx_x] = data n_pad = (n_concat - 1) / 2 if TF == "spectrogram": mixed_x = np.abs(mixed_cmplx_x) # mixed_phase = np.angle(mixed_cmplx_x) # Process data. #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) # mixed_phase = pp_data.pad_with_border(mixed_phase, n_pad) # speech_x = pp_data.log_sp(np.abs(speech_cmplx_x)) #speech_phase = np.angle(speech_cmplx_x) else: raise Exception("TF must be spectrogram, timedomain or fftmagnitude!") # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, mag_scaler) # speech_x = pp_data.scale_on_2d(speech_x, mag_scaler) #mixed_phase = pp_data.scale_on_2d(mixed_phase, phase_scaler) #speech_phase = pp_data.scale_on_2d(speech_phase, phase_scaler) # Cut input spectrogram to 3D segments with n_concat. #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) #mixed_phase_3d = pp_data.mat_2d_to_3d(mixed_phase, agg_num=n_concat, hop=1) #print("loading data time: %s s" % (time.time() - t1,)) ''' layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号 f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数) for _ in range(12): show_img = f1[1, :, :, _] show_img.shape = [1, 257] plt.subplot(3, 4, _ + 1) plt.imshow(show_img.T, cmap='gray') plt.axis('off') plt.show() ''' # Predict. t2 = time.time() mag_pred = mag_model.predict(mixed_x_3d) #phase_pred = phase_model.predict(mixed_phase_3d) print("model predicts %d utterance : %s successfully" % (cnt, na)) #print(pred) # Inverse scale. if scale: # mixed_x = pp_data.inverse_scale_on_2d(mixed_x, mag_scaler) # speech_x = pp_data.inverse_scale_on_2d(speech_x, mag_scaler) mag_pred = pp_data.inverse_scale_on_2d(mag_pred, mag_scaler) #mixed_phase = pp_data.inverse_scale_on_2d(mixed_phase, phase_scaler) #speech_phase = pp_data.inverse_scale_on_2d(speech_phase, phase_scaler) #phase_pred = pp_data.inverse_scale_on_2d(phase_pred, phase_scaler) # Recover enhanced wav. #pred_sp = np.exp(pred) if TF == "spectrogram": pred_sp = (10**(mag_pred/10))-1e-10 #pred_ph = np.exp(1j * phase_pred) ''' R = np.multiply(pred_sp, pred_ph) result = librosa.istft(R.T, hop_length=n_hop, win_length=cfg.n_window, window=scipy.signal.hamming, center=False) result /= abs(result).max() y_out = result*0.8''' #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) #s *= np.sqrt((np.hamming(n_window)**2).sum()) # Scaler for compensate the amplitude s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming') # Write out enhanced wav. out_path = os.path.join(workspace, "enh_flipphase", "test", "%s" % model_name, "{}_fft_dnn_map.wav".format(na.split('.')[0])) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) print("predict an utterance time: %s s" % (time.time() - t2,)) print("total test time: %s s" % (time.time() - t1,))
def inference1111(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace #tr_snr = args.tr_snr #te_snr = args.te_snr n_concat = args.n_concat #iter = args.iteration TF = args.TF model_name = args.model_name n_window = cfg.n_window n_overlap = cfg.n_overlap #snr = cfg.SNR n_hop = int(n_window-n_overlap) fs = cfg.sample_rate scale = True # Load model t1 = time.time() #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5") model_root = os.path.join(workspace, "saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' model_files = find_models(model_root) epoch_num = [] for i in range(len(model_files)): epoch_num.append(int(model_files[i].split("/")[-1].split('-')[2])) model_index = epoch_num.index(max(epoch_num)) model_path = model_files[model_index] print("The selected model path is %s :" % model_path) model = load_model(model_path) # Load scaler scaler_path = os.path.join(workspace, "packed_features", "train", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "test") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, na] = data n_pad = (n_concat - 1) / 2 if TF == "spectrogram": mixed_x = np.abs(mixed_cmplx_x) # Process data. #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) elif TF == "timedomain": #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_cmplx_x, n_pad) elif TF == "fftmagnitude": #n_pad = (n_concat - 1) / 2 mixed_x = np.abs(mixed_cmplx_x) mixed_x = pp_data.pad_with_border(mixed_x, n_pad) else: raise Exception("TF must be spectrogram, timedomain or fftmagnitude!") # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) #print("loading data time: %s s" % (time.time() - t1,)) ''' layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号 f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数) for _ in range(12): show_img = f1[1, :, :, _] show_img.shape = [1, 257] plt.subplot(3, 4, _ + 1) plt.imshow(show_img.T, cmap='gray') plt.axis('off') plt.show() ''' # Predict. t2 = time.time() pred = model.predict(mixed_x_3d) print("model predicts %d utterance : %s successfully" % (cnt, na)) #print(pred) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) #(frames, frame_length) = pred.shape #print("pred domensions %d and %d : " % (frames, frame_length)) # Debug plot. if args.visualize: if TF == "spectrogram": fig, axs = plt.subplots(3,1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.savefig('debug_model_spectra.png') plt.show() elif TF == "timedomain": fig, axs = plt.subplots(3,1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture time domain" % int(te_snr)) axs[1].set_title("Clean speech time domain") axs[2].set_title("Enhanced speech time domain") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.savefig('debug model_time.png') plt.show() else: raise Exception("TF must be spectrogram or timedomain!") # Recover enhanced wav. #pred_sp = np.exp(pred) if TF == "spectrogram": pred_sp = (10**(pred/20))-1e-10 #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) #s *= np.sqrt((np.hamming(n_window)**2).sum()) # Scaler for compensate the amplitude s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming') # change after spectrogram and IFFT. elif TF == "timedomain": s = time_recover_wav(pred, n_window, n_hop, 'hamming') #s *= np.sqrt((np.hamming(n_window)**2).sum()) elif TF == "fftmagnitude": #n_pad = (n_concat - 1) / 2 s = spectra_to_wav(pred, mixed_cmplx_x, n_window, n_hop, 'hamming') else: raise Exception("TF must be spectrogram timedomain or fftmagnitude!") # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%s" % model_name, "%s.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) print("predict an utterance time: %s s" % (time.time() - t2,)) print("total test time: %s s" % (time.time() - t1,))
def predict_folder(input_file_folder: object, output_file_folder: object) -> object: # Load model. data_type = "test" model_path = os.path.join(conf1.model_dir, "md_%diters.h5" % conf1.iterations) model = load_model(model_path) # Load scaler. # if scale: scaler_path = os.path.join(conf1.packed_feature_dir, data_type, "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. # names = os.listdir(input_file_folder) names = [f for f in sorted(os.listdir(input_file_folder)) if f.startswith("mix")] mixed_all = [] pred_all = [] for (cnt, na) in enumerate(names): # Load feature. file_path = os.path.join(input_file_folder, na) (a, _) = pp.read_audio(file_path) mixed_complex = pp.calc_sp(a, 'complex') mixed_x = np.abs(mixed_complex) # Process data. n_pad = (conf1.n_concat - 1) / 2 mixed_x = pp.pad_with_border(mixed_x, n_pad) mixed_x = pp.log_sp(mixed_x) # speech_x = dnn1_train.log_sp(speech_x) # Scale data. # if scale: mixed_x = pp.scale_on_2d(mixed_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. #if scale: mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler) # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler) pred = pp.inverse_scale_on_2d(pred, scaler) # Debug plot. if visualize_plot: visualize(mixed_x, pred) mixed_all.append(mixed_complex) pred_all.append(real_to_complex(pred, mixed_complex)) # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. pp.create_folder(output_file_folder) audio_path = os.path.join(output_file_folder, "enh_%s" % na) pp.write_audio(audio_path, s, conf1.sample_rate) return mixed_all, pred_all