def train(args): workspace = args.workspace # Load data. t1 = time.time() tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "data.h5") tr_y = pp_data.load_hdf5(tr_hdf5_path) tr_y = np.array(tr_y)[1] print("data shape is {}".format(tr_y.shape)) print("Load data time: %s s" % (time.time() - t1, )) # scaler data t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_y = pp_data.scale_on_2d(tr_y, scaler) print("Scale data time: %s s" % (time.time() - t1, )) # batch batch_size = 500 print("%d iterations / epoch" % (tr_y.shape[0] / batch_size)) # data shape and input shape (n_segs, n_freq) = tr_y.shape
def dnn1_colors(input): scaler_path = os.path.join(conf1.packed_feature_dir, "test", "scaler.p") scaler = dnn1.pickle.load(open(scaler_path, 'rb')) # n_pad = (conf1.n_concat - 1) / 2 # enh_pad[0] = pp.pad_with_border(enh_pad[0], n_pad) prova = pp.log_sp(input) prova = pp.scale_on_2d(np.abs(prova), scaler) prova = pp.inverse_scale_on_2d(prova, scaler) return -prova
def predict_file(file_path, model, scaler): (a, _) = pp.read_audio(file_path) mixed_complex = pp.calc_sp(a, 'complex') mixed_x = np.abs(mixed_complex) # Process data. n_pad = (conf1.n_concat - 1) / 2 mixed_x = pp.pad_with_border(mixed_x, n_pad) mixed_x = pp.log_sp(mixed_x) # speech_x = dnn1_train.log_sp(speech_x) # Scale data. # if scale: mixed_x = pp.scale_on_2d(mixed_x, scaler) # speech_x = pp.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) if visualize_plot: visualize(mixed_x, pred) # Inverse scale. # if scale: mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler) # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler) pred = pp.inverse_scale_on_2d(pred, scaler) # Debug plot. # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. # audio_path = os.path.dirname(file_path) # pp.write_audio(audio_path, s, conf1.sample_rate) return mixed_complex, pred, s
def generate(self, path_list): iter = 0 epoch = 0 pointer = 0 path = path_list[epoch] n_file = len(path_list) data = h5py.File(path) x = data['x'] y = data['y'] batch_size = self._batch_size_ n_samples = len(x) index = np.arange(n_samples) np.random.shuffle(index) while True: if (self._type_ == 'test') and (self._te_max_iter_ is not None): if iter == self._te_max_iter_: break iter += 1 if pointer >= n_samples: epoch += 1 if epoch == n_file: epoch = 0 path = path_list[epoch] print("start %s"%path) n_file = len(path_list) data = h5py.File(path) x = data['x'] y = data['y'] if (self._type_) == 'test' and (epoch == n_file - 1): break pointer = 0 np.random.shuffle(index) batch_idx = index[pointer : min(pointer + batch_size, n_samples)] pointer += batch_size yield pp_data.scale_on_3d(x[sorted(batch_idx)], self._scaler_), pp_data.scale_on_2d(y[sorted(batch_idx)], self._scaler_)
def train(args): """Train the neural network. Write out model every several iterations. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. lr: float, learning rate. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr lr = args.lr snr_arr = [0, 5, 10, 15] """ workspace = "workspace" tr_snr = 0 te_snr = 0 lr = 1e-4 """ # Load data. t1 = time.time() for i in snr_arr: tr_snr = i te_snr = i tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "data.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "%ddb" % int(te_snr), "data.h5") (tr_x, tr_y, tr_n) = pp_data.load_hdf5(tr_hdf5_path) # zxy tr_n (te_x, te_y, te_n) = pp_data.load_hdf5(te_hdf5_path) # zxy te_n print(tr_x.shape, tr_y.shape) # Scale data. if True: t2 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_x = pp_data.scale_on_3d(tr_x, scaler) tr_y = pp_data.scale_on_2d(tr_y, scaler) # tr_n = pp_data.scale_on_2d(tr_n, scaler)#zxy te_x = pp_data.scale_on_3d(te_x, scaler) te_y = pp_data.scale_on_2d(te_y, scaler) # te_n = pp_data.scale_on_2d(te_n, scaler)#zxy print("Scale data(%sdb) time: %s s" % (tr_snr, time.time() - t2,)) # append data if i == 0: tr_x_all = tr_x tr_y_all = tr_y te_x_all = te_x te_y_all = te_y else: tr_x_all = np.concatenate((tr_x_all, tr_x), axis=0) tr_y_all = np.concatenate((tr_y_all, tr_y), axis=0) te_x_all = np.concatenate((te_x_all, te_x), axis=0) te_y_all = np.concatenate((te_y_all, te_y), axis=0) print(tr_x_all.shape, tr_y_all.shape)#zxy tr_n.shape print(te_x_all.shape, te_y_all.shape)#zxy te_n.shape print("Load data time: %s s" % (time.time() - t1,)) batch_size = 100 print("%d iterations / epoch" % int(tr_x.shape[0] / batch_size)) # Debug plot. if False: plt.matshow(tr_x[0 : 1000, 0, :].T, origin='lower', aspect='auto', cmap='jet') plt.show() pause # Build model (_, n_concat, n_freq) = tr_x.shape # 1.Load Pre-model by Xu model_path = os.path.join("premodel", "sednn_keras_logMag_Relu2048layer1_1outFr_7inFr_dp0.2_weights.75-0.00.hdf5") pre_model = load_model(model_path) #pre_model.summary() # 2.Build train model n_hid = 2048 #input:feature_x main_input = Input(shape=(n_concat, n_freq), name='main_input') x = Flatten(input_shape=(n_concat, n_freq))(main_input) # 2.1Pre-train to get feature_x // should be called tranform learning 2018-7-8 experiment13 #x = pre_model(x) #x = (pre_model.get_layer('input_1'))(x) #x = (pre_model.get_layer('dense_1'))(x) #x = (Dense(n_hid, activation='linear'))(x) ## model_mid = Model(inputs=pre_model.input, outputs=pre_model.get_layer('dense_1').output) #model_mid.summary() ## x=model_mid(x) x = (Dense(n_hid, activation='linear'))(x) """ x = (LSTM(n_hid, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', unit_forget_bias=True, kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0.0, recurrent_dropout=0.3))(main_input) x = (LSTM(n_hid, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', unit_forget_bias=True, kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0.0, recurrent_dropout=0.3))(x) x = (LSTM(n_hid, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', unit_forget_bias=True, kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0.0, recurrent_dropout=0.3))(x) """ #hidden1 x = (Dense(n_hid, name='hidden_1'))(x) x = LeakyReLU(alpha=0.3)(x) x = Dropout(0.3)(x) x = (Dense(n_hid, activation='linear'))(x) #hidden2 x = (Dense(n_hid, name='hidden_2'))(x) x = LeakyReLU(alpha=0.3)(x) x = Dropout(0.3)(x) """ x = (Dense(n_hid, activation='linear'))(x) #hidden3 x = (Dense(n_hid, name='hidden_3'))(x) x = LeakyReLU(alpha=0.3)(x) x = Dropout(0.3)(x) #x = (Dense(n_hid, activation='linear'))(x) #hidden4 x = (Dense(n_hid, name='hidden_4'))(x) x = LeakyReLU(alpha=0.3)(x) x = Dropout(0.5)(x) """ #output1:^speech output_y = Dense(n_freq, activation='linear', name='out_y')(x) #define noisy_to_speech&noise model model = Model(inputs=main_input, outputs=output_y) #compile model with different loss and weights model.compile(optimizer=Adam(lr=lr), loss='mae', metrics=['accuracy']) #show model_summary model.summary() # Data generator. tr_gen = DataGenerator(batch_size=batch_size, type='train') eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) # Directories for saving models and training stats model_dir = os.path.join(workspace, "models") # , "%ddb" % int(tr_snr)) pp_data.create_folder(model_dir) stats_dir = os.path.join(workspace, "training_stats") # , "%ddb" % int(tr_snr)) pp_data.create_folder(stats_dir) # Print loss before training. iter = 0 tr_loss = eval(model, eval_tr_gen, tr_x, tr_y) te_loss = eval(model, eval_te_gen, te_x, te_y) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) #tr_n_loss = eval(model, eval_tr_gen, tr_x, tr_n)#zxy0523 #te_n_loss = eval(model, eval_te_gen, te_x, te_n) #print("Iteration: %d, tr_n_loss: %f, te_n_loss: %f" % (iter, tr_n_loss, te_n_loss)) # Save out training stats. stat_dict = {'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Train. t1 = time.time() for (batch_x, batch_y) in tr_gen.generate(xs=[tr_x], ys=[tr_y]): loss = model.train_on_batch(batch_x, batch_y) iter += 1 # Validate and save training stats. if iter % 50 == 0: tr_loss = eval(model, eval_tr_gen, tr_x, tr_y) te_loss = eval(model, eval_te_gen, te_x, te_y) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = {'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Save model. if iter % 3000 == 0: model_path = os.path.join(model_dir, "md_dnn2_%diters.h5" % iter) model.save(model_path) print("Saved model to %s" % model_path) if iter == 3001: break #zxy resultz = model.evaluate(tr_x, tr_y) print ("/nTrain Acc:" ) print(resultz) resultz = model.evaluate(te_x, te_y) print ("/nTest Acc:" ) print(resultz) print(model.metrics_names) #zxy print("Training time: %s s" % (time.time() - t1,))
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) model = load_model(model_path) # Load scaler. scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(workspace, tr_snr, te_snr, n_concat, iteration, model_name=None, visualize=False, force=False): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True if model_name is None: model_name = '_'.join([str(snr) for snr in tr_snr]) + 'ddbs' # Load model. model_path = os.path.join(workspace, "models", model_name, "md_%diters.h5" % iteration) print('GPU available: ', tf.test.is_gpu_available()) model = load_model(model_path) # Load scaler. scaler = read_combined_scaler(workspace, tr_snr) for snr in te_snr: # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(snr)) feat_paths = all_file_paths(feat_dir) for (cnt, feat_path) in tqdm(enumerate(feat_paths), 'Inference (creating enhanced speech)'): # Check if the enhanced audio is already inferred na = str(PurePath(feat_path).relative_to(feat_dir).with_suffix('')) out_path = os.path.join(workspace, "enh_wavs", "test", model_name, "%ddb" % int(snr), "%s.enh.wav" % na) if os.path.isfile(out_path) and not force: print(f'Enhanced audio {out_path} is already made') continue # Load feature. data = pickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, ir_mask, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) #print(cnt, na) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) #pred = pp_data.inverse_scale_on_2d(pred, scaler) # Debug plot. if visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav s = recover_wav(pred, mixed_cmplx_x, n_overlap, np.hamming, irr_mask=True) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def train_noise(args): """Train the neural network. Write out model every several iterations. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. lr: float, learning rate. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr lr = args.lr """ workspace = "workspace" tr_snr = 0 te_snr = 0 lr = 1e-4 """ # Load data. t1 = time.time() tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "data.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "%ddb" % int(te_snr), "data.h5") (tr_x, tr_y, tr_n) = pp_data.load_hdf5(tr_hdf5_path)#zxy tr_n (te_x, te_y, te_n) = pp_data.load_hdf5(te_hdf5_path)#zxy te_n print(tr_x.shape, tr_y.shape, tr_n.shape)#zxy tr_n.shape print(te_x.shape, te_y.shape, te_n.shape)#zxy te_n.shape print("Load data time: %s s" % (time.time() - t1,)) batch_size = 500 print("%d iterations / epoch" % int(tr_x.shape[0] / batch_size)) # Scale data. if True: t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_x = pp_data.scale_on_3d(tr_x, scaler) #tr_y = pp_data.scale_on_2d(tr_y, scaler) tr_n = pp_data.scale_on_2d(tr_n, scaler)#zxy te_x = pp_data.scale_on_3d(te_x, scaler) #te_y = pp_data.scale_on_2d(te_y, scaler) te_n = pp_data.scale_on_2d(te_n, scaler)#zxy print("Scale data time: %s s" % (time.time() - t1,)) # Debug plot. if False: plt.matshow(tr_x[0 : 1000, 0, :].T, origin='lower', aspect='auto', cmap='jet') plt.show() pause # Build model (_, n_concat, n_freq) = tr_x.shape # 1.Load Pre-model by Xu model_path = os.path.join("premodel", "sednn_keras_logMag_Relu2048layer1_1outFr_7inFr_dp0.2_weights.75-0.00.hdf5") pre_model = load_model(model_path) # 2.Build train model n_hid = 2048 #input:feature_x main_input = Input(shape=(n_concat, n_freq), name='main_input') x = Flatten(input_shape=(n_concat, n_freq))(main_input) # 2.1Pre-train to get feature_x x = pre_model(x) #hidden1 x = (Dense(n_hid))(x) x = LeakyReLU(alpha=0.3)(x) x = Dropout(0.3)(x) #hidden2 x = (Dense(n_hid))(x) x = LeakyReLU(alpha=0.3)(x) x = Dropout(0.3)(x) #hidden3 x = (Dense(n_hid))(x) x = LeakyReLU(alpha=0.3)(x) x = Dropout(0.3)(x) #output1:^speech output_y = Dense(n_freq, activation='linear', name='out_y')(x) #define noisy_to_speech&noise model model = Model(inputs=main_input, outputs=output_y) #compile model with different loss and weights model.compile(optimizer=Adam(lr=lr), loss='mae', metrics=['accuracy']) #show model_summary model.summary() # Data generator. tr_gen = DataGenerator(batch_size=batch_size, type='train') eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) # Directories for saving models and training stats model_dir = os.path.join(workspace, "models", "%ddb_n" % int(tr_snr)) pp_data.create_folder(model_dir) stats_dir = os.path.join(workspace, "training_stats", "%ddb_n" % int(tr_snr)) pp_data.create_folder(stats_dir) # Print loss before training. iter = 0 tr_loss = eval(model, eval_tr_gen, tr_x, tr_n) te_loss = eval(model, eval_te_gen, te_x, te_n) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) #tr_n_loss = eval(model, eval_tr_gen, tr_x, tr_n)#zxy0523 #te_n_loss = eval(model, eval_te_gen, te_x, te_n) #print("Iteration: %d, tr_n_loss: %f, te_n_loss: %f" % (iter, tr_n_loss, te_n_loss)) # Save out training stats. stat_dict = {'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Train. t1 = time.time() for (batch_x, batch_n) in tr_gen.generate(xs=[tr_x], ys=[tr_n]): loss = model.train_on_batch(batch_x, batch_n) iter += 1 # Validate and save training stats. if iter % 100 == 0: tr_loss = eval(model, eval_tr_gen, tr_x, tr_n) te_loss = eval(model, eval_te_gen, te_x, te_n) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = {'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Save model. if iter % 1000 == 0: model_path = os.path.join(model_dir, "md_%diters.h5" % iter) model.save(model_path) print("Saved model to %s" % model_path) if iter == 3001: break #zxy resultz = model.evaluate(tr_x, tr_n) print ("/nTrain Acc:" ) print(resultz) resultz = model.evaluate(te_x, te_n) print ("/nTest Acc:" ) print(resultz) print(model.metrics_names) #zxy print("Training time: %s s" % (time.time() - t1,))
def continue_train(args): workspace = args.workspace lr = args.lr iter = args.iteration data_type = "IRM" # Load model. if data_type == "DM": model_path = os.path.join(workspace, "models", "mixdb", "md_%diters.h5" % iter) else: model_path = os.path.join(workspace, "models", "mask_mixdb", "md_%diters.h5" % iter) model = load_model(model_path) #model = multi_gpu_model(model, 4) model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr, beta_1=0.2)) # Load data. t1 = time.time() if data_type == "DM": tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "mixdb", "data.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "mixdb", "data.h5") else: tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "mask_mixdb", "data.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "mask_mixdb", "data.h5") tr_hdf5_dir = os.path.join(workspace, "packed_features", "spectrogram", "train", "mask_mixdb") tr_hdf5_names = os.listdir(tr_hdf5_dir) tr_hdf5_names = [i for i in tr_hdf5_names if i.endswith(".h5")] tr_path_list = [os.path.join(tr_hdf5_dir, i) for i in tr_hdf5_names] (tr_x, tr_y) = pp_data.load_hdf5(tr_hdf5_path) (te_x, te_y) = pp_data.load_hdf5(te_hdf5_path) print(tr_x.shape, tr_y.shape) print(te_x.shape, te_y.shape) print("Load data time: %s s" % (time.time() - t1, )) batch_size = 2048 print("%d iterations / epoch" % int(tr_x.shape[0] / batch_size)) # Scale data. if True: t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "mixdb", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_x = pp_data.scale_on_3d(tr_x, scaler) te_x = pp_data.scale_on_3d(te_x, scaler) if data_type == "DM": tr_y = pp_data.scale_on_2d(tr_y, scaler) te_y = pp_data.scale_on_2d(te_y, scaler) print("Scale data time: %s s" % (time.time() - t1, )) #scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "mixdb", "scaler.p") #scaler = pickle.load(open(scaler_path, 'rb')) tr_gen = DataGenerator(batch_size=batch_size, type='train') eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) #tr_gen = DataGenerator_h5py(batch_size=batch_size, type='train', scaler = scaler) #eval_te_gen = DataGenerator_h5py(batch_size=batch_size, type='test', te_max_iter=100, scaler =scaler) #eval_tr_gen = DataGenerator_h5py(batch_size=batch_size, type='test', te_max_iter=100, scaler =scaler) # Directories for saving models and training stats if data_type == "DM": model_dir = os.path.join(workspace, "models", "chinese_mixdb", "continue") stats_dir = os.path.join(workspace, "training_stats", "chinese_mixdb", "continue") else: model_dir = os.path.join(workspace, "models", "mask_mixdb", "continue") stats_dir = os.path.join(workspace, "training_stats", "mask_mixdb", "continue") pp_data.create_folder(model_dir) pp_data.create_folder(stats_dir) # Print loss before training. iter = 0 tr_loss = eval(model, eval_tr_gen, tr_x, tr_y) te_loss = eval(model, eval_te_gen, te_x, te_y) #tr_loss = eval_h5py(model, eval_tr_gen, tr_path_list) #te_loss = eval_h5py(model, eval_te_gen, [te_hdf5_path]) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Train. t1 = time.time() for (batch_x, batch_y) in tr_gen.generate(xs=[tr_x], ys=[tr_y]): #for (batch_x, batch_y) in tr_gen.generate(tr_path_list): loss = model.train_on_batch(batch_x, batch_y) iter += 1 # Validate and save training stats. if iter % 500 == 0: tr_loss = eval(model, eval_tr_gen, tr_x, tr_y) te_loss = eval(model, eval_te_gen, te_x, te_y) #tr_loss = eval_h5py(model, eval_tr_gen, tr_path_list) #te_loss = eval_h5py(model, eval_te_gen, [te_hdf5_path]) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Save model. if iter % 5000 == 0: model_path = os.path.join(model_dir, "md_%diters.h5" % iter) model.save(model_path) print("Saved model to %s" % model_path) if iter == 100001: break print("Training time: %s s" % (time.time() - t1, ))
def continue_train_tfrecord(): workspace = "workspace" lr = 1e-5 iter = 220000 data_type = "IRM" # Load model. if data_type == "DM": model_path = os.path.join(workspace, "models", "elu_mixdb", "md_%diters.h5" % iter) else: model_path = os.path.join(workspace, "models", "mask_mixdb", "md_%diters.h5" % iter) model = load_model(model_path) #model = multi_gpu_model(model, 4) model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr, beta_1=0.2)) # Load data. if data_type == "DM": tr_hdf5_dir = os.path.join(workspace, "tfrecords", "train", "mixdb") tr_hdf5_names = os.listdir(tr_hdf5_dir) tr_path_list = [os.path.join(tr_hdf5_dir, i) for i in tr_hdf5_names] te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "mixdb", "data.h5") else: tr_hdf5_dir = os.path.join(workspace, "tfrecords", "train", "mask_mixdb") tr_hdf5_names = os.listdir(tr_hdf5_dir) tr_path_list = [os.path.join(tr_hdf5_dir, i) for i in tr_hdf5_names] te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "mask_mixdb", "data.h5") #(tr_x1, tr_y1) = pp_data.load_hdf5("workspace/packed_features/spectrogram/train/mixdb/data100000.h5") (te_x, te_y) = pp_data.load_hdf5(te_hdf5_path) t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "mixdb", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) te_x = pp_data.scale_on_3d(te_x, scaler) #tr_x1 = pp_data.scale_on_3d(tr_x1, scaler) if data_type == "DM": te_y = pp_data.scale_on_2d(te_y, scaler) tr_y1 = pp_data.scale_on_2d(tr_y1, scaler) print("Scale data time: %s s" % (time.time() - t1, )) # Directories for saving models and training stats if data_type == "DM": model_dir = os.path.join(workspace, "models", "elu_mixdb", "continue") stats_dir = os.path.join(workspace, "training_stats", "elu_mixdb", "continue") else: model_dir = os.path.join(workspace, "models", "mask_mixdb", "continue") stats_dir = os.path.join(workspace, "training_stats", "mask_mixdb", "continue") pp_data.create_folder(model_dir) pp_data.create_folder(stats_dir) # Print loss before training. batch_size = 1024 * 4 #eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) #tr_loss = eval(model, eval_tr_gen, tr_x1, tr_y1) tr_loss = 0 te_loss = eval(model, eval_te_gen, te_x, te_y) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Train. sess = tf.Session() x, y = load_tfrecord(batch=batch_size, repeat=100000, data_path=tr_path_list) t1 = time.time() for count in range(1000000000): [tr_x, tr_y] = sess.run([x, y]) loss = model.train_on_batch(tr_x, tr_y) iter += 1 # Validate and save training stats. if iter % 1000 == 0: #tr_loss = eval(model, eval_tr_gen, tr_x1, tr_y1) te_loss = eval(model, eval_te_gen, te_x, te_y) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Save model. if iter % 5000 == 0: model_path = os.path.join(model_dir, "md_%diters.h5" % iter) model.save(model_path) print("Saved model to %s" % model_path) if iter == 100001: break print("Training time: %s s" % (time.time() - t1, ))
te_x = [] te_y = [] for i in h5_test_list: te_x_t, te_y_t = pp.load_hdf5(os.path.join(conf1.data_test_dir, i)) te_x.append(te_x_t) te_y.append(te_y_t) te_x = np.concatenate(te_x, axis=0) te_y = np.concatenate(te_y, axis=0) #scale test data scaler = pickle.load( open(os.path.join(conf1.packed_feature_dir, 'test', 'scaler.p'), 'rb')) te_x = pp.scale_on_3d(te_x, scaler) te_y = pp.scale_on_2d(te_y, scaler) print("Scale data time: %s s" % (time.time() - t1, )) print("Load data time: %s s" % (time.time() - t1, )) # conf.batch_size = 512 # print("%d iterations / epoch" % int(tr_x.shape[0] / conf1.batch_size)) tr_x, tr_y = pp.load_hdf5(os.path.join(conf1.data_train_dir, h5_train_list[0])) # Debug plot. # if False: # plt.matshow(tr_x[0: 1000, 0, :].T, origin='lower', aspect='auto', cmap='jet') # plt.show() # pause
def predict_folder(input_file_folder: object, output_file_folder: object) -> object: # Load model. data_type = "test" model_path = os.path.join(conf1.model_dir, "md_%diters.h5" % conf1.iterations) model = load_model(model_path) # Load scaler. # if scale: scaler_path = os.path.join(conf1.packed_feature_dir, data_type, "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. # names = os.listdir(input_file_folder) names = [f for f in sorted(os.listdir(input_file_folder)) if f.startswith("mix")] mixed_all = [] pred_all = [] for (cnt, na) in enumerate(names): # Load feature. file_path = os.path.join(input_file_folder, na) (a, _) = pp.read_audio(file_path) mixed_complex = pp.calc_sp(a, 'complex') mixed_x = np.abs(mixed_complex) # Process data. n_pad = (conf1.n_concat - 1) / 2 mixed_x = pp.pad_with_border(mixed_x, n_pad) mixed_x = pp.log_sp(mixed_x) # speech_x = dnn1_train.log_sp(speech_x) # Scale data. # if scale: mixed_x = pp.scale_on_2d(mixed_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. #if scale: mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler) # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler) pred = pp.inverse_scale_on_2d(pred, scaler) # Debug plot. if visualize_plot: visualize(mixed_x, pred) mixed_all.append(mixed_complex) pred_all.append(real_to_complex(pred, mixed_complex)) # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. pp.create_folder(output_file_folder) audio_path = os.path.join(output_file_folder, "enh_%s" % na) pp.write_audio(audio_path, s, conf1.sample_rate) return mixed_all, pred_all
def train(args): """Train the neural network. Write out model every several iterations. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. lr: float, learning rate. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr lr = args.lr # Load data. t1 = time.time() tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "data.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "%ddb" % int(te_snr), "data.h5") (tr_x, tr_y) = pp_data.load_hdf5(tr_hdf5_path) (te_x, te_y) = pp_data.load_hdf5(te_hdf5_path) print(tr_x.shape, tr_y.shape) print(te_x.shape, te_y.shape) print("Load data time: %s s" % (time.time() - t1, )) batch_size = 500 print("%d iterations / epoch" % int(tr_x.shape[0] / batch_size)) # Scale data. if True: t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_x = pp_data.scale_on_3d(tr_x, scaler) tr_y = pp_data.scale_on_2d(tr_y, scaler) te_x = pp_data.scale_on_3d(te_x, scaler) te_y = pp_data.scale_on_2d(te_y, scaler) print("Scale data time: %s s" % (time.time() - t1, )) # Debug plot. if False: plt.matshow(tr_x[0:1000, 0, :].T, origin='lower', aspect='auto', cmap='jet') plt.show() pause # Build model (_, n_concat, n_freq) = tr_x.shape n_hid = 2048 model = Sequential() model.add(Flatten(input_shape=(n_concat, n_freq))) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(n_freq, activation='linear')) model.summary() model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr)) # Data generator. tr_gen = DataGenerator(batch_size=batch_size, type='train') eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) # Directories for saving models and training stats model_dir = os.path.join(workspace, "models", "%ddb" % int(tr_snr)) pp_data.create_folder(model_dir) stats_dir = os.path.join(workspace, "training_stats", "%ddb" % int(tr_snr)) pp_data.create_folder(stats_dir) # Print loss before training. iter = 0 tr_loss = eval(model, eval_tr_gen, tr_x, tr_y) te_loss = eval(model, eval_te_gen, te_x, te_y) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Train. t1 = time.time() for (batch_x, batch_y) in tr_gen.generate(xs=[tr_x], ys=[tr_y]): loss = model.train_on_batch(batch_x, batch_y) iter += 1 # Validate and save training stats. if iter % 1000 == 0: tr_loss = eval(model, eval_tr_gen, tr_x, tr_y) te_loss = eval(model, eval_te_gen, te_x, te_y) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Save model. if iter % 5000 == 0: model_path = os.path.join(model_dir, "md_%diters.h5" % iter) model.save(model_path) print("Saved model to %s" % model_path) if iter == 10001: break print("Training time: %s s" % (time.time() - t1, ))
def train(args): """Train the neural network. Write out model every several iterations. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. lr: float, learning rate. """ print(args) workspace = args.workspace model_name = args.model_name lr = args.lr tr_dir_name = args.tr_dir_name va_dir_name = args.va_dir_name iter_training = args.iteration dropout = args.dropout # Load data. t1 = time.time() tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", tr_dir_name, "data.h5") # va_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "validation", va_dir_name, "data.h5") (tr_x, tr_y) = pp_data.load_hdf5(tr_hdf5_path) # (va_x, va_y) = pp_data.load_hdf5(va_hdf5_path) print(tr_x.shape, tr_y.shape) # print(va_x.shape, va_y.shape) print("Load data time: %s s" % (time.time() - t1, )) batch_size = 500 print("%d iterations / epoch" % int(tr_x.shape[0] / batch_size)) # Scale data. if True: t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", tr_dir_name, "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_x = pp_data.scale_on_3d(tr_x, scaler) tr_y = pp_data.scale_on_2d(tr_y, scaler) # va_x = pp_data.scale_on_3d(va_x, scaler) # va_y = pp_data.scale_on_2d(va_y, scaler) print("Scale data time: %s s" % (time.time() - t1, )) # Debug plot. if False: plt.matshow(tr_x[0:1000, 0, :].T, origin='lower', aspect='auto', cmap='jet') plt.show() pause # Build model (_, n_concat, n_freq) = tr_x.shape n_hid = 2048 with tf.Session() as sess: model = DNN(sess, lr, batch_size, (n_concat, n_freq), n_freq, dropouts=dropout, training=True) model.build() sess.run(tf.global_variables_initializer()) merge_op = tf.summary.merge_all() # Data generator. tr_gen = DataGenerator(batch_size=batch_size, type='train') # eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) # Directories for saving models and training stats model_dir = os.path.join(workspace, "models", model_name) pp_data.create_folder(model_dir) stats_dir = os.path.join(workspace, "training_stats", model_name) pp_data.create_folder(stats_dir) # Print loss before training. iter = 0 tr_loss = eval(sess, model, eval_tr_gen, tr_x, tr_y) # te_loss = eval(model, eval_te_gen, te_x, te_y) # print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) print("Iteration: %d, tr_loss: %f" % (iter, tr_loss)) # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, } # 'te_loss': te_loss,} stat_path = os.path.join(stats_dir, "%diters.p" % iter) pickle.dump(stat_dict, open(stat_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) # Train. t1 = time.time() for (batch_x, batch_y) in tr_gen.generate(xs=[tr_x], ys=[tr_y]): feed_dict = {model.x_noisy: batch_x, model.y_clean: batch_y} _, loss, summary_str = sess.run( [model.optimizer, model.loss, merge_op], feed_dict=feed_dict) iter += 1 # Validate and save training stats. if iter % 1000 == 0: tr_loss = eval(sess, model, eval_tr_gen, tr_x, tr_y) # te_loss = eval(model, eval_te_gen, te_x, te_y) print("Iteration: %d, tr_loss: %f" % (iter, tr_loss)) # print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, } # 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) pickle.dump(stat_dict, open(stat_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) # Save model. if iter % 5000 == 0: ckpt_file_path = os.path.join(model_dir, model_name) # if os.path.isdir(model_dir) is False: # os.makedirs(model_dir) tf.train.Saver().save(sess, ckpt_file_path, write_meta_graph=True) print("Saved model to %s" % ckpt_file_path) if iter == iter_training + 1: break print("Training time: %s s" % (time.time() - t1, ))
def train(args): """Train the neural network. Write out model every several iterations. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. lr: float, learning rate. """ class MetricsHistory(Callback): def on_epoch_end(self, epoch, logs={}): file_logger.write([str(epoch), str(logs['loss']), str(logs['val_loss']) ]) print(args) workspace = args.workspace #tr_snr = args.tr_snr #te_snr = args.te_snr lr = args.lr #TF = args.TF model_name = args.model_name #model_save_dir = os.path.join(args.workspace, 'saved_models') # Load data t1 = time.time() print("Loading the train and vallidation dataset") tr_hdf5_path = os.path.join(workspace, "packed_features", "train", "mag.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "val", "mag.h5") (tr_x, tr_y) = pp_data.load_hdf5(tr_hdf5_path) (te_x, te_y) = pp_data.load_hdf5(te_hdf5_path) print('train_x shape:') print(tr_x.shape, tr_y.shape) print('test_x shape:') print(te_x.shape, te_y.shape) print("Load data time: %f s" % (time.time() - t1)) print('\n') # Scale data if True: print("Scaling train and test dataset. This will take some time, please wait patiently...") t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "train", "mag_scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_x = pp_data.scale_on_3d(tr_x, scaler) tr_y = pp_data.scale_on_2d(tr_y, scaler) te_x = pp_data.scale_on_3d(te_x, scaler) te_y = pp_data.scale_on_2d(te_y, scaler) print("Scale data time: %f s" % (time.time() - t1)) # Debug plot. if False: plt.matshow(tr_x[0 : 1000, 0, :].T, origin='lower', aspect='auto', cmap='jet') plt.show() #time.sleep(secs) os.system("pause") # Build model batch_size = 150 epoch = 100 print("The neural networks you have chosed is %s" % model_name) print("The training batch is set to %d and the %s will be training for at most %d epoches" % (batch_size, model_name.upper(), epoch)) print("======iteration of one epoch======" ) iter_each_epoch = int(tr_x.shape[0] / batch_size) #val_each_epoch = int(te_x.shape[0] / batch_size) #print("There are %d iterations / epoch" % int(tr_x.shape[0] / batch_size)) print("There are %d iterations / epoch" % iter_each_epoch) log_save_dir = os.path.join(workspace, 'log') if not os.path.isdir(log_save_dir): os.makedirs(log_save_dir) log_path = os.path.join(log_save_dir, 'out_{}.csv'.format(model_name)) #log_path = os.path.join(log_save_dir, 'out_%ddb_%s.csv' %(int(snr[0]), model_name)) file_logger = FileLogger(log_path, ['epoch', 'train_loss', 'val_loss']) (_, n_concat, n_freq) = tr_x.shape #temp_tr_x = tr_x[:, 3, :][:, np.newaxis, :] #print(temp_tr_x.shape) #np.axis n_hid = 2048 #data_gen = DataGenerator(batch_size=batch_size, type='train') #tr_gen = data_gen.generate(xs=[tr_x], ys=[tr_y]) #te_gen = data_gen.generate(xs=[te_x], ys=[te_y]) #temp_tr_x = tr_gen[:, 3, :][:, np.newaxis, :] ''' model = Sequential() model.add(Flatten(input_shape=(n_concat, n_freq))) model.add(BatchNormalization()) model.add(Dense(n_hid, activation='relu', kernel_regularizer=regularizers.l2(l=0.0001))) model.add(Dropout(0.2)) model.add(BatchNormalization()) model.add(Dense(n_hid, activation='relu', kernel_regularizer=regularizers.l2(l=0.0001))) model.add(Dropout(0.2)) model.add(BatchNormalization()) model.add(Dense(n_hid, activation='relu', kernel_regularizer=regularizers.l2(l=0.0001))) model.add(Dropout(0.2)) model.add(Dense(n_freq, activation='linear')) #model.summary() ''' print('Model selected:', model_name.lower()) if model_name == 'dnn': model = dnn(n_hid, n_concat, n_freq) elif model_name == 'sdnn1': model = sdnn1(n_hid, n_concat, n_freq) elif model_name == 'sdnn2': model = sdnn2(n_hid, n_concat, n_freq) elif model_name == 'sdnn3': model = sdnn3(n_hid, n_concat, n_freq) elif model_name == 'fcn': model = fcn(n_concat, n_freq) elif model_name == 'fcn1': model = fcn1(n_concat, n_freq) elif model_name == 'fcn1': model = fcn1_re(n_concat, n_freq) elif model_name == 'fcn2': model = fcn2(n_concat, n_freq) elif model_name == 'fcn3': model = fcn3(n_concat, n_freq) elif model_name == 'fcn4': model = fcn4(n_concat, n_freq) elif model_name == 'm_vgg': model = m_vgg(n_concat, n_freq) elif model_name == 'm_vgg1': model = m_vgg1(n_concat, n_freq) elif model_name == 'm_vgg2': model = m_vgg2(n_concat, n_freq) elif model_name == 'm_vgg3': model = m_vgg3(n_concat, n_freq) elif model_name == 'm_vgg4': model = m_vgg3(n_concat, n_freq) elif model_name == 'CapsNet': model = CapsNet(n_concat, n_freq, 3) elif model_name == 'brnn' : recur_layers = 7 unit = 256 output_dim = n_freq model = brnn(n_concat, n_freq, unit, recur_layers, output_dim) elif model_name == 'rnn' : output_dim = n_freq model = rnn(n_concat, n_freq, output_dim) elif model_name == 'tcn' : input_dim = n_freq model = tcn(n_concat, input_dim) if model is None: exit('Please choose a valid model: [dnn, sdnn, sdnn1, cnn, scnn1]') #mean_squared_error model.compile(loss = 'mean_squared_error', optimizer=Adam(lr=lr)) print(model.summary()) #plot model #plot_model(model, to_file=args.save_dir+'/model.png', show_shapes=True) #plot_model(model, to_file='%s/%s_model.png' % (log_save_dir, model_name), show_shapes=True) # Save model and weights model_save_dir = os.path.join(workspace, 'saved_models', "%s" % model_name) model_save_name = "weights-checkpoint-{epoch:02d}-{val_loss:.2f}.h5" if not os.path.isdir(model_save_dir): os.makedirs(model_save_dir) model_path = os.path.join(model_save_dir, model_save_name) checkpoint = ModelCheckpoint(model_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') print('Saved trained model at %s' % model_save_dir) #reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_lr=0.00001, verbose=1) lr_decay = LearningRateScheduler(schedule=lambda epoch: lr * (0.9 ** epoch)) metrics_history = MetricsHistory() hist = model.fit(x=tr_x, y=tr_y, batch_size=batch_size, epochs=epoch, verbose=1, shuffle=True, validation_data=(te_x, te_y), #validation_split=0.1, callbacks=[metrics_history, checkpoint, lr_decay]) ''' hist = model.fit_generator(tr_gen, steps_per_epoch=iter_each_epoch, epochs=epoch, verbose=1, validation_data=te_gen, validation_steps=val_each_epoch, callbacks=[metrics_history, checkpoint, reduce_lr]) ''' print(hist.history.keys()) # list all data in history #print(hist.history.keys()) ''' # summarize history for accuracy plt.plot(hist.history['acc']) plt.plot(hist.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() ''' # summarize history for loss model_png = "train_test_loss" loss_fig_dir = os.path.join(log_save_dir, '%s_%s.png' % (model_name, model_png)) plt.plot(hist.history['loss']) plt.plot(hist.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'val'], loc='upper right') plt.savefig(loss_fig_dir) #plt.show() ''' fig = plt.gcf() plt.show() fig.savefig('tessstttyyy.png', dpi=100) ''' file_logger.close() ''' # Data generator. tr_gen = DataGenerator(batch_size=batch_size, type='train') eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) # Directories for saving models and training stats model_dir = os.path.join(workspace, "models", "%ddb" % int(tr_snr)) pp_data.create_folder(model_dir) stats_dir = os.path.join(workspace, "training_stats", "%ddb" % int(tr_snr)) pp_data.create_folder(stats_dir) # Print loss before training. iter = 0 tr_loss = eval(model, eval_tr_gen, tr_x, tr_y) te_loss = eval(model, eval_te_gen, te_x, te_y) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = {'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Train. t1 = time.time() for (batch_x, batch_y) in tr_gen.generate(xs=[tr_x], ys=[tr_y]): #loss = model.train_on_batch(batch_x, batch_y) if iter % 2000 == 0: lr *= 0.1 model.train_on_batch(batch_x, batch_y) iter += 1 # Validate and save training stats. if iter % 1000 == 0: tr_loss = eval(model, eval_tr_gen, tr_x, tr_y) te_loss = eval(model, eval_te_gen, te_x, te_y) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = {'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Save model. if iter % 5000 == 0: model_path = os.path.join(model_dir, "md_%diters.h5" % iter) model.save(model_path) print("Saved model to %s" % model_path) if iter == 10001: break ''' print("Training time: %s s" % (time.time() - t1,))
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace #tr_snr = args.tr_snr #te_snr = args.te_snr n_concat = args.n_concat #iter = args.iteration TF = args.TF model_name = args.model_name n_window = cfg.n_window n_overlap = cfg.n_overlap #snr = cfg.SNR n_hop = int(n_window-n_overlap) fs = cfg.sample_rate scale = True # Load model t1 = time.time() #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5") mag_model_root = os.path.join(workspace, "saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' mag_model_files = find_models(mag_model_root) epoch_num = [] for i in range(len(mag_model_files)): epoch_num.append(int(mag_model_files[i].split("/")[-1].split('-')[2])) mag_model_index = epoch_num.index(max(epoch_num)) mag_model_path = mag_model_files[mag_model_index] print("The selected model path is %s :" % mag_model_path) mag_model = load_model(mag_model_path) ''' # loading phase model phase_model_root = os.path.join(workspace, "phase_saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' phase_model_files = find_models(phase_model_root) epoch_num1 = [] for i in range(len(phase_model_files)): epoch_num1.append(int(phase_model_files[i].split("/")[-1].split('-')[2])) phase_model_index = epoch_num1.index(max(epoch_num1)) phase_model_path = phase_model_files[phase_model_index] print("The selected model path is %s :" % phase_model_path) phase_model = load_model(phase_model_path) ''' # Load scaler mag_scaler_path = os.path.join(workspace, "packed_features", "train", "mag_scaler.p") mag_scaler = pickle.load(open(mag_scaler_path, 'rb')) #phase_scaler_path = os.path.join(workspace, "packed_features", "train", "phase_scaler.p") #phase_scaler = pickle.load(open(phase_scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "test") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_cmplx_x] = data n_pad = (n_concat - 1) / 2 if TF == "spectrogram": mixed_x = np.abs(mixed_cmplx_x) # mixed_phase = np.angle(mixed_cmplx_x) # Process data. #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) # mixed_phase = pp_data.pad_with_border(mixed_phase, n_pad) # speech_x = pp_data.log_sp(np.abs(speech_cmplx_x)) #speech_phase = np.angle(speech_cmplx_x) else: raise Exception("TF must be spectrogram, timedomain or fftmagnitude!") # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, mag_scaler) # speech_x = pp_data.scale_on_2d(speech_x, mag_scaler) #mixed_phase = pp_data.scale_on_2d(mixed_phase, phase_scaler) #speech_phase = pp_data.scale_on_2d(speech_phase, phase_scaler) # Cut input spectrogram to 3D segments with n_concat. #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) #mixed_phase_3d = pp_data.mat_2d_to_3d(mixed_phase, agg_num=n_concat, hop=1) #print("loading data time: %s s" % (time.time() - t1,)) ''' layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号 f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数) for _ in range(12): show_img = f1[1, :, :, _] show_img.shape = [1, 257] plt.subplot(3, 4, _ + 1) plt.imshow(show_img.T, cmap='gray') plt.axis('off') plt.show() ''' # Predict. t2 = time.time() mag_pred = mag_model.predict(mixed_x_3d) #phase_pred = phase_model.predict(mixed_phase_3d) print("model predicts %d utterance : %s successfully" % (cnt, na)) #print(pred) # Inverse scale. if scale: # mixed_x = pp_data.inverse_scale_on_2d(mixed_x, mag_scaler) # speech_x = pp_data.inverse_scale_on_2d(speech_x, mag_scaler) mag_pred = pp_data.inverse_scale_on_2d(mag_pred, mag_scaler) #mixed_phase = pp_data.inverse_scale_on_2d(mixed_phase, phase_scaler) #speech_phase = pp_data.inverse_scale_on_2d(speech_phase, phase_scaler) #phase_pred = pp_data.inverse_scale_on_2d(phase_pred, phase_scaler) # Recover enhanced wav. #pred_sp = np.exp(pred) if TF == "spectrogram": pred_sp = (10**(mag_pred/10))-1e-10 #pred_ph = np.exp(1j * phase_pred) ''' R = np.multiply(pred_sp, pred_ph) result = librosa.istft(R.T, hop_length=n_hop, win_length=cfg.n_window, window=scipy.signal.hamming, center=False) result /= abs(result).max() y_out = result*0.8''' #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) #s *= np.sqrt((np.hamming(n_window)**2).sum()) # Scaler for compensate the amplitude s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming') # Write out enhanced wav. out_path = os.path.join(workspace, "enh_flipphase", "test", "%s" % model_name, "{}_fft_dnn_map.wav".format(na.split('.')[0])) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) print("predict an utterance time: %s s" % (time.time() - t2,)) print("total test time: %s s" % (time.time() - t1,))
def inference1111(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace #tr_snr = args.tr_snr #te_snr = args.te_snr n_concat = args.n_concat #iter = args.iteration TF = args.TF model_name = args.model_name n_window = cfg.n_window n_overlap = cfg.n_overlap #snr = cfg.SNR n_hop = int(n_window-n_overlap) fs = cfg.sample_rate scale = True # Load model t1 = time.time() #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5") model_root = os.path.join(workspace, "saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' model_files = find_models(model_root) epoch_num = [] for i in range(len(model_files)): epoch_num.append(int(model_files[i].split("/")[-1].split('-')[2])) model_index = epoch_num.index(max(epoch_num)) model_path = model_files[model_index] print("The selected model path is %s :" % model_path) model = load_model(model_path) # Load scaler scaler_path = os.path.join(workspace, "packed_features", "train", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "test") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, na] = data n_pad = (n_concat - 1) / 2 if TF == "spectrogram": mixed_x = np.abs(mixed_cmplx_x) # Process data. #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) elif TF == "timedomain": #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_cmplx_x, n_pad) elif TF == "fftmagnitude": #n_pad = (n_concat - 1) / 2 mixed_x = np.abs(mixed_cmplx_x) mixed_x = pp_data.pad_with_border(mixed_x, n_pad) else: raise Exception("TF must be spectrogram, timedomain or fftmagnitude!") # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) #print("loading data time: %s s" % (time.time() - t1,)) ''' layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号 f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数) for _ in range(12): show_img = f1[1, :, :, _] show_img.shape = [1, 257] plt.subplot(3, 4, _ + 1) plt.imshow(show_img.T, cmap='gray') plt.axis('off') plt.show() ''' # Predict. t2 = time.time() pred = model.predict(mixed_x_3d) print("model predicts %d utterance : %s successfully" % (cnt, na)) #print(pred) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) #(frames, frame_length) = pred.shape #print("pred domensions %d and %d : " % (frames, frame_length)) # Debug plot. if args.visualize: if TF == "spectrogram": fig, axs = plt.subplots(3,1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.savefig('debug_model_spectra.png') plt.show() elif TF == "timedomain": fig, axs = plt.subplots(3,1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture time domain" % int(te_snr)) axs[1].set_title("Clean speech time domain") axs[2].set_title("Enhanced speech time domain") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.savefig('debug model_time.png') plt.show() else: raise Exception("TF must be spectrogram or timedomain!") # Recover enhanced wav. #pred_sp = np.exp(pred) if TF == "spectrogram": pred_sp = (10**(pred/20))-1e-10 #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) #s *= np.sqrt((np.hamming(n_window)**2).sum()) # Scaler for compensate the amplitude s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming') # change after spectrogram and IFFT. elif TF == "timedomain": s = time_recover_wav(pred, n_window, n_hop, 'hamming') #s *= np.sqrt((np.hamming(n_window)**2).sum()) elif TF == "fftmagnitude": #n_pad = (n_concat - 1) / 2 s = spectra_to_wav(pred, mixed_cmplx_x, n_window, n_hop, 'hamming') else: raise Exception("TF must be spectrogram timedomain or fftmagnitude!") # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%s" % model_name, "%s.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) print("predict an utterance time: %s s" % (time.time() - t2,)) print("total test time: %s s" % (time.time() - t1,))
def train(args): """Train the neural network. Write out model every several iterations. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. lr: float, learning rate. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr lr = args.lr iteration = args.iter # Load data. t1 = time.time() tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "data.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "%ddb" % int(te_snr), "data.h5") tr_adapt_utt_path = os.path.join(workspace, "adaptive_utterance", "train", "adaptive_utterance_spec.p") te_adapt_utt_path = os.path.join(workspace, "adaptive_utterance", "test", "adaptive_utterance_spec.p") tr_adapt_utt = cPickle.load(open(tr_adapt_utt_path, 'rb')) te_adapt_utt = cPickle.load(open(te_adapt_utt_path, 'rb')) tr_adapt_utt_len_path = os.path.join(workspace, "adaptive_utterance", "train", "adaptive_utterance_max_len.p") te_adapt_utt_len_path = os.path.join(workspace, "adaptive_utterance", "test", "adaptive_utterance_max_len.p") tr_adapt_utt_len = cPickle.load(open(tr_adapt_utt_len_path, 'rb')) te_adapt_utt_len = cPickle.load(open(te_adapt_utt_len_path, 'rb')) max_len = max(tr_adapt_utt_len, te_adapt_utt_len) (tr_x1, tr_x2, tr_y1, tr_y2, tr_name) = pp_data.load_hdf5(tr_hdf5_path) (te_x1, te_x2, te_y1, te_y2, te_name) = pp_data.load_hdf5(te_hdf5_path) print(tr_x1.shape, tr_y1.shape, tr_x2.shape, tr_y2.shape) print(te_x1.shape, te_y1.shape, te_x2.shape, te_y2.shape) print("Load data time: %s s" % (time.time() - t1,)) batch_size = 500 print("%d iterations / epoch" % int(tr_x1.shape[0] / batch_size)) # Scale data. if not True: t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_x1 = pp_data.scale_on_3d(tr_x1, scaler) tr_y1 = pp_data.scale_on_2d(tr_y1, scaler) te_x1 = pp_data.scale_on_3d(te_x1, scaler) te_y1 = pp_data.scale_on_2d(te_y1, scaler) tr_x2 = pp_data.scale_on_2d(tr_x2, scaler) tr_y2 = pp_data.scale_on_2d(tr_y2, scaler) te_x2 = pp_data.scale_on_2d(te_x2, scaler) te_y2 = pp_data.scale_on_2d(te_y2, scaler) print("Scale data time: %s s" % (time.time() - t1,)) # Debug plot. if False: plt.matshow(tr_x[0: 1000, 0, :].T, origin='lower', aspect='auto', cmap='jet') plt.show() pause # Build model (_, n_concat, n_freq) = tr_x1.shape n_hid = 2048 input_dim1 = (257 + 40 + 30) * 2 input_dim2 = (257 + 40 + 30) out_dim1 = (257 + 40 + 30) * 2 out_dim1_irm = 257 + 40 + 64 out_dim2 = (257 + 40 + 30) out_dim2_irm = (257 + 40 + 64) num_factorize = 30 def multiplication(pair_tensors): ''' :param pair_tensors: x: (num_factorize,) y: (num_factorize, n_hid) :return: (n_hid,) sum(x[i]*y[i,:],axis=1) ''' x, y = pair_tensors return K.sum(tf.multiply(y, K.expand_dims(x, -1)), axis=1) adapt_input = Input(shape=(None,), name='adapt_input') layer = Reshape((-1, 257), name='reshape')(adapt_input) layer = Dense(512, activation='relu', name='adapt_dense1')(layer) layer = Dense(512, activation='relu', name='adapt_dense2')(layer) layer = Dense(num_factorize, activation='softmax', name='adapt_out')(layer) alpha = Lambda(lambda x: K.sum(x, axis=1), output_shape=(num_factorize,), name='sequence_sum')(layer) input1 = Input(shape=(n_concat, input_dim1), name='input1') layer = Flatten(name='flatten')(input1) layer = Dense(n_hid * num_factorize, name='dense0')(layer) layer = Reshape((num_factorize, n_hid), name='reshape2')(layer) layer = Lambda(multiplication, name='multiply')([alpha, layer]) layer = Dense(n_hid, activation='relu', name='dense1')(layer) layer = Dropout(0.2)(layer) layer = Dense(n_hid, activation='relu', name='dense2')(layer) layer = Dropout(0.2)(layer) partial_out1 = Dense(out_dim1, name='1_out_linear')(layer) partial_out1_irm = Dense(out_dim1_irm, name='1_out_irm', activation='sigmoid')(layer) out1 = concatenate([partial_out1, partial_out1_irm], name='out1') input2 = Input(shape=(input_dim2,), name='input2') layer = concatenate([input2, out1], name='merge') layer = Dense(n_hid, activation='relu', name='dense3')(layer) layer = Dropout(0.2)(layer) layer = Dense(n_hid, activation='relu', name='dense4')(layer) layer = Dropout(0.2)(layer) partial_out2 = Dense(out_dim2, name='2_out_linear')(layer) partial_out2_irm = Dense(out_dim2_irm, name='2_out_irm', activation='sigmoid')(layer) out2 = concatenate([partial_out2, partial_out2_irm], name='out2') model = Model(inputs=[input1, input2, adapt_input], outputs=[out1, out2]) model.summary() sys.stdout.flush() model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr, epsilon=1e-03)) # Data generator. tr_gen = DataGenerator(batch_size=batch_size, type='train', max_len=max_len) eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100, max_len=max_len) eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100, max_len=max_len) # Directories for saving models and training stats model_dir = os.path.join(workspace, "models", "%ddb" % int(tr_snr)) pp_data.create_folder(model_dir) stats_dir = os.path.join(workspace, "training_stats", "%ddb" % int(tr_snr)) pp_data.create_folder(stats_dir) # Print loss before training. iter = 0 tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2, tr_name, tr_adapt_utt) te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2, te_name, te_adapt_utt) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = {'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Train. t1 = time.time() for (batch_x, batch_y) in tr_gen.generate([tr_x1, tr_x2, tr_name], [tr_y1, tr_y2], tr_adapt_utt): loss = model.train_on_batch(batch_x, batch_y) iter += 1 # Validate and save training stats. if iter % 100 == 0: tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2, tr_name, tr_adapt_utt) te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2, te_name, te_adapt_utt) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) sys.stdout.flush() # Save out training stats. stat_dict = {'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Save model. if iter % (iteration / 20) == 0: model_path = os.path.join(model_dir, "md_%diters.h5" % iter) model.save(model_path) print("Saved model to %s" % model_path) if iter == iteration + 1: break print("Training time: %s s" % (time.time() - t1,))
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration calc_log = args.calc_log model_file = args.model_file n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Build model n_concat = 7 n_freq = 257 n_hid = 2048 lr = 1e-3 model = Sequential() model.add(Flatten(input_shape=(n_concat, n_freq))) model.add(Dropout(0.1)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) if calc_log: model.add(Dense(n_freq, activation='linear')) else: model.add(Dense(n_freq, activation='relu')) model.summary() model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr)) # Load model. if (model_file == "null"): model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) #model = load_model(model_path) model.load_weights(model_path) else: model.load_weights(model_file) # Load scaler. if calc_log: scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) if calc_log: mixed_x = pp_data.log_sp(mixed_x) #speech_x = pp_data.log_sp(speech_x) else: mixed_x = mixed_x #speech_x = speech_x # Scale data. if calc_log: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) #speech_x = pp_data.scale_on_2d(speech_x, scaler) else: mixed_x_max = np.max(mixed_x) print("max of tr_x:", mixed_x_max) mixed_x = mixed_x / mixed_x_max speech_x_max = np.max(speech_x) print("max of speech_x:", speech_x_max) speech_x = speech_x / speech_x_max # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. if False: print(mixed_x_3d) pred = model.predict(mixed_x_3d) print(cnt, na) if False: print("pred") print(pred) print("speech") print(speech_x) # Inverse scale. if calc_log: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) #speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) else: mixed_x = mixed_x * mixed_x_max #speech_x = speech_x * 16384 pred = pred * mixed_x_max # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') #axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. if calc_log: pred_sp = np.exp(pred) else: #gv = 0.025 #pred_sp = np.maximum(0,pred - gv) pred_sp = pred if False: pred_sp = mixed_x[3:-3] s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) # Write out enhanced pcm 8K pcm_s16le. out_pcm_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.pcm" % na) cmd = ' '.join([ "./ffmpeg -y -i ", out_path, " -f s16le -ar 8000 -ac 1 -acodec pcm_s16le ", out_pcm_path ]) os.system(cmd) # Write out webrtc-denoised enhanced pcm 8K pcm_s16le. ns_out_pcm_path = os.path.join(workspace, "ns_enh_wavs", "test", "%ddb" % int(te_snr), "%s.ns_enh.pcm" % na) ns_out_wav_path = os.path.join(workspace, "ns_enh_wavs", "test", "%ddb" % int(te_snr), "%s.ns_enh.wav" % na) pp_data.create_folder(os.path.dirname(ns_out_pcm_path)) cmd = ' '.join(["./ns", out_pcm_path, ns_out_pcm_path]) os.system(cmd) cmd = ' '.join([ "./ffmpeg -y -f s16le -ar 8000 -ac 1 -acodec pcm_s16le -i ", ns_out_pcm_path, " ", ns_out_wav_path ]) os.system(cmd) cmd = ' '.join(["rm ", out_pcm_path]) os.system(cmd) cmd = ' '.join(["rm ", ns_out_pcm_path]) os.system(cmd)
def train(args): """Train the neural network. Write out model every several iterations. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. lr: float, learning rate. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr lr = args.lr iteration = args.iter # Load data. t1 = time.time() tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "data.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "%ddb" % int(te_snr), "data.h5") (tr_x1, tr_x2, tr_y1, tr_y2) = pp_data.load_hdf5(tr_hdf5_path) (te_x1, te_x2, te_y1, te_y2) = pp_data.load_hdf5(te_hdf5_path) print(tr_x1.shape, tr_y1.shape, tr_x2.shape, tr_y2.shape) print(te_x1.shape, te_y1.shape, te_x2.shape, te_y2.shape) print("Load data time: %s s" % (time.time() - t1, )) batch_size = 500 print("%d iterations / epoch" % int(tr_x1.shape[0] / batch_size)) # Scale data. if not True: t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_x1 = pp_data.scale_on_3d(tr_x1, scaler) tr_y1 = pp_data.scale_on_2d(tr_y1, scaler) te_x1 = pp_data.scale_on_3d(te_x1, scaler) te_y1 = pp_data.scale_on_2d(te_y1, scaler) tr_x2 = pp_data.scale_on_2d(tr_x2, scaler) tr_y2 = pp_data.scale_on_2d(tr_y2, scaler) te_x2 = pp_data.scale_on_2d(te_x2, scaler) te_y2 = pp_data.scale_on_2d(te_y2, scaler) print("Scale data time: %s s" % (time.time() - t1, )) # Debug plot. if False: plt.matshow(tr_x[0:1000, 0, :].T, origin='lower', aspect='auto', cmap='jet') plt.show() pause # Build model (_, n_concat, n_freq) = tr_x1.shape n_hid = 2048 input_dim1 = (257 + 40 + 30) * 2 input_dim2 = (257 + 40 + 30) out_dim1 = (257 + 40 + 30) * 2 out_dim1_irm = 257 + 40 + 64 out_dim2 = (257 + 40 + 30) out_dim2_irm = (257 + 40 + 64) # model = Sequential() # model.add(Flatten(input_shape=(n_concat, n_freq))) # model.add(Dense(n_hid, activation='relu')) # model.add(Dropout(0.2)) # model.add(Dense(n_hid, activation='relu')) # model.add(Dropout(0.2)) # model.add(Dense(n_hid, activation='relu')) # model.add(Dropout(0.2)) # model.add(Dense(n_freq, activation='linear')) input1 = Input(shape=(n_concat, input_dim1), name='input1') layer = Flatten(name='flatten')(input1) layer = Dense(n_hid, activation='relu', name='dense1')(layer) layer = Dropout(0.2)(layer) layer = Dense(n_hid, activation='relu', name='dense2')(layer) layer = Dropout(0.2)(layer) partial_out1 = Dense(out_dim1, name='1_out_linear')(layer) partial_out1_irm = Dense(out_dim1_irm, name='1_out_irm', activation='sigmoid')(layer) out1 = concatenate([partial_out1, partial_out1_irm], name='out1') input2 = Input(shape=(input_dim2, ), name='input2') layer = concatenate([input2, out1], name='merge') layer = Dense(n_hid, activation='relu', name='dense3')(layer) layer = Dropout(0.2)(layer) layer = Dense(n_hid, activation='relu', name='dense4')(layer) layer = Dropout(0.2)(layer) partial_out2 = Dense(out_dim2, name='2_out_linear')(layer) partial_out2_irm = Dense(out_dim2_irm, name='2_out_irm', activation='sigmoid')(layer) out2 = concatenate([partial_out2, partial_out2_irm], name='out2') model = Model(inputs=[input1, input2], outputs=[out1, out2]) model.summary() sys.stdout.flush() model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr, epsilon=1e-03)) # Data generator. tr_gen = DataGenerator(batch_size=batch_size, type='train') eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) # Directories for saving models and training stats model_dir = os.path.join(workspace, "models", "%ddb" % int(tr_snr)) pp_data.create_folder(model_dir) stats_dir = os.path.join(workspace, "training_stats", "%ddb" % int(tr_snr)) pp_data.create_folder(stats_dir) # Print loss before training. iter = 0 tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2) te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Train. t1 = time.time() for (batch_x, batch_y) in tr_gen.generate(xs=[tr_x1, tr_x2], ys=[tr_y1, tr_y2]): loss = model.train_on_batch(batch_x, batch_y) iter += 1 # Validate and save training stats. if iter % 100 == 0: tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2) te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) sys.stdout.flush() # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Save model. if iter % (iteration / 20) == 0: model_path = os.path.join(model_dir, "md_%diters.h5" % iter) model.save(model_path) print("Saved model to %s" % model_path) if iter == iteration + 1: break print("Training time: %s s" % (time.time() - t1, ))
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration data_type = 'IRM' n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. if data_type == "DM": model_path = os.path.join(workspace, "models", "mixdb", "md_%diters.h5" % 120000) else: model_path = os.path.join(workspace, "models", "mask_mixdb", "md_%diters.h5" % 265000) model = load_model(model_path) # Load scaler. scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "mixdb", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "mixdb") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) if data_type == "IRM": mixed_x = speech_x + noise_x mixed_x1 = speech_x + noise_x # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) if data_type == "IRM": pred_sp = pred * mixed_x1 print(cnt, na) # Inverse scale. if data_type == "DM": pred = pp_data.inverse_scale_on_2d(pred, scaler) pred_sp = np.exp(pred) # Debug plot. # Recover enhanced wav. s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. if data_type == "DM": out_path = os.path.join(workspace, "enh_wavs", "test", "mixdb", "%s.enh.wav" % na) else: out_path = os.path.join(workspace, "enh_wavs", "test", "mask_mixdb", "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)