def main(): global model model = tts_load(model=model, ckpt_path=ckpt_path_ljspeech) ppgs_list = open(ppgs_paths, 'r') ppgs_list = [i.strip() for i in ppgs_list] for idx, ppg_path in tqdm(enumerate(ppgs_list)): ppg = np.load(ppg_path) mel_pred, spec_pred, mel_pred_audio, spec_pred_audio = tts_predict( model, ppg) write_wav( os.path.join(ljspeech_log_dir, "{}_sample_mel.wav".format(idx)), mel_pred_audio) write_wav( os.path.join(ljspeech_log_dir, "{}_sample_spec.wav".format(idx)), spec_pred_audio) np.save( os.path.join(ljspeech_log_dir, "{}_sample_mel.npy".format(idx)), mel_pred) np.save( os.path.join(ljspeech_log_dir, "{}_sample_spec.npy".format(idx)), spec_pred) draw_spec( os.path.join(ljspeech_log_dir, "{}_sample_mel.png".format(idx)), mel_pred) draw_spec( os.path.join(ljspeech_log_dir, "{}_sample_spec.png".format(idx)), spec_pred)
def process_file(audio_path, output, spkr_to_spkr, lstnr_to_spkr, ear_to_ear): """ Read stereo binaural audio file and write wav file with crosstalk 'removed' """ logger.info('Loading file into memory: {}'.format(audio_path)) y, sr = audio.load(audio_path, mono=False, sr=44100) left = y[0] right = y[1] logger.info('Computing distance from speaker to each ear') d1, d2, theta = compute_geometry(spkr_to_spkr, lstnr_to_spkr, ear_to_ear) logger.debug('d1: {}'.format(d1)) logger.debug('d2: {}'.format(d2)) logger.debug('theta: {}'.format(theta)) headshadow = headshadow_filter_coefficients(theta, ear_to_ear / 2, sr) logger.debug('headshadow b: {} a: {}'.format(*headshadow)) logger.info('Computing recursive crosstalk cancellation for left channel') l_left, l_right = cancel_crosstalk(left, d1, d2, headshadow, sr) logger.info('Computing recursive crosstalk cancellation for right channel') r_right, r_left = cancel_crosstalk(right, d1, d2, headshadow, sr) left = audio.sum_signals([l_left, r_left, left]) right = audio.sum_signals([l_right, r_right, right]) y = audio.channel_merge([left, right]) logger.info('Writing output to: {}'.format(output)) audio.write_wav(output, y, sr, norm=True)
def do_convert(logdir1, logdir2, input_path, output_path): # Load graph model = Net2() model.actual_duration = librosa.core.get_duration(filename=input_path, sr=hp.default.sr) # TODO isolate out logdirs, uhh and how to pre-dl from s3? assert len(input_path) > 0, "must be non-empty input path" df = Net2DataFlow(data_path=input_path, batch_size=1) ckpt1 = tf.train.latest_checkpoint(logdir1) ckpt2 = tf.train.latest_checkpoint(logdir2) session_inits = [] session_inits.append(SaverRestore(ckpt2)) session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) pred_conf = PredictConfig(model=model, input_names=get_eval_input_names(), output_names=get_eval_output_names(), session_init=ChainInit(session_inits)) predictor = OfflinePredictor(pred_conf) audio, y_audio, ppgs = convert(predictor, df) write_wav(audio[0], hp.default.sr, output_path)
def main(): # en_ppg_l, en_linear_l = for_loop_en() #英文每一帧ppg的列表 en_l = [en_ppg1,en_ppg2,...] cn_ppg_l, cn_linear_l = for_loop_cn() #中文每一帧ppg的列表 cn_l = [cn_ppg1,cn_ppg2,...] all_ppg_l = en_ppg_l + cn_ppg_l #中英文混合后的ppg的列表 # en_final_cn_idx = np.load(en_final_cn_idx_path) # en_file_list = en_text2list(file=en_raw_list_path) # en_ppgs_ls = [] now = 0 for f in tqdm(en_file_list): wav_ppgs, linears = get_single_data_pair(f, ppgs_dir=en_raw_ppg_path, linears_dir=en_raw_linear_dir) e_ppg_id = [] # 英文从零开始 for i in range(wav_ppgs.shape[0]): e_ppg_id.append(now) now += 1 print('en id from 0:', e_ppg_id[:10]) c_ppg_id_projected = ppg_project(e_ppg_id, en_final_cn_idx) # 从中文的零开始 # 找到linear c_lineas_projected = list() for i in c_ppg_id_projected: c_lineas_projected.append(cn_linear_l[i]) c_lineas_projected = np.asarray(c_lineas_projected) save_linear_name = f + '_cn_linear_projected.wav' write_wav(os.path.join(projected_wav_dir, save_linear_name), normalized_db_spec2wav(c_lineas_projected)) save_linear_original_name = f + '_en_linear_original.wav' write_wav(os.path.join(projected_wav_dir, save_linear_original_name), normalized_db_spec2wav(linears))
def main(): #这一部分用于处理LJSpeech格式的数据集 a = open(meta_path, 'r').readlines() b = [] i = 0 while i < len(a): t = a[i][0:6] b.append(t) i += 2 print(b[:2]) a = b # a = [i.strip().split('|')[0] for i in a] cnt = 0 cnt_list = [] bad_cnt = 0 bad_list = [] for fname in tqdm(a): try: # 提取声学参数 wav_f = os.path.join(wav_dir, fname + '.wav') wav_arr = load_wav(wav_f) mfcc_feats = wav2unnormalized_mfcc(wav_arr) mel_feats = wav2normalized_db_mel(wav_arr) spec_feats = wav2normalized_db_spec(wav_arr) # 验证声学参数提取的对 save_name = fname + '.npy' save_mel_rec_name = fname + '_mel_rec.wav' save_spec_rec_name = fname + '_spec_rec.wav' # 这句话有可能错,不知道为什么,可能是服务器临时变动有关 ppg_already_feats = np.load(os.path.join(ppg_dir, save_name)) assert ppg_already_feats.shape[0] == mfcc_feats.shape[0] assert mfcc_feats.shape[0] == mel_feats.shape[0] and mel_feats.shape[0] == spec_feats.shape[0] write_wav(os.path.join(rec_wav_dir, save_mel_rec_name), normalized_db_mel2wav(mel_feats)) write_wav(os.path.join(rec_wav_dir, save_spec_rec_name), normalized_db_spec2wav(spec_feats)) # 存储声学参数 mfcc_save_name = os.path.join(mfcc_dir, save_name) mel_save_name = os.path.join(mel_dir, save_name) spec_save_name = os.path.join(spec_dir, save_name) np.save(mfcc_save_name, mfcc_feats) np.save(mel_save_name, mel_feats) np.save(spec_save_name, spec_feats) f_good_meta.write(fname + '\n') cnt_list.append(fname) cnt += 1 except: bad_list.append(fname) bad_cnt += 1 # print(cnt) # break print(cnt) print('bad:', bad_cnt) print(bad_list) return
def process_file(audio_path, output, spkr_to_spkr, lstnr_to_spkr, ear_to_ear): """ Read stereo binaural audio file and write wav file with crosstalk 'removed' """ logger.info('Loading file into memory: {}'.format(audio_path)) y, sr = audio.load(audio_path, mono=False, sr=44100) left = y[0] right = y[1] logger.info('Computing distance from speaker to each ear') d1, d2, theta = compute_geometry(spkr_to_spkr, lstnr_to_spkr, ear_to_ear) logger.debug('d1: {}'.format(d1)) logger.debug('d2: {}'.format(d2)) logger.debug('theta: {}'.format(theta)) headshadow = headshadow_filter_coefficients(theta, ear_to_ear/2, sr) logger.debug('headshadow b: {} a: {}'.format(*headshadow)) logger.info('Computing recursive crosstalk cancellation for left channel') l_left, l_right = cancel_crosstalk(left, d1, d2, headshadow, sr) logger.info('Computing recursive crosstalk cancellation for right channel') r_right, r_left = cancel_crosstalk(right, d1, d2, headshadow, sr) left = audio.sum_signals([l_left, r_left, left]) right = audio.sum_signals([l_right, r_right, right]) y = audio.channel_merge([left, right]) logger.info('Writing output to: {}'.format(output)) audio.write_wav(output, y, sr, norm=True)
def do_task(nthreads, audio): print 'Thread-{} start.\n'.format(nthreads) try: while True: src_path, tar_path = audio.next() wav, sr = librosa.load(src_path) wav = trim_wav(wav) write_wav(wav, sr, tar_path) except StopIteration: print 'Thread-{} done.\n'.format(nthreads)
def hrtf_file(audio_path, azimuth, elevation=0, distance=1, ear_distance=0.215, output=None): """ Read mono audio file and write binaural wav file to output """ logger.info('Loading signal into memory: {}'.format(audio_path)) y, sr = audio.load(audio_path) y = hrtf(y, sr, azimuth, elevation, distance, ear_distance) if output: audio.write_wav(output, y, sr, norm=True) return y
def generate_pair_wav(spec, spec_pred, log_dir, global_step, suffix_name): y_pred = normalized_db_spec2wav(spec_pred) pred_wav_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_predvalidation.wav") write_wav(pred_wav_path, y_pred) pred_spec_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_predvalidation.npy") np.save(pred_spec_path, spec_pred) y = normalized_db_spec2wav(spec) orig_wav_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_original.wav") write_wav(orig_wav_path, y) orig_spec_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_original.npy") np.save(orig_spec_path, spec)
def do_convert(args, logdir1, logdir2): # Load graph model = Net2() data = get_mfccs_and_spectrogram(args.file) ckpt1 = '{}/{}'.format( logdir1, args.net1) if args.net1 else tf.train.latest_checkpoint(logdir1) ckpt2 = '{}/{}'.format( logdir2, args.net2) if args.net2 else tf.train.latest_checkpoint(logdir2) session_inits = [] if ckpt2: session_inits.append(SaverRestore(ckpt2)) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) pred_conf = PredictConfig(model=model, input_names=get_eval_input_names(), output_names=get_eval_output_names(), session_init=ChainInit(session_inits)) predictor = OfflinePredictor(pred_conf) audio, y_audio, ppgs = convert(predictor, data) target_file = args.file.split('/')[-1] portion = os.path.splitext(target_file) # converted_file = target_file.split('.')[0] + '_converted.wav' converted_file = portion[0] + '.wav' write_wav(audio[0], hp.Default.sr, args.savepath + converted_file) # Write the result tf.summary.audio('A', y_audio, hp.Default.sr, max_outputs=hp.Convert.batch_size) tf.summary.audio('B', audio, hp.Default.sr, max_outputs=hp.Convert.batch_size) # Visualize PPGs heatmap = np.expand_dims(ppgs, 3) # channel=1 tf.summary.image('PPG', heatmap, max_outputs=ppgs.shape[0]) writer = tf.summary.FileWriter(args.savepath) with tf.Session() as sess: summ = sess.run(tf.summary.merge_all()) writer.add_summary(summ) writer.close()
def eval_model_generate(spec, spec_pred, length, log_dir, global_step): print("EVAL LENGTH:", length) print("EVAL SPEC PRED SHAPE:", spec_pred.shape) y_pred = normalized_db_spec2wav(spec_pred) pred_wav_path = os.path.join(log_dir, "checkpoint_step_{}_pred.wav".format(global_step)) write_wav(pred_wav_path, y_pred) pred_spec_path = os.path.join(log_dir, "checkpoint_step_{}_pred_spec.npy".format(global_step)) np.save(pred_spec_path, spec_pred) print("EVAL LENGTH:", length) print("EVAL SPEC SHAPE:", spec.shape) y = normalized_db_spec2wav(spec) orig_wav_path = os.path.join(log_dir, "checkpoint_step_{}_original.wav".format(global_step)) write_wav(orig_wav_path, y) orig_spec_path = os.path.join(log_dir, "checkpoint_step_{}_orig_spec.npy".format(global_step)) np.save(orig_spec_path, spec)
def main(): with torch.no_grad(): model = DCBHG() model = tts_load(model=model, ckpt_path=ckpt_path_Multi) ppgs_list = open(ppgs_paths, 'r') ppgs_list = [i.strip() for i in ppgs_list] for idx, ppg_path_and_findA_ppg_path_and_speaker in tqdm( enumerate(ppgs_list)): ppg_path, findA_ppg_path, speaker_id = ppg_path_and_findA_ppg_path_and_speaker.split( '|') ppg = np.load(ppg_path) findA_ppg = np.load(findA_ppg_path) assert ppg.shape[1] == PPG_DIM and findA_ppg.shape[1] == PPG_DIM speaker_id = int(speaker_id) mel_pred, spec_pred, mel_pred_audio, spec_pred_audio = tts_predict( model, ppg, speaker_id) findA_mel_pred, findA_spec_pred, findA_mel_pred_audio, findA_spec_pred_audio = tts_predict( model, findA_ppg, speaker_id) # CE_fromWav, CE_seq_fromWav = consistencyError_fromWav(spec_pred_audio, ppg) # findA_CE_fromWav, findA_CE_seq_fromWav = consistencyError_fromWav(findA_spec_pred_audio, ppg) # with open(CE_fromWav_compare_path, 'w') as f: # f.write(str(CE_fromWav) + '\n') # f.write(str(findA_CE_fromWav) + '\n') write_wav( os.path.join(Multi_log_dir, "{}_sample_mel.wav".format(idx)), mel_pred_audio) write_wav( os.path.join(Multi_log_dir, "{}_sample_spec.wav".format(idx)), spec_pred_audio) np.save( os.path.join(Multi_log_dir, "{}_sample_mel.npy".format(idx)), mel_pred) np.save( os.path.join(Multi_log_dir, "{}_sample_spec.npy".format(idx)), spec_pred) draw_spec( os.path.join(Multi_log_dir, "{}_sample_mel.png".format(idx)), mel_pred) draw_spec( os.path.join(Multi_log_dir, "{}_sample_spec.png".format(idx)), spec_pred) write_wav( os.path.join(Multi_log_dir, "{}_sample_mel_findA.wav".format(idx)), findA_mel_pred_audio) write_wav( os.path.join(Multi_log_dir, "{}_sample_spec_findA.wav".format(idx)), findA_spec_pred_audio) np.save( os.path.join(Multi_log_dir, "{}_sample_mel_findA.npy".format(idx)), findA_mel_pred) np.save( os.path.join(Multi_log_dir, "{}_sample_spec_findA.npy".format(idx)), findA_spec_pred) draw_spec( os.path.join(Multi_log_dir, "{}_sample_mel_findA.png".format(idx)), findA_mel_pred) draw_spec( os.path.join(Multi_log_dir, "{}_sample_spec_findA.png".format(idx)), findA_spec_pred)
def main(): print('start program') program_time = time.time() last_time = time.time() en_ppg_l, en_linear_l = for_loop_en( ) #英文每一帧ppg的列表 en_l = [en_ppg1,en_ppg2,...] cn_ppg_l, cn_linear_l = for_loop_cn( ) #中文每一帧ppg的列表 cn_l = [cn_ppg1,cn_ppg2,...] all_ppg_l = en_ppg_l + cn_ppg_l #中英文混合后的ppg的列表 print('end put ppg in memory, use:', time.time() - last_time) last_time = time.time() print('start cluster...') # 需要快速的聚类 #all_l=[en_ppg1,en_ppg2,...,cn_ppg1,cn_ppg2,...] all_class = cluster_kmeans( all_ppg_l, K_small) #all_class=[en_label,en_label,...,cn_label,cn_label,...] print('end cluster..., k-means use:', time.time() - last_time) last_time = time.time() #... a[100], a[0].1, 2, 3,... class_cn_ppgs = list( ) #建立一个列表class_cn_ppgs,列表中包含K个空列表class_cn_ppg = [[],[],[],...] class_cn_ppgs_value = list() class_cn_ppgs_value_kdtree = list() for i in range(K_small): l = list() class_cn_ppgs.append(l) #append()在列表后面添加元素 l_value = list() class_cn_ppgs_value.append(l_value) # 构造类的信息, 筛选出每个类里都有哪些中文的ppg; 并且平均每个类有100个中文ppg en_ppg_l_len = len(en_ppg_l) for i in range(len(cn_ppg_l)): idx = i + en_ppg_l_len now_class = all_class[idx] #now_class = cn_label 可能是0-1999 class_cn_ppgs[now_class].append( i ) #class_cn_ppg = [[2,8,19,...],[3,48,79,...],[4,5,36,...],...] 2000个类,每个类中含有cn_l中对应帧ppg的序列号 class_cn_ppgs_value[now_class].append(cn_ppg_l[i]) print('prepare for class infomation use:', time.time() - last_time) print('start construct kdtree') all_last_time = time.time() have_cnt = 0 for i in tqdm(range(K_small)): l = len(class_cn_ppgs[i]) if l > 0: have_cnt += 1 print('cluster', i, 'len', l, 'start construct kd-tree') last_time = time.time() class_cn_ppgs_value[i] = np.asarray(class_cn_ppgs_value[i]) class_cn_ppgs_value_kdtree.append( KDTree(class_cn_ppgs_value[i], leaf_size=40)) print('end cluster', i, 'kd-tree use:', time.time() - last_time) print('have class:', have_cnt) print('end construct all kdtrees, tot use:', time.time() - all_last_time) # 开始寻找en对应的类内所有中文ppg离他最近的 print('start get cloest map array for all en ppg') last_time = time.time() en_final_cn_idx = np.zeros( (en_ppg_l_len)) # a[1000000] np.zeros()返回来一个给定形状和类型的用0填充的数组; for i in tqdm(range(en_ppg_l_len)): #遍历英文ppg列表, now_class = all_class[i] #now_class = en_label 可能是0-1999 # 暴力寻找 # ans1, ans_id1 = bruce_find_closest(i, now_class, en_ppg_l, cn_ppg_l, class_cn_ppgs) # k-d tree寻找 ans, ans_id = kdtree_find_closest( i, en_ppg_l, class_cn_ppgs_value_kdtree[now_class], class_cn_ppgs[now_class]) # assert np.absolute(ans1 - ans) < eps and ans_id1 == ans_id en_final_cn_idx[i] = ans_id np.save(en_final_cn_idx_path, en_final_cn_idx) print('end write map array, all use:', time.time() - last_time) # 开始findA的部分 print('start findA') last_time = time.time() en_file_list = en_text2list(file=en_raw_list_path) now = 0 for f in tqdm(en_file_list): wav_ppgs, linears = get_single_data_pair(f, ppgs_dir=en_raw_ppg_path, linears_dir=en_raw_linear_dir) e_ppg_id = [] # 英文从零开始 for i in range(wav_ppgs.shape[0]): e_ppg_id.append(now) now += 1 print('en id from 0:', e_ppg_id[:10]) c_ppg_id_projected = ppg_project(e_ppg_id, en_final_cn_idx) # 从中文的零开始 # 找到ppg并存储 c_ppgs_projected = list() for i in c_ppg_id_projected: c_ppgs_projected.append(cn_ppg_l[i]) c_ppgs_projected = np.asarray(c_ppgs_projected) save_ppg_name_projected = f + '_cn_ppg_projected.npy' np.save(os.path.join(projected_wav_dir, save_ppg_name_projected), c_ppgs_projected) # 找到linear并存储 c_lineas_projected = list() for i in c_ppg_id_projected: c_lineas_projected.append(cn_linear_l[i]) c_lineas_projected = np.asarray(c_lineas_projected) save_linear_name_projected = f + '_cn_linear_projected.npy' np.save(os.path.join(projected_wav_dir, save_linear_name_projected), c_lineas_projected) # 计算音频wav并存储 save_wav_name_projected = f + '_cn_wav_projected.wav' write_wav(os.path.join(projected_wav_dir, save_wav_name_projected), normalized_db_spec2wav(c_lineas_projected)) #----------接下来是original的ppg,linear,wav存储,用来对比----------- # 找到ppg并存储 save_ppg_name_original = f + '_en_ppg_original.npy' np.save(os.path.join(projected_wav_dir, save_ppg_name_original), wav_ppgs) # 找到linear并存储 save_linear_name_original = f + '_en_linear_original.npy' np.save(os.path.join(projected_wav_dir, save_linear_name_original), linears) # 计算音频wav并存储-original save_wav_name_original = f + '_en_wav_original.wav' write_wav(os.path.join(projected_wav_dir, save_wav_name_original), normalized_db_spec2wav(linears)) print('end findA, use:', time.time() - last_time) print('program use:', time.time() - program_time)
def main(): # 命令行帮助界面 args = get_arguments() # 命令行参数中制定好的 # logdir = args.log_dir # model_dir = args.output_model_path # restore_dir = args.output_model_path train_dir = os.path.join(logdir, STARTED_DATESTRING, 'train') dev_dir = os.path.join(logdir, STARTED_DATESTRING, 'dev') # directories = validate_directories(args.restore_from, args.overwrite) # restore_dir = directories['restore_from'] # logdir = directories['logdir'] # dev_dir = directories['dev_dir'] # dataset train_set = tf.data.Dataset.from_generator(train_generator, output_types=( tf.float32, tf.float32, tf.int32), output_shapes=( [None, my_hp.num_ppgs], [None, my_hp.num_freq], [])) #padding train data(why?how?) train_set = train_set.padded_batch(args.batch_size, padded_shapes=([None, my_hp.num_ppgs], [None, my_hp.num_freq], [])).repeat() # Any unknown dimensions will be padded to the maximum size of that dimension in each batch. train_iterator = train_set.make_initializable_iterator() test_set = tf.data.Dataset.from_generator(test_generator, output_types=( tf.float32, tf.float32, tf.int32), output_shapes=( [None, my_hp.num_ppgs], [None, my_hp.num_freq], [])) #设置repeat(),在get_next循环中,如果越界了就自动循环。不计上限 test_set = test_set.padded_batch(args.batch_size, padded_shapes=([None, my_hp.num_ppgs], [None, my_hp.num_freq], [])).repeat() test_iterator = test_set.make_initializable_iterator() #创建一个handle占位符,在sess.run该handle迭代器的next()时,可以送入一个feed_dict 代表handle占位符,从而调用对应的迭代器 dataset_handle = tf.placeholder(tf.string, shape=[]) dataset_iter = tf.data.Iterator.from_string_handle( dataset_handle, train_set.output_types, train_set.output_shapes ) batch_data = dataset_iter.get_next() with tf.Session() as sess: print(batch_data) #time.sleep(8) # classifier = DNNClassifier(out_dims=PPG_DIM, hiddens=[256, 256, 256], # drop_rate=0.2, name='dnn_classifier') # classifier = CnnDnnClassifier(out_dims=PPG_DIM, n_cnn=5, # cnn_hidden=64, dense_hiddens=[256, 256, 256]) decoderRegression = AcousticCBHGRegression(my_hp) results_dict = decoderRegression(inputs=batch_data[0], labels=batch_data[1], lengths=batch_data[2]) #inputs labels lengths #results_dict['logits']= np.zeros([10]) predicted = results_dict['out'] # mask = tf.sequence_mask(batch_data[2], dtype=tf.float32)#batch[2]是(None,),是每条数据的MFCC数目,需要这个的原因是会填充成最长的MFCC长度。mask的维度是(None,max(batch[2])) #batch_data[2]是MFCC数组的长度,MFCC有多少是不一定的。 # accuracy = tf.reduce_sum( # tf.cast(#bool转float # tf.equal(tf.argmax(predicted, axis=-1),#比较每一行的最大元素 # tf.argmax(batch_data[1], axis=-1)), # tf.float32) * mask#乘上mask,是因为所有数据都被填充为最多mfcc的维度了。所以填充部分一定都是相等的,于是需要将其mask掉。 # ) / tf.reduce_sum(tf.cast(batch_data[2], dtype=tf.float32)) # tf.summary.scalar('accuracy', accuracy) tf.summary.image('predicted_linear', tf.expand_dims( tf.transpose(predicted, [0, 2, 1]), axis=-1), max_outputs=1) tf.summary.image('groundtruth_linear', tf.expand_dims( tf.cast( tf.transpose(batch_data[1], [0, 2, 1]), tf.float32), axis=-1), max_outputs=1) tf.summary.image('groundtruth_PPG', tf.expand_dims( tf.cast( tf.transpose(batch_data[0], [0, 2, 1]), tf.float32), axis=-1), max_outputs=1) loss = results_dict['loss'] learning_rate_pl = tf.placeholder(tf.float32, None, 'learning_rate') tf.summary.scalar('mse_weighted_loss', loss) tf.summary.scalar('learning_rate', learning_rate_pl) optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate_pl) optim = optimizer.minimize(loss) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) optim = tf.group([optim, update_ops]) # Set up logging for TensorBoard. train_writer = tf.summary.FileWriter(train_dir) train_writer.add_graph(tf.get_default_graph()) dev_writer = tf.summary.FileWriter(dev_dir) summaries = tf.summary.merge_all() #设置将所有的summary保存 run这个会将所有的summary更新 saver = tf.train.Saver(max_to_keep=args.max_ckpts) # set up session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run([train_iterator.initializer, test_iterator.initializer]) train_handle, test_handle = sess.run([train_iterator.string_handle(), test_iterator.string_handle()]) sess.run(init) # try to load saved model try: saved_global_step = load_model(saver, sess, restore_dir) if saved_global_step is None: saved_global_step = -1 except: print("Something went wrong while restoring checkpoint. " "We will terminate training to avoid accidentally overwriting " "the previous model.") raise last_saved_step = saved_global_step step = None try: for step in range(saved_global_step + 1, args.steps): # if step <= int(1e3): lr = args.lr # elif step <= int(2e3): # lr = 0.8 * args.lr # elif step <= int(4e3): # lr = 0.5 * args.lr # elif step <= int(8e3): # lr = 0.25 * args.lr # else: # lr = 0.125 * args.lr start_time = time.time() if step % args.ckpt_every == 0: summary, loss_value, mag_spec, label_spec= sess.run([summaries, loss, predicted, batch_data[1]], feed_dict={dataset_handle: test_handle, learning_rate_pl: lr}) dev_writer.add_summary(summary, step) duration = time.time() - start_time print('step {:d} - eval loss = {:.3f}, ({:.3f} sec/step)' .format(step, loss_value, duration)) save_model(saver, sess, model_dir, step) last_saved_step = step # 没有提取mask,先听听试试 np.save(os.path.join(dev_dir, 'dev' + str(step) + '.npy'), mag_spec[0]) np.save(os.path.join(dev_dir, 'groundtruth_dev' + str(step) + '.npy'), label_spec[0]) y = normSTFT2wav(mag_spec[0]) dev_path = os.path.join(dev_dir, 'dev' + str(step) + '.wav') write_wav(dev_path, y, sr=16000) y = normSTFT2wav(label_spec[0]) dev_path = os.path.join(dev_dir, 'groundtruth_dev' + str(step) + '.wav') write_wav(dev_path, y, sr=16000) else: summary, loss_value, _, mag_spec, label_spec = sess.run([summaries, loss, optim, predicted, batch_data[1]], feed_dict={dataset_handle: train_handle, learning_rate_pl: lr}) train_writer.add_summary(summary, step) if step % 10 == 0: duration = time.time() - start_time print('step {:d} - training loss = {:.3f}, ({:.3f} sec/step)' .format(step, loss_value, duration)) if step % save_train_audio == 0: # 没有提取mask,先听听试试 np.save(os.path.join(train_dir, 'train' + str(step) + '.npy'), mag_spec[0]) np.save(os.path.join(train_dir, 'groundtruth_train' + str(step) + '.npy'), label_spec[0]) y = normSTFT2wav(mag_spec[0]) train_path = os.path.join(train_dir, 'train' + str(step) + '.wav') write_wav(train_path, y, sr=16000) y = normSTFT2wav(label_spec[0]) trian_path = os.path.join(train_dir, 'groundtruth_train' + str(step) + '.wav') write_wav(trian_path, y, sr=16000) except KeyboardInterrupt: # Introduce a line break after ^C is displayed so save message # is on its own line. print() finally: if step > last_saved_step: save_model(saver, sess, model_dir, step) sess.close()
import numpy as np import matplotlib.pyplot as plt from audio import spec2wav, wav2spec, read_wav, write_wav if __name__ == '__main__': sr = 22050 n_fft = 512 win_length = 400 hop_length = 80 duration = 2 # sec wav = read_wav( "H:\\cs230\\wav_x\\1_1.wav", sr, duration ) spec, _ = wav2spec(wav, n_fft, win_length, hop_length, False) converted_wav = spec2wav(spec, n_fft, win_length, hop_length, 600) write_wav(converted_wav, sr, 'a.wav') plt.pcolormesh(spec) plt.ylabel('Frequency') plt.xlabel('Time') plt.savefig("a.png")