def main():
    global model
    model = tts_load(model=model, ckpt_path=ckpt_path_ljspeech)

    ppgs_list = open(ppgs_paths, 'r')
    ppgs_list = [i.strip() for i in ppgs_list]
    for idx, ppg_path in tqdm(enumerate(ppgs_list)):
        ppg = np.load(ppg_path)
        mel_pred, spec_pred, mel_pred_audio, spec_pred_audio = tts_predict(
            model, ppg)

        write_wav(
            os.path.join(ljspeech_log_dir, "{}_sample_mel.wav".format(idx)),
            mel_pred_audio)
        write_wav(
            os.path.join(ljspeech_log_dir, "{}_sample_spec.wav".format(idx)),
            spec_pred_audio)

        np.save(
            os.path.join(ljspeech_log_dir, "{}_sample_mel.npy".format(idx)),
            mel_pred)
        np.save(
            os.path.join(ljspeech_log_dir, "{}_sample_spec.npy".format(idx)),
            spec_pred)

        draw_spec(
            os.path.join(ljspeech_log_dir, "{}_sample_mel.png".format(idx)),
            mel_pred)
        draw_spec(
            os.path.join(ljspeech_log_dir, "{}_sample_spec.png".format(idx)),
            spec_pred)
Exemplo n.º 2
0
def process_file(audio_path, output, spkr_to_spkr, lstnr_to_spkr, ear_to_ear):
    """
    Read stereo binaural audio file and write wav file with crosstalk 'removed'
    """
    logger.info('Loading file into memory: {}'.format(audio_path))
    y, sr = audio.load(audio_path, mono=False, sr=44100)
    left = y[0]
    right = y[1]

    logger.info('Computing distance from speaker to each ear')
    d1, d2, theta = compute_geometry(spkr_to_spkr, lstnr_to_spkr, ear_to_ear)
    logger.debug('d1: {}'.format(d1))
    logger.debug('d2: {}'.format(d2))
    logger.debug('theta: {}'.format(theta))

    headshadow = headshadow_filter_coefficients(theta, ear_to_ear / 2, sr)
    logger.debug('headshadow b: {} a: {}'.format(*headshadow))

    logger.info('Computing recursive crosstalk cancellation for left channel')
    l_left, l_right = cancel_crosstalk(left, d1, d2, headshadow, sr)
    logger.info('Computing recursive crosstalk cancellation for right channel')
    r_right, r_left = cancel_crosstalk(right, d1, d2, headshadow, sr)

    left = audio.sum_signals([l_left, r_left, left])
    right = audio.sum_signals([l_right, r_right, right])

    y = audio.channel_merge([left, right])
    logger.info('Writing output to: {}'.format(output))
    audio.write_wav(output, y, sr, norm=True)
Exemplo n.º 3
0
def do_convert(logdir1, logdir2, input_path, output_path):
    # Load graph
    model = Net2()
    model.actual_duration = librosa.core.get_duration(filename=input_path,
                                                      sr=hp.default.sr)

    # TODO isolate out logdirs, uhh and how to pre-dl from s3?

    assert len(input_path) > 0, "must be non-empty input path"

    df = Net2DataFlow(data_path=input_path, batch_size=1)

    ckpt1 = tf.train.latest_checkpoint(logdir1)
    ckpt2 = tf.train.latest_checkpoint(logdir2)
    session_inits = []
    session_inits.append(SaverRestore(ckpt2))
    session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    pred_conf = PredictConfig(model=model,
                              input_names=get_eval_input_names(),
                              output_names=get_eval_output_names(),
                              session_init=ChainInit(session_inits))
    predictor = OfflinePredictor(pred_conf)

    audio, y_audio, ppgs = convert(predictor, df)

    write_wav(audio[0], hp.default.sr, output_path)
Exemplo n.º 4
0
def main():
    # 
    en_ppg_l, en_linear_l = for_loop_en()     #英文每一帧ppg的列表 en_l = [en_ppg1,en_ppg2,...]
    cn_ppg_l, cn_linear_l = for_loop_cn()     #中文每一帧ppg的列表 cn_l = [cn_ppg1,cn_ppg2,...]
    all_ppg_l = en_ppg_l + cn_ppg_l          #中英文混合后的ppg的列表

    #
    en_final_cn_idx = np.load(en_final_cn_idx_path)

    #
    en_file_list = en_text2list(file=en_raw_list_path)
    # en_ppgs_ls = []
    now = 0
    for f in tqdm(en_file_list):
        wav_ppgs, linears = get_single_data_pair(f, ppgs_dir=en_raw_ppg_path, linears_dir=en_raw_linear_dir)
        e_ppg_id = [] # 英文从零开始
        for i in range(wav_ppgs.shape[0]):
            e_ppg_id.append(now)
            now += 1
        print('en id from 0:', e_ppg_id[:10])
        c_ppg_id_projected = ppg_project(e_ppg_id, en_final_cn_idx) # 从中文的零开始

        # 找到linear
        c_lineas_projected = list()
        for i in c_ppg_id_projected:
            c_lineas_projected.append(cn_linear_l[i])
        c_lineas_projected = np.asarray(c_lineas_projected)
        save_linear_name = f + '_cn_linear_projected.wav'
        write_wav(os.path.join(projected_wav_dir, save_linear_name), normalized_db_spec2wav(c_lineas_projected))
        save_linear_original_name = f + '_en_linear_original.wav'
        write_wav(os.path.join(projected_wav_dir, save_linear_original_name), normalized_db_spec2wav(linears))
Exemplo n.º 5
0
def main():
    #这一部分用于处理LJSpeech格式的数据集
    a = open(meta_path, 'r').readlines()
    b = []
    i = 0
    while i < len(a):
        t = a[i][0:6]
        b.append(t)
        i += 2
    print(b[:2])
    a = b
    # a = [i.strip().split('|')[0] for i in a]
    cnt = 0
    cnt_list = []
    bad_cnt = 0
    bad_list = []
    for fname in tqdm(a):
        try:
            # 提取声学参数
            wav_f = os.path.join(wav_dir, fname + '.wav')
            wav_arr = load_wav(wav_f)
            mfcc_feats = wav2unnormalized_mfcc(wav_arr)
            mel_feats = wav2normalized_db_mel(wav_arr)
            spec_feats = wav2normalized_db_spec(wav_arr)
            
            # 验证声学参数提取的对
            save_name = fname + '.npy'
            save_mel_rec_name = fname + '_mel_rec.wav'
            save_spec_rec_name = fname + '_spec_rec.wav'
            # 这句话有可能错,不知道为什么,可能是服务器临时变动有关
            ppg_already_feats = np.load(os.path.join(ppg_dir, save_name))

            assert ppg_already_feats.shape[0] == mfcc_feats.shape[0]
            assert mfcc_feats.shape[0] == mel_feats.shape[0] and mel_feats.shape[0] == spec_feats.shape[0]
            write_wav(os.path.join(rec_wav_dir, save_mel_rec_name), normalized_db_mel2wav(mel_feats))
            write_wav(os.path.join(rec_wav_dir, save_spec_rec_name), normalized_db_spec2wav(spec_feats))
            
            # 存储声学参数
            mfcc_save_name = os.path.join(mfcc_dir, save_name)
            mel_save_name = os.path.join(mel_dir, save_name)
            spec_save_name = os.path.join(spec_dir, save_name)
            np.save(mfcc_save_name, mfcc_feats)
            np.save(mel_save_name, mel_feats)
            np.save(spec_save_name, spec_feats)

            f_good_meta.write(fname + '\n')
            cnt_list.append(fname)
            cnt += 1
        except:
            bad_list.append(fname)
            bad_cnt += 1
        
        # print(cnt)
        # break

    print(cnt)
    print('bad:', bad_cnt)
    print(bad_list)

    return
def process_file(audio_path, output, spkr_to_spkr, lstnr_to_spkr, ear_to_ear):
    """
    Read stereo binaural audio file and write wav file with crosstalk 'removed'
    """
    logger.info('Loading file into memory: {}'.format(audio_path))
    y, sr = audio.load(audio_path, mono=False, sr=44100)
    left = y[0]
    right = y[1]

    logger.info('Computing distance from speaker to each ear')
    d1, d2, theta = compute_geometry(spkr_to_spkr, lstnr_to_spkr, ear_to_ear)
    logger.debug('d1: {}'.format(d1))
    logger.debug('d2: {}'.format(d2))
    logger.debug('theta: {}'.format(theta))

    headshadow = headshadow_filter_coefficients(theta, ear_to_ear/2, sr)
    logger.debug('headshadow b: {} a: {}'.format(*headshadow))

    logger.info('Computing recursive crosstalk cancellation for left channel')
    l_left, l_right = cancel_crosstalk(left, d1, d2, headshadow, sr)
    logger.info('Computing recursive crosstalk cancellation for right channel')
    r_right, r_left = cancel_crosstalk(right, d1, d2, headshadow, sr)

    left = audio.sum_signals([l_left, r_left, left])
    right = audio.sum_signals([l_right, r_right, right])

    y = audio.channel_merge([left, right])
    logger.info('Writing output to: {}'.format(output))
    audio.write_wav(output, y, sr, norm=True)
Exemplo n.º 7
0
def do_task(nthreads, audio):
    print 'Thread-{} start.\n'.format(nthreads)
    try:
        while True:
            src_path, tar_path = audio.next()
            wav, sr = librosa.load(src_path)
            wav = trim_wav(wav)
            write_wav(wav, sr, tar_path)
    except StopIteration:
        print 'Thread-{} done.\n'.format(nthreads)
Exemplo n.º 8
0
def hrtf_file(audio_path, azimuth, elevation=0, distance=1, ear_distance=0.215, output=None):
    """
    Read mono audio file and write binaural wav file to output
    """
    logger.info('Loading signal into memory: {}'.format(audio_path))
    y, sr = audio.load(audio_path)
    y = hrtf(y, sr, azimuth, elevation, distance, ear_distance)
    if output:
        audio.write_wav(output, y, sr, norm=True)
    return y
Exemplo n.º 9
0
def generate_pair_wav(spec, spec_pred, log_dir, global_step, suffix_name):
  y_pred = normalized_db_spec2wav(spec_pred)
  pred_wav_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_predvalidation.wav")
  write_wav(pred_wav_path, y_pred)
  pred_spec_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_predvalidation.npy")
  np.save(pred_spec_path, spec_pred)


  y = normalized_db_spec2wav(spec)
  orig_wav_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_original.wav")
  write_wav(orig_wav_path, y)
  orig_spec_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_original.npy")
  np.save(orig_spec_path, spec)
Exemplo n.º 10
0
def do_convert(args, logdir1, logdir2):
    # Load graph
    model = Net2()

    data = get_mfccs_and_spectrogram(args.file)

    ckpt1 = '{}/{}'.format(
        logdir1,
        args.net1) if args.net1 else tf.train.latest_checkpoint(logdir1)
    ckpt2 = '{}/{}'.format(
        logdir2,
        args.net2) if args.net2 else tf.train.latest_checkpoint(logdir2)
    session_inits = []
    if ckpt2:
        session_inits.append(SaverRestore(ckpt2))
    if ckpt1:
        session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    pred_conf = PredictConfig(model=model,
                              input_names=get_eval_input_names(),
                              output_names=get_eval_output_names(),
                              session_init=ChainInit(session_inits))
    predictor = OfflinePredictor(pred_conf)

    audio, y_audio, ppgs = convert(predictor, data)

    target_file = args.file.split('/')[-1]
    portion = os.path.splitext(target_file)
    # converted_file = target_file.split('.')[0] + '_converted.wav'
    converted_file = portion[0] + '.wav'
    write_wav(audio[0], hp.Default.sr, args.savepath + converted_file)

    # Write the result
    tf.summary.audio('A',
                     y_audio,
                     hp.Default.sr,
                     max_outputs=hp.Convert.batch_size)
    tf.summary.audio('B',
                     audio,
                     hp.Default.sr,
                     max_outputs=hp.Convert.batch_size)

    # Visualize PPGs
    heatmap = np.expand_dims(ppgs, 3)  # channel=1
    tf.summary.image('PPG', heatmap, max_outputs=ppgs.shape[0])

    writer = tf.summary.FileWriter(args.savepath)
    with tf.Session() as sess:
        summ = sess.run(tf.summary.merge_all())
    writer.add_summary(summ)
    writer.close()
Exemplo n.º 11
0
def eval_model_generate(spec, spec_pred, length, log_dir, global_step):
  print("EVAL LENGTH:", length)
  print("EVAL SPEC PRED SHAPE:", spec_pred.shape)
  
  y_pred = normalized_db_spec2wav(spec_pred)
  pred_wav_path = os.path.join(log_dir, "checkpoint_step_{}_pred.wav".format(global_step))
  write_wav(pred_wav_path, y_pred)
  pred_spec_path = os.path.join(log_dir, "checkpoint_step_{}_pred_spec.npy".format(global_step))
  np.save(pred_spec_path, spec_pred)


  print("EVAL LENGTH:", length)
  print("EVAL SPEC SHAPE:", spec.shape)
  y = normalized_db_spec2wav(spec)
  orig_wav_path = os.path.join(log_dir, "checkpoint_step_{}_original.wav".format(global_step))
  write_wav(orig_wav_path, y)
  orig_spec_path = os.path.join(log_dir, "checkpoint_step_{}_orig_spec.npy".format(global_step))
  np.save(orig_spec_path, spec)
def main():
    with torch.no_grad():
        model = DCBHG()
        model = tts_load(model=model, ckpt_path=ckpt_path_Multi)

        ppgs_list = open(ppgs_paths, 'r')
        ppgs_list = [i.strip() for i in ppgs_list]
        for idx, ppg_path_and_findA_ppg_path_and_speaker in tqdm(
                enumerate(ppgs_list)):
            ppg_path, findA_ppg_path, speaker_id = ppg_path_and_findA_ppg_path_and_speaker.split(
                '|')
            ppg = np.load(ppg_path)
            findA_ppg = np.load(findA_ppg_path)
            assert ppg.shape[1] == PPG_DIM and findA_ppg.shape[1] == PPG_DIM

            speaker_id = int(speaker_id)
            mel_pred, spec_pred, mel_pred_audio, spec_pred_audio = tts_predict(
                model, ppg, speaker_id)
            findA_mel_pred, findA_spec_pred, findA_mel_pred_audio, findA_spec_pred_audio = tts_predict(
                model, findA_ppg, speaker_id)
            # CE_fromWav, CE_seq_fromWav = consistencyError_fromWav(spec_pred_audio, ppg)
            # findA_CE_fromWav, findA_CE_seq_fromWav = consistencyError_fromWav(findA_spec_pred_audio, ppg)

            # with open(CE_fromWav_compare_path, 'w') as f:
            #     f.write(str(CE_fromWav) + '\n')
            #     f.write(str(findA_CE_fromWav) + '\n')

            write_wav(
                os.path.join(Multi_log_dir, "{}_sample_mel.wav".format(idx)),
                mel_pred_audio)
            write_wav(
                os.path.join(Multi_log_dir, "{}_sample_spec.wav".format(idx)),
                spec_pred_audio)

            np.save(
                os.path.join(Multi_log_dir, "{}_sample_mel.npy".format(idx)),
                mel_pred)
            np.save(
                os.path.join(Multi_log_dir, "{}_sample_spec.npy".format(idx)),
                spec_pred)

            draw_spec(
                os.path.join(Multi_log_dir, "{}_sample_mel.png".format(idx)),
                mel_pred)
            draw_spec(
                os.path.join(Multi_log_dir, "{}_sample_spec.png".format(idx)),
                spec_pred)

            write_wav(
                os.path.join(Multi_log_dir,
                             "{}_sample_mel_findA.wav".format(idx)),
                findA_mel_pred_audio)
            write_wav(
                os.path.join(Multi_log_dir,
                             "{}_sample_spec_findA.wav".format(idx)),
                findA_spec_pred_audio)

            np.save(
                os.path.join(Multi_log_dir,
                             "{}_sample_mel_findA.npy".format(idx)),
                findA_mel_pred)
            np.save(
                os.path.join(Multi_log_dir,
                             "{}_sample_spec_findA.npy".format(idx)),
                findA_spec_pred)

            draw_spec(
                os.path.join(Multi_log_dir,
                             "{}_sample_mel_findA.png".format(idx)),
                findA_mel_pred)
            draw_spec(
                os.path.join(Multi_log_dir,
                             "{}_sample_spec_findA.png".format(idx)),
                findA_spec_pred)
def main():
    print('start program')
    program_time = time.time()
    last_time = time.time()

    en_ppg_l, en_linear_l = for_loop_en(
    )  #英文每一帧ppg的列表 en_l = [en_ppg1,en_ppg2,...]
    cn_ppg_l, cn_linear_l = for_loop_cn(
    )  #中文每一帧ppg的列表 cn_l = [cn_ppg1,cn_ppg2,...]
    all_ppg_l = en_ppg_l + cn_ppg_l  #中英文混合后的ppg的列表
    print('end put ppg in memory, use:', time.time() - last_time)
    last_time = time.time()

    print('start cluster...')
    # 需要快速的聚类                        #all_l=[en_ppg1,en_ppg2,...,cn_ppg1,cn_ppg2,...]
    all_class = cluster_kmeans(
        all_ppg_l,
        K_small)  #all_class=[en_label,en_label,...,cn_label,cn_label,...]
    print('end cluster..., k-means use:', time.time() - last_time)
    last_time = time.time()

    #... a[100], a[0].1, 2, 3,...
    class_cn_ppgs = list(
    )  #建立一个列表class_cn_ppgs,列表中包含K个空列表class_cn_ppg = [[],[],[],...]
    class_cn_ppgs_value = list()
    class_cn_ppgs_value_kdtree = list()
    for i in range(K_small):
        l = list()
        class_cn_ppgs.append(l)  #append()在列表后面添加元素
        l_value = list()
        class_cn_ppgs_value.append(l_value)

    # 构造类的信息, 筛选出每个类里都有哪些中文的ppg; 并且平均每个类有100个中文ppg
    en_ppg_l_len = len(en_ppg_l)
    for i in range(len(cn_ppg_l)):
        idx = i + en_ppg_l_len
        now_class = all_class[idx]  #now_class = cn_label  可能是0-1999
        class_cn_ppgs[now_class].append(
            i
        )  #class_cn_ppg = [[2,8,19,...],[3,48,79,...],[4,5,36,...],...] 2000个类,每个类中含有cn_l中对应帧ppg的序列号
        class_cn_ppgs_value[now_class].append(cn_ppg_l[i])

    print('prepare for class infomation use:', time.time() - last_time)
    print('start construct kdtree')
    all_last_time = time.time()
    have_cnt = 0
    for i in tqdm(range(K_small)):
        l = len(class_cn_ppgs[i])
        if l > 0:
            have_cnt += 1
            print('cluster', i, 'len', l, 'start construct kd-tree')
            last_time = time.time()

            class_cn_ppgs_value[i] = np.asarray(class_cn_ppgs_value[i])
            class_cn_ppgs_value_kdtree.append(
                KDTree(class_cn_ppgs_value[i], leaf_size=40))

            print('end cluster', i, 'kd-tree use:', time.time() - last_time)
    print('have class:', have_cnt)
    print('end construct all kdtrees, tot use:', time.time() - all_last_time)

    # 开始寻找en对应的类内所有中文ppg离他最近的
    print('start get cloest map array for all en ppg')
    last_time = time.time()
    en_final_cn_idx = np.zeros(
        (en_ppg_l_len))  # a[1000000] np.zeros()返回来一个给定形状和类型的用0填充的数组;
    for i in tqdm(range(en_ppg_l_len)):  #遍历英文ppg列表,
        now_class = all_class[i]  #now_class = en_label  可能是0-1999

        # 暴力寻找
        # ans1, ans_id1 = bruce_find_closest(i, now_class, en_ppg_l, cn_ppg_l, class_cn_ppgs)

        # k-d tree寻找
        ans, ans_id = kdtree_find_closest(
            i, en_ppg_l, class_cn_ppgs_value_kdtree[now_class],
            class_cn_ppgs[now_class])
        # assert np.absolute(ans1 - ans) < eps and ans_id1 == ans_id
        en_final_cn_idx[i] = ans_id

    np.save(en_final_cn_idx_path, en_final_cn_idx)
    print('end write map array, all use:', time.time() - last_time)

    # 开始findA的部分
    print('start findA')
    last_time = time.time()

    en_file_list = en_text2list(file=en_raw_list_path)
    now = 0
    for f in tqdm(en_file_list):
        wav_ppgs, linears = get_single_data_pair(f,
                                                 ppgs_dir=en_raw_ppg_path,
                                                 linears_dir=en_raw_linear_dir)
        e_ppg_id = []  # 英文从零开始
        for i in range(wav_ppgs.shape[0]):
            e_ppg_id.append(now)
            now += 1
        print('en id from 0:', e_ppg_id[:10])
        c_ppg_id_projected = ppg_project(e_ppg_id, en_final_cn_idx)  # 从中文的零开始

        # 找到ppg并存储
        c_ppgs_projected = list()
        for i in c_ppg_id_projected:
            c_ppgs_projected.append(cn_ppg_l[i])
        c_ppgs_projected = np.asarray(c_ppgs_projected)
        save_ppg_name_projected = f + '_cn_ppg_projected.npy'
        np.save(os.path.join(projected_wav_dir, save_ppg_name_projected),
                c_ppgs_projected)

        # 找到linear并存储
        c_lineas_projected = list()
        for i in c_ppg_id_projected:
            c_lineas_projected.append(cn_linear_l[i])
        c_lineas_projected = np.asarray(c_lineas_projected)
        save_linear_name_projected = f + '_cn_linear_projected.npy'
        np.save(os.path.join(projected_wav_dir, save_linear_name_projected),
                c_lineas_projected)

        # 计算音频wav并存储
        save_wav_name_projected = f + '_cn_wav_projected.wav'
        write_wav(os.path.join(projected_wav_dir, save_wav_name_projected),
                  normalized_db_spec2wav(c_lineas_projected))

        #----------接下来是original的ppg,linear,wav存储,用来对比-----------

        # 找到ppg并存储
        save_ppg_name_original = f + '_en_ppg_original.npy'
        np.save(os.path.join(projected_wav_dir, save_ppg_name_original),
                wav_ppgs)

        # 找到linear并存储
        save_linear_name_original = f + '_en_linear_original.npy'
        np.save(os.path.join(projected_wav_dir, save_linear_name_original),
                linears)

        # 计算音频wav并存储-original
        save_wav_name_original = f + '_en_wav_original.wav'
        write_wav(os.path.join(projected_wav_dir, save_wav_name_original),
                  normalized_db_spec2wav(linears))

    print('end findA, use:', time.time() - last_time)
    print('program use:', time.time() - program_time)
Exemplo n.º 14
0
def main():
    # 命令行帮助界面
    args = get_arguments()
    # 命令行参数中制定好的
    # logdir = args.log_dir
    # model_dir = args.output_model_path
    # restore_dir = args.output_model_path

    train_dir = os.path.join(logdir, STARTED_DATESTRING, 'train')
    dev_dir = os.path.join(logdir, STARTED_DATESTRING, 'dev')
    # directories = validate_directories(args.restore_from, args.overwrite)
    # restore_dir = directories['restore_from']
    # logdir = directories['logdir']
    # dev_dir = directories['dev_dir']


    # dataset
    train_set = tf.data.Dataset.from_generator(train_generator,
                                               output_types=(
                                                   tf.float32, tf.float32, tf.int32),
                                               output_shapes=(
                                                   [None, my_hp.num_ppgs], [None, my_hp.num_freq], []))
    #padding train data(why?how?)
    train_set = train_set.padded_batch(args.batch_size,
                                       padded_shapes=([None, my_hp.num_ppgs],
                                                      [None, my_hp.num_freq],
                                                      [])).repeat()
    # Any unknown dimensions  will be padded to the maximum size of that dimension in each batch.

    train_iterator = train_set.make_initializable_iterator()

    test_set = tf.data.Dataset.from_generator(test_generator,
                                              output_types=(
                                                  tf.float32, tf.float32, tf.int32),
                                              output_shapes=(
                                                  [None, my_hp.num_ppgs], [None, my_hp.num_freq], []))

    #设置repeat(),在get_next循环中,如果越界了就自动循环。不计上限
    test_set = test_set.padded_batch(args.batch_size,
                                     padded_shapes=([None, my_hp.num_ppgs],
                                                    [None, my_hp.num_freq],
                                                    [])).repeat()
    test_iterator = test_set.make_initializable_iterator()

    #创建一个handle占位符,在sess.run该handle迭代器的next()时,可以送入一个feed_dict 代表handle占位符,从而调用对应的迭代器
    dataset_handle = tf.placeholder(tf.string, shape=[])
    dataset_iter = tf.data.Iterator.from_string_handle(
        dataset_handle,
        train_set.output_types,
        train_set.output_shapes
    )
    batch_data = dataset_iter.get_next()
    with tf.Session() as sess:
        print(batch_data)
        #time.sleep(8)
    # classifier = DNNClassifier(out_dims=PPG_DIM, hiddens=[256, 256, 256],
    #                            drop_rate=0.2, name='dnn_classifier')
    # classifier = CnnDnnClassifier(out_dims=PPG_DIM, n_cnn=5,
    #                               cnn_hidden=64, dense_hiddens=[256, 256, 256])


    decoderRegression = AcousticCBHGRegression(my_hp)

    results_dict = decoderRegression(inputs=batch_data[0], labels=batch_data[1], lengths=batch_data[2])
    #inputs labels lengths
    #results_dict['logits']= np.zeros([10])
    predicted = results_dict['out']
    # mask = tf.sequence_mask(batch_data[2], dtype=tf.float32)#batch[2]是(None,),是每条数据的MFCC数目,需要这个的原因是会填充成最长的MFCC长度。mask的维度是(None,max(batch[2]))



    #batch_data[2]是MFCC数组的长度,MFCC有多少是不一定的。
    # accuracy = tf.reduce_sum(
    #     tf.cast(#bool转float
    #         tf.equal(tf.argmax(predicted, axis=-1),#比较每一行的最大元素
    #                  tf.argmax(batch_data[1], axis=-1)),
    #         tf.float32) * mask#乘上mask,是因为所有数据都被填充为最多mfcc的维度了。所以填充部分一定都是相等的,于是需要将其mask掉。
    # ) / tf.reduce_sum(tf.cast(batch_data[2], dtype=tf.float32))

    # tf.summary.scalar('accuracy', accuracy)
    tf.summary.image('predicted_linear',
                     tf.expand_dims(
                         tf.transpose(predicted, [0, 2, 1]),
                         axis=-1), max_outputs=1)
    tf.summary.image('groundtruth_linear',
                     tf.expand_dims(
                         tf.cast(
                             tf.transpose(batch_data[1], [0, 2, 1]),
                             tf.float32),
                         axis=-1), max_outputs=1)
    tf.summary.image('groundtruth_PPG',
                     tf.expand_dims(
                         tf.cast(
                             tf.transpose(batch_data[0], [0, 2, 1]),
                             tf.float32),
                         axis=-1), max_outputs=1)

    loss = results_dict['loss']
    learning_rate_pl = tf.placeholder(tf.float32, None, 'learning_rate')
    tf.summary.scalar('mse_weighted_loss', loss)
    tf.summary.scalar('learning_rate', learning_rate_pl)
    optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate_pl)
    optim = optimizer.minimize(loss)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    optim = tf.group([optim, update_ops])

    # Set up logging for TensorBoard.
    train_writer = tf.summary.FileWriter(train_dir)
    train_writer.add_graph(tf.get_default_graph())
    dev_writer = tf.summary.FileWriter(dev_dir)
    summaries = tf.summary.merge_all()
    #设置将所有的summary保存 run这个会将所有的summary更新
    saver = tf.train.Saver(max_to_keep=args.max_ckpts)

    # set up session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    init = tf.global_variables_initializer()
    sess.run([train_iterator.initializer, test_iterator.initializer])
    train_handle, test_handle = sess.run([train_iterator.string_handle(),
                                         test_iterator.string_handle()])
    sess.run(init)
    # try to load saved model
    try:
        saved_global_step = load_model(saver, sess, restore_dir)
        if saved_global_step is None:
            saved_global_step = -1
    except:
        print("Something went wrong while restoring checkpoint. "
              "We will terminate training to avoid accidentally overwriting "
              "the previous model.")
        raise

    last_saved_step = saved_global_step
    step = None
    try:
        for step in range(saved_global_step + 1, args.steps):
            # if step <= int(1e3):
            lr = args.lr
            # elif step <= int(2e3):
            #     lr = 0.8 * args.lr
            # elif step <= int(4e3):
            #     lr = 0.5 * args.lr
            # elif step <= int(8e3):
            #     lr = 0.25 * args.lr
            # else:
            #     lr = 0.125 * args.lr
            start_time = time.time()
            if step % args.ckpt_every == 0:
                summary, loss_value, mag_spec, label_spec= sess.run([summaries, loss, predicted, batch_data[1]],
                                               feed_dict={dataset_handle: test_handle,
                                                          learning_rate_pl: lr})
                dev_writer.add_summary(summary, step)
                duration = time.time() - start_time
                print('step {:d} - eval loss = {:.3f}, ({:.3f} sec/step)'
                      .format(step, loss_value, duration))
                save_model(saver, sess, model_dir, step)
                last_saved_step = step
                # 没有提取mask,先听听试试
                np.save(os.path.join(dev_dir, 'dev' + str(step) + '.npy'), mag_spec[0])
                np.save(os.path.join(dev_dir, 'groundtruth_dev' + str(step) + '.npy'), label_spec[0])
                y = normSTFT2wav(mag_spec[0])
                dev_path = os.path.join(dev_dir, 'dev' + str(step) + '.wav')
                write_wav(dev_path, y, sr=16000)
                y = normSTFT2wav(label_spec[0])
                dev_path = os.path.join(dev_dir, 'groundtruth_dev' + str(step) + '.wav')
                write_wav(dev_path, y, sr=16000)
            else:
                summary, loss_value, _, mag_spec, label_spec = sess.run([summaries, loss, optim, predicted, batch_data[1]],
                                                  feed_dict={dataset_handle: train_handle,
                                                             learning_rate_pl: lr})
                train_writer.add_summary(summary, step)
                if step % 10 == 0:
                    duration = time.time() - start_time
                    print('step {:d} - training loss = {:.3f}, ({:.3f} sec/step)'
                          .format(step, loss_value, duration))
                if step % save_train_audio == 0:
                    # 没有提取mask,先听听试试
                    np.save(os.path.join(train_dir, 'train' + str(step) + '.npy'), mag_spec[0])
                    np.save(os.path.join(train_dir, 'groundtruth_train' + str(step) + '.npy'), label_spec[0])
                    y = normSTFT2wav(mag_spec[0])
                    train_path = os.path.join(train_dir, 'train' + str(step) + '.wav')
                    write_wav(train_path, y, sr=16000)
                    y = normSTFT2wav(label_spec[0])
                    trian_path = os.path.join(train_dir, 'groundtruth_train' + str(step) + '.wav')
                    write_wav(trian_path, y, sr=16000)

    except KeyboardInterrupt:
        # Introduce a line break after ^C is displayed so save message
        # is on its own line.
        print()
    finally:
        if step > last_saved_step:
            save_model(saver, sess, model_dir, step)
    sess.close()
Exemplo n.º 15
0
import numpy as np
import matplotlib.pyplot as plt

from audio import spec2wav, wav2spec, read_wav, write_wav


if __name__ == '__main__':

    sr = 22050
    n_fft = 512
    win_length = 400
    hop_length = 80
    duration = 2 # sec

    wav = read_wav( "H:\\cs230\\wav_x\\1_1.wav", sr, duration )
    spec, _ = wav2spec(wav, n_fft, win_length, hop_length, False)

    converted_wav = spec2wav(spec, n_fft, win_length, hop_length, 600)

    write_wav(converted_wav, sr, 'a.wav')


    plt.pcolormesh(spec)
    plt.ylabel('Frequency')
    plt.xlabel('Time')
    plt.savefig("a.png")