示例#1
0
def main():
    # 
    en_ppg_l, en_linear_l = for_loop_en()     #英文每一帧ppg的列表 en_l = [en_ppg1,en_ppg2,...]
    cn_ppg_l, cn_linear_l = for_loop_cn()     #中文每一帧ppg的列表 cn_l = [cn_ppg1,cn_ppg2,...]
    all_ppg_l = en_ppg_l + cn_ppg_l          #中英文混合后的ppg的列表

    #
    en_final_cn_idx = np.load(en_final_cn_idx_path)

    #
    en_file_list = en_text2list(file=en_raw_list_path)
    # en_ppgs_ls = []
    now = 0
    for f in tqdm(en_file_list):
        wav_ppgs, linears = get_single_data_pair(f, ppgs_dir=en_raw_ppg_path, linears_dir=en_raw_linear_dir)
        e_ppg_id = [] # 英文从零开始
        for i in range(wav_ppgs.shape[0]):
            e_ppg_id.append(now)
            now += 1
        print('en id from 0:', e_ppg_id[:10])
        c_ppg_id_projected = ppg_project(e_ppg_id, en_final_cn_idx) # 从中文的零开始

        # 找到linear
        c_lineas_projected = list()
        for i in c_ppg_id_projected:
            c_lineas_projected.append(cn_linear_l[i])
        c_lineas_projected = np.asarray(c_lineas_projected)
        save_linear_name = f + '_cn_linear_projected.wav'
        write_wav(os.path.join(projected_wav_dir, save_linear_name), normalized_db_spec2wav(c_lineas_projected))
        save_linear_original_name = f + '_en_linear_original.wav'
        write_wav(os.path.join(projected_wav_dir, save_linear_original_name), normalized_db_spec2wav(linears))
示例#2
0
def generate_pair_wav(spec, spec_pred, log_dir, global_step, suffix_name):
  y_pred = normalized_db_spec2wav(spec_pred)
  pred_wav_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_predvalidation.wav")
  write_wav(pred_wav_path, y_pred)
  pred_spec_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_predvalidation.npy")
  np.save(pred_spec_path, spec_pred)


  y = normalized_db_spec2wav(spec)
  orig_wav_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_original.wav")
  write_wav(orig_wav_path, y)
  orig_spec_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_original.npy")
  np.save(orig_spec_path, spec)
def tts_predict(model, ppg, id_speaker):
    # 准备输入的数据并转换到GPU
    ppg = Variable(torch.from_numpy(ppg)).unsqueeze(0).float()
    id_speaker = torch.LongTensor([id_speaker])
    print('orig:', id_speaker)
    print(id_speaker.shape)
    # id_speaker = id_speaker.unsqueeze(0)
    print(ppg.size())
    print(ppg.shape)
    print(ppg.type())
    print('---------- id_speaker')
    print(id_speaker.size())
    print(id_speaker.shape)
    print(id_speaker.type())
    print(id_speaker)
    if use_cuda:
        ppg = ppg.cuda()
        id_speaker = id_speaker.cuda()

    # 进行预测并数据转换到CPU
    mel_pred, spec_pred = model(ppg, id_speaker)
    mel_pred = mel_pred[0].cpu().data.numpy()
    spec_pred = spec_pred[0].cpu().data.numpy()

    # vocoder合成音频波形文件
    mel_pred_audio = normalized_db_mel2wav(mel_pred)
    spec_pred_audio = normalized_db_spec2wav(spec_pred)

    return mel_pred, spec_pred, mel_pred_audio, spec_pred_audio
示例#4
0
def main():
    #这一部分用于处理LJSpeech格式的数据集
    a = open(meta_path, 'r').readlines()
    b = []
    i = 0
    while i < len(a):
        t = a[i][0:6]
        b.append(t)
        i += 2
    print(b[:2])
    a = b
    # a = [i.strip().split('|')[0] for i in a]
    cnt = 0
    cnt_list = []
    bad_cnt = 0
    bad_list = []
    for fname in tqdm(a):
        try:
            # 提取声学参数
            wav_f = os.path.join(wav_dir, fname + '.wav')
            wav_arr = load_wav(wav_f)
            mfcc_feats = wav2unnormalized_mfcc(wav_arr)
            mel_feats = wav2normalized_db_mel(wav_arr)
            spec_feats = wav2normalized_db_spec(wav_arr)
            
            # 验证声学参数提取的对
            save_name = fname + '.npy'
            save_mel_rec_name = fname + '_mel_rec.wav'
            save_spec_rec_name = fname + '_spec_rec.wav'
            # 这句话有可能错,不知道为什么,可能是服务器临时变动有关
            ppg_already_feats = np.load(os.path.join(ppg_dir, save_name))

            assert ppg_already_feats.shape[0] == mfcc_feats.shape[0]
            assert mfcc_feats.shape[0] == mel_feats.shape[0] and mel_feats.shape[0] == spec_feats.shape[0]
            write_wav(os.path.join(rec_wav_dir, save_mel_rec_name), normalized_db_mel2wav(mel_feats))
            write_wav(os.path.join(rec_wav_dir, save_spec_rec_name), normalized_db_spec2wav(spec_feats))
            
            # 存储声学参数
            mfcc_save_name = os.path.join(mfcc_dir, save_name)
            mel_save_name = os.path.join(mel_dir, save_name)
            spec_save_name = os.path.join(spec_dir, save_name)
            np.save(mfcc_save_name, mfcc_feats)
            np.save(mel_save_name, mel_feats)
            np.save(spec_save_name, spec_feats)

            f_good_meta.write(fname + '\n')
            cnt_list.append(fname)
            cnt += 1
        except:
            bad_list.append(fname)
            bad_cnt += 1
        
        # print(cnt)
        # break

    print(cnt)
    print('bad:', bad_cnt)
    print(bad_list)

    return
示例#5
0
def eval_model_generate(spec, spec_pred, length, log_dir, global_step):
  print("EVAL LENGTH:", length)
  print("EVAL SPEC PRED SHAPE:", spec_pred.shape)
  
  y_pred = normalized_db_spec2wav(spec_pred)
  pred_wav_path = os.path.join(log_dir, "checkpoint_step_{}_pred.wav".format(global_step))
  write_wav(pred_wav_path, y_pred)
  pred_spec_path = os.path.join(log_dir, "checkpoint_step_{}_pred_spec.npy".format(global_step))
  np.save(pred_spec_path, spec_pred)


  print("EVAL LENGTH:", length)
  print("EVAL SPEC SHAPE:", spec.shape)
  y = normalized_db_spec2wav(spec)
  orig_wav_path = os.path.join(log_dir, "checkpoint_step_{}_original.wav".format(global_step))
  write_wav(orig_wav_path, y)
  orig_spec_path = os.path.join(log_dir, "checkpoint_step_{}_orig_spec.npy".format(global_step))
  np.save(orig_spec_path, spec)
def tts_predict(model, ppg):
    # 准备输入的数据并转换到GPU
    ppg = Variable(torch.from_numpy(ppg)).unsqueeze(0).float()
    if use_cuda:
        ppg = ppg.cuda()

    # 进行预测并数据转换到CPU
    mel_pred, spec_pred = model(ppg)
    mel_pred = mel_pred[0].cpu().data.numpy()
    spec_pred = spec_pred[0].cpu().data.numpy()

    # vocoder合成音频波形文件
    mel_pred_audio = normalized_db_mel2wav(mel_pred)
    spec_pred_audio = normalized_db_spec2wav(spec_pred)

    return mel_pred, spec_pred, mel_pred_audio, spec_pred_audio
def main():
    print('start program')
    program_time = time.time()
    last_time = time.time()

    en_ppg_l, en_linear_l = for_loop_en(
    )  #英文每一帧ppg的列表 en_l = [en_ppg1,en_ppg2,...]
    cn_ppg_l, cn_linear_l = for_loop_cn(
    )  #中文每一帧ppg的列表 cn_l = [cn_ppg1,cn_ppg2,...]
    all_ppg_l = en_ppg_l + cn_ppg_l  #中英文混合后的ppg的列表
    print('end put ppg in memory, use:', time.time() - last_time)
    last_time = time.time()

    print('start cluster...')
    # 需要快速的聚类                        #all_l=[en_ppg1,en_ppg2,...,cn_ppg1,cn_ppg2,...]
    all_class = cluster_kmeans(
        all_ppg_l,
        K_small)  #all_class=[en_label,en_label,...,cn_label,cn_label,...]
    print('end cluster..., k-means use:', time.time() - last_time)
    last_time = time.time()

    #... a[100], a[0].1, 2, 3,...
    class_cn_ppgs = list(
    )  #建立一个列表class_cn_ppgs,列表中包含K个空列表class_cn_ppg = [[],[],[],...]
    class_cn_ppgs_value = list()
    class_cn_ppgs_value_kdtree = list()
    for i in range(K_small):
        l = list()
        class_cn_ppgs.append(l)  #append()在列表后面添加元素
        l_value = list()
        class_cn_ppgs_value.append(l_value)

    # 构造类的信息, 筛选出每个类里都有哪些中文的ppg; 并且平均每个类有100个中文ppg
    en_ppg_l_len = len(en_ppg_l)
    for i in range(len(cn_ppg_l)):
        idx = i + en_ppg_l_len
        now_class = all_class[idx]  #now_class = cn_label  可能是0-1999
        class_cn_ppgs[now_class].append(
            i
        )  #class_cn_ppg = [[2,8,19,...],[3,48,79,...],[4,5,36,...],...] 2000个类,每个类中含有cn_l中对应帧ppg的序列号
        class_cn_ppgs_value[now_class].append(cn_ppg_l[i])

    print('prepare for class infomation use:', time.time() - last_time)
    print('start construct kdtree')
    all_last_time = time.time()
    have_cnt = 0
    for i in tqdm(range(K_small)):
        l = len(class_cn_ppgs[i])
        if l > 0:
            have_cnt += 1
            print('cluster', i, 'len', l, 'start construct kd-tree')
            last_time = time.time()

            class_cn_ppgs_value[i] = np.asarray(class_cn_ppgs_value[i])
            class_cn_ppgs_value_kdtree.append(
                KDTree(class_cn_ppgs_value[i], leaf_size=40))

            print('end cluster', i, 'kd-tree use:', time.time() - last_time)
    print('have class:', have_cnt)
    print('end construct all kdtrees, tot use:', time.time() - all_last_time)

    # 开始寻找en对应的类内所有中文ppg离他最近的
    print('start get cloest map array for all en ppg')
    last_time = time.time()
    en_final_cn_idx = np.zeros(
        (en_ppg_l_len))  # a[1000000] np.zeros()返回来一个给定形状和类型的用0填充的数组;
    for i in tqdm(range(en_ppg_l_len)):  #遍历英文ppg列表,
        now_class = all_class[i]  #now_class = en_label  可能是0-1999

        # 暴力寻找
        # ans1, ans_id1 = bruce_find_closest(i, now_class, en_ppg_l, cn_ppg_l, class_cn_ppgs)

        # k-d tree寻找
        ans, ans_id = kdtree_find_closest(
            i, en_ppg_l, class_cn_ppgs_value_kdtree[now_class],
            class_cn_ppgs[now_class])
        # assert np.absolute(ans1 - ans) < eps and ans_id1 == ans_id
        en_final_cn_idx[i] = ans_id

    np.save(en_final_cn_idx_path, en_final_cn_idx)
    print('end write map array, all use:', time.time() - last_time)

    # 开始findA的部分
    print('start findA')
    last_time = time.time()

    en_file_list = en_text2list(file=en_raw_list_path)
    now = 0
    for f in tqdm(en_file_list):
        wav_ppgs, linears = get_single_data_pair(f,
                                                 ppgs_dir=en_raw_ppg_path,
                                                 linears_dir=en_raw_linear_dir)
        e_ppg_id = []  # 英文从零开始
        for i in range(wav_ppgs.shape[0]):
            e_ppg_id.append(now)
            now += 1
        print('en id from 0:', e_ppg_id[:10])
        c_ppg_id_projected = ppg_project(e_ppg_id, en_final_cn_idx)  # 从中文的零开始

        # 找到ppg并存储
        c_ppgs_projected = list()
        for i in c_ppg_id_projected:
            c_ppgs_projected.append(cn_ppg_l[i])
        c_ppgs_projected = np.asarray(c_ppgs_projected)
        save_ppg_name_projected = f + '_cn_ppg_projected.npy'
        np.save(os.path.join(projected_wav_dir, save_ppg_name_projected),
                c_ppgs_projected)

        # 找到linear并存储
        c_lineas_projected = list()
        for i in c_ppg_id_projected:
            c_lineas_projected.append(cn_linear_l[i])
        c_lineas_projected = np.asarray(c_lineas_projected)
        save_linear_name_projected = f + '_cn_linear_projected.npy'
        np.save(os.path.join(projected_wav_dir, save_linear_name_projected),
                c_lineas_projected)

        # 计算音频wav并存储
        save_wav_name_projected = f + '_cn_wav_projected.wav'
        write_wav(os.path.join(projected_wav_dir, save_wav_name_projected),
                  normalized_db_spec2wav(c_lineas_projected))

        #----------接下来是original的ppg,linear,wav存储,用来对比-----------

        # 找到ppg并存储
        save_ppg_name_original = f + '_en_ppg_original.npy'
        np.save(os.path.join(projected_wav_dir, save_ppg_name_original),
                wav_ppgs)

        # 找到linear并存储
        save_linear_name_original = f + '_en_linear_original.npy'
        np.save(os.path.join(projected_wav_dir, save_linear_name_original),
                linears)

        # 计算音频wav并存储-original
        save_wav_name_original = f + '_en_wav_original.wav'
        write_wav(os.path.join(projected_wav_dir, save_wav_name_original),
                  normalized_db_spec2wav(linears))

    print('end findA, use:', time.time() - last_time)
    print('program use:', time.time() - program_time)