예제 #1
0
    def preprocess_speaker(speaker_dir: Path):
        # Give a name to the speaker that includes its dataset
        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)

        # Create an output directory with that name, as well as a txt file containing a
        # reference to each source file.
        speaker_out_dir = out_dir.joinpath(speaker_name)
        speaker_out_dir.mkdir(exist_ok=True)
        sources_fpath = speaker_out_dir.joinpath("_sources.txt")

        # There's a possibility that the preprocessing was interrupted earlier, check if
        # there already is a sources file.
        if sources_fpath.exists():
            try:
                with sources_fpath.open("r") as sources_file:
                    existing_fnames = {
                        line.split(",")[0]
                        for line in sources_file
                    }
            except:
                existing_fnames = {}
        else:
            existing_fnames = {}

        # Gather all audio files for that speaker recursively
        sources_file = sources_fpath.open("a" if skip_existing else "w")
        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
            # Check if the target output file already exists
            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
            out_fname = out_fname.replace(".%s" % extension, ".npy")
            if skip_existing and out_fname in existing_fnames:
                continue

            # Load and preprocess the waveform
            try:
                wav = audio.preprocess_wav(in_fpath)
            except ValueError as e:
                # loading VoxCeleb2, this gets raised:
                # ValueError("frames must be specified for non-seekable files")
                print(f"skipping loading of {in_fpath}, because: {str(e)}")
                continue
            if len(wav) == 0:
                continue

            print(f"Processing {in_fpath}...")
            # Create the mel spectrogram, discard those that are too short
            frames = audio.wav_to_mel_spectrogram(wav)
            if len(frames) < partials_n_frames:
                continue

            out_fpath = speaker_out_dir.joinpath(out_fname)
            np.save(out_fpath, frames)
            logger.add_sample(duration=len(wav) / sampling_rate)
            sources_file.write("%s,%s\n" % (out_fname, in_fpath))

        sources_file.close()
예제 #2
0
def load_preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray]):
    """
    Loads an audio file in memory and applies the same preprocessing operations used in trained 
    the Speaker Encoder. Using this function is not mandatory but recommended.
    """
    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
        wav = audio.load(fpath_or_wav)
    else:
        wav = fpath_or_wav
    wav = audio.preprocess_wav(wav)
    return wav
예제 #3
0
def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path,
                        skip_existing: bool):
    # Give a name to the speaker that includes its dataset
    speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)

    # Create an output directory with that name, as well as a txt file containing a
    # reference to each source file.
    speaker_out_dir = out_dir.joinpath(speaker_name)
    speaker_out_dir.mkdir(exist_ok=True)
    sources_fpath = speaker_out_dir.joinpath("_sources.txt")

    # There's a possibility that the preprocessing was interrupted earlier, check if
    # there already is a sources file.
    if sources_fpath.exists():
        try:
            with sources_fpath.open("r") as sources_file:
                existing_fnames = {line.split(",")[0] for line in sources_file}
        except:
            existing_fnames = {}
    else:
        existing_fnames = {}

    # Gather all audio files for that speaker recursively
    sources_file = sources_fpath.open("a" if skip_existing else "w")
    audio_durs = []
    for extension in _AUDIO_EXTENSIONS:
        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
            # Check if the target output file already exists
            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
            out_fname = out_fname.replace(".%s" % extension, ".npy")
            if skip_existing and out_fname in existing_fnames:
                continue

            # Load and preprocess the waveform
            wav = audio.preprocess_wav(in_fpath)
            if len(wav) == 0:
                continue

            # Create the mel spectrogram, discard those that are too short
            frames = audio.wav_to_mel_spectrogram(wav)
            if len(frames) < partials_n_frames:
                continue

            out_fpath = speaker_out_dir.joinpath(out_fname)
            np.save(out_fpath, frames)
            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
            audio_durs.append(len(wav) / sampling_rate)

    sources_file.close()

    return audio_durs
 def preprocess_speaker(args):
     speaker, files = args
     # Give a name to the speaker that includes its dataset
     parts = list(speaker_dir.relative_to(datasets_root).parts)
     parts.append(speaker)
     speaker_name = "_".join(parts)
     
     # Create an output directory with that name, as well as a txt file containing a 
     # reference to each source file.
     speaker_out_dir = out_dir.joinpath(speaker_name)
     speaker_out_dir.mkdir(exist_ok=True)
     sources_fpath = speaker_out_dir.joinpath("_sources.txt")
     
     # There's a possibility that the preprocessing was interrupted earlier, check if 
     # there already is a sources file.
     if sources_fpath.exists():
         try:
             with sources_fpath.open("r") as sources_file:
                 existing_fnames = {line.split(",")[0] for line in sources_file}
         except:
             existing_fnames = {}
     else:
         existing_fnames = {}
     
     # Gather all audio files for that speaker recursively
     sources_file = sources_fpath.open("a" if skip_existing else "w")
     for in_fpath, out_fname in files:
         # Check if the target output file already exists
         if skip_existing and out_fname in existing_fnames:
             continue
             
         # Load and preprocess the waveform
         try:
             wav = audio.preprocess_wav(in_fpath)
             if len(wav) == 0:
                 continue
         except:
             print('wave preprocess error')
             continue
         
         # Create the mel spectrogram, discard those that are too short
         frames = audio.wav_to_mel_spectrogram(wav)
         if len(frames) < partials_n_frames:
             continue
         
         out_fpath = speaker_out_dir.joinpath(out_fname)
         np.save(out_fpath, frames)
         logger.add_sample(duration=len(wav) / sampling_rate)
         sources_file.write("%s,%s\n" % (out_fname, in_fpath))
     
     sources_file.close()
def preprocess(in_fpath, out_fpath, parent_path):

    source_text = parent_path / "_sources.txt"
    sources_file = source_text.open("w")

    # Load and preprocess the waveform
    wav = audio.preprocess_wav(in_fpath)
    if len(wav) == 0:
        print("empty audio file")
    
    # Create the mel spectrogram, discard those that are too short
    frames = audio.wav_to_mel_spectrogram(wav)
    if len(frames) < partials_n_frames:
        print("{} < {}, number of frames is less than partials_n_frames".format(len(frames), partials_n_frames))

    np.save(out_fpath, frames)
    sources_file.write("%s,%s\n" % (out_fpath.name + '.npy', in_fpath.name))
    
    sources_file.close()

    return frames
예제 #6
0
def run_eval_part1(args):
    speaker_enc_ckpt = args.speaker_encoder_checkpoint
    syn_ckpt = args.syn_checkpoint
    speaker_name = args.speaker_name
    eval_results_dir = os.path.join(args.eval_results_dir, speaker_name)
    if not os.path.exists(eval_results_dir):
        os.makedirs(eval_results_dir)
    speaker_audio_dirs = {
        "speaker_name": ["speaker_audio_1.wav", "speaker_audio_2.wav"],
        "biaobei_speaker": [
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000005.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000006.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000007.wav",
        ],
        "SLR68_DEV_3756_22": [
            "/home/zhangwenbo5/lihongfeng/corpus/SLR68/dev/37_5622/37_5622_20170913203118.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/SLR68/dev/37_5622/37_5622_20170913203322.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/SLR68/dev/37_5622/37_5622_20170913203824.wav"
        ],
        "SLR38_P00001A": [
            "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0004.wav",
        ],
        "aishell_C0002": [
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0004.wav",
        ],
        "aishell_C0896": [
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0004.wav",
        ],
    }[speaker_name]
    sentences = [
        # '美国主持人听到“中国”就插话',
        # '勉励乡亲们为过上更加幸福美好的生活继续团结奋斗。',
        # '中国基建领域又来了一款“神器”, 哪里不平平哪里',
        # '违反中央八项规定精神和廉洁纪律,违规出入私人会所和打高尔夫球',
        # '陪审团未能就其盗窃和藏匿文物罪名作出裁决',
        # '于美国首都华盛顿国家记者俱乐部召开的新闻发布会上说',
        # '杭州市卫健委某直属单位一名拟提副处级干部刘某公示期间,纪检监察组照例对其个人重大事项进行抽查',
        # '我国森林面积、森林蓄积分别增长一倍左右,人工林面积居全球第一',
        # '打打打打打打打打打打打',
        # '卡尔普陪外孙玩滑梯。',
        # '假语村言,别再拥抱我。',
        # '宝马配挂跛骡鞍,貂蝉怨枕董翁榻。',
        # '中国地震台网速报,'
        # '中国地震台网正式测定,',
        # '06月04日17时46分在台湾台东县海域(北纬22.82度,东经121.75度)发生5.8级地震',
        # '中国地震台网速报,中国地震台网正式测定:06月04日17时46分在台湾台东县海域(北纬22.82度,东经121.75度)发生5.8级地震',
        # '震源深度9千米,震中位于海中,距台湾岛最近约47公里。',
        # '刚刚,台湾发生5.8级地震,与此同时,泉州厦门漳州震感明显,',
        # '此次台湾地震发生后,许多网友为同胞祈福,愿平安,',
        '新世界百货望京店',
        '全聚德烤鸭店王府井店',
        '麻烦帮我把空调温度调整到二十四',
        '请帮我显示中央一套',  # aishell IC0896W0001.wav
        '确定下载三帝狂野飙车',  # aishell IC0896W0002.wav
        '请帮我开启深圳卫视国际频道',  # aishell IC0896W0003.wav
        '您吃饭了吗,我今天吃的太撑了',
        '您吃饭了吗?',
        '你多大了,你到底多大了,我猜你三十了,他多大了,他到底多大了,他猜你三十了',
        '二毛你今天沒课嘛还和李霞聊天',
    ]

    text2pinyin = partial(get_pinyin, std=True, pb=True)
    sentences = [' '.join(text2pinyin(sent)) for sent in sentences]

    print('eval part1> model: %s.' % syn_ckpt)
    syner = syn_infer.Synthesizer(syn_ckpt)
    encoder_infer.load_model(speaker_enc_ckpt)

    ckpt_step = re.compile(r'.*?\.ckpt\-([0-9]+)').match(syn_ckpt)
    ckpt_step = "step-" + str(ckpt_step.group(1)) if ckpt_step else syn_ckpt

    speaker_audio_wav_list = [
        encoder_audio.preprocess_wav(wav_dir) for wav_dir in speaker_audio_dirs
    ]
    speaker_audio_wav = np.concatenate(speaker_audio_wav_list)
    print(os.path.join(eval_results_dir, '000_refer_speaker_audio.wav'))
    audio.save_wav(
        speaker_audio_wav,
        os.path.join(eval_results_dir, '000_refer_speaker_audio.wav'),
        hparams.sample_rate)
    speaker_embed = encoder_infer.embed_utterance(speaker_audio_wav)
    for i, text in enumerate(sentences):
        path = os.path.join(eval_results_dir,
                            "%s-eval-%03d.wav" % (ckpt_step, i))
        print('[{:<10}]: {}'.format('processing', path))
        mel_spec = syner.synthesize_spectrograms(
            [text], [speaker_embed])[0]  # batch synthesize
        print('[{:<10}]:'.format('text:'), text)
        # print(np.shape(mel_spec))
        wav = syner.griffin_lim(mel_spec)
        audio.save_wav(wav, path, hparams.sample_rate)
예제 #7
0
    def preprocess_speaker(speaker_dir: Path):
        # Give a name to the speaker that includes its dataset
        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)

        # Create an output directory with that name, as well as a txt file containing a
        # reference to each source file.
        speaker_out_dir = out_dir.joinpath(speaker_name)
        speaker_out_dir.mkdir(exist_ok=True)
        sources_fpath = speaker_out_dir.joinpath("_sources.txt")

        # There's a possibility that the preprocessing was interrupted earlier, check if
        # there already is a sources file.
        if sources_fpath.exists():
            try:
                with sources_fpath.open("r") as sources_file:
                    existing_fnames = {
                        line.split(",")[0]
                        for line in sources_file
                    }
            except:
                existing_fnames = {}
        else:
            existing_fnames = {}

        # Gather all audio files for that speaker recursively
        sources_file = sources_fpath.open("a" if skip_existing else "w")
        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
            # Check if the target output file already exists
            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
            out_fname = out_fname.replace(".%s" % extension, ".npy")
            if skip_existing and out_fname in existing_fnames:
                continue

            # Load and preprocess the waveform
            wav = audio.preprocess_wav(in_fpath)
            if len(wav) == 0:
                continue

            # Create the mel spectrogram, discard those that are too short
            # frames = audio.wav_to_mel_spectrogram(wav)

            # Extract raw audio patches for fCNN
            win = np.hamming(int(sampling_rate * 0.02))
            inc = int(win.shape[0] / 2)
            frames = get_frame_from_file(wav,
                                         win=win,
                                         inc=inc,
                                         sr=sampling_rate,
                                         n_channels=1,
                                         duration=None)
            frames = np.transpose(frames)

            if len(frames) < partials_n_frames:
                continue

            out_fpath = speaker_out_dir.joinpath(out_fname)
            np.save(out_fpath, frames)
            logger.add_sample(duration=len(wav) / sampling_rate)
            sources_file.write("%s,%s\n" % (out_fname, in_fpath))

        sources_file.close()
예제 #8
0
def run_eval_part1(args):
    speaker_enc_ckpt = args.speaker_encoder_checkpoint
    syn_ckpt = args.syn_checkpoint
    speaker_name = args.speaker_name
    eval_results_dir = os.path.join(args.eval_results_dir, speaker_name)
    if not os.path.exists(eval_results_dir):
        os.makedirs(eval_results_dir)
    speaker_audio_dirs = {
        "speaker_name": ["speaker_audio_1.wav", "speaker_audio_2.wav"],
        "vctk_p225": [
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_005.wav",
        ],
        "vctk_p226": [
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_005.wav",
        ],
        "vctk_p227": [
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_005.wav",
        ],
        "vctk_p228": [
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_005.wav",
        ],
        "biaobei_speaker": [
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000005.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000006.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000007.wav",
        ],
        "aishell_C0002": [
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0004.wav",
        ],
        "aishell_C0896": [
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0004.wav",
        ],
    }[speaker_name]
    sentences = [
        "THAT MATTER OF TROY AND ACHILLES WRATH ONE TWO THREE RATS",
        "ENDED THE QUEST OF THE HOLY GRAAL JERUSALEM A HANDFUL OF ASHES BLOWN BY THE WIND EXTINCT",
        "She can scoop these things into three red bags",
        "and we will go meet her Wednesday at the train station",
        "This was demonstrated in a laboratory experiment with rats."
    ]

    sentences = [sen.upper() for sen in sentences]

    sentences.append(
        "This was demonstrated in a laboratory experiment with rats")

    print('eval part1> model: %s.' % syn_ckpt)
    syner = syn_infer.Synthesizer(syn_ckpt)
    encoder_infer.load_model(speaker_enc_ckpt)

    ckpt_step = re.compile(r'.*?\.ckpt\-([0-9]+)').match(syn_ckpt)
    ckpt_step = "step-" + str(ckpt_step.group(1)) if ckpt_step else syn_ckpt

    speaker_audio_wav_list = [
        encoder_audio.preprocess_wav(wav_dir) for wav_dir in speaker_audio_dirs
    ]
    speaker_audio_wav = np.concatenate(speaker_audio_wav_list)
    print(
        os.path.join(eval_results_dir,
                     '%s-000_refer_speaker_audio.wav' % speaker_name))
    audio.save_wav(
        speaker_audio_wav,
        os.path.join(eval_results_dir,
                     '%s-000_refer_speaker_audio.wav' % speaker_name),
        hparams.sample_rate)
    speaker_embed = encoder_infer.embed_utterance(speaker_audio_wav)
    for i, text in enumerate(sentences):
        path = os.path.join(
            eval_results_dir,
            "%s-%s-eval-%03d.wav" % (speaker_name, ckpt_step, i))
        print('[{:<10}]: {}'.format('processing', path))
        mel_spec = syner.synthesize_spectrograms(
            [text], [speaker_embed])[0]  # batch synthesize
        print('[{:<10}]:'.format('text:'), text)
        # print(np.shape(mel_spec))
        wav = syner.griffin_lim(mel_spec)
        audio.save_wav(wav, path, hparams.sample_rate)
예제 #9
0
 def load_voice(sele, voice_dir):
     original_wav, sampling_rate = librosa.load(voice_dir)
     preprocessed_wav = preprocess_wav(original_wav, sampling_rate)
     voice_embed = encoder.embed_utterance(preprocessed_wav)
     return voice_embed