예제 #1
0
def build_phoneme_dicts(text_lang_pairs):
    """Create dictionaries (possibly more languages) of words (from a list of texts) with IPA equivalents."""
    dictionaries = {}
    Logger.progress(0 / len(text_lang_pairs),
                    prefix='Building phoneme dictionary:')
    for i, (t, l) in enumerate(text_lang_pairs):
        if not (l in dictionaries):
            dictionaries[l] = {}
        clear_words = remove_punctuation(t).split()
        for w in clear_words:
            if w in dictionaries[l]: continue
            dictionaries[l][w] = _phonemize(w, l)[:-1]
        Logger.progress((i + 1) / len(text_lang_pairs),
                        prefix='Building phoneme dictionary:')
    return dictionaries
    def create_meta_file(dataset_name, dataset_root_dir, output_metafile_name, audio_sample_rate, num_fft_freqs, spectrograms=True, phonemes=True):
        """Create the meta-file and spectrograms (mel and linear, optionally) or phonemized utterances (optionally).
        
        Format details:
            Every line of the metadata file contains info about one dataset item.
            The line has following format 
                'id|speaker|language|audio_file_path|mel_spectrogram_path|linear_spectrogram_path|text|phonemized_text'
            And the following must hold
                'audio_file_path' can be empty if loading just spectrograms
                'text' should be carefully normalized and should contain interpunction
                'phonemized_text' can be empty if loading just raw text  
        
        Arguments:
            dataset_name (string): Name of the dataset, loaders.py should contain a function for loading with a corresponding name.
            dataset_root_dir (string): Root directory from which is the dataset build and to which are spectrograms and the meta-file saved..
            output_metafile_name (string): Name of the output meta-file.
            audio_sample_rate (int): Sample rate of audios, used if spectrograms is set True.
            num_fft_freqs (int): Number of frequency bands used during spectrogram computation, used if spectrograms is set True.
        Keyword arguments:
            spectrograms (boolean, default True): If true, spetrograms (both mel and linear) are computed and saved.
            phonemes (boolean, default True): If true, phonemized variants of utterances are computed and saved.
        """

        # save current sample rate and fft freqs hyperparameters, as we may process dataset with different sample rate
        if spectrograms:
            old_sample_rate = hp.sample_rate
            hp.sample_rate = audio_sample_rate
            old_fft_freqs = hp.num_fft
            hp.num_fft = num_fft_freqs

        # load metafiles, an item is a list like: [text, audiopath, speaker_id, language_code]
        items = loaders.get_loader_by_name(dataset_name)(dataset_root_dir)

        # build dictionaries for translation to IPA from source languages, see utils.text for details
        if phonemes:
            text_lang_pairs = [(i[0], hp.languages[0] if i[3] == "" else i[3]) for i in items]
            phoneme_dicts = text.build_phoneme_dicts(text_lang_pairs)

        # prepare directories which will store spectrograms
        if spectrograms:
            spectrogram_dirs = [os.path.join(dataset_root_dir, 'spectrograms'), 
                                os.path.join(dataset_root_dir, 'linear_spectrograms')]
            for x in spectrogram_dirs:
                if not os.path.exists(x): os.makedirs(x)

        # iterate through items and build the meta-file
        metafile_path = os.path.join(dataset_root_dir, output_metafile_name)
        with open(metafile_path, 'w', encoding='utf-8') as f:
            Logger.progress(0, prefix='Building metafile:')
            for i in range(len(items)):
                raw_text, audio_path, speaker, language = items[i]   
                if language == "": language = hp.languages[0]
                phonemized_text = text.to_phoneme(raw_text, False, language, phoneme_dicts[language]) if phonemes else ""     
                spectrogram_paths = "|"
                if spectrograms:
                    spec_name = f'{str(i).zfill(6)}.npy'                 
                    audio_data = audio.load(os.path.join(dataset_root_dir, audio_path))
                    np.save(os.path.join(spectrogram_dirs[0], spec_name), audio.spectrogram(audio_data, True))
                    np.save(os.path.join(spectrogram_dirs[1], spec_name), audio.spectrogram(audio_data, False))
                    spectrogram_paths = os.path.join('spectrograms', spec_name) + '|' + os.path.join('linear_spectrograms', spec_name)
                print(f'{str(i).zfill(6)}|{speaker}|{language}|{audio_path}|{spectrogram_paths}|{raw_text}|{phonemized_text}', file=f)
                Logger.progress((i + 1) / len(items), prefix='Building metafile:')
        
        # restore the original sample rate and fft freq values
        if spectrograms:
            hp.sample_rate = old_sample_rate
            hp.num_fft = old_fft_freqs