def __init__( self, manifest_filepath: Union[str, Path, List[str], List[Path]], sample_rate: int, n_segments: Optional[int] = None, max_duration: Optional[float] = None, min_duration: Optional[float] = None, ignore_file: Optional[Union[str, Path]] = None, trim: Optional[bool] = False, load_precomputed_mel: bool = False, hop_length: Optional[int] = None, ): """Dataset which can be used for training and fine-tuning vocoder with pre-computed mel-spectrograms. Args: manifest_filepath (Union[str, Path, List[str], List[Path]]): Path(s) to the .json manifests containing information on the dataset. Each line in the .json file should be valid json. Note: the .json file itself is not valid json. Each line should contain the following: "audio_filepath": <PATH_TO_WAV>, "duration": <Duration of audio clip in seconds> (Optional), "mel_filepath": <PATH_TO_LOG_MEL_PT> (Optional) sample_rate (int): The sample rate of the audio. Or the sample rate that we will resample all files to. n_segments (int): The length of audio in samples to load. For example, given a sample rate of 16kHz, and n_segments=16000, a random 1 second section of audio from the clip will be loaded. The section will be randomly sampled everytime the audio is batched. Can be set to None to load the entire audio. Must be specified if load_precomputed_mel is True. max_duration (Optional[float]): Max duration of audio clips in seconds. All samples exceeding this will be pruned prior to training. Note: Requires "duration" to be set in the manifest file. It does not load audio to compute duration. Defaults to None which does not prune. min_duration (Optional[float]): Min duration of audio clips in seconds. All samples lower than this will be pruned prior to training. Note: Requires "duration" to be set in the manifest file. It does not load audio to compute duration. Defaults to None which does not prune. ignore_file (Optional[Union[str, Path]]): The location of a pickle-saved list of audio paths that will be pruned prior to training. Defaults to None which does not prune. trim (bool): Whether to apply librosa.effects.trim to the audio file. Defaults to False. load_precomputed_mel (bool): Whether to load precomputed mel (useful for fine-tuning). Note: Requires "mel_filepath" to be set in the manifest file. hop_length (Optional[int]): The hope length between fft computations. Must be specified if load_precomputed_mel is True. """ super().__init__() if load_precomputed_mel: if hop_length is None: raise ValueError( "hop_length must be specified when load_precomputed_mel is True" ) if n_segments is None: raise ValueError( "n_segments must be specified when load_precomputed_mel is True" ) # Initialize and read manifest file(s), filter out data by duration and ignore_file if isinstance(manifest_filepath, str): manifest_filepath = [manifest_filepath] self.manifest_filepath = manifest_filepath data = [] total_duration = 0 for manifest_file in self.manifest_filepath: with open(Path(manifest_file).expanduser(), 'r') as f: logging.info(f"Loading dataset from {manifest_file}.") for line in tqdm(f): item = json.loads(line) if "mel_filepath" not in item and load_precomputed_mel: raise ValueError( f"mel_filepath is missing in {manifest_file}") file_info = { "audio_filepath": item["audio_filepath"], "mel_filepath": item["mel_filepath"] if "mel_filepath" in item else None, "duration": item["duration"] if "duration" in item else None, } data.append(file_info) if file_info["duration"] is None: logging.info( "Not all audio files have duration information. Duration logging will be disabled." ) total_duration = None if total_duration is not None: total_duration += item["duration"] logging.info(f"Loaded dataset with {len(data)} files.") if total_duration is not None: logging.info( f"Dataset contains {total_duration / 3600:.2f} hours.") self.data = TTSDataset.filter_files(data, ignore_file, min_duration, max_duration, total_duration) self.base_data_dir = get_base_dir( [item["audio_filepath"] for item in self.data]) # Initialize audio and mel related parameters self.load_precomputed_mel = load_precomputed_mel self.featurizer = WaveformFeaturizer(sample_rate=sample_rate) self.sample_rate = sample_rate self.n_segments = n_segments self.hop_length = hop_length self.trim = trim
def __init__( self, manifest_filepath: Union[str, Path, List[str], List[Path]], sample_rate: int, text_tokenizer: Union[BaseTokenizer, Callable[[str], List[int]]], tokens: Optional[List[str]] = None, text_normalizer: Optional[Union[Normalizer, Callable[[str], str]]] = None, text_normalizer_call_kwargs: Optional[Dict] = None, text_tokenizer_pad_id: Optional[int] = None, sup_data_types: Optional[List[str]] = None, sup_data_path: Optional[Union[Path, str]] = None, max_duration: Optional[float] = None, min_duration: Optional[float] = None, ignore_file: Optional[Union[str, Path]] = None, trim: bool = False, n_fft: int = 1024, win_length: Optional[int] = None, hop_length: Optional[int] = None, window: str = "hann", n_mels: int = 80, lowfreq: int = 0, highfreq: Optional[int] = None, **kwargs, ): """Dataset which can be used for training spectrogram generators and end-to-end TTS models. It loads main data types (audio, text) and specified supplementary data types (log mel, durations, align prior matrix, pitch, energy, speaker id). Some of supplementary data types will be computed on the fly and saved in the sup_data_path if they did not exist before. Saved folder can be changed for some supplementary data types (see keyword args section). Arguments for supplementary data should be also specified in this class and they will be used from kwargs (see keyword args section). Args: manifest_filepath (Union[str, Path, List[str], List[Path]]): Path(s) to the .json manifests containing information on the dataset. Each line in the .json file should be valid json. Note: the .json file itself is not valid json. Each line should contain the following: "audio_filepath": <PATH_TO_WAV>, "text": <THE_TRANSCRIPT>, "normalized_text": <NORMALIZED_TRANSCRIPT> (Optional), "mel_filepath": <PATH_TO_LOG_MEL_PT> (Optional), "duration": <Duration of audio clip in seconds> (Optional) sample_rate (int): The sample rate of the audio. Or the sample rate that we will resample all files to. text_tokenizer (Optional[Union[BaseTokenizer, Callable[[str], List[int]]]]): BaseTokenizer or callable which represents text tokenizer. tokens (Optional[List[str]]): Tokens from text_tokenizer. Should be specified if text_tokenizer is not BaseTokenizer. text_normalizer (Optional[Union[Normalizer, Callable[[str], str]]]): Normalizer or callable which represents text normalizer. text_normalizer_call_kwargs (Optional[Dict]): Additional arguments for text_normalizer function. text_tokenizer_pad_id (Optional[int]): Index of padding. Should be specified if text_tokenizer is not BaseTokenizer. sup_data_types (Optional[List[str]]): List of supplementary data types. sup_data_path (Optional[Union[Path, str]]): A folder that contains or will contain supplementary data (e.g. pitch). max_duration (Optional[float]): Max duration of audio clips in seconds. All samples exceeding this will be pruned prior to training. Note: Requires "duration" to be set in the manifest file. It does not load audio to compute duration. Defaults to None which does not prune. min_duration (Optional[float]): Min duration of audio clips in seconds. All samples lower than this will be pruned prior to training. Note: Requires "duration" to be set in the manifest file. It does not load audio to compute duration. Defaults to None which does not prune. ignore_file (Optional[Union[str, Path]]): The location of a pickle-saved list of audio paths that will be pruned prior to training. Defaults to None which does not prune. trim (Optional[bool]): Whether to apply librosa.effects.trim to the audio file. Defaults to False. n_fft (int): The number of fft samples. Defaults to 1024 win_length (Optional[int]): The length of the stft windows. Defaults to None which uses n_fft. hop_length (Optional[int]): The hope length between fft computations. Defaults to None which uses n_fft//4. window (str): One of 'hann', 'hamming', 'blackman','bartlett', 'none'. Which corresponds to the equivalent torch window function. n_mels (int): The number of mel filters. Defaults to 80. lowfreq (int): The lowfreq input to the mel filter calculation. Defaults to 0. highfreq (Optional[int]): The highfreq input to the mel filter calculation. Defaults to None. Keyword Args: log_mel_folder (Optional[Union[Path, str]]): The folder that contains or will contain log mel spectrograms. align_prior_matrix_folder (Optional[Union[Path, str]]): The folder that contains or will contain align prior matrices. pitch_folder (Optional[Union[Path, str]]): The folder that contains or will contain pitch. energy_folder (Optional[Union[Path, str]]): The folder that contains or will contain energy. durs_file (Optional[str]): String path to pickled durations location. durs_type (Optional[str]): Type of durations. Currently supported only "aligner-based". use_beta_binomial_interpolator (Optional[bool]): Whether to use beta-binomial interpolator for calculating alignment prior matrix. Defaults to False. pitch_fmin (Optional[float]): The fmin input to librosa.pyin. Defaults to librosa.note_to_hz('C2'). pitch_fmax (Optional[float]): The fmax input to librosa.pyin. Defaults to librosa.note_to_hz('C7'). pitch_mean (Optional[float]): The mean that we use to normalize the pitch. pitch_std (Optional[float]): The std that we use to normalize the pitch. pitch_norm (Optional[bool]): Whether to normalize pitch (via pitch_mean and pitch_std) or not. """ super().__init__() # Initialize text tokenizer self.text_tokenizer = text_tokenizer if isinstance(self.text_tokenizer, BaseTokenizer): self.text_tokenizer_pad_id = text_tokenizer.pad self.tokens = text_tokenizer.tokens else: if text_tokenizer_pad_id is None: raise ValueError( f"text_tokenizer_pad_id must be specified if text_tokenizer is not BaseTokenizer" ) if tokens is None: raise ValueError( f"tokens must be specified if text_tokenizer is not BaseTokenizer" ) self.text_tokenizer_pad_id = text_tokenizer_pad_id self.tokens = tokens # Initialize text normalizer is specified self.text_normalizer = text_normalizer self.text_normalizer_call = ( self.text_normalizer.normalize if isinstance( self.text_normalizer, Normalizer) else self.text_normalizer) self.text_normalizer_call_kwargs = (text_normalizer_call_kwargs if text_normalizer_call_kwargs is not None else {}) # Initialize and read manifest file(s), filter out data by duration and ignore_file, compute base dir if isinstance(manifest_filepath, str): manifest_filepath = [manifest_filepath] self.manifest_filepath = manifest_filepath data = [] total_duration = 0 for manifest_file in self.manifest_filepath: with open(Path(manifest_file).expanduser(), 'r') as f: logging.info(f"Loading dataset from {manifest_file}.") for line in tqdm(f): item = json.loads(line) file_info = { "audio_filepath": item["audio_filepath"], "original_text": item["text"], "mel_filepath": item["mel_filepath"] if "mel_filepath" in item else None, "duration": item["duration"] if "duration" in item else None, "speaker_id": item["speaker"] if "speaker" in item else None, } if "normalized_text" not in item: text = item["text"] if self.text_normalizer is not None: text = self.text_normalizer_call( text, **self.text_normalizer_call_kwargs) file_info["normalized_text"] = text file_info["text_tokens"] = self.text_tokenizer(text) else: file_info["normalized_text"] = item["normalized_text"] file_info["text_tokens"] = self.text_tokenizer( item["normalized_text"]) data.append(file_info) if file_info["duration"] is None: logging.info( "Not all audio files have duration information. Duration logging will be disabled." ) total_duration = None if total_duration is not None: total_duration += item["duration"] logging.info(f"Loaded dataset with {len(data)} files.") if total_duration is not None: logging.info( f"Dataset contains {total_duration / 3600:.2f} hours.") self.data = TTSDataset.filter_files(data, ignore_file, min_duration, max_duration, total_duration) self.base_data_dir = get_base_dir( [item["audio_filepath"] for item in self.data]) # Initialize audio and mel related parameters self.sample_rate = sample_rate self.featurizer = WaveformFeaturizer(sample_rate=self.sample_rate) self.trim = trim self.n_fft = n_fft self.n_mels = n_mels self.lowfreq = lowfreq self.highfreq = highfreq self.window = window self.win_length = win_length or self.n_fft self.hop_length = hop_length self.hop_len = self.hop_length or self.n_fft // 4 self.fb = torch.tensor( librosa.filters.mel(self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.lowfreq, fmax=self.highfreq), dtype=torch.float, ).unsqueeze(0) window_fn = { 'hann': torch.hann_window, 'hamming': torch.hamming_window, 'blackman': torch.blackman_window, 'bartlett': torch.bartlett_window, 'none': None, }.get(self.window, None) self.stft = lambda x: torch.stft( input=x, n_fft=self.n_fft, hop_length=self.hop_len, win_length=self.win_length, window=window_fn(self.win_length, periodic=False).to(torch.float) if window_fn else None, ) # Initialize sup_data_path, sup_data_types and run preprocessing methods for every supplementary data type if sup_data_path is not None: Path(sup_data_path).mkdir(parents=True, exist_ok=True) self.sup_data_path = sup_data_path self.sup_data_types = ([ DATA_STR2DATA_CLASS[d_as_str] for d_as_str in sup_data_types ] if sup_data_types is not None else []) self.sup_data_types_set = set(self.sup_data_types) for data_type in self.sup_data_types: if data_type not in VALID_SUPPLEMENTARY_DATA_TYPES: raise NotImplementedError( f"Current implementation doesn't support {data_type} type." ) getattr(self, f"add_{data_type.name}")(**kwargs)