def __init__( self, manifest_path=None, min_snr_db=10, max_snr_db=50, max_gain_db=300.0, rng=None, audio_tar_filepaths=None, shuffle_n=100, orig_sr=16000, ): self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser( []), index_by_file_id=True) self._audiodataset = None self._tarred_audio = False self._orig_sr = orig_sr self._data_iterator = None if audio_tar_filepaths: self._tarred_audio = True self._audiodataset = AugmentationDataset(manifest_path, audio_tar_filepaths, shuffle_n) self._data_iterator = iter(self._audiodataset) self._rng = random.Random() if rng is None else rng self._min_snr_db = min_snr_db self._max_snr_db = max_snr_db self._max_gain_db = max_gain_db
def parser(self): if self._parser is not None: return self._parser if self.learn_alignment: ds_class_name = self._cfg.train_ds.dataset._target_.split(".")[-1] # TODO(Oktai15): remove it in 1.8.0 version if ds_class_name == "AudioToCharWithPriorAndPitchDataset" or ds_class_name == "TTSDataset": self._parser = self.vocab.encode else: raise ValueError(f"Unknown dataset class: {ds_class_name}") else: # TODO(Oktai15): remove it in 1.8.0 version # ds_class_name == "FastPitchDataset" self._parser = parsers.make_parser( labels=self._cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) return self._parser
def parser(self): if self._parser is not None: return self._parser if self.learn_alignment: ds_class_name = self._cfg.train_ds.dataset._target_.split(".")[-1] if ds_class_name == "TTSDataset": self._parser = self.vocab.encode elif ds_class_name == "AudioToCharWithPriorAndPitchDataset": if self.vocab is None: tokenizer_conf = self._get_default_text_tokenizer_conf() self._setup_tokenizer(tokenizer_conf) self._parser = self.vocab.encode else: raise ValueError(f"Unknown dataset class: {ds_class_name}") else: self._parser = parsers.make_parser( labels=self._cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) return self._parser
def parser(self): if self._parser is not None: return self._parser ds_class_name = self._cfg.train_ds.dataset._target_.split(".")[-1] if ds_class_name == "TTSDataset": self._parser = None elif hasattr(self._cfg, "labels"): self._parser = parsers.make_parser( labels=self._cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) elif ds_class_name == "AudioToCharWithPriorAndPitchDataset": self.parser = self.vocab.encode else: raise ValueError( "Wanted to setup parser, but model does not have necessary paramaters" ) return self._parser
def __init__(self, manifest_path: str, tar_filepaths: Union[str, List[str]], shuffle_n: int = 128): self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]), index_by_file_id=True) if isinstance(tar_filepaths, str): # Replace '(' and '[' with '{' brace_keys_open = ['(', '[', '<', '_OP_'] for bkey in brace_keys_open: if bkey in tar_filepaths: tar_filepaths = tar_filepaths.replace(bkey, "{") # Replace ')' and ']' with '}' brace_keys_close = [')', ']', '>', '_CL_'] for bkey in brace_keys_close: if bkey in tar_filepaths: tar_filepaths = tar_filepaths.replace(bkey, "}") if not HAVE_OMEGACONG_WEBDATASET: raise LightningNotInstalledException(self) self.audio_dataset = wd.WebDataset(urls=tar_filepaths, nodesplitter=None) if shuffle_n > 0: self.audio_dataset = self.audio_dataset.shuffle(shuffle_n) else: logging.info("WebDataset will not shuffle files within the tar files.") self.audio_dataset = self.audio_dataset.rename(audio='wav', key='__key__').to_tuple('audio', 'key') self.audio_iter = iter(self.audio_dataset)
def __init__( self, manifest_filepath: str, device: str, batch_size: int, labels: Union[str, List[str]], sample_rate: int = 16000, num_threads: int = 4, max_duration: float = 0.0, min_duration: float = 0.0, blank_index: int = -1, unk_index: int = -1, normalize: bool = True, bos_id: Optional[int] = None, eos_id: Optional[int] = None, pad_id: int = 0, trim: bool = False, shuffle: bool = False, drop_last: bool = False, parser: Union[str, Callable] = 'en', device_id: int = 0, global_rank: int = 0, world_size: int = 1, preprocessor_cfg: DictConfig = None, return_sample_id: bool = False, ): self.labels = labels parser = parsers.make_parser(labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize) super().__init__( manifest_filepath=manifest_filepath, device=device, batch_size=batch_size, sample_rate=sample_rate, num_threads=num_threads, max_duration=max_duration, min_duration=min_duration, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id, trim=trim, shuffle=shuffle, drop_last=drop_last, parser=parser, device_id=device_id, global_rank=global_rank, world_size=world_size, preprocessor_cfg=preprocessor_cfg, return_sample_id=return_sample_id, )
def __init__( self, manifest_filepath: Union[str, 'pathlib.Path'], n_segments: int, max_duration: Optional[float] = None, min_duration: Optional[float] = None, trim: Optional[bool] = False, truncate_to: Optional[int] = 1, ): """ See above AudioDataset for details on dataset and manifest formats. Unlike the regular AudioDataset, which samples random segments from each audio array as an example, SplicedAudioDataset concatenates all audio arrays together and indexes segments as examples. This way, the model sees more data (about 9x for LJSpeech) per epoch. Note: this class is not recommended to be used in validation. Args: manifest_filepath (str, Path): Path to manifest json as described above. Can be comma-separated paths such as "train_1.json,train_2.json" which is treated as two separate json files. n_segments (int): The length of audio in samples to load. For example, given a sample rate of 16kHz, and n_segments=16000, a random 1 second section of audio from the clip will be loaded. The section will be randomly sampled everytime the audio is batched. Can be set to -1 to load the entire audio. max_duration (float): If audio exceeds this length in seconds, it is filtered from the dataset. Defaults to None, which does not filter any audio. min_duration(float): If audio is less than this length in seconds, it is filtered from the dataset. Defaults to None, which does not filter any audio. trim (bool): Whether to use librosa.effects.trim on the audio clip truncate_to (int): Ensures that the audio segment returned is a multiple of truncate_to. Defaults to 1, which does no truncating. """ assert n_segments > 0 collection = collections.ASRAudioText( manifests_files=manifest_filepath.split(','), parser=parsers.make_parser(), min_duration=min_duration, max_duration=max_duration, ) self.trim = trim self.n_segments = n_segments self.truncate_to = truncate_to self.samples = [] for index in range(len(collection)): example = collection[index] with sf.SoundFile(example.audio_file, 'r') as f: samples = f.read(dtype='float32').transpose() self.samples.append(samples) self.samples = np.concatenate(self.samples, axis=0) self.samples = self.samples[:self.samples.shape[0] - (self.samples.shape[0] % self.n_segments), ...]
def __init__( self, audio_tar_filepaths: Union[str, List[str]], manifest_filepath: str, labels: List[str], sample_rate: int, int_values: bool = False, augmentor: Optional[ 'nemo.collections.asr.parts.perturb.AudioAugmentor'] = None, shuffle_n: int = 0, min_duration: Optional[float] = None, max_duration: Optional[float] = None, max_utts: int = 0, blank_index: int = -1, unk_index: int = -1, normalize: bool = True, trim: bool = False, bos_id: Optional[int] = None, eos_id: Optional[int] = None, parser: Optional[str] = 'en', pad_id: int = 0, shard_strategy: str = "scatter", global_rank: int = 0, world_size: int = 0, return_sample_id: bool = False, ): self.labels = labels parser = parsers.make_parser(labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize) super().__init__( audio_tar_filepaths=audio_tar_filepaths, manifest_filepath=manifest_filepath, parser=parser, sample_rate=sample_rate, int_values=int_values, augmentor=augmentor, shuffle_n=shuffle_n, min_duration=min_duration, max_duration=max_duration, max_utts=max_utts, trim=trim, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id, shard_strategy=shard_strategy, global_rank=global_rank, world_size=world_size, return_sample_id=return_sample_id, )
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) super().__init__(cfg=cfg, trainer=trainer) self.audio_to_melspec_precessor = instantiate(cfg.preprocessor) self.encoder = instantiate(cfg.encoder) self.variance_adapter = instantiate(cfg.variance_adaptor) self.generator = instantiate(cfg.generator) self.multiperioddisc = MultiPeriodDiscriminator() self.multiscaledisc = MultiScaleDiscriminator() self.melspec_fn = instantiate(cfg.preprocessor, highfreq=None, use_grads=True) self.mel_val_loss = L1MelLoss() self.durationloss = DurationLoss() self.feat_matching_loss = FeatureMatchingLoss() self.disc_loss = DiscriminatorLoss() self.gen_loss = GeneratorLoss() self.mseloss = torch.nn.MSELoss() self.energy = cfg.add_energy_predictor self.pitch = cfg.add_pitch_predictor self.mel_loss_coeff = cfg.mel_loss_coeff self.pitch_loss_coeff = cfg.pitch_loss_coeff self.energy_loss_coeff = cfg.energy_loss_coeff self.splice_length = cfg.splice_length self.use_energy_pred = False self.use_pitch_pred = False self.log_train_images = False self.logged_real_samples = False self._tb_logger = None self.sample_rate = cfg.sample_rate self.hop_size = cfg.hop_size # Parser and mappings are used for inference only. self.parser = parsers.make_parser(name='en') if 'mappings_filepath' in cfg: mappings_filepath = cfg.get('mappings_filepath') else: logging.error( "ERROR: You must specify a mappings.json file in the config file under model.mappings_filepath." ) mappings_filepath = self.register_artifact('mappings_filepath', mappings_filepath) with open(mappings_filepath, 'r') as f: mappings = json.load(f) self.word2phones = mappings['word2phones'] self.phone2idx = mappings['phone2idx']
def __init__(self, manifest_path=None, rng=None, audio_tar_filepaths=None, shuffle_n=128, shift_impulse=False): self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]), index_by_file_id=True) self._audiodataset = None self._tarred_audio = False self._shift_impulse = shift_impulse self._data_iterator = None if audio_tar_filepaths: self._tarred_audio = True self._audiodataset = AugmentationDataset(manifest_path, audio_tar_filepaths, shuffle_n) self._data_iterator = iter(self._audiodataset) self._rng = random.Random() if rng is None else rng
def parser(self): if self._parser is not None: return self._parser self._parser = parsers.make_parser( labels=self._cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) return self._parser
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(FastSpeech2Config) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) # Ensure passed cfg is compliant with schema OmegaConf.merge(cfg, schema) self.pitch = cfg.add_pitch_predictor self.energy = cfg.add_energy_predictor self.duration_coeff = cfg.duration_coeff self.audio_to_melspec_preprocessor = instantiate( self._cfg.preprocessor) self.encoder = instantiate(self._cfg.encoder) self.mel_decoder = instantiate(self._cfg.decoder) self.variance_adapter = instantiate(self._cfg.variance_adaptor) self.loss = L2MelLoss() self.mseloss = torch.nn.MSELoss() self.durationloss = DurationLoss() self.log_train_images = False # Parser and mappings are used for inference only. self.parser = parsers.make_parser(name='en') if 'mappings_filepath' in cfg: mappings_filepath = cfg.get('mappings_filepath') else: logging.error( "ERROR: You must specify a mappings.json file in the config file under model.mappings_filepath." ) mappings_filepath = self.register_artifact('mappings_filepath', mappings_filepath) with open(mappings_filepath, 'r') as f: mappings = json.load(f) self.word2phones = mappings['word2phones'] self.phone2idx = mappings['phone2idx']
def __init__( self, manifest_filepath: Union[str, "pathlib.Path"], n_segments: int, max_duration: Optional[float] = None, min_duration: Optional[float] = None, trim: Optional[bool] = False, truncate_to: Optional[int] = 1, ): """ Mostly compliant with nemo.collections.asr.data.datalayers.AudioToTextDataset except it only returns Audio without text. Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds). Each new line is a different sample. Note that text is required, but is ignored for AudioDataset. Example below: {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147} ... {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": "utterance_id", "ctm_utt": "en_4156", "side": "A"} Args: manifest_filepath (str, Path): Path to manifest json as described above. Can be comma-separated paths such as "train_1.json,train_2.json" which is treated as two separate json files. n_segments (int): The length of audio in samples to load. For example, given a sample rate of 16kHz, and n_segments=16000, a random 1 second section of audio from the clip will be loaded. The section will be randomly sampled everytime the audio is batched. Can be set to -1 to load the entire audio. max_duration (float): If audio exceeds this length in seconds, it is filtered from the dataset. Defaults to None, which does not filter any audio. min_duration(float): If audio is less than this length in seconds, it is filtered from the dataset. Defaults to None, which does not filter any audio. trim (bool): Whether to use librosa.effects.trim on the audio clip truncate_to (int): Ensures that the audio segment returned is a multiple of truncate_to. Defaults to 1, which does no truncating. """ self.collection = collections.ASRAudioText( manifests_files=manifest_filepath.split(","), parser=parsers.make_parser(), min_duration=min_duration, max_duration=max_duration, ) self.trim = trim self.n_segments = n_segments self.truncate_to = truncate_to
def parser(self): if self._parser is not None: return self._parser if self.learn_alignment: vocab = AudioToCharWithDursF0Dataset.make_vocab( **self._cfg.train_ds.dataset.vocab) self._parser = vocab.encode else: self._parser = parsers.make_parser( labels=self._cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) return self._parser
def __init__( self, manifest_filepath: str, labels: Union[str, List[str]], sample_rate: int, int_values: bool = False, augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, max_duration: Optional[float] = None, min_duration: Optional[float] = None, max_utts: int = 0, blank_index: int = -1, unk_index: int = -1, normalize: bool = True, trim: bool = False, bos_id: Optional[int] = None, eos_id: Optional[int] = None, pad_id: int = 0, parser: Union[str, Callable] = 'en', return_sample_id: bool = False, ): self.labels = labels parser = parsers.make_parser(labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize) super().__init__( manifest_filepath=manifest_filepath, parser=parser, sample_rate=sample_rate, int_values=int_values, augmentor=augmentor, max_duration=max_duration, min_duration=min_duration, max_utts=max_utts, trim=trim, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id, return_sample_id=return_sample_id, )
def main(): filelist_base = 'https://raw.githubusercontent.com/NVIDIA/tacotron2/master/filelists/' filelists = ['train', 'val', 'test'] # NeMo parser for text normalization text_parser = parsers.make_parser(name='en') for split in filelists: # Download file list if necessary filelist_path = os.path.join(args.ljspeech_base, f"ljs_audio_text_{split}_filelist.txt") if not os.path.exists(filelist_path): wget.download( f"{filelist_base}/ljs_audio_text_{split}_filelist.txt", out=args.ljspeech_base) manifest_target = os.path.join(args.ljspeech_base, f"ljspeech_{split}.json") with open(manifest_target, 'w') as f_out: with open(filelist_path, 'r') as filelist: print(f"\nCreating {manifest_target}...") for line in filelist: basename = line[6:16] text = text_parser._normalize(line[21:].strip()) # Make sure corresponding wavfile exists and write .txt transcript wav_path = os.path.join(args.ljspeech_base, 'wavs/', basename + '.wav') assert os.path.exists(wav_path) txt_path = os.path.join(args.ljspeech_base, 'wavs/', basename + '.txt') with open(txt_path, 'w') as f_txt: f_txt.write(text) # Write manifest entry entry = { 'audio_filepath': wav_path, 'duration': sox.file_info.duration(wav_path), 'text': text, } f_out.write(json.dumps(entry) + '\n')
def parser(self): if self._parser is not None: return self._parser if self._validation_dl is not None: return self._validation_dl.dataset.manifest_processor.parser if self._test_dl is not None: return self._test_dl.dataset.manifest_processor.parser if self._train_dl is not None: return self._train_dl.dataset.manifest_processor.parser # Else construct a parser # Try to get params from validation, test, and then train params = {} try: params = self._cfg.validation_ds.dataset except ConfigAttributeError: pass if params == {}: try: params = self._cfg.test_ds.dataset except ConfigAttributeError: pass if params == {}: try: params = self._cfg.train_ds.dataset except ConfigAttributeError: pass name = params.get('parser', None) or 'en' unk_id = params.get('unk_index', None) or -1 blank_id = params.get('blank_index', None) or -1 do_normalize = params.get('normalize', True) self._parser = parsers.make_parser( labels=self._cfg.labels, name=name, unk_id=unk_id, blank_id=blank_id, do_normalize=do_normalize, ) return self._parser
def main(): args = get_args() ljspeech_dir = args.ljspeech_dir # Download LJSpeech dataset if needed if args.download_ljspeech: get_lj_speech(args.ljspeech_dir) ljspeech_dir = os.path.join(args.ljspeech_dir, "LJSpeech-1.1") # Create normalizer if args.normalizer_class == "ENCharParser": normalizer_call = parsers.make_parser(name='en')._normalize elif args.normalizer_class == "Normalizer": whitelist_path = args.whitelist_path if whitelist_path is None: wget.download( "https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv", out=ljspeech_dir, ) whitelist_path = os.path.join(ljspeech_dir, "whitelist_lj_speech.tsv") text_normalizer = Normalizer( lang="en", input_case="cased", whitelist=whitelist_path, overwrite_cache=True, cache_dir=os.path.join(ljspeech_dir, "cache_dir"), ) text_normalizer_call_kwargs = { "punct_pre_process": True, "punct_post_process": True } normalizer_call = lambda x: text_normalizer.normalize( x, **text_normalizer_call_kwargs) else: raise ValueError("normalizer_class must be ENCharParser or Normalizer") # Create manifests (based on predefined NVIDIA's split) and optionally save transcripts in .txt files filelist_base = 'https://raw.githubusercontent.com/NVIDIA/tacotron2/master/filelists' filelists = ['train', 'val', 'test'] for split in filelists: # Download file list if necessary filelist_path = os.path.join(ljspeech_dir, f"ljs_audio_text_{split}_filelist.txt") if not os.path.exists(filelist_path): wget.download( f"{filelist_base}/ljs_audio_text_{split}_filelist.txt", out=ljspeech_dir) manifest_target = os.path.join(ljspeech_dir, f"ljspeech_{split}.json") with open(manifest_target, 'w') as f_out: with open(filelist_path, 'r') as filelist: print(f"\nCreating {manifest_target}...") for line in filelist: basename = line[6:16] text = line[21:].strip() norm_text = normalizer_call(text) # Make sure corresponding wavfile exists wav_path = os.path.join(ljspeech_dir, 'wavs', basename + '.wav') assert os.path.exists(wav_path) if args.save_transcripts_in_txt: txt_path = os.path.join(ljspeech_dir, 'wavs', basename + '.txt') with open(txt_path, 'w') as f_txt: f_txt.write(norm_text) # Write manifest entry entry = { 'audio_filepath': wav_path, 'duration': sox.file_info.duration(wav_path), 'text': norm_text if args.manifest_text_var_is_normalized else text, 'normalized_text': norm_text, } f_out.write(json.dumps(entry) + '\n')
def __init__( self, manifest_filepath: str, sample_rate: int, supplementary_folder: Path, max_duration: Optional[float] = None, min_duration: Optional[float] = None, ignore_file: Optional[str] = None, trim: bool = False, n_fft=1024, win_length=None, hop_length=None, window="hann", n_mels=64, lowfreq=0, highfreq=None, pitch_fmin=80, pitch_fmax=640, pitch_avg=0, pitch_std=1, tokenize_text=True, ): """Dataset that loads audio, log mel specs, text tokens, duration / attention priors, pitches, and energies. Log mels, priords, pitches, and energies will be computed on the fly and saved in the supplementary_folder if they did not exist before. Args: manifest_filepath (str, Path, List[str, Path]): Path(s) to the .json manifests containing information on the dataset. Each line in the .json file should be valid json. Note: the .json file itself is not valid json. Each line should contain the following: "audio_filepath": <PATH_TO_WAV> "mel_filepath": <PATH_TO_LOG_MEL_PT> (Optional) "duration": <Duration of audio clip in seconds> (Optional) "text": <THE_TRANSCRIPT> (Optional) sample_rate (int): The sample rate of the audio. Or the sample rate that we will resample all files to. supplementary_folder (Path): A folder that contains or will contain extra information such as log_mel if not specified in the manifest .json file. It will also contain priors, pitches, and energies max_duration (Optional[float]): Max duration of audio clips in seconds. All samples exceeding this will be pruned prior to training. Note: Requires "duration" to be set in the manifest file. It does not load audio to compute duration. Defaults to None which does not prune. min_duration (Optional[float]): Min duration of audio clips in seconds. All samples lower than this will be pruned prior to training. Note: Requires "duration" to be set in the manifest file. It does not load audio to compute duration. Defaults to None which does not prune. ignore_file (Optional[str, Path]): The location of a pickle-saved list of audio_ids (the stem of the audio files) that will be pruned prior to training. Defaults to None which does not prune. trim (Optional[bool]): Whether to apply librosa.effects.trim to the audio file. Defaults to False. n_fft (Optional[int]): The number of fft samples. Defaults to 1024 win_length (Optional[int]): The length of the stft windows. Defaults to None which uses n_fft. hop_length (Optional[int]): The hope length between fft computations. Defaults to None which uses n_fft//4. window (Optional[str]): One of 'hann', 'hamming', 'blackman','bartlett', 'none'. Which corresponds to the equivalent torch window function. n_mels (Optional[int]): The number of mel filters. Defaults to 64. lowfreq (Optional[int]): The lowfreq input to the mel filter calculation. Defaults to 0. highfreq (Optional[int]): The highfreq input to the mel filter calculation. Defaults to None. pitch_fmin (Optional[int]): The fmin input to librosa.pyin. Defaults to None. pitch_fmax (Optional[int]): The fmax input to librosa.pyin. Defaults to None. pitch_avg (Optional[float]): The mean that we use to normalize the pitch. Defaults to 0. pitch_std (Optional[float]): The std that we use to normalize the pitch. Defaults to 1. tokenize_text (Optional[bool]): Whether to tokenize (turn chars into ints). Defaults to True. """ super().__init__() self.pitch_fmin = pitch_fmin self.pitch_fmax = pitch_fmax self.pitch_avg = pitch_avg self.pitch_std = pitch_std self.win_length = win_length or n_fft self.sample_rate = sample_rate self.hop_len = hop_length or n_fft // 4 self.parser = make_parser(name="en", do_tokenize=tokenize_text) self.pad_id = self.parser._blank_id Path(supplementary_folder).mkdir(parents=True, exist_ok=True) self.supplementary_folder = supplementary_folder audio_files = [] total_duration = 0 # Load data from manifests # Note: audio is always required, even for text -> mel_spectrogram models, due to the fact that most models # extract pitch from the audio # Note: mel_filepath is not required and if not present, we then check the supplementary folder. If we fail, we # compute the mel on the fly and save it to the supplementary folder # Note: text is not required. Any models that require on text (spectrogram generators, end-to-end models) will # fail if not set. However vocoders (mel -> audio) will be able to work without text if isinstance(manifest_filepath, str): manifest_filepath = [manifest_filepath] for manifest_file in manifest_filepath: with open(Path(manifest_file).expanduser(), 'r') as f: logging.info(f"Loading dataset from {manifest_file}.") for line in f: item = json.loads(line) # Grab audio, text, mel if they exist file_info = {} file_info["audio_filepath"] = item["audio_filepath"] file_info["mel_filepath"] = item[ "mel_filepath"] if "mel_filepath" in item else None file_info["duration"] = item[ "duration"] if "duration" in item else None # Parse text file_info["text_tokens"] = None if "text" in item: text = item["text"] text_tokens = self.parser(text) file_info["text_tokens"] = text_tokens audio_files.append(file_info) if file_info["duration"] is None: logging.info( "Not all audio files have duration information. Duration logging will be disabled." ) total_duration = None if total_duration is not None: total_duration += item["duration"] logging.info(f"Loaded dataset with {len(audio_files)} files.") if total_duration is not None: logging.info(f"Dataset contains {total_duration/3600:.2f} hours.") self.data = [] if ignore_file: logging.info(f"using {ignore_file} to prune dataset.") with open(Path(ignore_file).expanduser(), "rb") as f: wavs_to_ignore = set(pickle.load(f)) pruned_duration = 0 if total_duration is not None else None pruned_items = 0 for item in audio_files: audio_path = item['audio_filepath'] audio_id = Path(audio_path).stem # Prune data according to min/max_duration & the ignore file if total_duration is not None: if (min_duration and item["duration"] < min_duration) or ( max_duration and item["duration"] > max_duration): pruned_duration += item["duration"] pruned_items += 1 continue if ignore_file and (audio_id in wavs_to_ignore): pruned_items += 1 pruned_duration += item["duration"] wavs_to_ignore.remove(audio_id) continue self.data.append(item) logging.info( f"Pruned {pruned_items} files. Final dataset contains {len(self.data)} files" ) if pruned_duration is not None: logging.info( f"Pruned {pruned_duration/3600:.2f} hours. Final dataset contains " f"{(total_duration-pruned_duration)/3600:.2f} hours.") self.featurizer = WaveformFeaturizer(sample_rate=sample_rate) self.trim = trim filterbanks = torch.tensor(librosa.filters.mel(sample_rate, n_fft, n_mels=n_mels, fmin=lowfreq, fmax=highfreq), dtype=torch.float).unsqueeze(0) self.fb = filterbanks torch_windows = { 'hann': torch.hann_window, 'hamming': torch.hamming_window, 'blackman': torch.blackman_window, 'bartlett': torch.bartlett_window, 'none': None, } window_fn = torch_windows.get(window, None) window_tensor = window_fn(self.win_length, periodic=False) if window_fn else None self.stft = lambda x: stft_patch( input=x, n_fft=n_fft, hop_length=self.hop_len, win_length=self.win_length, window=window_tensor.to(torch.float), )
def __init__(self, cfg: DictConfig, trainer: Trainer = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) self._parser = parsers.make_parser( labels=cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(FastPitchHifiGanE2EConfig) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) # Ensure passed cfg is compliant with schema OmegaConf.merge(cfg, schema) self.preprocessor = instantiate(cfg.preprocessor) self.melspec_fn = instantiate(cfg.preprocessor, highfreq=None, use_grads=True) self.encoder = instantiate(cfg.input_fft) self.duration_predictor = instantiate(cfg.duration_predictor) self.pitch_predictor = instantiate(cfg.pitch_predictor) self.generator = instantiate(cfg.generator) self.multiperioddisc = MultiPeriodDiscriminator() self.multiscaledisc = MultiScaleDiscriminator() self.mel_val_loss = L1MelLoss() self.feat_matching_loss = FeatureMatchingLoss() self.disc_loss = DiscriminatorLoss() self.gen_loss = GeneratorLoss() self.max_token_duration = cfg.max_token_duration self.pitch_emb = torch.nn.Conv1d( 1, cfg.symbols_embedding_dim, kernel_size=cfg.pitch_embedding_kernel_size, padding=int((cfg.pitch_embedding_kernel_size - 1) / 2), ) # Store values precomputed from training data for convenience self.register_buffer('pitch_mean', torch.zeros(1)) self.register_buffer('pitch_std', torch.zeros(1)) self.pitchloss = PitchLoss() self.durationloss = DurationLoss() self.mel_loss_coeff = cfg.mel_loss_coeff self.log_train_images = False self.logged_real_samples = False self._tb_logger = None self.hann_window = None self.splice_length = cfg.splice_length self.sample_rate = cfg.sample_rate self.hop_size = cfg.hop_size
def __init__( self, manifest_filepath: str, device: str, batch_size: int, labels: Union[str, List[str]], sample_rate: int = 16000, num_threads: int = 4, max_duration: float = 0.0, min_duration: float = 0.0, blank_index: int = -1, unk_index: int = -1, normalize: bool = True, bos_id: Optional[int] = None, eos_id: Optional[int] = None, trim: bool = False, shuffle: bool = True, drop_last: bool = False, parser: Union[str, Callable] = 'en', device_id: int = 0, global_rank: int = 0, world_size: int = 1, preprocessor_cfg: DictConfig = None, ): self.drop_last = drop_last # used by lr_scheduler if not HAVE_DALI: raise ModuleNotFoundError( f"{self} requires NVIDIA DALI to be installed. " f"See: https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html#id1" ) if device not in ('cpu', 'gpu'): raise ValueError( f"{self} received an unexpected device argument {device}. Supported values are: 'cpu', 'gpu'" ) self.batch_size = batch_size # Used by NeMo self.device = device self.device_id = device_id if world_size > 1: self.shard_id = global_rank self.num_shards = world_size else: self.shard_id = None self.num_shards = None self.labels = labels if self.labels is None or len(self.labels) == 0: raise ValueError(f"{self} expects non empty labels list") self.parser = parsers.make_parser( labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize, ) self.eos_id = eos_id self.bos_id = bos_id self.sample_rate = sample_rate self.pipe = Pipeline( batch_size=batch_size, num_threads=num_threads, device_id=self.device_id, exec_async=True, exec_pipelined=True, ) has_preprocessor = preprocessor_cfg is not None if has_preprocessor: if preprocessor_cfg._target_ == "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor": feature_type = "mel_spectrogram" elif preprocessor_cfg._target_ == "nemo.collections.asr.modules.AudioToMFCCPreprocessor": feature_type = "mfcc" else: raise ValueError( f"{self} received an unexpected preprocessor configuration: {preprocessor_cfg._target_}." f" Supported preprocessors are: AudioToMelSpectrogramPreprocessor, AudioToMFCCPreprocessor" ) # Default values taken from AudioToMelSpectrogramPreprocessor params = preprocessor_cfg self.dither = params['dither'] if 'dither' in params else 0.0 self.preemph = params['preemph'] if 'preemph' in params else 0.97 self.window_size_sec = params[ 'window_size'] if 'window_size' in params else 0.02 self.window_stride_sec = params[ 'window_stride'] if 'window_stride' in params else 0.01 self.sample_rate = params[ 'sample_rate'] if 'sample_rate' in params else sample_rate self.window_size = int(self.window_size_sec * self.sample_rate) self.window_stride = int(self.window_size_sec * self.sample_rate) normalize = params[ 'normalize'] if 'normalize' in params else 'per_feature' if normalize == 'per_feature': # Each freq channel independently self.normalization_axes = (1, ) elif normalize == 'all_features': self.normalization_axes = (0, 1) else: raise ValueError( f"{self} received {normalize} for the normalize parameter." f" It must be either 'per_feature' or 'all_features'.") self.window = None window_name = params['window'] if 'window' in params else None torch_windows = { 'hamming': torch.hamming_window, 'blackman': torch.blackman_window, 'bartlett': torch.bartlett_window, } if window_name is None or window_name == 'hann': self.window = None # Hann is DALI's default elif window_name == 'ones': self.window = torch.ones(self.window_size) else: try: window_fn = torch_windows.get(window_name, None) self.window = window_fn(self.window_size, periodic=False) except: raise ValueError( f"{self} received {window_name} for the window parameter." f" It must be one of: ('hann', 'ones', 'hamming', 'blackman', 'bartlett', None)." f" None is equivalent to 'hann'.") self.n_fft = params[ 'n_fft'] if 'n_fft' in params else None # None means default self.n_mels = params['n_mels'] if 'n_mels' in params else 64 self.n_mfcc = params['n_mfcc'] if 'n_mfcc' in params else 64 features = params['features'] if 'features' in params else 0 if features > 0: if feature_type == 'mel_spectrogram': self.n_mels = features elif feature_type == 'mfcc': self.n_mfcc = features # TODO Implement frame splicing if 'frame_splicing' in params: assert params[ 'frame_splicing'] == 1, "Frame splicing is not implemented" self.freq_low = params['lowfreq'] if 'lowfreq' in params else 0.0 self.freq_high = params[ 'highfreq'] if 'highfreq' in params else self.sample_rate / 2.0 self.log_features = params['log'] if 'log' in params else True # We want to avoid taking the log of zero # There are two options: either adding or clamping to a small value self.log_zero_guard_type = params[ 'log_zero_guard_type'] if 'log_zero_guard_type' in params else 'add' if self.log_zero_guard_type not in ["add", "clamp"]: raise ValueError( f"{self} received {self.log_zero_guard_type} for the " f"log_zero_guard_type parameter. It must be either 'add' or " f"'clamp'.") self.log_zero_guard_value = params[ 'log_zero_guard_value'] if 'log_zero_guard_value' in params else 1e-05 if isinstance(self.log_zero_guard_value, str): if self.log_zero_guard_value == "tiny": self.log_zero_guard_value = torch.finfo(torch.float32).tiny elif self.log_zero_guard_value == "eps": self.log_zero_guard_value = torch.finfo(torch.float32).eps else: raise ValueError( f"{self} received {self.log_zero_guard_value} for the log_zero_guard_type parameter." f"It must be either a number, 'tiny', or 'eps'") self.mag_power = params['mag_power'] if 'mag_power' in params else 2 if self.mag_power != 1.0 and self.mag_power != 2.0: raise ValueError( f"{self} received {self.mag_power} for the mag_power parameter." f" It must be either 1.0 or 2.0.") self.pad_to = params['pad_to'] if 'pad_to' in params else 16 self.pad_value = params[ 'pad_value'] if 'pad_value' in params else 0.0 with self.pipe: audio, transcript = dali.fn.nemo_asr_reader( name="Reader", manifest_filepaths=manifest_filepath.split(','), dtype=dali.types.FLOAT, downmix=True, sample_rate=float(self.sample_rate), min_duration=min_duration, max_duration=max_duration, read_sample_rate=False, read_text=True, random_shuffle=shuffle, shard_id=self.shard_id, num_shards=self.num_shards, pad_last_batch=False, ) transcript_len = dali.fn.shapes( dali.fn.reshape(transcript, shape=[-1])) transcript = dali.fn.pad(transcript) # Extract nonsilent region, if necessary if trim: # Need to extract non-silent region before moving to the GPU roi_start, roi_len = dali.fn.nonsilent_region(audio, cutoff_db=-60) audio = audio.gpu() if self.device == 'gpu' else audio audio = dali.fn.slice(audio, roi_start, roi_len, normalized_anchor=False, normalized_shape=False, axes=[0]) else: audio = audio.gpu() if self.device == 'gpu' else audio if not has_preprocessor: # No preprocessing, the output is the audio signal audio_len = dali.fn.shapes(dali.fn.reshape(audio, shape=[-1])) audio = dali.fn.pad(audio) self.pipe.set_outputs(audio, audio_len, transcript, transcript_len) else: # Additive gaussian noise (dither) if self.dither > 0.0: gaussian_noise = dali.fn.normal_distribution( device=self.device) audio = audio + self.dither * gaussian_noise # Preemphasis filter if self.preemph > 0.0: audio = dali.fn.preemphasis_filter( audio, preemph_coeff=self.preemph) # Power spectrogram spec = dali.fn.spectrogram(audio, nfft=self.n_fft, window_length=self.window_size, window_step=self.window_stride) if feature_type == 'mel_spectrogram' or feature_type == 'mfcc': # Spectrogram to Mel Spectrogram spec = dali.fn.mel_filter_bank( spec, sample_rate=self.sample_rate, nfilter=self.n_mels, normalize=True, freq_low=self.freq_low, freq_high=self.freq_high, ) # Mel Spectrogram to MFCC if feature_type == 'mfcc': spec = dali.fn.mfcc(spec, n_mfcc=self.n_mfcc) # Logarithm if self.log_zero_guard_type == 'add': spec = spec + self.log_zero_guard_value spec = dali.fn.to_decibels(spec, multiplier=math.log(10), reference=1.0, cutoff_db=math.log( self.log_zero_guard_value)) # Normalization spec = dali.fn.normalize(spec, axes=self.normalization_axes) # Extracting the length of the spectrogram spec_len = dali.fn.slice(dali.fn.shapes(spec), 1, 1, axes=(0, )) # Pads feature dimension to be a multiple of `pad_to` and the temporal dimension to be as big as the largest sample (shape -1) spec = dali.fn.pad(spec, fill_value=self.pad_value, axes=(0, 1), align=(self.pad_to, 1), shape=(1, -1)) self.pipe.set_outputs(spec, spec_len, transcript, transcript_len) # Building DALI pipeline self.pipe.build() if has_preprocessor: output_names = [ 'processed_signal', 'processed_signal_len', 'transcript_raw', 'transcript_raw_len' ] else: output_names = [ 'audio', 'audio_len', 'transcript_raw', 'transcript_raw_len' ] last_batch_policy = LastBatchPolicy.DROP if drop_last else LastBatchPolicy.PARTIAL self._iter = DALIPytorchIterator( [self.pipe], output_map=output_names, reader_name="Reader", last_batch_policy=last_batch_policy, dynamic_shape=True, auto_reset=True, ) # TODO come up with a better solution class DummyDataset: def __init__(self, parent): self.parent = parent def __len__(self): return self.parent.size self.dataset = DummyDataset(self) # Used by NeMo
def main(): parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5Base-En", choices=[ x.pretrained_model_name for x in EncDecCTCModel.list_available_models() ], ) parser.add_argument( "--tts_model_spec", type=str, default="tts_en_tacotron2", choices=[ x.pretrained_model_name for x in SpectrogramGenerator.list_available_models() ], ) parser.add_argument( "--tts_model_vocoder", type=str, default="tts_waveglow_88m", choices=[ x.pretrained_model_name for x in Vocoder.list_available_models() ], ) parser.add_argument("--wer_tolerance", type=float, default=1.0, help="used by test") parser.add_argument("--trim", action="store_true") parser.add_argument("--debug", action="store_true") args = parser.parse_args() torch.set_grad_enabled(False) if args.debug: logging.set_verbosity(logging.DEBUG) logging.info(f"Using NGC cloud ASR model {args.asr_model}") asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model) logging.info( f"Using NGC cloud TTS Spectrogram Generator model {args.tts_model_spec}" ) tts_model_spec = SpectrogramGenerator.from_pretrained( model_name=args.tts_model_spec) logging.info(f"Using NGC cloud TTS Vocoder model {args.tts_model_vocoder}") tts_model_vocoder = Vocoder.from_pretrained( model_name=args.tts_model_vocoder) models = [asr_model, tts_model_spec, tts_model_vocoder] if torch.cuda.is_available(): for i, m in enumerate(models): models[i] = m.cuda() for m in models: m.eval() asr_model, tts_model_spec, tts_model_vocoder = models parser = parsers.make_parser( labels=asr_model.decoder.vocabulary, name="en", unk_id=-1, blank_id=-1, do_normalize=True, ) labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))]) tts_input = [] asr_references = [] longest_tts_input = 0 for test_str in LIST_OF_TEST_STRINGS: tts_parsed_input = tts_model_spec.parse(test_str) if len(tts_parsed_input[0]) > longest_tts_input: longest_tts_input = len(tts_parsed_input[0]) tts_input.append(tts_parsed_input.squeeze()) asr_parsed = parser(test_str) asr_parsed = ''.join([labels_map[c] for c in asr_parsed]) asr_references.append(asr_parsed) # Pad TTS Inputs for i, text in enumerate(tts_input): pad = (0, longest_tts_input - len(text)) tts_input[i] = torch.nn.functional.pad(text, pad, value=68) logging.debug(tts_input) # Do TTS tts_input = torch.stack(tts_input) if torch.cuda.is_available(): tts_input = tts_input.cuda() specs = tts_model_spec.generate_spectrogram(tokens=tts_input) audio = [] step = ceil(len(specs) / 4) for i in range(4): audio.append( tts_model_vocoder.convert_spectrogram_to_audio( spec=specs[i * step:i * step + step])) audio = [item for sublist in audio for item in sublist] audio_file_paths = [] # Save audio logging.debug(f"args.trim: {args.trim}") for i, aud in enumerate(audio): aud = aud.cpu().numpy() if args.trim: aud = librosa.effects.trim(aud, top_db=40)[0] soundfile.write(f"{i}.wav", aud, samplerate=22050) audio_file_paths.append(str(Path(f"{i}.wav"))) # Do ASR hypotheses = asr_model.transcribe(audio_file_paths) for i, _ in enumerate(hypotheses): logging.debug(f"{i}") logging.debug(f"ref:'{asr_references[i]}'") logging.debug(f"hyp:'{hypotheses[i]}'") wer_value = word_error_rate(hypotheses=hypotheses, references=asr_references) if wer_value > args.wer_tolerance: raise ValueError( f"Got WER of {wer_value}. It was higher than {args.wer_tolerance}") logging.info(f'Got WER of {wer_value}. Tolerance was {args.wer_tolerance}')