def __init__(self, cfg: DictConfig, trainer: Trainer = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) self.learn_alignment = False if "learn_alignment" in cfg: self.learn_alignment = cfg.learn_alignment self._parser = None self._tb_logger = None super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(FastPitchConfig) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) # Ensure passed cfg is compliant with schema OmegaConf.merge(cfg, schema) self.bin_loss_warmup_epochs = 100 self.aligner = None self.log_train_images = False self.mel_loss = MelLoss() loss_scale = 0.1 if self.learn_alignment else 1.0 self.pitch_loss = PitchLoss(loss_scale=loss_scale) self.duration_loss = DurationLoss(loss_scale=loss_scale) input_fft_kwargs = {} if self.learn_alignment: self.aligner = instantiate(self._cfg.alignment_module) self.forward_sum_loss = ForwardSumLoss() self.bin_loss = BinLoss() self.vocab = AudioToCharWithDursF0Dataset.make_vocab( **self._cfg.train_ds.dataset.vocab) input_fft_kwargs["n_embed"] = len(self.vocab.labels) input_fft_kwargs["padding_idx"] = self.vocab.pad self.preprocessor = instantiate(self._cfg.preprocessor) input_fft = instantiate(self._cfg.input_fft, **input_fft_kwargs) output_fft = instantiate(self._cfg.output_fft) duration_predictor = instantiate(self._cfg.duration_predictor) pitch_predictor = instantiate(self._cfg.pitch_predictor) self.fastpitch = FastPitchModule( input_fft, output_fft, duration_predictor, pitch_predictor, self.aligner, cfg.n_speakers, cfg.symbols_embedding_dim, cfg.pitch_embedding_kernel_size, cfg.n_mel_channels, )
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # Setup normalizer self.normalizer = None self.text_normalizer_call = None self.text_normalizer_call_kwargs = {} self._setup_normalizer(cfg) self.learn_alignment = cfg.get("learn_alignment", False) # Setup vocabulary (=tokenizer) and input_fft_kwargs (supported only with self.learn_alignment=True) input_fft_kwargs = {} if self.learn_alignment: self.vocab = None self.ds_class_name = cfg.train_ds.dataset._target_.split(".")[-1] if self.ds_class_name == "TTSDataset": self._setup_tokenizer(cfg) assert self.vocab is not None input_fft_kwargs["n_embed"] = len(self.vocab.tokens) input_fft_kwargs["padding_idx"] = self.vocab.pad elif self.ds_class_name == "AudioToCharWithPriorAndPitchDataset": logging.warning( "AudioToCharWithPriorAndPitchDataset class has been deprecated. No support for" " training or finetuning. Only inference is supported.") tokenizer_conf = self._get_default_text_tokenizer_conf() self._setup_tokenizer(tokenizer_conf) assert self.vocab is not None input_fft_kwargs["n_embed"] = len(self.vocab.tokens) input_fft_kwargs["padding_idx"] = self.vocab.pad else: raise ValueError( f"Unknown dataset class: {self.ds_class_name}") self._parser = None self._tb_logger = None super().__init__(cfg=cfg, trainer=trainer) self.bin_loss_warmup_epochs = cfg.get("bin_loss_warmup_epochs", 100) self.log_train_images = False loss_scale = 0.1 if self.learn_alignment else 1.0 dur_loss_scale = loss_scale pitch_loss_scale = loss_scale if "dur_loss_scale" in cfg: dur_loss_scale = cfg.dur_loss_scale if "pitch_loss_scale" in cfg: pitch_loss_scale = cfg.pitch_loss_scale self.mel_loss = MelLoss() self.pitch_loss = PitchLoss(loss_scale=pitch_loss_scale) self.duration_loss = DurationLoss(loss_scale=dur_loss_scale) self.aligner = None if self.learn_alignment: self.aligner = instantiate(self._cfg.alignment_module) self.forward_sum_loss = ForwardSumLoss() self.bin_loss = BinLoss() self.preprocessor = instantiate(self._cfg.preprocessor) input_fft = instantiate(self._cfg.input_fft, **input_fft_kwargs) output_fft = instantiate(self._cfg.output_fft) duration_predictor = instantiate(self._cfg.duration_predictor) pitch_predictor = instantiate(self._cfg.pitch_predictor) self.fastpitch = FastPitchModule( input_fft, output_fft, duration_predictor, pitch_predictor, self.aligner, cfg.n_speakers, cfg.symbols_embedding_dim, cfg.pitch_embedding_kernel_size, cfg.n_mel_channels, ) self._input_types = self._output_types = None
def __init__(self, cfg: DictConfig, trainer: Trainer = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) self._parser = parsers.make_parser( labels=cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(FastPitchHifiGanE2EConfig) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) # Ensure passed cfg is compliant with schema OmegaConf.merge(cfg, schema) self.preprocessor = instantiate(cfg.preprocessor) self.melspec_fn = instantiate(cfg.preprocessor, highfreq=None, use_grads=True) self.encoder = instantiate(cfg.input_fft) self.duration_predictor = instantiate(cfg.duration_predictor) self.pitch_predictor = instantiate(cfg.pitch_predictor) self.generator = instantiate(cfg.generator) self.multiperioddisc = MultiPeriodDiscriminator() self.multiscaledisc = MultiScaleDiscriminator() self.mel_val_loss = L1MelLoss() self.feat_matching_loss = FeatureMatchingLoss() self.disc_loss = DiscriminatorLoss() self.gen_loss = GeneratorLoss() self.max_token_duration = cfg.max_token_duration self.pitch_emb = torch.nn.Conv1d( 1, cfg.symbols_embedding_dim, kernel_size=cfg.pitch_embedding_kernel_size, padding=int((cfg.pitch_embedding_kernel_size - 1) / 2), ) # Store values precomputed from training data for convenience self.register_buffer('pitch_mean', torch.zeros(1)) self.register_buffer('pitch_std', torch.zeros(1)) self.pitchloss = PitchLoss() self.durationloss = DurationLoss() self.mel_loss_coeff = cfg.mel_loss_coeff self.log_train_images = False self.logged_real_samples = False self._tb_logger = None self.hann_window = None self.splice_length = cfg.splice_length self.sample_rate = cfg.sample_rate self.hop_size = cfg.hop_size