def __init__(self, cfg: DictConfig, trainer: Trainer = None): if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) self.learn_alignment = False if "learn_alignment" in cfg: self.learn_alignment = cfg.learn_alignment self._parser = None self._tb_logger = None super().__init__(cfg=cfg, trainer=trainer) schema = OmegaConf.structured(FastPitchConfig) # ModelPT ensures that cfg is a DictConfig, but do this second check in case ModelPT changes if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) # Ensure passed cfg is compliant with schema OmegaConf.merge(cfg, schema) self.bin_loss_warmup_epochs = 100 self.aligner = None self.log_train_images = False self.mel_loss = MelLoss() loss_scale = 0.1 if self.learn_alignment else 1.0 self.pitch_loss = PitchLoss(loss_scale=loss_scale) self.duration_loss = DurationLoss(loss_scale=loss_scale) input_fft_kwargs = {} if self.learn_alignment: self.aligner = instantiate(self._cfg.alignment_module) self.forward_sum_loss = ForwardSumLoss() self.bin_loss = BinLoss() self.vocab = AudioToCharWithDursF0Dataset.make_vocab( **self._cfg.train_ds.dataset.vocab) input_fft_kwargs["n_embed"] = len(self.vocab.labels) input_fft_kwargs["padding_idx"] = self.vocab.pad self.preprocessor = instantiate(self._cfg.preprocessor) input_fft = instantiate(self._cfg.input_fft, **input_fft_kwargs) output_fft = instantiate(self._cfg.output_fft) duration_predictor = instantiate(self._cfg.duration_predictor) pitch_predictor = instantiate(self._cfg.pitch_predictor) self.fastpitch = FastPitchModule( input_fft, output_fft, duration_predictor, pitch_predictor, self.aligner, cfg.n_speakers, cfg.symbols_embedding_dim, cfg.pitch_embedding_kernel_size, cfg.n_mel_channels, )
def __init__(self, cfg: DictConfig): super().__init__(cfg=cfg) typecheck.set_typecheck_enabled(enabled=False) cfg = self._cfg self.vocab = AudioToCharWithDursF0Dataset.make_vocab(**cfg.train_ds.dataset.vocab) self.embed = nn.Embedding(len(self.vocab.labels), cfg.d_char) self.model = instantiate(cfg.model) d_out = cfg.model.jasper[-1].filters self.proj = nn.Conv1d(d_out, 1, kernel_size=1)
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): super().__init__(cfg=cfg, trainer=trainer) cfg = self._cfg self.vocab = AudioToCharWithDursF0Dataset.make_vocab( **cfg.train_ds.dataset.vocab) self.embed = nn.Embedding(len(self.vocab.labels), cfg.d_char) self.encoder = instantiate(cfg.encoder) d_out = cfg.encoder.jasper[-1].filters self.proj = nn.Conv1d(d_out, 1, kernel_size=1)
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): super().__init__(cfg=cfg, trainer=trainer) cfg = self._cfg self.vocab = AudioToCharWithDursF0Dataset.make_vocab( **cfg.train_ds.dataset.vocab) self.embed = GaussianEmbedding(self.vocab, cfg.d_char) self.encoder = instantiate(cfg.encoder) d_out = cfg.encoder.jasper[-1].filters self.sil_proj = nn.Conv1d(d_out, 1, kernel_size=1) self.body_proj = nn.Conv1d(d_out, 1, kernel_size=1) self.f0_mean, self.f0_std = cfg.f0_mean, cfg.f0_std
def __init__(self, cfg: DictConfig): super().__init__(cfg=cfg) typecheck.set_typecheck_enabled(enabled=False) cfg = self._cfg self.vocab = AudioToCharWithDursF0Dataset.make_vocab(**cfg.train_ds.dataset.vocab) self.preprocessor = instantiate(cfg.preprocessor) self.embed = GaussianEmbedding(self.vocab, cfg.d_char) self.norm_f0 = MaskedInstanceNorm1d(1) self.res_f0 = StyleResidual(cfg.d_char, 1, kernel_size=3) self.model = instantiate(cfg.model) d_out = cfg.model.jasper[-1].filters self.proj = nn.Conv1d(d_out, cfg.n_mels, kernel_size=1)
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): super().__init__(cfg=cfg, trainer=trainer) cfg = self._cfg self.vocab = AudioToCharWithDursF0Dataset.make_vocab( **cfg.train_ds.dataset.vocab) self.blanking = cfg.train_ds.dataset.blanking self.preprocessor = instantiate(cfg.preprocessor) self.embed = GaussianEmbedding(self.vocab, cfg.d_char) self.norm_f0 = MaskedInstanceNorm1d(1) self.res_f0 = StyleResidual(cfg.d_char, 1, kernel_size=3) self.encoder = instantiate(cfg.encoder) d_out = cfg.encoder.jasper[-1].filters self.proj = nn.Conv1d(d_out, cfg.n_mels, kernel_size=1)
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None): super().__init__(cfg=cfg, trainer=trainer) typecheck.set_typecheck_enabled(enabled=False) cfg = self._cfg self.vocab = AudioToCharWithDursF0Dataset.make_vocab(**cfg.train_ds.dataset.vocab) self.embed = nn.Embedding(len(self.vocab.labels), cfg.d_char) self.preprocessor = instantiate(cfg.preprocessor) self.alignment_encoder = instantiate(cfg.alignment_encoder) self.forward_sum_loss = ForwardSumLoss() self.bin_loss = BinLoss() self.bin_start_ratio = cfg.bin_start_ratio self.add_bin_loss = False
def parser(self): if self._parser is not None: return self._parser if self.learn_alignment: vocab = AudioToCharWithDursF0Dataset.make_vocab( **self._cfg.train_ds.dataset.vocab) self._parser = vocab.encode else: self._parser = parsers.make_parser( labels=self._cfg.labels, name='en', unk_id=-1, blank_id=-1, do_normalize=True, abbreviation_version="fastpitch", make_table=False, ) return self._parser
def __init__(self, cfg: DictConfig, trainer: Trainer = None): # Convert to Hydra 1.0 compatible DictConfig cfg = model_utils.convert_model_config_to_dict_config(cfg) cfg = model_utils.maybe_update_config_version(cfg) # Setup normalizer self.normalizer = None self.text_normalizer_call = None self.text_normalizer_call_kwargs = {} self._setup_normalizer(cfg) self.learn_alignment = cfg.get("learn_alignment", False) # Setup vocabulary (=tokenizer) and input_fft_kwargs (supported only with self.learn_alignment=True) input_fft_kwargs = {} if self.learn_alignment: self.vocab = None self.ds_class_name = cfg.train_ds.dataset._target_.split(".")[-1] if self.ds_class_name == "AudioToCharWithPriorAndPitchDataset": self.vocab = AudioToCharWithDursF0Dataset.make_vocab( **cfg.train_ds.dataset.vocab) input_fft_kwargs["n_embed"] = len(self.vocab.labels) input_fft_kwargs["padding_idx"] = self.vocab.pad elif self.ds_class_name == "TTSDataset": self._setup_tokenizer(cfg) assert self.vocab is not None input_fft_kwargs["n_embed"] = len(self.vocab.tokens) input_fft_kwargs["padding_idx"] = self.vocab.pad else: raise ValueError( f"Unknown dataset class: {self.ds_class_name}") self._parser = None self._tb_logger = None super().__init__(cfg=cfg, trainer=trainer) self.bin_loss_warmup_epochs = cfg.get("bin_loss_warmup_epochs", 100) self.log_train_images = False loss_scale = 0.1 if self.learn_alignment else 1.0 dur_loss_scale = loss_scale pitch_loss_scale = loss_scale if "dur_loss_scale" in cfg: dur_loss_scale = cfg.dur_loss_scale if "pitch_loss_scale" in cfg: pitch_loss_scale = cfg.pitch_loss_scale self.mel_loss = MelLoss() self.pitch_loss = PitchLoss(loss_scale=pitch_loss_scale) self.duration_loss = DurationLoss(loss_scale=dur_loss_scale) self.aligner = None if self.learn_alignment: self.aligner = instantiate(self._cfg.alignment_module) self.forward_sum_loss = ForwardSumLoss() self.bin_loss = BinLoss() self.preprocessor = instantiate(self._cfg.preprocessor) input_fft = instantiate(self._cfg.input_fft, **input_fft_kwargs) output_fft = instantiate(self._cfg.output_fft) duration_predictor = instantiate(self._cfg.duration_predictor) pitch_predictor = instantiate(self._cfg.pitch_predictor) self.fastpitch = FastPitchModule( input_fft, output_fft, duration_predictor, pitch_predictor, self.aligner, cfg.n_speakers, cfg.symbols_embedding_dim, cfg.pitch_embedding_kernel_size, cfg.n_mel_channels, ) self._input_types = self._output_types = None