def create_NMs(tacotron2_params, logger=None, decoder_infer=False): data_preprocessor = nemo_asr.AudioPreprocessing( **tacotron2_params["AudioPreprocessing"]) text_embedding = nemo_tts.TextEmbedding( len(tacotron2_params["labels"]), **tacotron2_params["TextEmbedding"]) t2_enc = nemo_tts.Tacotron2Encoder(**tacotron2_params["Tacotron2Encoder"]) if decoder_infer: t2_dec = nemo_tts.Tacotron2DecoderInfer( **tacotron2_params["Tacotron2Decoder"]) else: t2_dec = nemo_tts.Tacotron2Decoder( **tacotron2_params["Tacotron2Decoder"]) t2_postnet = nemo_tts.Tacotron2Postnet( **tacotron2_params["Tacotron2Postnet"]) t2_loss = nemo_tts.Tacotron2Loss() makegatetarget = nemo_tts.MakeGate() if logger: total_weights = (text_embedding.num_weights + t2_enc.num_weights + t2_dec.num_weights + t2_postnet.num_weights) logger.info('================================') logger.info(f"Total number of parameters: {total_weights}") logger.info('================================') return (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet, t2_loss, makegatetarget)
def create_NMs(tacotron2_params, decoder_infer=False): data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **tacotron2_params["AudioToMelSpectrogramPreprocessor"]) text_embedding = nemo_tts.TextEmbedding( len(tacotron2_params["labels"]) + 3, # + 3 special chars **tacotron2_params["TextEmbedding"]) t2_enc = nemo_tts.Tacotron2Encoder(**tacotron2_params["Tacotron2Encoder"]) if decoder_infer: t2_dec = nemo_tts.Tacotron2DecoderInfer( **tacotron2_params["Tacotron2Decoder"]) else: t2_dec = nemo_tts.Tacotron2Decoder( **tacotron2_params["Tacotron2Decoder"]) t2_postnet = nemo_tts.Tacotron2Postnet( **tacotron2_params["Tacotron2Postnet"]) t2_loss = nemo_tts.Tacotron2Loss(**tacotron2_params["Tacotron2Loss"]) makegatetarget = nemo_tts.MakeGate() total_weights = (text_embedding.num_weights + t2_enc.num_weights + t2_dec.num_weights + t2_postnet.num_weights) nemo.logging.info('================================') nemo.logging.info(f"Total number of parameters: {total_weights}") nemo.logging.info('================================') return (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet, t2_loss, makegatetarget)
def __init__(self): """Loads pre-trained ASR model""" device = nemo.core.DeviceType.CPU self.nf = nemo.core.NeuralModuleFactory(placement=device) # Create text to spectrogram model self.tts_conf = parse_yaml("conf.yaml")["tts"] self.tacotron2_params = parse_yaml( os.path.join(self.tts_conf["model_dir"], "tacotron2.yaml")) self.nf.logger.info('================================') # create text embedding module self.text_embedding = nemo_tts.TextEmbedding( len(self.tacotron2_params["labels"]) + 3, # + 3 special chars **self.tacotron2_params["TextEmbedding"]) self.text_embedding.restore_from( os.path.join(self.tts_conf["model_dir"], "TextEmbedding.pt")) self.nf.logger.info(f"Number of parameters in text-embedding: " f"{self.text_embedding.num_weights}") # create encoder self.t2_enc = nemo_tts.Tacotron2Encoder( **self.tacotron2_params["Tacotron2Encoder"]) self.t2_enc.restore_from( os.path.join(self.tts_conf["model_dir"], "Tacotron2Encoder.pt")) self.nf.logger.info( f"Number of parameters in encoder: {self.t2_enc.num_weights}") # create decoder self.t2_dec = nemo_tts.Tacotron2DecoderInfer( **self.tacotron2_params["Tacotron2Decoder"]) self.t2_dec.restore_from( os.path.join(self.tts_conf["model_dir"], "Tacotron2Decoder.pt")) self.nf.logger.info( f"Number of parameters in decoder: {self.t2_dec.num_weights}") # create PostNet self.t2_postnet = nemo_tts.Tacotron2Postnet( **self.tacotron2_params["Tacotron2Postnet"]) self.t2_postnet.restore_from( os.path.join(self.tts_conf["model_dir"], "Tacotron2Postnet.pt")) self.nf.logger.info( f"Number of parameters in postnet: {self.t2_postnet.num_weights}") total_weights= self.text_embedding.num_weights+self.t2_enc.num_weights \ + self.t2_dec.num_weights + self.t2_postnet.num_weights self.nf.logger.info(f"Total number of parameters in model: " f"{total_weights}") # load waveglow if chosen if self.tts_conf["vocoder"] == "waveglow": self.nf.logger.info("Loading waveglow as a vocoder") self.waveglow_params = parse_yaml( os.path.join(self.tts_conf["vocoder_dir"], "waveglow.yaml")) self.waveglow = nemo_tts.WaveGlowInferNM( sigma = self.tts_conf["sigma"], **self.waveglow_params["WaveGlowNM"]) self.waveglow.restore_from( os.path.join(self.tts_conf["vocoder_dir"], "WaveGlowNM.pt")) if self.tts_conf["denoising_strength"] > 0: self.nf.logger.info("Setting up a denoiser for waveglow") self.waveglow.setup_denoiser() self.nf.logger.info("Waveglow denoiser is ready") self.nf.logger.info('================================')