示例#1
0
def create_NMs(tacotron2_params, decoder_infer=False):
    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        **tacotron2_params["AudioToMelSpectrogramPreprocessor"])
    text_embedding = nemo_tts.TextEmbedding(
        len(tacotron2_params["labels"]) + 3,
        **tacotron2_params["TextEmbedding"],  # + 3 special chars
    )
    t2_enc = nemo_tts.Tacotron2Encoder(**tacotron2_params["Tacotron2Encoder"])
    if decoder_infer:
        t2_dec = nemo_tts.Tacotron2DecoderInfer(
            **tacotron2_params["Tacotron2Decoder"])
    else:
        t2_dec = nemo_tts.Tacotron2Decoder(
            **tacotron2_params["Tacotron2Decoder"])
    t2_postnet = nemo_tts.Tacotron2Postnet(
        **tacotron2_params["Tacotron2Postnet"])
    t2_loss = nemo_tts.Tacotron2Loss(**tacotron2_params["Tacotron2Loss"])
    makegatetarget = nemo_tts.MakeGate()

    total_weights = text_embedding.num_weights + t2_enc.num_weights + t2_dec.num_weights + t2_postnet.num_weights

    logging.info('================================')
    logging.info(f"Total number of parameters: {total_weights}")
    logging.info('================================')

    return (
        data_preprocessor,
        text_embedding,
        t2_enc,
        t2_dec,
        t2_postnet,
        t2_loss,
        makegatetarget,
    )
示例#2
0
    def test_tacotron2_training(self):
        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
        )
        text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256)
        t2_enc = nemo_tts.Tacotron2Encoder(
            encoder_n_convolutions=2,
            encoder_kernel_size=5,
            encoder_embedding_dim=256,
        )
        t2_dec = nemo_tts.Tacotron2Decoder(
            n_mel_channels=64,
            n_frames_per_step=1,
            encoder_embedding_dim=256,
            gate_threshold=0.5,
            prenet_dim=128,
            max_decoder_steps=1000,
            decoder_rnn_dim=512,
            p_decoder_dropout=0.1,
            p_attention_dropout=0.1,
            attention_rnn_dim=512,
            attention_dim=64,
            attention_location_n_filters=16,
            attention_location_kernel_size=15,
        )
        t2_postnet = nemo_tts.Tacotron2Postnet(
            n_mel_channels=64,
            postnet_embedding_dim=256,
            postnet_kernel_size=5,
            postnet_n_convolutions=3,
        )
        t2_loss = nemo_tts.Tacotron2Loss()
        makegatetarget = nemo_tts.MakeGate()

        # DAG
        audio, audio_len, transcript, transcript_len = data_layer()
        spec_target, spec_target_len = preprocessing(input_signal=audio,
                                                     length=audio_len)

        transcript_embedded = text_embedding(char_phone=transcript)
        transcript_encoded = t2_enc(
            char_phone_embeddings=transcript_embedded,
            embedding_length=transcript_len,
        )
        mel_decoder, gate, _ = t2_dec(
            char_phone_encoded=transcript_encoded,
            encoded_length=transcript_len,
            mel_target=spec_target,
        )
        mel_postnet = t2_postnet(mel_input=mel_decoder)
        gate_target = makegatetarget(mel_target=spec_target,
                                     target_len=spec_target_len)
        loss_t = t2_loss(
            mel_out=mel_decoder,
            mel_out_postnet=mel_postnet,
            gate_out=gate,
            mel_target=spec_target,
            gate_target=gate_target,
            target_len=spec_target_len,
            seq_len=audio_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss_t],
            print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'
                                              ),
        )
        # Instantiate an optimizer to perform `train` action
        optimizer = nemo.backends.pytorch.actions.PtActions()
        optimizer.train(
            [loss_t],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )
示例#3
0
    def test_tacotron2_training(self):
        """Integtaion test that instantiates a smaller Tacotron2 model and tests training with the sample asr data.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        """
        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4
        )
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
            log_zero_guard_type="clamp",
            log_zero_guard_value=1e-05,
        )
        text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256)
        t2_enc = nemo_tts.Tacotron2Encoder(encoder_n_convolutions=2, encoder_kernel_size=5, encoder_embedding_dim=256)
        t2_dec = nemo_tts.Tacotron2Decoder(
            n_mel_channels=64,
            n_frames_per_step=1,
            encoder_embedding_dim=256,
            gate_threshold=0.5,
            prenet_dim=128,
            max_decoder_steps=1000,
            decoder_rnn_dim=512,
            p_decoder_dropout=0.1,
            p_attention_dropout=0.1,
            attention_rnn_dim=512,
            attention_dim=64,
            attention_location_n_filters=16,
            attention_location_kernel_size=15,
        )
        t2_postnet = nemo_tts.Tacotron2Postnet(
            n_mel_channels=64, postnet_embedding_dim=256, postnet_kernel_size=5, postnet_n_convolutions=3
        )
        t2_loss = nemo_tts.Tacotron2Loss()
        makegatetarget = nemo_tts.MakeGate()

        # DAG
        audio, audio_len, transcript, transcript_len = data_layer()
        spec_target, spec_target_len = preprocessing(input_signal=audio, length=audio_len)

        transcript_embedded = text_embedding(char_phone=transcript)
        transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len)
        mel_decoder, gate, _ = t2_dec(
            char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target
        )
        mel_postnet = t2_postnet(mel_input=mel_decoder)
        gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len)
        loss_t = t2_loss(
            mel_out=mel_decoder,
            mel_out_postnet=mel_postnet,
            gate_out=gate,
            mel_target=spec_target,
            gate_target=gate_target,
            target_len=spec_target_len,
            seq_len=audio_len,
        )
        loss_list = []

        callback = SimpleLossLoggerCallback(
            tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1
        )
        # Instantiate an optimizer to perform `train` action
        optimizer = PtActions()
        optimizer.train(
            [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.01}
        )

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]