예제 #1
0
def create_infer_dags(neural_factory,
                      neural_modules,
                      tacotron2_params,
                      infer_dataset,
                      infer_batch_size,
                      cpu_per_dl=1):
    (_, text_embedding, t2_enc, t2_dec, t2_postnet, _, _) = neural_modules

    data_layer = nemo_asr.TranscriptDataLayer(
        path=infer_dataset,
        labels=tacotron2_params['labels'],
        batch_size=infer_batch_size,
        num_workers=cpu_per_dl,
        load_audio=False,
        bos_id=len(tacotron2_params['labels']),
        eos_id=len(tacotron2_params['labels']) + 1,
        pad_id=len(tacotron2_params['labels']) + 2,
        shuffle=False)
    transcript, transcript_len = data_layer()

    transcript_embedded = text_embedding(char_phone=transcript)
    transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded,
                                embedding_length=transcript_len)
    if isinstance(t2_dec, nemo_tts.Tacotron2DecoderInfer):
        mel_decoder, gate, alignments, mel_len = t2_dec(
            char_phone_encoded=transcript_encoded,
            encoded_length=transcript_len)
    else:
        raise ValueError(
            "The Neural Module for tacotron2 decoder was not understood")
    mel_postnet = t2_postnet(mel_input=mel_decoder)

    return [mel_postnet, gate, alignments, mel_len]
예제 #2
0
def create_dag(args, cfg, num_gpus):
    # Defining nodes
    data = nemo_asr.TranscriptDataLayer(
        path=args.train_dataset,
        labels=cfg['target']['labels'],
        eos_id=cfg['target']['eos_id'],
        pad_id=cfg['target']['pad_id'],
        batch_size=cfg['optimization']['batch_size'],
        drop_last=True,
    )
    data_eval = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.eval_datasets,
        labels=cfg['target']['labels'],
        eos_id=cfg['target']['eos_id'],
        batch_size=cfg['inference']['batch_size'],
        load_audio=False
    )
    decoder = nemo.backends.pytorch.DecoderRNN(
        voc_size=len(cfg['target']['labels']),
        bos_id=cfg['target']['bos_id'],
        **cfg['DecoderRNN']
    )
    num_data = len(data)
    batch_size = cfg['optimization']['batch_size']
    num_epochs = cfg['optimization']['params']['num_epochs']
    steps_per_epoch = int(num_data / (batch_size))
    total_steps = num_epochs * steps_per_epoch
    vsc = ValueSetterCallback
    tf_callback = ValueSetterCallback(
        decoder, 'teacher_forcing',
        policies=[
            vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0),
        ],
        total_steps=total_steps
    )
    seq_loss = nemo.backends.pytorch.SequenceLoss(
        pad_id=cfg['target']['pad_id'],
        smoothing_coef=cfg['optimization']['smoothing_coef']
    )
    saver_callback = nemo.core.ModuleSaverCallback(
        save_modules_list=[decoder],
        folder=args.checkpoint_dir,
        step_freq=args.checkpoint_save_freq
    )

    # Creating DAG
    texts, _ = data()
    log_probs, _ = decoder(
        targets=texts
    )
    train_loss = seq_loss(
        log_probs=log_probs,
        targets=texts
    )
    evals = []
    _, _, texts, _ = data_eval()
    log_probs, _ = decoder(
        targets=texts
    )
    eval_loss = seq_loss(
        log_probs=log_probs,
        targets=texts
    )
    evals.append((args.eval_datasets,
                  (eval_loss, log_probs, texts)))

    # Update config
    cfg['num_params'] = {'decoder': decoder.num_weights}
    cfg['num_params']['total'] = sum(cfg['num_params'].values())
    cfg['input']['train'] = {'num_data': num_data}
    cfg['optimization']['steps_per_epoch'] = steps_per_epoch
    cfg['optimization']['total_steps'] = total_steps

    return (train_loss, evals), cfg, [tf_callback, saver_callback]
예제 #3
0
    def synthesis(self, text):
        """Reads text and returns the audio signal in a wav file"""
        self.nf.logger.info('Starting speech synthesis')
        # create inference DAGs
        data_layer = nemo_asr.TranscriptDataLayer(
            path = build_text_path(text),
            labels = self.tacotron2_params['labels'],
            batch_size = 1,
            num_workers = 1,
            load_audio=False,
            bos_id = len(self.tacotron2_params['labels']),
            eos_id = len(self.tacotron2_params['labels']) + 1,
            pad_id = len(self.tacotron2_params['labels']) + 2,
            shuffle = False
        )
        os.remove("text.json")
        self.nf.logger.info("Running Tacotron 2")
        transcript, transcript_len = data_layer()

        transcript_embedded = self.text_embedding(char_phone=transcript)
        transcript_encoded = self.t2_enc(
                                char_phone_embeddings=transcript_embedded,
                                embedding_length=transcript_len)
        mel_decoder, gate, alignments, mel_len = self.t2_dec(
                                char_phone_encoded=transcript_encoded,
                                encoded_length=transcript_len)
        mel_postnet = self.t2_postnet(mel_input=mel_decoder)
        infer_tensors = [mel_postnet, gate, alignments, mel_len]
        # Run tacotron 2
        evaluated_tensors = self.nf.infer(
                                    tensors = infer_tensors,
                                    cache = True,
                                    offload_to_cpu = True)
        mel_len = evaluated_tensors[-1]

        # creating vocoder
        if self.tts_conf["vocoder"] == "griffin-lim":
            self.nf.logger.info("Running Griffin-Lim as a vocoder")
            mel_spec = evaluated_tensors[0][0]
            log_mel = mel_spec.cpu().numpy().transpose(0, 2, 1)
            mel = np.exp(log_mel)
            filterbank = librosa.filters.mel(
                                sr = self.tacotron2_params["sample_rate"],
                                n_fft = self.tacotron2_params["n_fft"],
                                n_mels = self.tacotron2_params["n_mels"],
                                fmax = self.tacotron2_params["fmax"])
            sample = np.dot(mel, filterbank) * self.tts_conf["mag_scale"]
            sample = sample[0][:mel_len[0][0], :]

            # convert magnitude spectrograms to audio signal
            magnitudes = sample.T ** self.tts_conf["power"]
            phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape))
            complex_spec = magnitudes * phase
            signal = librosa.istft(complex_spec)
            if np.isfinite(signal).all():
                for _ in range(self.tts_conf["n_iters"]):
                    _, phase = librosa.magphase(librosa.stft(signal,
                                    n_fft = self.tts_conf["n_fft"]))
                    complex_spec = magnitudes * phase
                    signal = librosa.istft(complex_spec)
            else:
                self.nf.logger.warn("audio was not finite")
                signal = np.array([0])
            outwav = "griffin_sample.wav"
            wavfile.write(outwav, self.tacotron2_params["sample_rate"], signal)
            self.nf.logger.info("Wav file was generated and named: " + outwav)
        
        elif self.tts_conf["vocoder"] == "waveglow":
            self.nf.logger.info("Running Waveglow as a vocoder")
            audio_pred = self.waveglow(mel_spectrogram=mel_postnet)
            # Run waveglow
            evaluated_tensors = self.nf.infer(
                tensors = [audio_pred],
                modules_to_restore = [self.waveglow],
                use_cache = True)
            mel_spec = evaluated_tensors[0][0]
            sample = mel_spec.cpu().numpy()[0]
            sample_len = mel_len[0][0] * self.tacotron2_params["n_stride"]
            sample = sample[:sample_len]
            
            # apply denoiser
            waveglow_denoiser_strength = self.tts_conf["denoising_strength"]
            if waveglow_denoiser_strength > 0:
                sample, spec = self.waveglow.denoise(sample,
                                    strength = waveglow_denoiser_strength)
            else:
                spec, _ = librosa.core.magphase(librosa.core.stft(
                             sample, n_fft = self.waveglow_params["n_fft"]))
            outwav = "waveglow_sample.wav"
            wavfile.write(outwav, self.waveglow_params["sample_rate"], sample)
            self.nf.logger.info("Wav file was generated and named: " + outwav)