예제 #1
0
파일: tts_infer.py 프로젝트: rajdipa/NeMo
def main():
    args = parse_args()
    neural_factory = nemo.core.NeuralModuleFactory(
        optimization_level=args.amp_opt_level,
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
    )

    use_cache = True
    if args.local_rank is not None:
        print("Doing ALL GPU")
        use_cache = False

    # Create text to spectrogram model
    if args.spec_model == "tacotron2":
        yaml = YAML(typ="safe")
        with open(args.spec_model_config) as file:
            tacotron2_params = yaml.load(file)
        spec_neural_modules = create_NMs(tacotron2_params, decoder_infer=True)
        infer_tensors = create_infer_dags(
            neural_factory=neural_factory,
            neural_modules=spec_neural_modules,
            tacotron2_params=tacotron2_params,
            infer_dataset=args.eval_dataset,
            infer_batch_size=args.batch_size,
        )

    print("Running Tacotron 2")
    # Run tacotron 2
    evaluated_tensors = neural_factory.infer(
        tensors=infer_tensors,
        checkpoint_dir=args.spec_model_load_dir,
        cache=use_cache,
        offload_to_cpu=False,
    )
    mel_len = evaluated_tensors[-1]
    print("Done Running Tacotron 2")
    filterbank = librosa.filters.mel(
        sr=tacotron2_params["sample_rate"],
        n_fft=tacotron2_params["n_fft"],
        n_mels=tacotron2_params["n_mels"],
        fmax=tacotron2_params["fmax"],
    )

    if args.vocoder == "griffin-lim":
        print("Running Griffin-Lim")
        mel_spec = evaluated_tensors[0]
        for i, batch in enumerate(mel_spec):
            log_mel = batch.cpu().numpy().transpose(0, 2, 1)
            mel = np.exp(log_mel)
            magnitudes = np.dot(mel, filterbank) * args.griffin_lim_mag_scale
            for j, sample in enumerate(magnitudes):
                sample = sample[:mel_len[i][j], :]
                audio = griffin_lim(sample.T**args.griffin_lim_power)
                save_file = f"sample_{i * 32 + j}.wav"
                if args.save_dir:
                    save_file = os.path.join(args.save_dir, save_file)
                write(save_file, tacotron2_params["sample_rate"], audio)
                plot_and_save_spec(log_mel[j][:mel_len[i][j], :].T, i * 32 + j,
                                   args.save_dir)

    elif args.vocoder == "waveglow":
        (mel_pred, _, _, _) = infer_tensors
        if not args.vocoder_model_config or not args.vocoder_model_load_dir:
            raise ValueError(
                "Using waveglow as the vocoder requires the "
                "--vocoder_model_config and --vocoder_model_load_dir args")

        yaml = YAML(typ="safe")
        with open(args.vocoder_model_config) as file:
            waveglow_params = yaml.load(file)
        waveglow = nemo_tts.WaveGlowInferNM(sigma=args.waveglow_sigma,
                                            **waveglow_params["WaveGlowNM"])
        audio_pred = waveglow(mel_spectrogram=mel_pred)
        # waveglow.restore_from(args.vocoder_model_load_dir)

        # Run waveglow
        print("Running Waveglow")
        evaluated_tensors = neural_factory.infer(
            tensors=[audio_pred],
            checkpoint_dir=args.vocoder_model_load_dir,
            # checkpoint_dir=None,
            modules_to_restore=[waveglow],
            use_cache=use_cache,
        )
        print("Done Running Waveglow")

        if args.waveglow_denoiser_strength > 0:
            print("Setup denoiser")
            waveglow.setup_denoiser()

        print("Saving results to disk")
        for i, batch in enumerate(evaluated_tensors[0]):
            audio = batch.cpu().numpy()
            for j, sample in enumerate(audio):
                sample_len = mel_len[i][j] * tacotron2_params["n_stride"]
                sample = sample[:sample_len]
                save_file = f"sample_{i * 32 + j}.wav"
                if args.save_dir:
                    save_file = os.path.join(args.save_dir, save_file)
                if args.waveglow_denoiser_strength > 0:
                    sample, spec = waveglow.denoise(
                        sample, strength=args.waveglow_denoiser_strength)
                else:
                    spec, _ = librosa.core.magphase(
                        librosa.core.stft(sample,
                                          n_fft=waveglow_params["n_fft"]))
                write(save_file, waveglow_params["sample_rate"], sample)
                spec = np.dot(filterbank, spec)
                spec = np.log(np.clip(spec, a_min=1e-5, a_max=None))
                plot_and_save_spec(spec, i * 32 + j, args.save_dir)
예제 #2
0
def main():
    args = parse_args()
    neural_factory = nemo.core.NeuralModuleFactory(
        optimization_level=args.amp_opt_level,
        local_rank=args.local_rank,
    )

    use_cache = True
    if args.local_rank is not None:
        logging.info("Doing ALL GPU")
        use_cache = False

    # Create text to spectrogram model
    if args.spec_model == "tacotron2":
        yaml = YAML(typ="safe")
        with open(args.spec_model_config) as file:
            tacotron2_params = yaml.load(file)
        spec_neural_modules = create_NMs(args.spec_model_config,
                                         labels=tacotron2_params['labels'],
                                         decoder_infer=False)
        infer_tensors = create_infer_dags(
            neural_factory=neural_factory,
            neural_modules=spec_neural_modules,
            tacotron2_config_file=args.spec_model_config,
            tacotron2_params=tacotron2_params,
            infer_dataset=args.eval_dataset,
            infer_batch_size=args.batch_size,
            labels=tacotron2_params['labels'],
        )

    logging.info("Running Tacotron 2")
    # Run tacotron 2
    evaluated_tensors = neural_factory.infer(
        tensors=infer_tensors,
        checkpoint_dir=args.spec_model_load_dir,
        cache=use_cache,
        offload_to_cpu=True,
    )

    def get_D(alignment, true_len):
        D = np.array([0 for _ in range(np.shape(alignment)[1])])

        for i in range(np.shape(alignment)[0]):
            max_index = alignment[i].tolist().index(alignment[i].max())
            D[max_index] = D[max_index] + 1

        assert D.sum() == alignment.shape[0]
        assert D.sum() == true_len

        return D

    # Save durations.
    alignments_dir = pathlib.Path(args.durations_dir)
    alignments_dir.mkdir(exist_ok=True)
    k = -1
    for alignments, mel_lens, text_lens in zip(
            tqdm.tqdm(evaluated_tensors[2]),
            evaluated_tensors[3],
            evaluated_tensors[4],
    ):
        for alignment, mel_len, text_len in zip(alignments, mel_lens,
                                                text_lens):
            alignment = alignment.cpu().numpy()
            mel_len = mel_len.cpu().numpy().item()
            text_len = text_len.cpu().numpy().item()
            dur = get_D(alignment[:mel_len, :text_len], mel_len)
            k += 1
            np.save(alignments_dir / f'{k}.npy', dur, allow_pickle=False)