def main(): args = parse_args() neural_factory = nemo.core.NeuralModuleFactory( optimization_level=args.amp_opt_level, backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, ) use_cache = True if args.local_rank is not None: print("Doing ALL GPU") use_cache = False # Create text to spectrogram model if args.spec_model == "tacotron2": yaml = YAML(typ="safe") with open(args.spec_model_config) as file: tacotron2_params = yaml.load(file) spec_neural_modules = create_NMs(tacotron2_params, decoder_infer=True) infer_tensors = create_infer_dags( neural_factory=neural_factory, neural_modules=spec_neural_modules, tacotron2_params=tacotron2_params, infer_dataset=args.eval_dataset, infer_batch_size=args.batch_size, ) print("Running Tacotron 2") # Run tacotron 2 evaluated_tensors = neural_factory.infer( tensors=infer_tensors, checkpoint_dir=args.spec_model_load_dir, cache=use_cache, offload_to_cpu=False, ) mel_len = evaluated_tensors[-1] print("Done Running Tacotron 2") filterbank = librosa.filters.mel( sr=tacotron2_params["sample_rate"], n_fft=tacotron2_params["n_fft"], n_mels=tacotron2_params["n_mels"], fmax=tacotron2_params["fmax"], ) if args.vocoder == "griffin-lim": print("Running Griffin-Lim") mel_spec = evaluated_tensors[0] for i, batch in enumerate(mel_spec): log_mel = batch.cpu().numpy().transpose(0, 2, 1) mel = np.exp(log_mel) magnitudes = np.dot(mel, filterbank) * args.griffin_lim_mag_scale for j, sample in enumerate(magnitudes): sample = sample[:mel_len[i][j], :] audio = griffin_lim(sample.T**args.griffin_lim_power) save_file = f"sample_{i * 32 + j}.wav" if args.save_dir: save_file = os.path.join(args.save_dir, save_file) write(save_file, tacotron2_params["sample_rate"], audio) plot_and_save_spec(log_mel[j][:mel_len[i][j], :].T, i * 32 + j, args.save_dir) elif args.vocoder == "waveglow": (mel_pred, _, _, _) = infer_tensors if not args.vocoder_model_config or not args.vocoder_model_load_dir: raise ValueError( "Using waveglow as the vocoder requires the " "--vocoder_model_config and --vocoder_model_load_dir args") yaml = YAML(typ="safe") with open(args.vocoder_model_config) as file: waveglow_params = yaml.load(file) waveglow = nemo_tts.WaveGlowInferNM(sigma=args.waveglow_sigma, **waveglow_params["WaveGlowNM"]) audio_pred = waveglow(mel_spectrogram=mel_pred) # waveglow.restore_from(args.vocoder_model_load_dir) # Run waveglow print("Running Waveglow") evaluated_tensors = neural_factory.infer( tensors=[audio_pred], checkpoint_dir=args.vocoder_model_load_dir, # checkpoint_dir=None, modules_to_restore=[waveglow], use_cache=use_cache, ) print("Done Running Waveglow") if args.waveglow_denoiser_strength > 0: print("Setup denoiser") waveglow.setup_denoiser() print("Saving results to disk") for i, batch in enumerate(evaluated_tensors[0]): audio = batch.cpu().numpy() for j, sample in enumerate(audio): sample_len = mel_len[i][j] * tacotron2_params["n_stride"] sample = sample[:sample_len] save_file = f"sample_{i * 32 + j}.wav" if args.save_dir: save_file = os.path.join(args.save_dir, save_file) if args.waveglow_denoiser_strength > 0: sample, spec = waveglow.denoise( sample, strength=args.waveglow_denoiser_strength) else: spec, _ = librosa.core.magphase( librosa.core.stft(sample, n_fft=waveglow_params["n_fft"])) write(save_file, waveglow_params["sample_rate"], sample) spec = np.dot(filterbank, spec) spec = np.log(np.clip(spec, a_min=1e-5, a_max=None)) plot_and_save_spec(spec, i * 32 + j, args.save_dir)
def main(): args = parse_args() neural_factory = nemo.core.NeuralModuleFactory( optimization_level=args.amp_opt_level, local_rank=args.local_rank, ) use_cache = True if args.local_rank is not None: logging.info("Doing ALL GPU") use_cache = False # Create text to spectrogram model if args.spec_model == "tacotron2": yaml = YAML(typ="safe") with open(args.spec_model_config) as file: tacotron2_params = yaml.load(file) spec_neural_modules = create_NMs(args.spec_model_config, labels=tacotron2_params['labels'], decoder_infer=False) infer_tensors = create_infer_dags( neural_factory=neural_factory, neural_modules=spec_neural_modules, tacotron2_config_file=args.spec_model_config, tacotron2_params=tacotron2_params, infer_dataset=args.eval_dataset, infer_batch_size=args.batch_size, labels=tacotron2_params['labels'], ) logging.info("Running Tacotron 2") # Run tacotron 2 evaluated_tensors = neural_factory.infer( tensors=infer_tensors, checkpoint_dir=args.spec_model_load_dir, cache=use_cache, offload_to_cpu=True, ) def get_D(alignment, true_len): D = np.array([0 for _ in range(np.shape(alignment)[1])]) for i in range(np.shape(alignment)[0]): max_index = alignment[i].tolist().index(alignment[i].max()) D[max_index] = D[max_index] + 1 assert D.sum() == alignment.shape[0] assert D.sum() == true_len return D # Save durations. alignments_dir = pathlib.Path(args.durations_dir) alignments_dir.mkdir(exist_ok=True) k = -1 for alignments, mel_lens, text_lens in zip( tqdm.tqdm(evaluated_tensors[2]), evaluated_tensors[3], evaluated_tensors[4], ): for alignment, mel_len, text_len in zip(alignments, mel_lens, text_lens): alignment = alignment.cpu().numpy() mel_len = mel_len.cpu().numpy().item() text_len = text_len.cpu().numpy().item() dur = get_D(alignment[:mel_len, :text_len], mel_len) k += 1 np.save(alignments_dir / f'{k}.npy', dur, allow_pickle=False)