def load_model(self): print("loading model...") args = self.args parser = self.parser tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run, forward_is_infer=True) if args.cpu_run: denoiser = Denoiser(waveglow, args.cpu_run) else: denoiser = Denoiser(waveglow, args.cpu_run).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) print("warming up...") if args.include_warmup: if args.cpu_run: sequence = torch.randint(low=0, high=148, size=(1, 50), dtype=torch.long) input_lengths = torch.IntTensor([sequence.size(1)]).long() else: sequence = torch.randint(low=0, high=148, size=(1, 50), dtype=torch.long).cuda() input_lengths = torch.IntTensor([sequence.size(1) ]).cuda().long() for i in range(3): with torch.no_grad(): mel, mel_lengths, _ = jitted_tacotron2( sequence, input_lengths) _ = waveglow(mel) self.jitted_tacotron2 = jitted_tacotron2 self.waveglow = waveglow self.denoiser = denoiser print("done...")
def _denoiser(waveglow, filter_length=1024, n_overlap=4, win_length=1024, mode='zeros'): denoiser = Denoiser(waveglow, filter_length, n_overlap, win_length, mode) return denoiser
def setup(): global model, waveglow, denoiser, hparams hparams = create_hparams() hparams.sampling_rate = 22050 checkpoint_path = downloads.download_from_gdrive( gdrive_fileid='1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA', output_path='tacotron2/tacotron2_statedict.pt') model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval().half() waveglow_path = downloads.download_from_gdrive( gdrive_fileid='1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF', output_path='tacotron2/waveglow_256channels_universal_v5.pt') with submodules.localimport('submodules/tacotron2/waveglow') as _importer: waveglow_ = torch.load(waveglow_path) waveglow = waveglow_['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow)
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def init_vocoder(): waveglow_path = '../waveglow_256channels_universal_v5.pt' waveglow = torch.load(waveglow_path)['model'] for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) return waveglow, denoiser
def load_denoiser(waveglow_path): """ Library: from waveglow.denoiser import Denoiser """ waveglow_for_denoiser = torch.load(waveglow_path)['model'] waveglow_for_denoiser.cuda() denoiser_mode = 'zeros' denoiser = Denoiser(waveglow_for_denoiser, mode=denoiser_mode) return denoiser
def load_models(hparams, checkpoint_path, waveglow_path): print("load models...") model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) model.cuda().eval() waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) print("loaded!") return model, waveglow
def init_model(): hparams = create_hparams() checkpoint_path = "checkpoints/mellotron_libritts.pt" tacotron = load_model(hparams).cpu().eval() tacotron.load_state_dict( torch.load(checkpoint_path, map_location=torch.device('cpu'))['state_dict']) waveglow_path = 'checkpoints/waveglow_256channels_v4.pt' waveglow = torch.load( waveglow_path, map_location=torch.device('cpu'))['model'].cpu().eval() denoiser = Denoiser(waveglow).cpu().eval() return (tacotron, waveglow, denoiser)
def load_tts_vocoder_models(tacotron_checkpoint_path, waveglow_checkpoint_path): hparams = create_hparams() hparams.sampling_rate = 22050 model = load_model(hparams) model.load_state_dict(torch.load(tacotron_checkpoint_path)['state_dict']) _ = model.cuda().eval() waveglow = torch.load(waveglow_checkpoint_path)['model'] waveglow.cuda().eval() #for k in waveglow.convinv: # k.float() denoiser = Denoiser(waveglow) return model, waveglow, denoiser, hparams
def load_waveglow_model(model_path: str, device: torch.device): # this is required for pickle to see glow module sys.path.append("tts_dev/waveglow/") waveglow = torch.load(model_path, map_location=device)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.eval() if device.type != 'cpu': waveglow.cuda().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) return waveglow, denoiser
def __init__(self, ckpt, wglw, n_speakers=123): print("[Loading Model]") self.ckpt = ckpt self.hparams = create_hparams() self.hparams.n_speakers = n_speakers self.stft = TacotronSTFT(self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin, self.hparams.mel_fmax) self.mellotron = load_model(self.hparams).cuda().eval() self.waveglow = torch.load(wglw)['model'].cuda().eval() self.denoiser = Denoiser(self.waveglow).cuda().eval() self.arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') self.mellotron.load_state_dict(torch.load(ckpt)['state_dict']) print('[Loaded Model]')
def load_tts_model(checkpoint_path=None, waveglow_path=None): # set-up params hparams = create_hparams() # load model from checkpoint model = load_model(hparams) model.load_state_dict( torch.load(checkpoint_path, map_location='cpu')['state_dict']) _ = model.eval() # Load WaveGlow for mel2audio synthesis and denoiser waveglow = torch.load(waveglow_path, map_location='cpu')['model'] waveglow.eval() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) return model, denoiser, waveglow, hparams
os.mkdir(output_dir) logging.basicConfig(filename=os.path.join(output_dir, 'debug.log'), level=logging.DEBUG) logging.info('Output dir: %s', output_dir) # Parameters teacher_utt_path = args.teacher_utterance_path checkpoint_path = args.ppg2mel_model waveglow_path = args.waveglow_model is_clip = False # Set to True to control the output length of AC. fs = 16000 waveglow_sigma = 0.6 waveglow_for_denoiser = torch.load(waveglow_path)['model'] waveglow_for_denoiser.cuda() denoiser_mode = 'zeros' denoiser = Denoiser(waveglow_for_denoiser, mode=denoiser_mode) denoiser_strength = 0.005 # End of parameters logging.debug('Tacotron: %s', checkpoint_path) logging.debug('Waveglow: %s', waveglow_path) logging.debug('AM: SI model') logging.debug('is_clip: %d', is_clip) logging.debug('Fs: %d', fs) logging.debug('Sigma: %f', waveglow_sigma) logging.debug('Denoiser strength: %f', denoiser_strength) logging.debug('Denoiser mode: %s', denoiser_mode) hparams = create_hparams_stage() taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_acoustic_feat_dims,
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, forward_is_infer=True) denoiser = Denoiser(waveglow).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) if args.include_warmup: sequence = torch.randint(low=0, high=148, size=(1, 50), dtype=torch.long).cuda() input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long() for i in range(3): with torch.no_grad(): mel, mel_lengths = jitted_tacotron2(sequence, input_lengths) _ = waveglow(mel) measurements = {} sequences_padded, input_lengths = prepare_input_sequence(texts) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths) with torch.no_grad(), MeasureTime(measurements, "waveglow_time"): audios = waveglow(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) print("Stopping after", mel.size(2), "decoder steps") tacotron2_infer_perf = mel.size(0) * mel.size( 2) / measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0) * audios.size( 1) / measurements['waveglow_time'] DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf}) DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']}) DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={ "latency": (measurements['tacotron2_time'] + measurements['waveglow_time']) }) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) audio_path = args.output + "audio_" + str(i) + ".wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.flush()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference', allow_abbrev=False) parser = parse_args(parser) args, unk_args = parser.parse_known_args() if args.p_arpabet > 0.0: cmudict.initialize(args.cmudict_path, keep_ambiguous=True) torch.backends.cudnn.benchmark = args.cudnn_benchmark if args.output is not None: Path(args.output).mkdir(parents=False, exist_ok=True) log_fpath = args.log_file or str(Path(args.output, 'nvlog_infer.json')) log_fpath = unique_log_fpath(log_fpath) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath), StdOutBackend(Verbosity.VERBOSE, metric_format=stdout_metric_format)]) init_inference_metadata() [DLLogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()] device = torch.device('cuda' if args.cuda else 'cpu') if args.fastpitch != 'SKIP': generator = load_and_setup_model( 'FastPitch', parser, args.fastpitch, args.amp, device, unk_args=unk_args, forward_is_infer=True, ema=args.ema, jitable=args.torchscript) if args.torchscript: generator = torch.jit.script(generator) else: generator = None if args.waveglow != 'SKIP': with warnings.catch_warnings(): warnings.simplefilter("ignore") waveglow = load_and_setup_model( 'WaveGlow', parser, args.waveglow, args.amp, device, unk_args=unk_args, forward_is_infer=True, ema=args.ema) denoiser = Denoiser(waveglow).to(device) waveglow = getattr(waveglow, 'infer', waveglow) else: waveglow = None if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') fields = load_fields(args.input) batches = prepare_input_sequence( fields, device, args.symbol_set, args.text_cleaners, args.batch_size, args.dataset_path, load_mels=(generator is None), p_arpabet=args.p_arpabet) # Use real data rather than synthetic - FastPitch predicts len for _ in tqdm(range(args.warmup_steps), 'Warmup'): with torch.no_grad(): if generator is not None: b = batches[0] mel, *_ = generator(b['text']) if waveglow is not None: audios = waveglow(mel, sigma=args.sigma_infer).float() _ = denoiser(audios, strength=args.denoising_strength) gen_measures = MeasureTime(cuda=args.cuda) waveglow_measures = MeasureTime(cuda=args.cuda) gen_kw = {'pace': args.pace, 'speaker': args.speaker, 'pitch_tgt': None, 'pitch_transform': build_pitch_transformation(args)} if args.torchscript: gen_kw.pop('pitch_transform') print('NOTE: Pitch transforms are disabled with TorchScript') all_utterances = 0 all_samples = 0 all_letters = 0 all_frames = 0 reps = args.repeats log_enabled = reps == 1 log = lambda s, d: DLLogger.log(step=s, data=d) if log_enabled else None for rep in (tqdm(range(reps), 'Inference') if reps > 1 else range(reps)): for b in batches: if generator is None: log(rep, {'Synthesizing from ground truth mels'}) mel, mel_lens = b['mel'], b['mel_lens'] else: with torch.no_grad(), gen_measures: mel, mel_lens, *_ = generator(b['text'], **gen_kw) gen_infer_perf = mel.size(0) * mel.size(2) / gen_measures[-1] all_letters += b['text_lens'].sum().item() all_frames += mel.size(0) * mel.size(2) log(rep, {"fastpitch_frames/s": gen_infer_perf}) log(rep, {"fastpitch_latency": gen_measures[-1]}) if args.save_mels: for i, mel_ in enumerate(mel): m = mel_[:, :mel_lens[i].item()].permute(1, 0) fname = b['output'][i] if 'output' in b else f'mel_{i}.npy' mel_path = Path(args.output, Path(fname).stem + '.npy') np.save(mel_path, m.cpu().numpy()) if waveglow is not None: with torch.no_grad(), waveglow_measures: audios = waveglow(mel, sigma=args.sigma_infer) audios = denoiser(audios.float(), strength=args.denoising_strength ).squeeze(1) all_utterances += len(audios) all_samples += sum(audio.size(0) for audio in audios) waveglow_infer_perf = ( audios.size(0) * audios.size(1) / waveglow_measures[-1]) log(rep, {"waveglow_samples/s": waveglow_infer_perf}) log(rep, {"waveglow_latency": waveglow_measures[-1]}) if args.output is not None and reps == 1: for i, audio in enumerate(audios): audio = audio[:mel_lens[i].item() * args.stft_hop_length] if args.fade_out: fade_len = args.fade_out * args.stft_hop_length fade_w = torch.linspace(1.0, 0.0, fade_len) audio[-fade_len:] *= fade_w.to(audio.device) audio = audio / torch.max(torch.abs(audio)) fname = b['output'][i] if 'output' in b else f'audio_{i}.wav' audio_path = Path(args.output, fname) write(audio_path, args.sampling_rate, audio.cpu().numpy()) if generator is not None and waveglow is not None: log(rep, {"latency": (gen_measures[-1] + waveglow_measures[-1])}) log_enabled = True if generator is not None: gm = np.sort(np.asarray(gen_measures)) rtf = all_samples / (all_utterances * gm.mean() * args.sampling_rate) log((), {"avg_fastpitch_letters/s": all_letters / gm.sum()}) log((), {"avg_fastpitch_frames/s": all_frames / gm.sum()}) log((), {"avg_fastpitch_latency": gm.mean()}) log((), {"avg_fastpitch_RTF": rtf}) log((), {"90%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.90) / 2) * gm.std()}) log((), {"95%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.95) / 2) * gm.std()}) log((), {"99%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.99) / 2) * gm.std()}) if waveglow is not None: wm = np.sort(np.asarray(waveglow_measures)) rtf = all_samples / (all_utterances * wm.mean() * args.sampling_rate) log((), {"avg_waveglow_samples/s": all_samples / wm.sum()}) log((), {"avg_waveglow_latency": wm.mean()}) log((), {"avg_waveglow_RTF": rtf}) log((), {"90%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.90) / 2) * wm.std()}) log((), {"95%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.95) / 2) * wm.std()}) log((), {"99%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.99) / 2) * wm.std()}) if generator is not None and waveglow is not None: m = gm + wm rtf = all_samples / (all_utterances * m.mean() * args.sampling_rate) log((), {"avg_samples/s": all_samples / m.sum()}) log((), {"avg_letters/s": all_letters / m.sum()}) log((), {"avg_latency": m.mean()}) log((), {"avg_RTF": rtf}) log((), {"90%_latency": m.mean() + norm.ppf((1.0 + 0.90) / 2) * m.std()}) log((), {"95%_latency": m.mean() + norm.ppf((1.0 + 0.95) / 2) * m.std()}) log((), {"99%_latency": m.mean() + norm.ppf((1.0 + 0.99) / 2) * m.std()}) DLLogger.flush()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() use_custom_naming = args.custom_name input_path = args.input text_cleaners = args.text_cleaners check_directory_and_create(args.output, exists_warning=True) # import pdb; pdb.set_trace() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) if args.use_extracted_mels: print(f"mel found in {args.mel_path}") mel = torch.load(args.mel_path) mel = mel.unsqueeze(0) print(f"The size of the mel we just loaded is {mel.shape}") audios = apply_griffin_lim(args, mel) else: tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16, args.cpu, forward_is_infer=True) if not args.use_griffin_lim: waveglow = \ load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16, args.cpu, forward_is_infer=True) denoiser = Denoiser(waveglow) if not args.cpu: denoiser.cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) if args.include_warmup and (not args.use_griffin_lim): sequence = torch.randint(low=0, high=148, size=(1, 50)).long() input_lengths = torch.IntTensor([sequence.size(1)]).long() if not args.cpu: sequence = sequence.cuda() input_lengths = input_lengths.cuda() for i in range(3): with torch.no_grad(): mel, mel_lengths, _ = jitted_tacotron2( sequence, input_lengths) _ = waveglow(mel) measurements = {} sequences_padded, input_lengths = \ prepare_input_sequence(texts, args.cpu, text_cleaners) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu): mel, mel_lengths, alignments = jitted_tacotron2( sequences_padded, input_lengths) if args.use_griffin_lim: print(f"The size of the generated mel spec is {mel.shape}") audios = apply_griffin_lim(args, mel) # import pdb; pdb.set_trace() # audios = audios.cpu().numpy() #audio = audio.astype('int16') # audio_path = os.path.join('samples', "{}_synthesis.wav".format(out_filename)) # write(audio_path, hparams.sampling_rate, audio) # print(audio_path) else: with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu): audios = waveglow(mel, sigma=args.sigma_infer) audios = audios.float() with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) print("Stopping after", mel.size(2), "decoder steps") tacotron2_infer_perf = mel.size(0) * mel.size( 2) / measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0) * audios.size( 1) / measurements['waveglow_time'] DLLogger.log( step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf}) DLLogger.log( step=0, data={"tacotron2_latency": measurements['tacotron2_time']}) DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf}) DLLogger.log( step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log( step=0, data={"denoiser_latency": measurements['denoiser_time']}) DLLogger.log(step=0, data={ "latency": (measurements['tacotron2_time'] + measurements['waveglow_time'] + measurements['denoiser_time']) }) for i, audio in enumerate(audios): if use_custom_naming: if args.use_extracted_mels: custom_name = (args.mel_path.split("/")[-1]).split(".")[0] else: custom_name = (input_path.split("/")[-1]).split(".")[0] custom_path = os.path.join(args.output, custom_name) if not args.use_extracted_mels: # save alignment import pdb pdb.set_trace() plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower") figure_path = custom_path + "_alignment.png" plt.savefig(figure_path) meltitle = "_predicted" else: meltitle = "_extracetd" # save predicted mel # import pdb; pdb.set_trace() plot_mel_spectrogram( mel, title=meltitle, dirname=custom_path, append_name=True, load_mel_path=False, # load_mel_path=True ) # save generated audio # if not args.use_griffin_lim: if not args.use_extracted_mels: audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) # custom_name = (input_path.split("/")[-1]).split(".")[0] audio_path = custom_path + ".wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) else: plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower") # figure_path = args.output+"alignment_"+str(i)+"_"+args.suffix+".png" figure_path = "alignment_" + str(i) + "_" + args.suffix + ".png" # import pdb; pdb.set_trace() figure_path = os.path.join(args.output, figure_path) plt.savefig(figure_path) audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) audio_path = \ os.path.join(args.output, "audio_"+str(i)+"_"+args.suffix+".wav") write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.flush()
def get_evaluator(evaluator_classname: str, encoder_hparams: HParams, encoder_checkpoint_path: str, vocoder_hparams: HParams, vocoder_checkpoint_path: str, use_denoiser: bool = True, device: str = 'cpu') -> BaseEvaluator: """ Function for creation instance of Evaluator for syntesis Args: evaluator_classname: `str` class of evaluator encoder_hparams: `HParams` with tacotron2 meta encoder_checkpoint_path: `str` path to tacotron2 checkpoint vocoder_hparams: `HParams` with waveglow meta vocoder_checkpoint_path: `str` path to waveglow checkpoint use_denoiser: `bool` use or not postprocessing denoising device: `str` identifier for device to use Returns: `BaseEvaluator` instance """ encoder = Factory.get_object( f"tacotron2.models.{encoder_hparams['model_class_name']}", encoder_hparams) encoder.load_state_dict( torch.load(encoder_checkpoint_path, map_location=device)['model_state_dict']) encoder.to(device) vocoder = Factory.get_object( f"waveglow.models.{vocoder_hparams['model_class_name']}", vocoder_hparams) vocoder_loaded_weights = torch.load(vocoder_checkpoint_path, map_location=device) if 'model_state_dict' in vocoder_loaded_weights: vocoder.load_state_dict( torch.load(vocoder_checkpoint_path, map_location=device)['model_state_dict']) else: vocoder.load_state_dict( torch.load(vocoder_checkpoint_path, map_location=device)) vocoder.to(device) if use_denoiser: denoiser = Denoiser(vocoder, device=device) else: denoiser = None tokenizer = Factory.get_object( f"tacotron2.tokenizers.{encoder_hparams['tokenizer_class_name']}") evaluator = Factory.get_object( f"tacotron2.evaluators.{evaluator_classname}", encoder=encoder, vocoder=vocoder, tokenizer=tokenizer, denoiser=denoiser, device=device) return evaluator
hparams = create_hparams() # Load model from checkpoint checkpoint_path = "./outdir/4/checkpoint_57500" model, _ = load_Tacotron2(hparams, device) model.load_state_dict( torch.load(checkpoint_path, map_location=device)['state_dict']) _ = model.eval() # Load WaveGlow for mel2audio synthesis if device == torch.device('cuda'): sys.path.insert(0, "waveglow/") # To look glow(original version) first from waveglow.denoiser import Denoiser else: sys.path.insert( 0, "waveglow_cpu_components/") # To look glow(cpu version) first from waveglow_cpu_components.denoiser import Denoiser waveglow_path = './waveglow/waveglow_170000_22k' waveglow = torch.load(waveglow_path, map_location=device)['model'] waveglow.eval() denoiser = Denoiser(waveglow).to(device) # Start Server tornado_logger = TornadoLogger() logger = tornado_logger.logger logger.info("Server Start") app = make_app(model, waveglow, hparams, denoiser, device, logger) app.listen(8888) tornado.ioloop.IOLoop.current().start()
model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval().half() # 경상 "/home/ubuntu/Workspaces/thien/nvidia-tacotron-je/outdir/male/gyeongsang/waveglow_gyeongsang_266000" # 제주 "/home/ubuntu/Workspaces/thien/nvidia-tacotron-je/outdir/waveglow_jeju_146000" # 전라 "/home/ubuntu/Workspaces/thien/nvidia-tacotron-jeonla/outdir/waveglow_240000" waveglow_path = "/home/ubuntu/Workspaces/thien/nvidia-tacotron-je/outdir/male/gyeongsang/waveglow_gyeongsang_266000" taco = checkpoint_path.split('_')[-1] wave = waveglow_path.split('_')[-1] waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) # 텍스트 넣기 txt_list = [ '여러분이 있었기 땜시 즈희가 잘할 수 있었십니다', '니는 어떤 제목 드라마를 좋아하노?', '내 친구들은 다 휴가 갔습니더', '어. 그라모 다덜 이메일 주소들 좀 도.', '내는 이 책으로 열심히 공부하고 싶어예', '이 둘은 같은 디자인인데 사이즈가 다릅니더', '온라인상에서도 마찬가지입니더', '고객분들에 한해 무료로 배포하는거 아닙니꺼?', '애들이 묵기에는 쪼매 그렇네예.', '당신은 기차역에서 열차를 잘못 탔습니더', '그녀는 매사에 정확한 사람입니더', '갈비탕을 맛있게 하는 곳이 있으믄 거 가고 싶데이.', '건물 중에 어데 갈라꼬 하시는건가예?', '훨씬 나아지긴 했는데 지금은 너무 밝아서 파이다.', '예, 문제가 있으신가예?', '당신은 내랑 꼭 같이 가지 않아도 됩니더', '당신 마이 아파 보이는데 병원에 가보는 게 어떻습니꺼?', '저는 제가 결혼하게 되어가 기쁩니더', '오늘이 물리치료 몇 번째 받으시는 긴가예?', '영어보다 중국어로 말씀을 더 잘하시네예' ] for i, text in enumerate(txt_list): # text = "야. 도로모깡도 왜정시대나 낫주. 도로모깡도 엇일 땐양 허벅에."
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference', allow_abbrev=False) parser = parse_args(parser) args, unk_args = parser.parse_known_args() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'FastPitch_PyT'}) if args.output is not None: Path(args.output).mkdir(parents=False, exist_ok=True) device = torch.device('cuda' if args.cuda else 'cpu') if args.fastpitch is not None: generator = load_and_setup_model('FastPitch', parser, args.fastpitch, args.amp_run, device, unk_args=unk_args, forward_is_infer=True, ema=args.ema, jitable=args.torchscript) if args.torchscript: generator = torch.jit.script(generator) else: generator = None if args.waveglow is not None: with warnings.catch_warnings(): warnings.simplefilter("ignore") waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, device, unk_args=unk_args, forward_is_infer=True, ema=args.ema) denoiser = Denoiser(waveglow).to(device) waveglow = getattr(waveglow, 'infer', waveglow) else: waveglow = None if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') fields = load_fields(args.input) batches = prepare_input_sequence(fields, device, args.batch_size, args.dataset_path, load_mels=(generator is None)) if args.include_warmup: # Use real data rather than synthetic - FastPitch predicts len for i in range(3): with torch.no_grad(): if generator is not None: b = batches[0] mel, *_ = generator(b['text'], b['text_lens']) if waveglow is not None: audios = waveglow(mel, sigma=args.sigma_infer).float() _ = denoiser(audios, strength=args.denoising_strength) gen_measures = MeasureTime() waveglow_measures = MeasureTime() gen_kw = { 'pace': args.pace, 'pitch_tgt': None, 'pitch_transform': build_pitch_transformation(args) } if args.torchscript: gen_kw.pop('pitch_transform') all_utterances = 0 all_samples = 0 all_letters = 0 all_frames = 0 reps = args.repeats log_enabled = reps == 1 log = lambda s, d: DLLogger.log(step=s, data=d) if log_enabled else None for repeat in (tqdm.tqdm(range(reps)) if reps > 1 else range(reps)): for b in batches: if generator is None: log(0, {'Synthesizing from ground truth mels'}) mel, mel_lens = b['mel'], b['mel_lens'] else: with torch.no_grad(), gen_measures: mel, mel_lens, *_ = generator(b['text'], b['text_lens'], **gen_kw) gen_infer_perf = mel.size(0) * mel.size(2) / gen_measures[-1] all_letters += b['text_lens'].sum().item() all_frames += mel.size(0) * mel.size(2) log(0, {"generator_frames_per_sec": gen_infer_perf}) log(0, {"generator_latency": gen_measures[-1]}) if waveglow is not None: with torch.no_grad(), waveglow_measures: audios = waveglow(mel, sigma=args.sigma_infer) audios = denoiser( audios.float(), strength=args.denoising_strength).squeeze(1) all_utterances += len(audios) all_samples += sum(audio.size(0) for audio in audios) waveglow_infer_perf = (audios.size(0) * audios.size(1) / waveglow_measures[-1]) log(0, {"waveglow_samples_per_sec": waveglow_infer_perf}) log(0, {"waveglow_latency": waveglow_measures[-1]}) if args.output is not None and reps == 1: for i, audio in enumerate(audios): audio = audio[:mel_lens[i].item() * args.stft_hop_length] if args.fade_out: fade_len = args.fade_out * args.stft_hop_length fade_w = torch.linspace(1.0, 0.0, fade_len) audio[-fade_len:] *= fade_w.to(audio.device) audio = audio / torch.max(torch.abs(audio)) fname = b['output'][ i] if 'output' in b else f'audio_{i}.wav' audio_path = Path(args.output, fname) write(audio_path, args.sampling_rate, audio.cpu().numpy()) if generator is not None and waveglow is not None: log(0, {"latency": (gen_measures[-1] + waveglow_measures[-1])}) log_enabled = True if generator is not None: gm = np.sort(np.asarray(gen_measures)) log('avg', {"generator letters/s": all_letters / gm.sum()}) log('avg', {"generator_frames/s": all_frames / gm.sum()}) log('avg', {"generator_latency": gm.mean()}) log('90%', { "generator_latency": gm.mean() + norm.ppf((1.0 + 0.90) / 2) * gm.std() }) log('95%', { "generator_latency": gm.mean() + norm.ppf((1.0 + 0.95) / 2) * gm.std() }) log('99%', { "generator_latency": gm.mean() + norm.ppf((1.0 + 0.99) / 2) * gm.std() }) if waveglow is not None: wm = np.sort(np.asarray(waveglow_measures)) log('avg', {"waveglow_samples/s": all_samples / wm.sum()}) log('avg', {"waveglow_latency": wm.mean()}) log('90%', { "waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.90) / 2) * wm.std() }) log('95%', { "waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.95) / 2) * wm.std() }) log('99%', { "waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.99) / 2) * wm.std() }) if generator is not None and waveglow is not None: m = gm + wm rtf = all_samples / (len(batches) * all_utterances * m.mean() * args.sampling_rate) log('avg', {"samples/s": all_samples / m.sum()}) log('avg', {"letters/s": all_letters / m.sum()}) log('avg', {"latency": m.mean()}) log('avg', {"RTF": rtf}) log('90%', {"latency": m.mean() + norm.ppf((1.0 + 0.90) / 2) * m.std()}) log('95%', {"latency": m.mean() + norm.ppf((1.0 + 0.95) / 2) * m.std()}) log('99%', {"latency": m.mean() + norm.ppf((1.0 + 0.99) / 2) * m.std()}) DLLogger.flush()
sys.path.append(sys.argv[1]) # must be imported after path is modified from import_utils import load_waveglow from waveglow.denoiser import Denoiser strength = 0.1 if len(sys.argv) == 5: strength = float(sys.argv[4]) print("Building denoiser") waveglow = load_waveglow(sys.argv[2], WAVEGLOW_CONFIG) denoiser = Denoiser(waveglow).cuda() statedict = {} statedict["denoiser.stft.forward_basis"] = denoiser.stft.forward_basis.cpu( ).numpy().tolist() statedict["denoiser.stft.inverse_basis"] = denoiser.stft.inverse_basis.cpu( ).numpy().tolist() statedict["denoiser.stft.win_sq"] = gen_win_sq(denoiser).tolist() statedict["denoiser.bias_spec"] = (denoiser.bias_spec * strength).cpu().numpy().tolist() with open(json_path, "w") as fout: json.dump(statedict, fout, indent=2) print("Wrote to '%s'" % json_path)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, unknown_args = parser.parse_known_args() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE)]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) measurements_all = {"pre_processing": [], "tacotron2_latency": [], "waveglow_latency": [], "latency": [], "type_conversion": [], "data_transfer": [], "storage": [], "tacotron2_items_per_sec": [], "waveglow_items_per_sec": [], "num_mels_per_audio": [], "throughput": []} print("args:", args, unknown_args) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run) if args.cpu_run: denoiser = Denoiser(waveglow, args.cpu_run) else: denoiser = Denoiser(waveglow, args.cpu_run).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."] texts = [texts[0][:args.input_length]] texts = texts*args.batch_size warmup_iters = 3 for iter in range(args.num_iters): measurements = {} with MeasureTime(measurements, "pre_processing", args.cpu_run): sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu_run) with torch.no_grad(): with MeasureTime(measurements, "latency", args.cpu_run): with MeasureTime(measurements, "tacotron2_latency", args.cpu_run): mel, mel_lengths, _ = jitted_tacotron2(sequences_padded, input_lengths) with MeasureTime(measurements, "waveglow_latency", args.cpu_run): audios = waveglow.infer(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) num_mels = mel.size(0)*mel.size(2) num_samples = audios.size(0)*audios.size(1) with MeasureTime(measurements, "type_conversion", args.cpu_run): audios = audios.float() with MeasureTime(measurements, "data_transfer", args.cpu_run): audios = audios.cpu() with MeasureTime(measurements, "storage", args.cpu_run): audios = audios.numpy() for i, audio in enumerate(audios): audio_path = "audio_"+str(i)+".wav" write(audio_path, args.sampling_rate, audio[:mel_lengths[i]*args.stft_hop_length]) measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency'] measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency'] measurements['num_mels_per_audio'] = mel.size(2) measurements['throughput'] = num_samples/measurements['latency'] if iter >= warmup_iters: for k,v in measurements.items(): measurements_all[k].append(v) DLLogger.log(step=(iter-warmup_iters), data={k: v}) DLLogger.flush() print_stats(measurements_all)
def agumentation(arpabet_dict, audio_paths, target_spk_id_list, output_path, ljs=False): if not os.path.exists(output_path): os.makedirs(output_path) # Step1: Basic Setups if not ljs: # Whether to use lj speech checkpoint_path = "mellotron_libritts.pt" else: checkpoit_path = "mellotron_ljs.pt" if torch.cuda.is_available(): tacotron = load_model(hparams).cuda().eval() else: tacotron = load_model(hparams).eval() tacotron.load_state_dict( torch.load(checkpoint_path, map_location="cpu")['state_dict']) waveglow_path = 'waveglow_256channels_v4.pt' if torch.cuda.is_available(): waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() else: waveglow = torch.load(waveglow_path, map_location="cpu")['model'].eval().cpu() denoiser = Denoiser(waveglow).eval() arpabet_dict = cmudict.CMUDict(arpabet_dict) dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) # Step2: Load for file_idx in range(len(dataloader)): source_scp = open(os.path.join(output_path, "source.scp"), "w", encoding="utf-8") audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] source_scp.write("{} {}\n".format(file_idx, audio_path)) # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :] pitch_contour = dataloader[file_idx][3][None] if torch.cuda.is_available(): text_encoded = text_encoded.cuda() pitch_contour = pitch_contour.cuda() mel = load_mel(audio_path) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = tacotron.parse_batch(datacollate([dataloader[file_idx]])) # Step3: Perform speaker transfer with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = tacotron.forward( x) rhythm = rhythm.permute(1, 0, 2) for spk_id in target_spk_id_list: speaker_id = torch.LongTensor([spk_id]) if torch.cuda.is_available(): speaker_id = speaker_id.cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = tacotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour * 0.4, rhythm)) with torch.no_grad(): audio = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] sf.write( os.path.join(output_path, "{}-{}.wav".format(file_idx, spk_id)), audio.detach().cpu().numpy().T, hparams.sampling_rate)
def main(): parser = argparse.ArgumentParser( description='TensorRT Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() # initialize CUDA state torch.cuda.init() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) encoder = load_engine(args.encoder, TRT_LOGGER) decoder_iter = load_engine(args.decoder, TRT_LOGGER) postnet = load_engine(args.postnet, TRT_LOGGER) waveglow = load_engine(args.waveglow, TRT_LOGGER) if args.waveglow_ckpt != "": # setup denoiser using WaveGlow PyTorch checkpoint waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt, True, forward_is_infer=True) denoiser = Denoiser(waveglow_ckpt).cuda() # after initialization, we don't need WaveGlow PyTorch checkpoint # anymore - deleting del waveglow_ckpt torch.cuda.empty_cache() # create TRT contexts for each engine encoder_context = encoder.create_execution_context() decoder_context = decoder_iter.create_execution_context() postnet_context = postnet.create_execution_context() waveglow_context = waveglow.create_execution_context() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.output+'/'+args.log_file), StdOutBackend(Verbosity.VERBOSE)]) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) measurements = {} sequences, sequence_lengths = prepare_input_sequence(texts) sequences = sequences.to(torch.int32) sequence_lengths = sequence_lengths.to(torch.int32) with MeasureTime(measurements, "latency"): mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences, sequence_lengths, measurements, args.fp16) audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16) with encoder_context, decoder_context, postnet_context, waveglow_context: pass audios = audios.float() if args.waveglow_ckpt != "": with MeasureTime(measurements, "denoiser"): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i]*args.stft_hop_length] audio = audio/torch.max(torch.abs(audio)) audio_path = args.output + "audio_"+str(i)+"_trt.wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']}) DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']}) DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={"latency": measurements['latency']}) if args.waveglow_ckpt != "": DLLogger.log(step=0, data={"denoiser": measurements['denoiser']}) DLLogger.flush() prec = "fp16" if args.fp16 else "fp32" latency = measurements['latency'] throughput = audios.size(1)/latency log_data = "1,"+str(sequence_lengths[0].item())+","+prec+","+str(latency)+","+str(throughput)+","+str(mel_lengths[0].item())+"\n" with open("log_bs1_"+prec+".log", 'a') as f: f.write(log_data)
def measure(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Handles all the validation scoring and printing""" stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) mellotron = load_model(hparams).cuda().eval() mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict']) waveglow_path = '/media/arsh/New Volume/Models/speech/waveglow_256channels_v4.pt' waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = 'filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt' dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) speaker_ids = TextMelLoader( "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", hparams).speaker_ids speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python', header=None, comment=';', sep=' *\| *', names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME']) speakers['MELLOTRON_ID'] = speakers['ID'].apply( lambda x: speaker_ids[x] if x in speaker_ids else -1) female_speakers = cycle( speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0") ['MELLOTRON_ID'].sample(frac=1).tolist()) male_speakers = cycle( speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0") ['MELLOTRON_ID'].sample(frac=1).tolist()) file_idx = 0 MEL_DTW = [] TPP_DTW = [] RAND_DTW = [] logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0) while file_idx < len(dataloader): audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() pitch_contour = dataloader[file_idx][3][None].cuda() mel = load_mel(audio_path, stft) fs, audio = read(audio_path) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]])) with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm, gst, tpse_gst = mellotron.forward( x) rhythm = rhythm.permute(1, 0, 2) speaker_id = next(female_speakers) if np.random.randint(2) else next( male_speakers) speaker_id = torch.LongTensor([speaker_id]).cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour, rhythm), with_tpse=False) with torch.no_grad(): audio_mel = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour, rhythm), with_tpse=True) with torch.no_grad(): audio_tpp = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, np.random.randint( 0, 9), speaker_id, pitch_contour, rhythm), with_tpse=False) with torch.no_grad(): audio_rand = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] audio = np.pad(audio, 128) MEL_DTW.append( logSpecDbConst * np.log(dtw(audio_mel.data.cpu().numpy(), audio, eucCepDist)[0])) TPP_DTW.append( logSpecDbConst * np.log(dtw(audio_tpp.data.cpu().numpy(), audio, eucCepDist)[0])) RAND_DTW.append( logSpecDbConst * np.log(dtw(audio_rand.data.cpu().numpy(), audio, eucCepDist)[0])) print(MEL_DTW[-1], TPP_DTW[-1], RAND_DTW[-1]) print("MEL DTW, Mean: ", np.mean(MEL_DTW), " SD: ", np.std(MEL_DTW)) print("TPP DTW, Mean: ", np.mean(TPP_DTW), " SD: ", np.std(TPP_DTW)) print("RAND DTW, Mean: ", np.mean(RAND_DTW), " SD: ", np.std(RAND_DTW)) file_idx += 1
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("waveglow_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) log_hardware() log_args(args) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run) denoiser = Denoiser(waveglow).cuda() tacotron2.forward = tacotron2.infer type(tacotron2).forward = type(tacotron2).infer jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) if args.include_warmup: sequence = torch.randint(low=0, high=148, size=(1, 50), dtype=torch.long).cuda() input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long() for i in range(3): with torch.no_grad(): mel, mel_lengths = jitted_tacotron2(sequence, input_lengths) _ = waveglow.infer(mel) LOGGER.iteration_start() measurements = {} sequences_padded, input_lengths = prepare_input_sequence(texts) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths) with torch.no_grad(), MeasureTime(measurements, "waveglow_time"): audios = waveglow.infer(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) tacotron2_infer_perf = mel.size(0) * mel.size( 2) / measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0) * audios.size( 1) / measurements['waveglow_time'] LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf) LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf) LOGGER.log(key="waveglow_latency", value=measurements['waveglow_time']) LOGGER.log(key="latency", value=(measurements['tacotron2_time'] + measurements['waveglow_time'])) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) audio_path = args.output + "audio_" + str(i) + ".wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) LOGGER.iteration_stop() LOGGER.finish()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = os.path.join(args.output, args.log_file) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file), StdOutBackend(Verbosity.VERBOSE)]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16, args.cpu, forward_is_infer=True) # forward is infer를 해줌으로써 tacotron model의 infer로 간다. waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16, args.cpu, forward_is_infer=True) denoiser = Denoiser(waveglow) if not args.cpu: denoiser.cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] id_list = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) #------------------------------------------------------------------------------------------------------------------- ref_mel = load_mel(args.ref_mel) id_list.append(args.emotion_id) emotion_id = torch.LongTensor(id_list).cuda() print(emotion_id) #------------------------------------------------------------------------------------------------------------------- if args.include_warmup: sequence = torch.randint(low=0, high=80, size=(1,50)).long() input_lengths = torch.IntTensor([sequence.size(1)]).long() if not args.cpu: sequence = sequence.cuda() input_lengths = input_lengths.cuda() for i in range(3): with torch.no_grad(): mel, mel_lengths, _ = jitted_tacotron2(sequence, input_lengths, ref_mel, emotion_id) _ = waveglow(mel) measurements = {} sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu): mel, mel_lengths, alignments = jitted_tacotron2(sequences_padded, input_lengths, ref_mel, emotion_id) with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu): audios = waveglow(mel, sigma=args.sigma_infer) audios = audios.float() with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) print("Stopping after",mel.size(2),"decoder steps") tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time'] DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf}) DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']}) DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={"denoiser_latency": measurements['denoiser_time']}) DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time']+measurements['denoiser_time'])}) for i, audio in enumerate(audios): plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower") figure_path = os.path.join(args.output,"alignment_"+str(i)+args.suffix+".png") plt.savefig(figure_path) audio = audio[:mel_lengths[i]*args.stft_hop_length] audio = audio/torch.max(torch.abs(audio)) audio_path = os.path.join(args.output,"audio_"+str(i)+args.suffix+".wav") write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.flush()
from data_utils import TextMelLoader, TextMelCollate from text import cmudict, text_to_sequence from mellotron_utils import get_data_from_musicxml hparams = create_hparams() hparams.batch_size = 1 stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) speaker = "nes" checkpoint_path = '/mnt/sdd1/backup_149/checkpoints/supervised/checkpoint_180000' model = initiate_model(hparams).cuda().eval() model.load_state_dict(torch.load(checkpoint_path)['state_dict']) waveglow_path = '/home/admin/projects/mellotron_init_with_single/models/waveglow_256channels_v4.pt' waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') test_text_path = 'filelists/emotion/neutral2.txt' test_set = TextMelLoader(test_text_path, hparams) datacollate = TextMelCollate(1) dataloader = DataLoader(test_set, num_workers=1, shuffle=False, batch_size=1, pin_memory=False, drop_last=False, collate_fn=datacollate) speaker_ids = TextMelLoader(hparams.training_files, hparams).speaker_ids speaker_id = torch.LongTensor([speaker_ids[speaker]]).cuda() pytorch_total_params = sum(p.numel() for p in model.parameters())
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() # LOGGER.set_model_name("Tacotron2_PyT") # LOGGER.set_backends([ # dllg.StdOutBackend(log_file=None, # logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), # dllg.JsonBackend(log_file=args.log_file, # logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) # ]) # LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) # LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) # LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) # LOGGER.register_metric("waveglow_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) # LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) # log_hardware() # log_args(args) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run) waveglow = torch.load(args.waveglow)['model'] # waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, # args.amp_run) denoiser = Denoiser(waveglow).cuda() data_loader = torch.utils.data.DataLoader(dataloader(args), 5, shuffle=False, collate_fn = collate_fn) measurements = {} img_num = 0 k = 0 for i, data in enumerate(data_loader): try: new_num = math.ceil((i+1)/2) sequences_padded, input_lengths, keys = data if torch.cuda.is_available(): sequences_padded = torch.autograd.Variable(sequences_padded).cuda().long() input_lengths = torch.autograd.Variable(input_lengths).cuda().long() else: sequences_padded = torch.autograd.Variable(sequences_padded).long() input_lengths = torch.autograd.Variable(input_lengths).long() with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): _, mel, _, _, mel_lengths = tacotron2.infer(sequences_padded, input_lengths) with torch.no_grad(), MeasureTime(measurements, "waveglow_time"): audios = waveglow.infer(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) # tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time'] # waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time'] # LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf) # LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) # LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf) # LOGGER.log(key="waveglow_latency", value=measurements['waveglow_time']) # LOGGER.log(key="latency", value=(measurements['tacotron2_time']+ # measurements['waveglow_time'])) for j, audio in enumerate(audios): k+=1 key = keys[j] audio = audio[:mel_lengths[j]*args.stft_hop_length] audio = audio/torch.max(torch.abs(audio)) # audio_path = args.output + "/audio_"+str(j)+'-'+str(i)+".wav" audio_dir = args.output audio_path = str(key) + '.wav' save_path = os.path.join(audio_dir,audio_path) write(save_path, args.sampling_rate, audio.cpu().numpy()) info = 'saved the %i-th audios\n'%(k) except: pass