def export_onnx(parser, args): waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run) # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech mel = torch.randn(1, 80, 620).cuda() stride = 256 # value from waveglow upsample kernel_size = 1024 # value from waveglow upsample n_group = 8 z_size2 = (mel.size(2) - 1) * stride + (kernel_size - 1) + 1 # corresponds to cutoff in infer_onnx z_size2 = z_size2 - (kernel_size - stride) z_size2 = z_size2 // n_group z = torch.randn(1, n_group, z_size2, 1).cuda() if args.amp_run: mel = mel.half() z = z.half() with torch.no_grad(): # run inference to force calculation of inverses waveglow.infer(mel, sigma=args.sigma_infer) # export to ONNX convert_1d_to_2d_(waveglow) waveglow.forward = waveglow.infer_onnx if args.amp_run: waveglow.half() mel = mel.unsqueeze(3) torch.onnx.export(waveglow, (mel, z), args.output)
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch FastPitch Inference Benchmark') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = args.log_file DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'FastPitch_PyT'}) model = load_and_setup_model('FastPitch', parser, None, args.amp_run, 'cuda', unk_args=[], forward_is_infer=True, ema=False, jitable=True) # FIXME Temporarily disabled due to nn.LayerNorm fp16 casting bug in pytorch:20.02-py3 and 20.03 # model = torch.jit.script(model) warmup_iters = 3 iters = 1 gen_measures = MeasureTime() all_frames = 0 for i in range(-warmup_iters, iters): text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 128), dtype=torch.long).to('cuda') input_lengths = torch.IntTensor([text_padded.size(1)] * args.batch_size).to('cuda') durs = torch.ones_like(text_padded).mul_(4).to('cuda') with torch.no_grad(), gen_measures: mels, *_ = model(text_padded, input_lengths, dur_tgt=durs) num_frames = mels.size(0) * mels.size(2) if i >= 0: all_frames += num_frames DLLogger.log(step=(i, ), data={"latency": gen_measures[-1]}) DLLogger.log(step=(i, ), data={"frames/s": num_frames / gen_measures[-1]}) measures = gen_measures[warmup_iters:] DLLogger.log(step=(), data={'avg latency': np.mean(measures)}) DLLogger.log(step=(), data={'avg frames/s': all_frames / np.sum(measures)}) DLLogger.flush()
def export_onnx(parser, args): waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, fp16_run=args.fp16, cpu_run=False, forward_is_infer=False) # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech mel = torch.randn(1, 80, 620).cuda() stride = 256 # value from waveglow upsample n_group = 8 z_size2 = (mel.size(2) * stride) // n_group z = torch.randn(1, n_group, z_size2, 1).cuda() if args.fp16: mel = mel.half() z = z.half() with torch.no_grad(): # run inference to force calculation of inverses waveglow.infer(mel, sigma=args.sigma_infer) convert_1d_to_2d_(waveglow) mel = mel.unsqueeze(3) # export to ONNX if args.fp16: waveglow = waveglow.half() waveglow.forward = waveglow.infer_onnx opset_version = 11 if os.path.isdir(args.output): output_path = os.path.join(args.output, "waveglow.onnx") else: output_path = args.output torch.onnx.export(waveglow, (mel, z), output_path, opset_version=opset_version, do_constant_folding=True, input_names=["mel", "z"], output_names=["audio"], dynamic_axes={ "mel": { 0: "batch_size", 2: "mel_seq" }, "z": { 0: "batch_size", 2: "z_seq" }, "audio": { 0: "batch_size", 1: "audio_seq" } })
def export_onnx(parser, args): waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, forward_is_infer=False) # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech mel = torch.randn(1, 80, 620).cuda() stride = 256 # value from waveglow upsample kernel_size = 1024 # value from waveglow upsample n_group = 8 z_size2 = (mel.size(2) - 1) * stride + (kernel_size - 1) + 1 # corresponds to cutoff in infer_onnx z_size2 = z_size2 - (kernel_size - stride) z_size2 = z_size2 // n_group z = torch.randn(1, n_group, z_size2, 1).cuda() if args.amp_run: mel = mel.half() z = z.half() with torch.no_grad(): # run inference to force calculation of inverses waveglow.infer(mel, sigma=args.sigma_infer) # export to ONNX convert_1d_to_2d_(waveglow) fType = types.MethodType waveglow.forward = fType(infer_onnx, waveglow) if args.amp_run: waveglow.half() mel = mel.unsqueeze(3) opset_version = 10 torch.onnx.export(waveglow, (mel, z), args.output + "/" + "waveglow.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["mel", "z"], output_names=["audio"], dynamic_axes={ "mel": { 0: "batch_size", 2: "mel_seq" }, "z": { 0: "batch_size", 2: "z_seq" }, "audio": { 0: "batch_size", 1: "audio_seq" } })
def export_onnx(parser, args): waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, fp16_run=args.fp16, cpu_run=False, forward_is_infer=False) # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech mel = torch.randn(1, 80, 620).cuda() stride = 256 # value from waveglow upsample n_group = 8 z_size2 = (mel.size(2) * stride) // n_group z = torch.randn(1, n_group, z_size2).cuda() if args.fp16: mel = mel.half() z = z.half() with torch.no_grad(): # run inference to force calculation of inverses waveglow.infer(mel, sigma=args.sigma_infer) # export to ONNX if args.fp16: waveglow = waveglow.half() fType = types.MethodType waveglow.forward = fType(infer_onnx, waveglow) opset_version = 11 torch.onnx.export(waveglow, (mel, z), args.output + "/" + "waveglow.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["mel", "z"], output_names=["audio"], dynamic_axes={ "mel": { 0: "batch_size", 2: "mel_seq" }, "z": { 0: "batch_size", 2: "z_seq" }, "audio": { 0: "batch_size", 1: "audio_seq" } })
def main(): parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args = parser.parse_args() tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, forward_is_infer=True) jitted_tacotron2 = torch.jit.script(tacotron2) torch.jit.save(jitted_tacotron2, args.output)
def main(): parser = argparse.ArgumentParser( description='Export models to TorchScript') parser = parse_args(parser) args = parser.parse_args() model = load_and_setup_model(args.generator_name, parser, args.generator_checkpoint, args.amp, device='cpu', forward_is_infer=True, polyak=False, jitable=True) torch.jit.save(torch.jit.script(model), args.output)
def main(): parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 export to TRT') parser = parse_args(parser) args, _ = parser.parse_known_args() tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, fp16_run=args.fp16, cpu_run=False) opset_version = 10 sequences = torch.randint(low=0, high=148, size=(1,50), dtype=torch.long).cuda() sequence_lengths = torch.IntTensor([sequences.size(1)]).cuda().long() dummy_input = (sequences, sequence_lengths) encoder = Encoder(tacotron2) encoder.eval() with torch.no_grad(): encoder(*dummy_input) torch.onnx.export(encoder, dummy_input, args.output+"/"+"encoder.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["sequences", "sequence_lengths"], output_names=["memory", "processed_memory", "lens"], dynamic_axes={"sequences": {1: "text_seq"}, "memory": {1: "mem_seq"}, "processed_memory": {1: "mem_seq"} }) decoder_iter = DecoderIter(tacotron2) memory = torch.randn((1,sequence_lengths[0],512)).cuda() #encoder_outputs if args.fp16: memory = memory.half() memory_lengths = sequence_lengths # initialize decoder states for dummy_input decoder_input = tacotron2.decoder.get_go_frame(memory) mask = get_mask_from_lengths(memory_lengths) (attention_hidden, attention_cell, decoder_hidden, decoder_cell, attention_weights, attention_weights_cum, attention_context, processed_memory) = tacotron2.decoder.initialize_decoder_states(memory) dummy_input = (decoder_input, attention_hidden, attention_cell, decoder_hidden, decoder_cell, attention_weights, attention_weights_cum, attention_context, memory, processed_memory, mask) decoder_iter = DecoderIter(tacotron2) decoder_iter.eval() with torch.no_grad(): decoder_iter(*dummy_input) torch.onnx.export(decoder_iter, dummy_input, args.output+"/"+"decoder_iter.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["decoder_input", "attention_hidden", "attention_cell", "decoder_hidden", "decoder_cell", "attention_weights", "attention_weights_cum", "attention_context", "memory", "processed_memory", "mask"], output_names=["decoder_output", "gate_prediction", "out_attention_hidden", "out_attention_cell", "out_decoder_hidden", "out_decoder_cell", "out_attention_weights", "out_attention_weights_cum", "out_attention_context"], dynamic_axes={"attention_weights" : {1: "seq_len"}, "attention_weights_cum" : {1: "seq_len"}, "memory" : {1: "seq_len"}, "processed_memory" : {1: "seq_len"}, "mask" : {1: "seq_len"}, "out_attention_weights" : {1: "seq_len"}, "out_attention_weights_cum" : {1: "seq_len"} }) postnet = Postnet(tacotron2) dummy_input = torch.randn((1,80,620)).cuda() if args.fp16: dummy_input = dummy_input.half() torch.onnx.export(postnet, dummy_input, args.output+"/"+"postnet.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["mel_outputs"], output_names=["mel_outputs_postnet"], dynamic_axes={"mel_outputs": {2: "mel_seq"}, "mel_outputs_postnet": {2: "mel_seq"}}) mel = test_inference(encoder, decoder_iter, postnet) torch.save(mel, "mel.pt")
def main(): parser = argparse.ArgumentParser( description='PyTorch TTS Data Pre-processing') parser = parse_args(parser) args, unk_args = parser.parse_known_args() if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') if args.extract_pitch_char: assert args.extract_durations, "Durations required for pitch extraction" DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) model = load_and_setup_model( 'Tacotron2', parser, args.tacotron2_checkpoint, amp=False, device=torch.device('cuda' if args.cuda else 'cpu'), forward_is_infer=False, ema=False) if args.train_mode: model.train() # n_mel_channels arg has been consumed by model's arg parser args.n_mel_channels = model.n_mel_channels for datum in ('mels', 'mels_teacher', 'attentions', 'durations', 'pitch_mel', 'pitch_char', 'pitch_trichar'): if getattr(args, f'extract_{datum}'): Path(args.dataset_path, datum).mkdir(parents=False, exist_ok=True) filenames = [ Path(l.split('|')[0]).stem for l in open(args.wav_text_filelist, 'r') ] # Compatibility with Tacotron2 Data loader args.n_speakers = 1 dataset = FilenamedLoader(filenames, args.dataset_path, args.wav_text_filelist, args, load_mel_from_disk=False) # TextMelCollate supports only n_frames_per_step=1 data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, sampler=None, num_workers=0, collate_fn=TextMelCollate(1), pin_memory=False, drop_last=False) pitch_vecs = {'mel': {}, 'char': {}, 'trichar': {}} for i, batch in enumerate(data_loader): tik = time.time() fnames = batch[-1] x, _, _ = batch_to_gpu(batch[:-1]) _, text_lens, mels_padded, _, mel_lens = x for j, mel in enumerate(mels_padded): fpath = Path(args.dataset_path, 'mels', fnames[j] + '.pt') torch.save(mel[:, :mel_lens[j]].cpu(), fpath) with torch.no_grad(): out_mels, out_mels_postnet, _, alignments = model.forward(x) if args.extract_mels_teacher: for j, mel in enumerate(out_mels_postnet): fpath = Path(args.dataset_path, 'mels_teacher', fnames[j] + '.pt') torch.save(mel[:, :mel_lens[j]].cpu(), fpath) if args.extract_attentions: for j, ali in enumerate(alignments): ali = ali[:mel_lens[j], :text_lens[j]] fpath = Path(args.dataset_path, 'attentions', fnames[j] + '.pt') torch.save(ali.cpu(), fpath) durations = [] if args.extract_durations: for j, ali in enumerate(alignments): text_len = text_lens[j] ali = ali[:mel_lens[j], :text_len] dur = torch.histc(torch.argmax(ali, dim=1), min=0, max=text_len - 1, bins=text_len) durations.append(dur) fpath = Path(args.dataset_path, 'durations', fnames[j] + '.pt') torch.save(dur.cpu().int(), fpath) if args.extract_pitch_mel or args.extract_pitch_char or args.extract_pitch_trichar: for j, dur in enumerate(durations): fpath = Path(args.dataset_path, 'pitch_char', fnames[j] + '.pt') wav = Path(args.dataset_path, 'wavs', fnames[j] + '.wav') p_mel, p_char, p_trichar = calculate_pitch( str(wav), dur.cpu().numpy()) pitch_vecs['mel'][fnames[j]] = p_mel pitch_vecs['char'][fnames[j]] = p_char pitch_vecs['trichar'][fnames[j]] = p_trichar nseconds = time.time() - tik DLLogger.log(step=f'{i+1}/{len(data_loader)} ({nseconds:.2f}s)', data={}) if args.extract_pitch_mel: normalize_pitch_vectors(pitch_vecs['mel']) for fname, pitch in pitch_vecs['mel'].items(): fpath = Path(args.dataset_path, 'pitch_mel', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) if args.extract_pitch_char: mean, std = normalize_pitch_vectors(pitch_vecs['char']) for fname, pitch in pitch_vecs['char'].items(): fpath = Path(args.dataset_path, 'pitch_char', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) save_stats(args.dataset_path, args.wav_text_filelist, 'pitch_char', mean, std) if args.extract_pitch_trichar: normalize_pitch_vectors(pitch_vecs['trichar']) for fname, pitch in pitch_vecs['trichar'].items(): fpath = Path(args.dataset_path, 'pitch_trichar', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) DLLogger.flush()
def main(): parser = argparse.ArgumentParser( description='TensorRT Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() # initialize CUDA state torch.cuda.init() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) encoder = load_engine(args.encoder, TRT_LOGGER) decoder_iter = load_engine(args.decoder, TRT_LOGGER) postnet = load_engine(args.postnet, TRT_LOGGER) waveglow = load_engine(args.waveglow, TRT_LOGGER) if args.waveglow_ckpt != "": # setup denoiser using WaveGlow PyTorch checkpoint waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt, True, forward_is_infer=True) denoiser = Denoiser(waveglow_ckpt).cuda() # after initialization, we don't need WaveGlow PyTorch checkpoint # anymore - deleting del waveglow_ckpt torch.cuda.empty_cache() # create TRT contexts for each engine encoder_context = encoder.create_execution_context() decoder_context = decoder_iter.create_execution_context() postnet_context = postnet.create_execution_context() waveglow_context = waveglow.create_execution_context() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.output+'/'+args.log_file), StdOutBackend(Verbosity.VERBOSE)]) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) measurements = {} sequences, sequence_lengths = prepare_input_sequence(texts) sequences = sequences.to(torch.int32) sequence_lengths = sequence_lengths.to(torch.int32) with MeasureTime(measurements, "latency"): mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences, sequence_lengths, measurements, args.fp16) audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16) with encoder_context, decoder_context, postnet_context, waveglow_context: pass audios = audios.float() if args.waveglow_ckpt != "": with MeasureTime(measurements, "denoiser"): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i]*args.stft_hop_length] audio = audio/torch.max(torch.abs(audio)) audio_path = args.output + "audio_"+str(i)+"_trt.wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']}) DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']}) DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={"latency": measurements['latency']}) if args.waveglow_ckpt != "": DLLogger.log(step=0, data={"denoiser": measurements['denoiser']}) DLLogger.flush() prec = "fp16" if args.fp16 else "fp32" latency = measurements['latency'] throughput = audios.size(1)/latency log_data = "1,"+str(sequence_lengths[0].item())+","+prec+","+str(latency)+","+str(throughput)+","+str(mel_lengths[0].item())+"\n" with open("log_bs1_"+prec+".log", 'a') as f: f.write(log_data)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, unknown_args = parser.parse_known_args() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) measurements_all = { "pre_processing": [], "tacotron2_latency": [], "waveglow_latency": [], "denoiser_latency": [], "latency": [], "type_conversion": [], "data_transfer": [], "storage": [], "tacotron2_items_per_sec": [], "waveglow_items_per_sec": [], "num_mels_per_audio": [], "throughput": [] } print("args:", args, unknown_args) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16, args.cpu, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16, args.cpu, forward_is_infer=True) denoiser = Denoiser(waveglow) if not args.cpu: denoiser.cuda() texts = [ "The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves." ] texts = [texts[0][:args.input_length]] texts = texts * args.batch_size warmup_iters = 3 for iter in range(args.num_iters): measurements = {} with MeasureTime(measurements, "pre_processing", args.cpu): sequences_padded, input_lengths = prepare_input_sequence( texts, args.cpu) with torch.no_grad(): with MeasureTime(measurements, "latency", args.cpu): with MeasureTime(measurements, "tacotron2_latency", args.cpu): mel, mel_lengths, _ = tacotron2.infer( sequences_padded, input_lengths) with MeasureTime(measurements, "waveglow_latency", args.cpu): audios = waveglow.infer(mel, sigma=args.sigma_infer) num_mels = mel.size(0) * mel.size(2) num_samples = audios.size(0) * audios.size(1) with MeasureTime(measurements, "type_conversion", args.cpu): audios = audios.float() with torch.no_grad(), MeasureTime(measurements, "denoiser_latency", args.cpu): audios = denoiser( audios, strength=args.denoising_strength).squeeze(1) with MeasureTime(measurements, "data_transfer", args.cpu): audios = audios.cpu() with MeasureTime(measurements, "storage", args.cpu): audios = audios.numpy() for i, audio in enumerate(audios): audio_path = "audio_" + str(i) + ".wav" write(audio_path, args.sampling_rate, audio[:mel_lengths[i] * args.stft_hop_length]) measurements['tacotron2_items_per_sec'] = num_mels / measurements[ 'tacotron2_latency'] measurements['waveglow_items_per_sec'] = num_samples / measurements[ 'waveglow_latency'] measurements['num_mels_per_audio'] = mel.size(2) measurements['throughput'] = num_samples / measurements['latency'] if iter >= warmup_iters: for k, v in measurements.items(): measurements_all[k].append(v) DLLogger.log(step=(iter - warmup_iters), data={k: v}) DLLogger.flush() print_stats(measurements_all)
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = args.log_file DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) model = load_and_setup_model(args.model_name, parser, None, args.amp_run, forward_is_infer=True) if args.model_name == "Tacotron2": model = torch.jit.script(model) warmup_iters = 3 num_iters = 1 + warmup_iters for i in range(num_iters): measurements = {} if args.model_name == 'Tacotron2': text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 140), dtype=torch.long).cuda() input_lengths = torch.IntTensor([text_padded.size(1)] * args.batch_size).cuda().long() with torch.no_grad(), MeasureTime(measurements, "inference_time"): mels, _, _ = model(text_padded, input_lengths) num_items = mels.size(0) * mels.size(2) if args.model_name == 'WaveGlow': n_mel_channels = model.upsample.in_channels num_mels = 895 mel_padded = torch.zeros(args.batch_size, n_mel_channels, num_mels).normal_(-5.62, 1.98).cuda() if args.amp_run: mel_padded = mel_padded.half() with torch.no_grad(), MeasureTime(measurements, "inference_time"): audios = model(mel_padded) audios = audios.float() num_items = audios.size(0) * audios.size(1) if i >= warmup_iters: DLLogger.log(step=(i - warmup_iters, ), data={"latency": measurements['inference_time']}) DLLogger.log(step=(i - warmup_iters, ), data={ "items_per_sec": num_items / measurements['inference_time'] }) DLLogger.log(step=tuple(), data={'infer_latency': measurements['inference_time']}) DLLogger.log(step=tuple(), data={ 'infer_items_per_sec': num_items / measurements['inference_time'] }) DLLogger.flush()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, unknown_args = parser.parse_known_args() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) measurements_all = { "pre_processing": [], "tacotron2_encoder_time": [], "tacotron2_decoder_time": [], "tacotron2_postnet_time": [], "tacotron2_latency": [], "waveglow_latency": [], "latency": [], "type_conversion": [], "data_transfer": [], "storage": [], "tacotron2_items_per_sec": [], "waveglow_items_per_sec": [], "num_mels_per_audio": [], "throughput": [] } print("args:", args, unknown_args) torch.cuda.init() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) encoder = load_engine(args.encoder, TRT_LOGGER) decoder_iter = load_engine(args.decoder, TRT_LOGGER) postnet = load_engine(args.postnet, TRT_LOGGER) waveglow = load_engine(args.waveglow, TRT_LOGGER) if args.waveglow_ckpt != "": # setup denoiser using WaveGlow PyTorch checkpoint waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt, fp16_run=args.fp16, cpu_run=False, forward_is_infer=True) denoiser = Denoiser(waveglow_ckpt).cuda() # after initialization, we don't need WaveGlow PyTorch checkpoint # anymore - deleting del waveglow_ckpt torch.cuda.empty_cache() # create TRT contexts for each engine encoder_context = encoder.create_execution_context() decoder_context = decoder_iter.create_execution_context() postnet_context = postnet.create_execution_context() waveglow_context = waveglow.create_execution_context() texts = [ "The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves." ] texts = [texts[0][:args.input_length]] texts = texts * args.batch_size warmup_iters = 3 for iter in range(args.num_iters): measurements = {} with MeasureTime(measurements, "pre_processing"): sequences_padded, input_lengths = prepare_input_sequence(texts) sequences_padded = sequences_padded.to(torch.int32) input_lengths = input_lengths.to(torch.int32) with torch.no_grad(): with MeasureTime(measurements, "latency"): with MeasureTime(measurements, "tacotron2_latency"): mel, mel_lengths = infer_tacotron2_trt( encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences_padded, input_lengths, measurements, args.fp16) with MeasureTime(measurements, "waveglow_latency"): audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16) num_mels = mel.size(0) * mel.size(2) num_samples = audios.size(0) * audios.size(1) with MeasureTime(measurements, "type_conversion"): audios = audios.float() with MeasureTime(measurements, "data_transfer"): audios = audios.cpu() with MeasureTime(measurements, "storage"): audios = audios.numpy() for i, audio in enumerate(audios): audio_path = "audio_" + str(i) + ".wav" write(audio_path, args.sampling_rate, audio[:mel_lengths[i] * args.stft_hop_length]) measurements['tacotron2_items_per_sec'] = num_mels / measurements[ 'tacotron2_latency'] measurements['waveglow_items_per_sec'] = num_samples / measurements[ 'waveglow_latency'] measurements['num_mels_per_audio'] = mel.size(2) measurements['throughput'] = num_samples / measurements['latency'] if iter >= warmup_iters: for k, v in measurements.items(): if k in measurements_all.keys(): measurements_all[k].append(v) DLLogger.log(step=(iter - warmup_iters), data={k: v}) DLLogger.flush() print_stats(measurements_all)
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = args.log_file LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) log_hardware() log_args(args) model = load_and_setup_model(args.model_name, parser, None, args.amp_run) warmup_iters = 3 num_iters = 1 + warmup_iters for i in range(num_iters): if i >= warmup_iters: LOGGER.iteration_start() measurements = {} if args.model_name == 'Tacotron2': text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 140), dtype=torch.long).cuda() input_lengths = torch.IntTensor([text_padded.size(1)] * args.batch_size).cuda().long() with torch.no_grad(), MeasureTime(measurements, "inference_time"): mels, _ = model.infer(text_padded, input_lengths) num_items = mels.size(0) * mels.size(2) if args.model_name == 'WaveGlow': n_mel_channels = model.upsample.in_channels num_mels = 895 mel_padded = torch.zeros(args.batch_size, n_mel_channels, num_mels).normal_(-5.62, 1.98).cuda() if args.amp_run: mel_padded = mel_padded.half() with torch.no_grad(), MeasureTime(measurements, "inference_time"): audios = model.infer(mel_padded) audios = audios.float() num_items = audios.size(0) * audios.size(1) if i >= warmup_iters: LOGGER.log(key="items_per_sec", value=(num_items / measurements['inference_time'])) LOGGER.log(key="latency", value=measurements['inference_time']) LOGGER.iteration_stop() LOGGER.finish()
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = ("qa/baselines/" + args.model_name + "_inferbench_BS" + str(args.batch_size) + "_FP" + ("16" if args.fp16_run else "32") + "_DGX1_16GB_1GPU_single" + ".json") \ if args.create_benchmark else \ (args.model_name + "_infer_BS" + str(args.batch_size) + "_FP" + ("16" if args.fp16_run else "32") + \ "_DGX1_16GB_1GPU_single" + ".json") LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) log_hardware() log_args(args) # ## uncomment to generate new padded text # texts = [] # f = open('qa/ljs_text_train_subset_2500.txt', 'r') # texts = f.readlines() # sequence = [] # for i, text in enumerate(texts): # sequence.append(torch.IntTensor(text_to_sequence(text, ['english_cleaners']))) # text_padded, input_lengths = collate_text(sequence) # text_padded = torch.autograd.Variable(text_padded).cuda().long() # torch.save(text_padded, "qa/text_padded.pt") # torch.save(input_lengths, "qa/input_lengths.pt") model = load_and_setup_model(args.model_name, parser, None, args.fp16_run) dry_runs = 3 num_iters = (16 + dry_runs) if args.create_benchmark else (1 + dry_runs) for i in range(num_iters): ## skipping the first inference which is slower if i >= dry_runs: LOGGER.iteration_start() if args.model_name == 'Tacotron2': text_padded = torch.load(args.input_text) text_padded = text_padded[:args.batch_size] text_padded = torch.autograd.Variable(text_padded).cuda().long() t0 = time.time() with torch.no_grad(): _, mels, _, _ = model.infer(text_padded) t1 = time.time() inference_time = t1 - t0 num_items = text_padded.size(0) * text_padded.size(1) # # ## uncomment to generate new padded mels # torch.save(mels, "qa/mel_padded.pt") if args.model_name == 'WaveGlow': mel_padded = torch.load(args.input_mels) mel_padded = torch.cat( (mel_padded, mel_padded, mel_padded, mel_padded)) mel_padded = mel_padded[:args.batch_size] mel_padded = mel_padded.cuda() if args.fp16_run: mel_padded = mel_padded.half() t0 = time.time() with torch.no_grad(): audios = model.infer(mel_padded) audios = audios.float() t1 = time.time() inference_time = t1 - t0 num_items = audios.size(0) * audios.size(1) if i >= dry_runs: LOGGER.log(key="items_per_sec", value=(num_items / inference_time)) LOGGER.iteration_stop() LOGGER.finish()
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = os.path.join(args.output, args.log_file) DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) if args.synth_data: model = load_and_setup_model(args.model_name, parser, None, args.fp16, cpu_run=False, forward_is_infer=True) else: if not os.path.isfile(args.model): print(f"File {args.model} does not exist!") sys.exit(1) model = load_and_setup_model(args.model_name, parser, args.model, args.fp16, cpu_run=False, forward_is_infer=True) if args.model_name == "Tacotron2": model = torch.jit.script(model) warmup_iters = 3 num_iters = 1 + warmup_iters for i in range(num_iters): measurements = {} if args.model_name == 'Tacotron2': text_padded, input_lengths = gen_text(args.synth_data) with torch.no_grad(), MeasureTime(measurements, "inference_time"): mels, _, _ = model(text_padded, input_lengths) num_items = mels.size(0) * mels.size(2) if args.model_name == 'WaveGlow': n_mel_channels = model.upsample.in_channels mel_padded = gen_mel(args.synth_data, n_mel_channels, args.fp16) with torch.no_grad(), MeasureTime(measurements, "inference_time"): audios = model(mel_padded) audios = audios.float() num_items = audios.size(0) * audios.size(1) if i >= warmup_iters: DLLogger.log(step=(i - warmup_iters, ), data={"latency": measurements['inference_time']}) DLLogger.log(step=(i - warmup_iters, ), data={ "items_per_sec": num_items / measurements['inference_time'] }) DLLogger.log(step=tuple(), data={'infer_latency': measurements['inference_time']}) DLLogger.log(step=tuple(), data={ 'infer_items_per_sec': num_items / measurements['inference_time'] }) DLLogger.flush()