예제 #1
0
def export_onnx(parser, args):

    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
                                    args.amp_run)

    # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech
    mel = torch.randn(1, 80, 620).cuda()
    stride = 256  # value from waveglow upsample
    kernel_size = 1024  # value from waveglow upsample
    n_group = 8
    z_size2 = (mel.size(2) - 1) * stride + (kernel_size - 1) + 1
    # corresponds to cutoff in infer_onnx
    z_size2 = z_size2 - (kernel_size - stride)
    z_size2 = z_size2 // n_group
    z = torch.randn(1, n_group, z_size2, 1).cuda()

    if args.amp_run:
        mel = mel.half()
        z = z.half()
    with torch.no_grad():
        # run inference to force calculation of inverses
        waveglow.infer(mel, sigma=args.sigma_infer)

        # export to ONNX
        convert_1d_to_2d_(waveglow)
        waveglow.forward = waveglow.infer_onnx
        if args.amp_run:
            waveglow.half()
        mel = mel.unsqueeze(3)
        torch.onnx.export(waveglow, (mel, z), args.output)
def main():
    """
    Launches inference benchmark.
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch FastPitch Inference Benchmark')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    log_file = args.log_file
    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'FastPitch_PyT'})

    model = load_and_setup_model('FastPitch',
                                 parser,
                                 None,
                                 args.amp_run,
                                 'cuda',
                                 unk_args=[],
                                 forward_is_infer=True,
                                 ema=False,
                                 jitable=True)

    # FIXME Temporarily disabled due to nn.LayerNorm fp16 casting bug in pytorch:20.02-py3 and 20.03
    # model = torch.jit.script(model)

    warmup_iters = 3
    iters = 1
    gen_measures = MeasureTime()
    all_frames = 0
    for i in range(-warmup_iters, iters):
        text_padded = torch.randint(low=0,
                                    high=148,
                                    size=(args.batch_size, 128),
                                    dtype=torch.long).to('cuda')
        input_lengths = torch.IntTensor([text_padded.size(1)] *
                                        args.batch_size).to('cuda')
        durs = torch.ones_like(text_padded).mul_(4).to('cuda')

        with torch.no_grad(), gen_measures:
            mels, *_ = model(text_padded, input_lengths, dur_tgt=durs)
        num_frames = mels.size(0) * mels.size(2)

        if i >= 0:
            all_frames += num_frames
            DLLogger.log(step=(i, ), data={"latency": gen_measures[-1]})
            DLLogger.log(step=(i, ),
                         data={"frames/s": num_frames / gen_measures[-1]})

    measures = gen_measures[warmup_iters:]
    DLLogger.log(step=(), data={'avg latency': np.mean(measures)})
    DLLogger.log(step=(), data={'avg frames/s': all_frames / np.sum(measures)})
    DLLogger.flush()
예제 #3
0
def export_onnx(parser, args):

    waveglow = load_and_setup_model('WaveGlow',
                                    parser,
                                    args.waveglow,
                                    fp16_run=args.fp16,
                                    cpu_run=False,
                                    forward_is_infer=False)

    # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech
    mel = torch.randn(1, 80, 620).cuda()
    stride = 256  # value from waveglow upsample
    n_group = 8
    z_size2 = (mel.size(2) * stride) // n_group
    z = torch.randn(1, n_group, z_size2, 1).cuda()

    if args.fp16:
        mel = mel.half()
        z = z.half()
    with torch.no_grad():
        # run inference to force calculation of inverses
        waveglow.infer(mel, sigma=args.sigma_infer)

        convert_1d_to_2d_(waveglow)
        mel = mel.unsqueeze(3)

        # export to ONNX
        if args.fp16:
            waveglow = waveglow.half()

        waveglow.forward = waveglow.infer_onnx

        opset_version = 11

        if os.path.isdir(args.output):
            output_path = os.path.join(args.output, "waveglow.onnx")
        else:
            output_path = args.output

        torch.onnx.export(waveglow, (mel, z),
                          output_path,
                          opset_version=opset_version,
                          do_constant_folding=True,
                          input_names=["mel", "z"],
                          output_names=["audio"],
                          dynamic_axes={
                              "mel": {
                                  0: "batch_size",
                                  2: "mel_seq"
                              },
                              "z": {
                                  0: "batch_size",
                                  2: "z_seq"
                              },
                              "audio": {
                                  0: "batch_size",
                                  1: "audio_seq"
                              }
                          })
def export_onnx(parser, args):

    waveglow = load_and_setup_model('WaveGlow',
                                    parser,
                                    args.waveglow,
                                    args.amp_run,
                                    forward_is_infer=False)

    # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech
    mel = torch.randn(1, 80, 620).cuda()
    stride = 256  # value from waveglow upsample
    kernel_size = 1024  # value from waveglow upsample
    n_group = 8
    z_size2 = (mel.size(2) - 1) * stride + (kernel_size - 1) + 1
    # corresponds to cutoff in infer_onnx
    z_size2 = z_size2 - (kernel_size - stride)
    z_size2 = z_size2 // n_group
    z = torch.randn(1, n_group, z_size2, 1).cuda()

    if args.amp_run:
        mel = mel.half()
        z = z.half()
    with torch.no_grad():
        # run inference to force calculation of inverses
        waveglow.infer(mel, sigma=args.sigma_infer)

        # export to ONNX
        convert_1d_to_2d_(waveglow)

        fType = types.MethodType
        waveglow.forward = fType(infer_onnx, waveglow)

        if args.amp_run:
            waveglow.half()
        mel = mel.unsqueeze(3)

        opset_version = 10

        torch.onnx.export(waveglow, (mel, z),
                          args.output + "/" + "waveglow.onnx",
                          opset_version=opset_version,
                          do_constant_folding=True,
                          input_names=["mel", "z"],
                          output_names=["audio"],
                          dynamic_axes={
                              "mel": {
                                  0: "batch_size",
                                  2: "mel_seq"
                              },
                              "z": {
                                  0: "batch_size",
                                  2: "z_seq"
                              },
                              "audio": {
                                  0: "batch_size",
                                  1: "audio_seq"
                              }
                          })
예제 #5
0
def export_onnx(parser, args):

    waveglow = load_and_setup_model('WaveGlow',
                                    parser,
                                    args.waveglow,
                                    fp16_run=args.fp16,
                                    cpu_run=False,
                                    forward_is_infer=False)

    # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech
    mel = torch.randn(1, 80, 620).cuda()
    stride = 256  # value from waveglow upsample
    n_group = 8
    z_size2 = (mel.size(2) * stride) // n_group
    z = torch.randn(1, n_group, z_size2).cuda()

    if args.fp16:
        mel = mel.half()
        z = z.half()
    with torch.no_grad():
        # run inference to force calculation of inverses
        waveglow.infer(mel, sigma=args.sigma_infer)

        # export to ONNX
        if args.fp16:
            waveglow = waveglow.half()

        fType = types.MethodType
        waveglow.forward = fType(infer_onnx, waveglow)

        opset_version = 11

        torch.onnx.export(waveglow, (mel, z),
                          args.output + "/" + "waveglow.onnx",
                          opset_version=opset_version,
                          do_constant_folding=True,
                          input_names=["mel", "z"],
                          output_names=["audio"],
                          dynamic_axes={
                              "mel": {
                                  0: "batch_size",
                                  2: "mel_seq"
                              },
                              "z": {
                                  0: "batch_size",
                                  2: "z_seq"
                              },
                              "audio": {
                                  0: "batch_size",
                                  1: "audio_seq"
                              }
                          })
예제 #6
0
def main():

    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args = parser.parse_args()

    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
                                     args.amp_run, forward_is_infer=True)
    
    jitted_tacotron2 = torch.jit.script(tacotron2)

    torch.jit.save(jitted_tacotron2, args.output)
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Export models to TorchScript')
    parser = parse_args(parser)
    args = parser.parse_args()

    model = load_and_setup_model(args.generator_name,
                                 parser,
                                 args.generator_checkpoint,
                                 args.amp,
                                 device='cpu',
                                 forward_is_infer=True,
                                 polyak=False,
                                 jitable=True)

    torch.jit.save(torch.jit.script(model), args.output)
예제 #8
0
def main():

    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 export to TRT')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
                                     fp16_run=args.fp16, cpu_run=False)

    opset_version = 10

    sequences = torch.randint(low=0, high=148, size=(1,50),
                             dtype=torch.long).cuda()
    sequence_lengths = torch.IntTensor([sequences.size(1)]).cuda().long()
    dummy_input = (sequences, sequence_lengths)

    encoder = Encoder(tacotron2)
    encoder.eval()
    with torch.no_grad():
        encoder(*dummy_input)

    torch.onnx.export(encoder, dummy_input, args.output+"/"+"encoder.onnx",
                      opset_version=opset_version,
                      do_constant_folding=True,
                      input_names=["sequences", "sequence_lengths"],
                      output_names=["memory", "processed_memory", "lens"],
                      dynamic_axes={"sequences": {1: "text_seq"},
                                    "memory": {1: "mem_seq"},
                                    "processed_memory": {1: "mem_seq"}
                      })

    decoder_iter = DecoderIter(tacotron2)
    memory = torch.randn((1,sequence_lengths[0],512)).cuda() #encoder_outputs
    if args.fp16:
        memory = memory.half()
    memory_lengths = sequence_lengths
    # initialize decoder states for dummy_input
    decoder_input = tacotron2.decoder.get_go_frame(memory)
    mask = get_mask_from_lengths(memory_lengths)
    (attention_hidden,
     attention_cell,
     decoder_hidden,
     decoder_cell,
     attention_weights,
     attention_weights_cum,
     attention_context,
     processed_memory) = tacotron2.decoder.initialize_decoder_states(memory)
    dummy_input = (decoder_input,
                   attention_hidden,
                   attention_cell,
                   decoder_hidden,
                   decoder_cell,
                   attention_weights,
                   attention_weights_cum,
                   attention_context,
                   memory,
                   processed_memory,
                   mask)

    decoder_iter = DecoderIter(tacotron2)
    decoder_iter.eval()
    with torch.no_grad():
        decoder_iter(*dummy_input)

    torch.onnx.export(decoder_iter, dummy_input, args.output+"/"+"decoder_iter.onnx",
                      opset_version=opset_version,
                      do_constant_folding=True,
                      input_names=["decoder_input",
                                   "attention_hidden",
                                   "attention_cell",
                                   "decoder_hidden",
                                   "decoder_cell",
                                   "attention_weights",
                                   "attention_weights_cum",
                                   "attention_context",
                                   "memory",
                                   "processed_memory",
                                   "mask"],
                      output_names=["decoder_output",
                                    "gate_prediction",
                                    "out_attention_hidden",
                                    "out_attention_cell",
                                    "out_decoder_hidden",
                                    "out_decoder_cell",
                                    "out_attention_weights",
                                    "out_attention_weights_cum",
                                    "out_attention_context"],
                      dynamic_axes={"attention_weights" : {1: "seq_len"},
                                    "attention_weights_cum" : {1: "seq_len"},
                                    "memory" : {1: "seq_len"},
                                    "processed_memory" : {1: "seq_len"},
                                    "mask" : {1: "seq_len"},
                                    "out_attention_weights" : {1: "seq_len"},
                                    "out_attention_weights_cum" : {1: "seq_len"}
                      })

    postnet = Postnet(tacotron2)
    dummy_input = torch.randn((1,80,620)).cuda()
    if args.fp16:
        dummy_input = dummy_input.half()
    torch.onnx.export(postnet, dummy_input, args.output+"/"+"postnet.onnx",
                      opset_version=opset_version,
                      do_constant_folding=True,
                      input_names=["mel_outputs"],
                      output_names=["mel_outputs_postnet"],
                      dynamic_axes={"mel_outputs": {2: "mel_seq"},
                                    "mel_outputs_postnet": {2: "mel_seq"}})

    mel = test_inference(encoder, decoder_iter, postnet)
    torch.save(mel, "mel.pt")
예제 #9
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch TTS Data Pre-processing')
    parser = parse_args(parser)
    args, unk_args = parser.parse_known_args()
    if len(unk_args) > 0:
        raise ValueError(f'Invalid options {unk_args}')

    if args.extract_pitch_char:
        assert args.extract_durations, "Durations required for pitch extraction"

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})

    model = load_and_setup_model(
        'Tacotron2',
        parser,
        args.tacotron2_checkpoint,
        amp=False,
        device=torch.device('cuda' if args.cuda else 'cpu'),
        forward_is_infer=False,
        ema=False)

    if args.train_mode:
        model.train()

    # n_mel_channels arg has been consumed by model's arg parser
    args.n_mel_channels = model.n_mel_channels

    for datum in ('mels', 'mels_teacher', 'attentions', 'durations',
                  'pitch_mel', 'pitch_char', 'pitch_trichar'):
        if getattr(args, f'extract_{datum}'):
            Path(args.dataset_path, datum).mkdir(parents=False, exist_ok=True)

    filenames = [
        Path(l.split('|')[0]).stem for l in open(args.wav_text_filelist, 'r')
    ]
    # Compatibility with Tacotron2 Data loader
    args.n_speakers = 1
    dataset = FilenamedLoader(filenames,
                              args.dataset_path,
                              args.wav_text_filelist,
                              args,
                              load_mel_from_disk=False)
    # TextMelCollate supports only n_frames_per_step=1
    data_loader = DataLoader(dataset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             sampler=None,
                             num_workers=0,
                             collate_fn=TextMelCollate(1),
                             pin_memory=False,
                             drop_last=False)
    pitch_vecs = {'mel': {}, 'char': {}, 'trichar': {}}
    for i, batch in enumerate(data_loader):
        tik = time.time()
        fnames = batch[-1]
        x, _, _ = batch_to_gpu(batch[:-1])
        _, text_lens, mels_padded, _, mel_lens = x

        for j, mel in enumerate(mels_padded):
            fpath = Path(args.dataset_path, 'mels', fnames[j] + '.pt')
            torch.save(mel[:, :mel_lens[j]].cpu(), fpath)

        with torch.no_grad():
            out_mels, out_mels_postnet, _, alignments = model.forward(x)

        if args.extract_mels_teacher:
            for j, mel in enumerate(out_mels_postnet):
                fpath = Path(args.dataset_path, 'mels_teacher',
                             fnames[j] + '.pt')
                torch.save(mel[:, :mel_lens[j]].cpu(), fpath)
        if args.extract_attentions:
            for j, ali in enumerate(alignments):
                ali = ali[:mel_lens[j], :text_lens[j]]
                fpath = Path(args.dataset_path, 'attentions',
                             fnames[j] + '.pt')
                torch.save(ali.cpu(), fpath)
        durations = []
        if args.extract_durations:
            for j, ali in enumerate(alignments):
                text_len = text_lens[j]
                ali = ali[:mel_lens[j], :text_len]
                dur = torch.histc(torch.argmax(ali, dim=1),
                                  min=0,
                                  max=text_len - 1,
                                  bins=text_len)
                durations.append(dur)
                fpath = Path(args.dataset_path, 'durations', fnames[j] + '.pt')
                torch.save(dur.cpu().int(), fpath)
        if args.extract_pitch_mel or args.extract_pitch_char or args.extract_pitch_trichar:
            for j, dur in enumerate(durations):
                fpath = Path(args.dataset_path, 'pitch_char',
                             fnames[j] + '.pt')
                wav = Path(args.dataset_path, 'wavs', fnames[j] + '.wav')
                p_mel, p_char, p_trichar = calculate_pitch(
                    str(wav),
                    dur.cpu().numpy())
                pitch_vecs['mel'][fnames[j]] = p_mel
                pitch_vecs['char'][fnames[j]] = p_char
                pitch_vecs['trichar'][fnames[j]] = p_trichar

        nseconds = time.time() - tik
        DLLogger.log(step=f'{i+1}/{len(data_loader)} ({nseconds:.2f}s)',
                     data={})

    if args.extract_pitch_mel:
        normalize_pitch_vectors(pitch_vecs['mel'])
        for fname, pitch in pitch_vecs['mel'].items():
            fpath = Path(args.dataset_path, 'pitch_mel', fname + '.pt')
            torch.save(torch.from_numpy(pitch), fpath)

    if args.extract_pitch_char:
        mean, std = normalize_pitch_vectors(pitch_vecs['char'])
        for fname, pitch in pitch_vecs['char'].items():
            fpath = Path(args.dataset_path, 'pitch_char', fname + '.pt')
            torch.save(torch.from_numpy(pitch), fpath)
        save_stats(args.dataset_path, args.wav_text_filelist, 'pitch_char',
                   mean, std)

    if args.extract_pitch_trichar:
        normalize_pitch_vectors(pitch_vecs['trichar'])
        for fname, pitch in pitch_vecs['trichar'].items():
            fpath = Path(args.dataset_path, 'pitch_trichar', fname + '.pt')
            torch.save(torch.from_numpy(pitch), fpath)

    DLLogger.flush()
예제 #10
0
def main():

    parser = argparse.ArgumentParser(
        description='TensorRT Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    # initialize CUDA state
    torch.cuda.init()

    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    encoder = load_engine(args.encoder, TRT_LOGGER)
    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
    postnet = load_engine(args.postnet, TRT_LOGGER)
    waveglow = load_engine(args.waveglow, TRT_LOGGER)

    if args.waveglow_ckpt != "":
        # setup denoiser using WaveGlow PyTorch checkpoint
        waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt,
                                             True, forward_is_infer=True)
        denoiser = Denoiser(waveglow_ckpt).cuda()
        # after initialization, we don't need WaveGlow PyTorch checkpoint
        # anymore - deleting
        del waveglow_ckpt
        torch.cuda.empty_cache()

    # create TRT contexts for each engine
    encoder_context = encoder.create_execution_context()
    decoder_context = decoder_iter.create_execution_context()
    postnet_context = postnet.create_execution_context()
    waveglow_context = waveglow.create_execution_context()

    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
                                              args.output+'/'+args.log_file),
                            StdOutBackend(Verbosity.VERBOSE)])

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    measurements = {}

    sequences, sequence_lengths = prepare_input_sequence(texts)
    sequences = sequences.to(torch.int32)
    sequence_lengths = sequence_lengths.to(torch.int32)
    with MeasureTime(measurements, "latency"):
        mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
                                               encoder_context, decoder_context, postnet_context,
                                               sequences, sequence_lengths, measurements, args.fp16)
        audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16)

    with encoder_context, decoder_context,  postnet_context, waveglow_context:
        pass

    audios = audios.float()
    if args.waveglow_ckpt != "":
        with MeasureTime(measurements, "denoiser"):
            audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    for i, audio in enumerate(audios):
        audio = audio[:mel_lengths[i]*args.stft_hop_length]
        audio = audio/torch.max(torch.abs(audio))
        audio_path = args.output + "audio_"+str(i)+"_trt.wav"
        write(audio_path, args.sampling_rate, audio.cpu().numpy())


    DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']})
    DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']})
    DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']})
    DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
    DLLogger.log(step=0, data={"latency": measurements['latency']})

    if args.waveglow_ckpt != "":
        DLLogger.log(step=0, data={"denoiser": measurements['denoiser']})
    DLLogger.flush()

    prec = "fp16" if args.fp16 else "fp32"
    latency = measurements['latency']
    throughput = audios.size(1)/latency
    log_data = "1,"+str(sequence_lengths[0].item())+","+prec+","+str(latency)+","+str(throughput)+","+str(mel_lengths[0].item())+"\n"
    with open("log_bs1_"+prec+".log", 'a') as f:
        f.write(log_data)
예제 #11
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU or CPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, unknown_args = parser.parse_known_args()

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    measurements_all = {
        "pre_processing": [],
        "tacotron2_latency": [],
        "waveglow_latency": [],
        "denoiser_latency": [],
        "latency": [],
        "type_conversion": [],
        "data_transfer": [],
        "storage": [],
        "tacotron2_items_per_sec": [],
        "waveglow_items_per_sec": [],
        "num_mels_per_audio": [],
        "throughput": []
    }

    print("args:", args, unknown_args)

    tacotron2 = load_and_setup_model('Tacotron2',
                                     parser,
                                     args.tacotron2,
                                     args.fp16,
                                     args.cpu,
                                     forward_is_infer=True)
    waveglow = load_and_setup_model('WaveGlow',
                                    parser,
                                    args.waveglow,
                                    args.fp16,
                                    args.cpu,
                                    forward_is_infer=True)
    denoiser = Denoiser(waveglow)
    if not args.cpu:
        denoiser.cuda()

    texts = [
        "The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."
    ]
    texts = [texts[0][:args.input_length]]
    texts = texts * args.batch_size

    warmup_iters = 3

    for iter in range(args.num_iters):

        measurements = {}

        with MeasureTime(measurements, "pre_processing", args.cpu):
            sequences_padded, input_lengths = prepare_input_sequence(
                texts, args.cpu)

        with torch.no_grad():
            with MeasureTime(measurements, "latency", args.cpu):
                with MeasureTime(measurements, "tacotron2_latency", args.cpu):
                    mel, mel_lengths, _ = tacotron2.infer(
                        sequences_padded, input_lengths)

                with MeasureTime(measurements, "waveglow_latency", args.cpu):
                    audios = waveglow.infer(mel, sigma=args.sigma_infer)

                num_mels = mel.size(0) * mel.size(2)
                num_samples = audios.size(0) * audios.size(1)

                with MeasureTime(measurements, "type_conversion", args.cpu):
                    audios = audios.float()

                with torch.no_grad(), MeasureTime(measurements,
                                                  "denoiser_latency",
                                                  args.cpu):
                    audios = denoiser(
                        audios, strength=args.denoising_strength).squeeze(1)

        with MeasureTime(measurements, "data_transfer", args.cpu):
            audios = audios.cpu()

        with MeasureTime(measurements, "storage", args.cpu):
            audios = audios.numpy()
            for i, audio in enumerate(audios):
                audio_path = "audio_" + str(i) + ".wav"
                write(audio_path, args.sampling_rate,
                      audio[:mel_lengths[i] * args.stft_hop_length])

        measurements['tacotron2_items_per_sec'] = num_mels / measurements[
            'tacotron2_latency']
        measurements['waveglow_items_per_sec'] = num_samples / measurements[
            'waveglow_latency']
        measurements['num_mels_per_audio'] = mel.size(2)
        measurements['throughput'] = num_samples / measurements['latency']

        if iter >= warmup_iters:
            for k, v in measurements.items():
                measurements_all[k].append(v)
                DLLogger.log(step=(iter - warmup_iters), data={k: v})

    DLLogger.flush()

    print_stats(measurements_all)
예제 #12
0
def main():
    """
    Launches inference benchmark.
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    log_file = args.log_file

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' +
                          args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    model = load_and_setup_model(args.model_name,
                                 parser,
                                 None,
                                 args.amp_run,
                                 forward_is_infer=True)

    if args.model_name == "Tacotron2":
        model = torch.jit.script(model)

    warmup_iters = 3
    num_iters = 1 + warmup_iters

    for i in range(num_iters):

        measurements = {}

        if args.model_name == 'Tacotron2':
            text_padded = torch.randint(low=0,
                                        high=148,
                                        size=(args.batch_size, 140),
                                        dtype=torch.long).cuda()
            input_lengths = torch.IntTensor([text_padded.size(1)] *
                                            args.batch_size).cuda().long()
            with torch.no_grad(), MeasureTime(measurements, "inference_time"):
                mels, _, _ = model(text_padded, input_lengths)
            num_items = mels.size(0) * mels.size(2)

        if args.model_name == 'WaveGlow':
            n_mel_channels = model.upsample.in_channels
            num_mels = 895
            mel_padded = torch.zeros(args.batch_size, n_mel_channels,
                                     num_mels).normal_(-5.62, 1.98).cuda()
            if args.amp_run:
                mel_padded = mel_padded.half()

            with torch.no_grad(), MeasureTime(measurements, "inference_time"):
                audios = model(mel_padded)
                audios = audios.float()
            num_items = audios.size(0) * audios.size(1)

        if i >= warmup_iters:
            DLLogger.log(step=(i - warmup_iters, ),
                         data={"latency": measurements['inference_time']})
            DLLogger.log(step=(i - warmup_iters, ),
                         data={
                             "items_per_sec":
                             num_items / measurements['inference_time']
                         })

    DLLogger.log(step=tuple(),
                 data={'infer_latency': measurements['inference_time']})
    DLLogger.log(step=tuple(),
                 data={
                     'infer_items_per_sec':
                     num_items / measurements['inference_time']
                 })

    DLLogger.flush()
예제 #13
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, unknown_args = parser.parse_known_args()

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    measurements_all = {
        "pre_processing": [],
        "tacotron2_encoder_time": [],
        "tacotron2_decoder_time": [],
        "tacotron2_postnet_time": [],
        "tacotron2_latency": [],
        "waveglow_latency": [],
        "latency": [],
        "type_conversion": [],
        "data_transfer": [],
        "storage": [],
        "tacotron2_items_per_sec": [],
        "waveglow_items_per_sec": [],
        "num_mels_per_audio": [],
        "throughput": []
    }

    print("args:", args, unknown_args)

    torch.cuda.init()

    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    encoder = load_engine(args.encoder, TRT_LOGGER)
    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
    postnet = load_engine(args.postnet, TRT_LOGGER)
    waveglow = load_engine(args.waveglow, TRT_LOGGER)

    if args.waveglow_ckpt != "":
        # setup denoiser using WaveGlow PyTorch checkpoint
        waveglow_ckpt = load_and_setup_model('WaveGlow',
                                             parser,
                                             args.waveglow_ckpt,
                                             fp16_run=args.fp16,
                                             cpu_run=False,
                                             forward_is_infer=True)
        denoiser = Denoiser(waveglow_ckpt).cuda()
        # after initialization, we don't need WaveGlow PyTorch checkpoint
        # anymore - deleting
        del waveglow_ckpt
        torch.cuda.empty_cache()

    # create TRT contexts for each engine
    encoder_context = encoder.create_execution_context()
    decoder_context = decoder_iter.create_execution_context()
    postnet_context = postnet.create_execution_context()
    waveglow_context = waveglow.create_execution_context()

    texts = [
        "The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."
    ]
    texts = [texts[0][:args.input_length]]
    texts = texts * args.batch_size

    warmup_iters = 3

    for iter in range(args.num_iters):

        measurements = {}

        with MeasureTime(measurements, "pre_processing"):
            sequences_padded, input_lengths = prepare_input_sequence(texts)
            sequences_padded = sequences_padded.to(torch.int32)
            input_lengths = input_lengths.to(torch.int32)

        with torch.no_grad():
            with MeasureTime(measurements, "latency"):
                with MeasureTime(measurements, "tacotron2_latency"):
                    mel, mel_lengths = infer_tacotron2_trt(
                        encoder, decoder_iter, postnet, encoder_context,
                        decoder_context, postnet_context, sequences_padded,
                        input_lengths, measurements, args.fp16)

                with MeasureTime(measurements, "waveglow_latency"):
                    audios = infer_waveglow_trt(waveglow, waveglow_context,
                                                mel, measurements, args.fp16)

        num_mels = mel.size(0) * mel.size(2)
        num_samples = audios.size(0) * audios.size(1)

        with MeasureTime(measurements, "type_conversion"):
            audios = audios.float()

        with MeasureTime(measurements, "data_transfer"):
            audios = audios.cpu()

        with MeasureTime(measurements, "storage"):
            audios = audios.numpy()
            for i, audio in enumerate(audios):
                audio_path = "audio_" + str(i) + ".wav"
                write(audio_path, args.sampling_rate,
                      audio[:mel_lengths[i] * args.stft_hop_length])

        measurements['tacotron2_items_per_sec'] = num_mels / measurements[
            'tacotron2_latency']
        measurements['waveglow_items_per_sec'] = num_samples / measurements[
            'waveglow_latency']
        measurements['num_mels_per_audio'] = mel.size(2)
        measurements['throughput'] = num_samples / measurements['latency']

        if iter >= warmup_iters:
            for k, v in measurements.items():
                if k in measurements_all.keys():
                    measurements_all[k].append(v)
                    DLLogger.log(step=(iter - warmup_iters), data={k: v})

    DLLogger.flush()

    print_stats(measurements_all)
예제 #14
0
def main():
    """
    Launches inference benchmark.
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    log_file = args.log_file

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])
    LOGGER.register_metric("items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    log_hardware()
    log_args(args)

    model = load_and_setup_model(args.model_name, parser, None, args.amp_run)

    warmup_iters = 3
    num_iters = 1 + warmup_iters

    for i in range(num_iters):
        if i >= warmup_iters:
            LOGGER.iteration_start()

        measurements = {}

        if args.model_name == 'Tacotron2':
            text_padded = torch.randint(low=0,
                                        high=148,
                                        size=(args.batch_size, 140),
                                        dtype=torch.long).cuda()
            input_lengths = torch.IntTensor([text_padded.size(1)] *
                                            args.batch_size).cuda().long()
            with torch.no_grad(), MeasureTime(measurements, "inference_time"):
                mels, _ = model.infer(text_padded, input_lengths)
            num_items = mels.size(0) * mels.size(2)

        if args.model_name == 'WaveGlow':
            n_mel_channels = model.upsample.in_channels
            num_mels = 895
            mel_padded = torch.zeros(args.batch_size, n_mel_channels,
                                     num_mels).normal_(-5.62, 1.98).cuda()
            if args.amp_run:
                mel_padded = mel_padded.half()

            with torch.no_grad(), MeasureTime(measurements, "inference_time"):
                audios = model.infer(mel_padded)
                audios = audios.float()
            num_items = audios.size(0) * audios.size(1)

        if i >= warmup_iters:
            LOGGER.log(key="items_per_sec",
                       value=(num_items / measurements['inference_time']))
            LOGGER.log(key="latency", value=measurements['inference_time'])
            LOGGER.iteration_stop()

    LOGGER.finish()
def main():
    """
    Launches inference benchmark.
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    log_file = ("qa/baselines/" + args.model_name + "_inferbench_BS" + str(args.batch_size) + "_FP" + ("16" if args.fp16_run else "32") +
                "_DGX1_16GB_1GPU_single" + ".json") \
                if args.create_benchmark else \
                   (args.model_name + "_infer_BS" + str(args.batch_size) + "_FP" + ("16" if args.fp16_run else "32") + \
                   "_DGX1_16GB_1GPU_single" + ".json")

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])
    LOGGER.register_metric("items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)

    log_hardware()
    log_args(args)

    # ## uncomment to generate new padded text
    # texts = []
    # f = open('qa/ljs_text_train_subset_2500.txt', 'r')
    # texts = f.readlines()
    # sequence = []
    # for i, text in enumerate(texts):
    #     sequence.append(torch.IntTensor(text_to_sequence(text, ['english_cleaners'])))

    # text_padded, input_lengths = collate_text(sequence)
    # text_padded = torch.autograd.Variable(text_padded).cuda().long()
    # torch.save(text_padded, "qa/text_padded.pt")
    # torch.save(input_lengths, "qa/input_lengths.pt")

    model = load_and_setup_model(args.model_name, parser, None, args.fp16_run)

    dry_runs = 3
    num_iters = (16 + dry_runs) if args.create_benchmark else (1 + dry_runs)

    for i in range(num_iters):
        ## skipping the first inference which is slower
        if i >= dry_runs:
            LOGGER.iteration_start()

        if args.model_name == 'Tacotron2':
            text_padded = torch.load(args.input_text)
            text_padded = text_padded[:args.batch_size]
            text_padded = torch.autograd.Variable(text_padded).cuda().long()

            t0 = time.time()
            with torch.no_grad():
                _, mels, _, _ = model.infer(text_padded)
            t1 = time.time()
            inference_time = t1 - t0
            num_items = text_padded.size(0) * text_padded.size(1)

            # # ## uncomment to generate new padded mels
            # torch.save(mels, "qa/mel_padded.pt")

        if args.model_name == 'WaveGlow':
            mel_padded = torch.load(args.input_mels)
            mel_padded = torch.cat(
                (mel_padded, mel_padded, mel_padded, mel_padded))
            mel_padded = mel_padded[:args.batch_size]
            mel_padded = mel_padded.cuda()

            if args.fp16_run:
                mel_padded = mel_padded.half()

            t0 = time.time()
            with torch.no_grad():
                audios = model.infer(mel_padded)
                audios = audios.float()
            t1 = time.time()
            inference_time = t1 - t0
            num_items = audios.size(0) * audios.size(1)

        if i >= dry_runs:
            LOGGER.log(key="items_per_sec", value=(num_items / inference_time))
            LOGGER.iteration_stop()

    LOGGER.finish()
def main():
    """
    Launches inference benchmark.
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    log_file = os.path.join(args.output, args.log_file)

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    if args.synth_data:
        model = load_and_setup_model(args.model_name,
                                     parser,
                                     None,
                                     args.fp16,
                                     cpu_run=False,
                                     forward_is_infer=True)
    else:
        if not os.path.isfile(args.model):
            print(f"File {args.model} does not exist!")
            sys.exit(1)
        model = load_and_setup_model(args.model_name,
                                     parser,
                                     args.model,
                                     args.fp16,
                                     cpu_run=False,
                                     forward_is_infer=True)

    if args.model_name == "Tacotron2":
        model = torch.jit.script(model)

    warmup_iters = 3
    num_iters = 1 + warmup_iters

    for i in range(num_iters):

        measurements = {}

        if args.model_name == 'Tacotron2':
            text_padded, input_lengths = gen_text(args.synth_data)

            with torch.no_grad(), MeasureTime(measurements, "inference_time"):
                mels, _, _ = model(text_padded, input_lengths)
            num_items = mels.size(0) * mels.size(2)

        if args.model_name == 'WaveGlow':

            n_mel_channels = model.upsample.in_channels
            mel_padded = gen_mel(args.synth_data, n_mel_channels, args.fp16)

            with torch.no_grad(), MeasureTime(measurements, "inference_time"):
                audios = model(mel_padded)
                audios = audios.float()
            num_items = audios.size(0) * audios.size(1)

        if i >= warmup_iters:
            DLLogger.log(step=(i - warmup_iters, ),
                         data={"latency": measurements['inference_time']})
            DLLogger.log(step=(i - warmup_iters, ),
                         data={
                             "items_per_sec":
                             num_items / measurements['inference_time']
                         })

    DLLogger.log(step=tuple(),
                 data={'infer_latency': measurements['inference_time']})
    DLLogger.log(step=tuple(),
                 data={
                     'infer_items_per_sec':
                     num_items / measurements['inference_time']
                 })

    DLLogger.flush()