def train(self):
        try:
            with torch.autograd.profiler.emit_nvtx(
                    enabled=self.pyprof_enabled):
                for i in range(self.step + 1, self.final_steps + 1):
                    self.step = i
                    tprint(
                        "------------- TRAIN step : {} -------------".format(
                            i))

                    if self.nvprof_iter_start and i == self.nvprof_iter_start:
                        profiler.start()

                    with Nvtx("step #{}".format(self.step)):
                        loss, meta = self.do_step()

                    if self.nvprof_iter_end and i == self.nvprof_iter_end:
                        profiler.stop()

                    if self.lr_scheduler:
                        for param_group in self.optimizer.param_groups:
                            tprint("lr: {:06f}".format(param_group['lr']))
                        self.lr_scheduler.step(self.step)

                    if self.step % self.log_steps == 0:
                        self.log(loss, meta)

                    if self.ckpt_path and self.save_steps and i % self.save_steps == 0:
                        self.save()

            tprint("Training has been done.")
        except StopIteration:  # done by n_epochs
            tprint("Training has been done. (by n_epochs)")
        except KeyboardInterrupt:
            tprint("Training has been canceled.")
    def __init__(self,
                 ckpt_file,
                 device='cuda',
                 use_fp16=False,
                 use_denoiser=False):
        self.ckpt_file = ckpt_file
        self.device = device
        self.use_fp16 = use_fp16
        self.use_denoiser = use_denoiser

        # model
        sys.path.append('waveglow')
        self.model = torch.load(self.ckpt_file,
                                map_location=self.device)['model']
        self.model = self.model.remove_weightnorm(self.model)
        self.model.eval()
        self.model = to_device_async(self.model, self.device)
        if self.use_fp16:
            self.model = self.model.half()
        self.model = self.model

        if self.use_denoiser:
            self.denoiser = Denoiser(self.model, device=device)
            self.denoiser = to_device_async(self.denoiser, self.device)

            tprint('Using WaveGlow denoiser.')
示例#3
0
def preprocess_mel(hparam="base.yaml", **kwargs):
    """The script for preprocessing mel-spectrograms from the dataset.

    By default, this script assumes to load parameters in the default config file, fastspeech/hparams/base.yaml.

    Besides the flags, you can also set parameters in the config file via the command-line. For examples,
    --dataset_path=DATASET_PATH
        Path to dataset directory.
    --mels_path=MELS_PATH
        Path to output preprocessed mels directory.

    Refer to fastspeech/hparams/base.yaml to see more parameters.

    Args:
        hparam (str, optional): Path to default config file. Defaults to "base.yaml".
    """

    hp.set_hparam(hparam, kwargs)
    tprint("Hparams:\n{}".format(pp.pformat(hp)))
    
    pathlib.Path(hp.mels_path).mkdir(parents=True, exist_ok=True)

    dataset = LJSpeechDataset(hp.dataset_path, mels_path=None)

    for data in tqdm(dataset):
        name = data["name"]
        mel = data["mel"]

        save_path = os.path.join(hp.mels_path, name + ".mel.npy")

        if os.path.exists(save_path):
            continue

        # print(name, mel)
        np.save(save_path, mel)
def verify(hparam="trt.yaml", text=SAMPLE_TEXT, **kwargs):
    hp.set_hparam(hparam, kwargs)
    tprint("Hparams:\n{}".format(pp.pformat(hp)))
    tprint("Device count: {}".format(torch.cuda.device_count()))

    outs_trt, acts_trt = infer_trt(text)
    outs, acts = infer_pytorch(text)

    both, pytorch, trt = join_dict(acts, acts_trt)

    # print diff
    print("## Diff ##\n\n")
    for name, (act, act_trt) in both.items():
        act = act.float()
        act_trt = act_trt.float()
        diff = act.reshape(-1) - act_trt.reshape(-1)
        is_identical = diff.eq(0).all()
        errors = diff[diff.ne(0)]
        max_error = torch.max(torch.abs(errors)) if len(errors) > 0 else 0
        print(
            "# {} #\n\n[PyTorch]\n{}\n\n[TRT]: \n{}\n\n[Diff]: \n{}\n\n[Errors]: \n{}\n- identical? {}\n- {} errors out of {}\n- max: {}\n\n"
            .format(
                name,
                act,
                act_trt,
                diff,
                errors,
                is_identical,
                len(errors),
                len(diff),
                max_error,
            ))
示例#5
0
 def __exit__(self, *exc_info):
     if self.device == 'cuda' and self.cuda_sync:
         torch.cuda.synchronize()
     self.end_time = time.time()
     self.time_elapsed = self.end_time - self.start_time
     tprint(("[{}] Time elapsed: {" + self.format + "}").format(
         self.name, self.time_elapsed))
示例#6
0
 def end(self):
     if not hasattr(self, "start_time"):
         return
     if self.device == 'cuda' and self.cuda_sync:
         torch.cuda.synchronize()
     self.end_time = time.time()
     self.time_elapsed = self.end_time - self.start_time
     tprint(("[{}] Time elapsed: {" + self.format + "}").format(self.name, self.time_elapsed))
示例#7
0
    def save(self):
        state_dict = {
            'step': self.step,
            'model': self.model.state_dict(),
            'optim': self.optimizer.state_dict(),
        }
        torch.save(state_dict,
                   self.ckpt_path + '/checkpoint_{:06d}.pt'.format(self.step))

        tprint('[Save] Model "{}". Step={}.'.format(self.model_name,
                                                    self.step))
    def build_engine(self):
        # load engines and create contexts
        self.engine_list = []
        self.context_list = []
        for i, (trt_max_input_seq_len, trt_max_output_seq_len,
                trt_file_path) in enumerate(
                    self.max_seq_lens_and_file_path_list):
            if trt_file_path and os.path.isfile(
                    trt_file_path) and not self.trt_force_build:
                with open(trt_file_path, 'rb') as f:
                    engine_str = f.read()
                with trt.Runtime(TRT_LOGGER) as runtime:
                    engine = runtime.deserialize_cuda_engine(engine_str)
                tprint('TRT Engine Loaded from {} successfully.'.format(
                    trt_file_path))
            else:
                self.trt_max_input_seq_len = trt_max_input_seq_len
                self.trt_max_output_seq_len = trt_max_output_seq_len
                self.trt_file_path = trt_file_path

                tprint('Building a TRT Engine..')
                engine = self.do_build_engine()
                tprint('TRT Engine Built.')

                with open(self.trt_file_path, 'wb') as f:
                    f.write(engine.serialize())
                tprint('TRT Engine Saved in {}.'.format(self.trt_file_path))

            self.engine_list.append(engine)
示例#9
0
    def load(self, load_optim=True):
        files_exist = glob.glob(os.path.join(self.ckpt_path, '*'))
        if files_exist:
            # load the latest created file.
            latest_file = max(files_exist, key=os.path.getctime)
            state_dict = torch.load(latest_file)

            self.step = state_dict['step']
            self.model.load_state_dict(state_dict['model'])
            if load_optim:
                self.optimizer.load_state_dict(state_dict['optim'])

            tprint('[Load] Checkpoint \'{}\'. Step={}'.format(
                latest_file, self.step))
        else:
            tprint('No checkpoints in {}. Load skipped.'.format(
                self.ckpt_path))
    def load(self, ckpt_file):
        # load latest checkpoint file if not defined.
        if not ckpt_file:
            files_exist = glob.glob(os.path.join(self.ckpt_path, '*'))
            if files_exist:
                ckpt_file = max(files_exist, key=os.path.getctime)

        if ckpt_file:
            state_dict = torch.load(ckpt_file, map_location=self.device)

            self.step = state_dict['step']
            self.model.load_state_dict(state_dict['model'])

            tprint('[Load] Checkpoint \'{}\'. Step={}'.format(ckpt_file, self.step))
        else:
            tprint('No checkpoints in {}. Load skipped.'.format(self.ckpt_path))
            raise Exception("No checkpoints found.")
    def __init__(self, model_name, model, data_loader=None, ckpt_path=None, ckpt_file=None, log_path=None, device='cuda', use_fp16=False, seed=None):
        self.data_loader = data_loader
        self.model_name = model_name
        self.model = model
        self.ckpt_path = ckpt_path
        self.log_path = log_path
        self.device = device
        self.seed = seed
        self.step = 0
        self.ckpt_file = ckpt_file
        self.use_fp16 = use_fp16

        # model
        self.model.eval()
        to_device_async(self.model, self.device)
        num_param = sum(param.numel() for param in model.parameters())
        tprint('The number of {} parameters: {}'.format(self.model_name, num_param))

        # precision
        if self.use_fp16:
            self.model = self.model.half()

        # data parallel
        self.model = nn.DataParallel(self.model)

        # set seed
        if seed is None:
            seed = np.random.randint(2**16)
        np.random.seed(seed)
        torch.manual_seed(seed)

        self.data_loader_iter = iter(self.data_loader)

        # logging
        if log_path:
            # tensorboard log path : {log_path}/YYYYMMDD-HHMMMSS
            log_path = os.path.join(log_path, time.strftime('%Y%m%d-%H%M%S'))
            self.tbwriter = SummaryWriter(log_dir=log_path, flush_secs=10)

        # checkpoint path
        if self.ckpt_path:
            self.ckpt_path = os.path.join(self.ckpt_path, self.model_name)
            pathlib.Path(self.ckpt_path).mkdir(parents=True, exist_ok=True)

            # load checkpoint
            self.load(ckpt_file)
示例#12
0
    def __init__(self, ckpt_file, device='cuda', use_fp16=False, use_denoiser=False):
        self.ckpt_file = ckpt_file
        self.device = device
        self.use_fp16 = use_fp16
        self.use_denoiser = use_denoiser

        # model
        # sys.path.append('waveglow')

        from waveglow.arg_parser import parse_waveglow_args
        parser = parser = argparse.ArgumentParser()
        model_parser= parse_waveglow_args(parser)
        args, _ = model_parser.parse_known_args()
        model_config = dict(
            n_mel_channels=args.n_mel_channels,
            n_flows=args.flows,
            n_group=args.groups,
            n_early_every=args.early_every,
            n_early_size=args.early_size,
            WN_config=dict(
                n_layers=args.wn_layers,
                kernel_size=args.wn_kernel_size,
                n_channels=args.wn_channels
            )
        )        
        self.model = WaveGlow(**model_config)

        state_dict = torch.load(self.ckpt_file, map_location=self.device)['state_dict']
        state_dict = unwrap_distributed(state_dict)
        self.model.load_state_dict(state_dict)

        self.model = to_device_async(self.model, self.device)

        self.model = self.model.remove_weightnorm(self.model)

        self.model.eval()

        if self.use_fp16:
            self.model = self.model.half()
        self.model = self.model

        if self.use_denoiser:
            self.denoiser = Denoiser(self.model, device=device)
            self.denoiser = to_device_async(self.denoiser, self.device)

            tprint('Using WaveGlow denoiser.')
 def set_engine_and_context(self, length):
     for i, (trt_max_input_seq_len, trt_max_output_seq_len,
             trt_file_path) in enumerate(
                 self.max_seq_lens_and_file_path_list):
         if length <= trt_max_input_seq_len:
             self.engine = self.engine_list[i]
             self.context = self.context_list[i]
             self.trt_max_input_seq_len = trt_max_input_seq_len
             self.trt_max_output_seq_len = trt_max_output_seq_len
             self.trt_file_path = trt_file_path
             break
     else:
         self.engine = self.engine_list[-1]
         self.context = self.context_list[-1]
         self.trt_max_input_seq_len = trt_max_input_seq_len
         self.trt_max_output_seq_len = trt_max_output_seq_len
         self.trt_file_path = trt_file_path
     tprint('TRT Engine {} is selected.'.format(self.trt_file_path))
示例#14
0
    def __init__(self, ckpt_file, engine_file, use_fp16=False, use_denoiser=False, stride=256, n_groups=8):
        self.ckpt_file = ckpt_file
        self.engine_file = engine_file
        self.use_fp16 = use_fp16
        self.use_denoiser = use_denoiser
        self.stride = stride
        self.n_groups = n_groups

        if self.use_denoiser:
            sys.path.append('waveglow')
            waveglow = torch.load(self.ckpt_file)['model']
            waveglow = waveglow.remove_weightnorm(waveglow)
            waveglow.eval()
            self.denoiser = Denoiser(waveglow)
            self.denoiser = to_gpu_async(self.denoiser)
            tprint('Using WaveGlow denoiser.')

            # after initialization, we don't need WaveGlow PyTorch checkpoint
            # anymore - deleting
            del waveglow
            torch.cuda.empty_cache()

        # load engine
        with open(self.engine_file, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            self.engine = runtime.deserialize_cuda_engine(f.read())

        if self.engine:
            tprint('TRT Engine Loaded from {} successfully.'.format(self.engine_file))
            return
        else:
            tprint('Loading TRT Engine from {} failed.'.format(self.engine_file))
    def build_engine(self):
        if self.trt_file_path and os.path.isfile(
                self.trt_file_path) and not self.trt_force_build:
            with open(self.trt_file_path, 'rb') as f:
                engine_str = f.read()
            with trt.Runtime(TRT_LOGGER) as runtime:
                self.engine = runtime.deserialize_cuda_engine(engine_str)

        if self.engine:
            tprint('TRT Engine Loaded from {} successfully.'.format(
                self.trt_file_path))
            return
        else:
            tprint('Loading TRT Engine from {} failed.'.format(
                self.trt_file_path))

        tprint('Building a TRT Engine..')

        self.engine = self.do_build_engine()
        tprint('TRT Engine Built.')
        if self.trt_file_path:
            with open(self.trt_file_path, 'wb') as f:
                f.write(self.engine.serialize())
            tprint('TRT Engine Saved in {}.'.format(self.trt_file_path))
def generate(hparam='infer.yaml',
             text='test_sentences.txt',
             results_path='results',
             device=DEFAULT_DEVICE,
             **kwargs):
    """The script for generating waveforms from texts with a vocoder.

    By default, this script assumes to load parameters in the default config file, fastspeech/hparams/infer.yaml.

    Besides the flags, you can also set parameters in the config file via the command-line. For examples,
    --checkpoint_path=CHECKPOINT_PATH
        Path to checkpoint directory. The latest checkpoint will be loaded.
    --waveglow_path=WAVEGLOW_PATH
        Path to the WaveGlow checkpoint file.
    --waveglow_engine_path=WAVEGLOW_ENGINE_PATH
        Path to the WaveGlow engine file. It can be only used with --use_trt=True.
    --batch_size=BATCH_SIZE
        Batch size to use. Defaults to 1.

    Refer to fastspeech/hparams/infer.yaml to see more parameters.

    Args:
        hparam (str, optional): Path to default config file. Defaults to "infer.yaml".
        text (str, optional): a sample text or a text file path to generate its waveform. Defaults to 'test_sentences.txt'.
        results_path (str, optional): Path to output waveforms directory. Defaults to 'results'.
        device (str, optional): Device to use. Defaults to "cuda" if avaiable, or "cpu".
    """

    hp.set_hparam(hparam, kwargs)

    if os.path.isfile(text):
        f = open(text, 'r', encoding="utf-8")
        texts = f.read().splitlines()
    else:  # single string
        texts = [text]

    dataset = TextDataset(texts)
    data_loader = PadDataLoader(dataset,
                                batch_size=hp.batch_size,
                                num_workers=hp.n_workers,
                                shuffle=False,
                                drop_last=False)

    # text to mel
    model = Fastspeech(
        max_seq_len=hp.max_seq_len,
        d_model=hp.d_model,
        phoneme_side_n_layer=hp.phoneme_side_n_layer,
        phoneme_side_head=hp.phoneme_side_head,
        phoneme_side_conv1d_filter_size=hp.phoneme_side_conv1d_filter_size,
        phoneme_side_output_size=hp.phoneme_side_output_size,
        mel_side_n_layer=hp.mel_side_n_layer,
        mel_side_head=hp.mel_side_head,
        mel_side_conv1d_filter_size=hp.mel_side_conv1d_filter_size,
        mel_side_output_size=hp.mel_side_output_size,
        duration_predictor_filter_size=hp.duration_predictor_filter_size,
        duration_predictor_kernel_size=hp.duration_predictor_kernel_size,
        fft_conv1d_kernel=hp.fft_conv1d_kernel,
        fft_conv1d_padding=hp.fft_conv1d_padding,
        dropout=hp.dropout,
        n_mels=hp.num_mels,
        fused_layernorm=hp.fused_layernorm)

    fs_inferencer = get_inferencer(model, data_loader, device)

    # set up WaveGlow
    if hp.use_trt:
        from fastspeech.trt.waveglow_trt_inferencer import WaveGlowTRTInferencer
        wb_inferencer = WaveGlowTRTInferencer(
            ckpt_file=hp.waveglow_path,
            engine_file=hp.waveglow_engine_path,
            use_fp16=hp.use_fp16)
    else:
        wb_inferencer = WaveGlowInferencer(ckpt_file=hp.waveglow_path,
                                           device=device,
                                           use_fp16=hp.use_fp16)

    tprint("Generating {} sentences.. ".format(len(dataset)))

    with fs_inferencer, wb_inferencer:
        try:
            for i in range(len(data_loader)):
                tprint("------------- BATCH # {} -------------".format(i))

                with TimeElapsed(name="Inferece Time: E2E", format=":.6f"):
                    ## Text-to-Mel ##
                    with TimeElapsed(name="Inferece Time: FastSpeech",
                                     device=device,
                                     cuda_sync=True,
                                     format=":.6f"), torch.no_grad():
                        outputs = fs_inferencer.infer()

                    texts = outputs["text"]
                    mels = outputs["mel"]  # (b, n_mels, t)
                    mel_masks = outputs['mel_mask']  # (b, t)
                    # assert(mels.is_cuda)

                    # remove paddings
                    mel_lens = mel_masks.sum(axis=1)
                    max_len = mel_lens.max()
                    mels = mels[..., :max_len]
                    mel_masks = mel_masks[..., :max_len]

                    ## Vocoder ##
                    with TimeElapsed(name="Inferece Time: WaveGlow",
                                     device=device,
                                     cuda_sync=True,
                                     format=":.6f"), torch.no_grad():
                        wavs = wb_inferencer.infer(mels)
                        wavs = to_cpu_numpy(wavs)

                ## Write wavs ##
                pathlib.Path(results_path).mkdir(parents=True, exist_ok=True)
                for i, (text, wav) in enumerate(zip(texts, wavs)):
                    tprint("TEXT #{}: \"{}\"".format(i, text))

                    # remove paddings in case of batch size > 1
                    wav_len = mel_lens[i] * hp.hop_len
                    wav = wav[:wav_len]

                    path = os.path.join(results_path, text + ".wav")
                    librosa.output.write_wav(path, wav, hp.sr)

        except StopIteration:
            tprint("Generation has been done.")
        except KeyboardInterrupt:
            tprint("Generation has been canceled.")
示例#17
0
def train(hparam="train.yaml", device=DEFAULT_DEVICE, **kwargs):
    """ The FastSpeech model training script.

    By default, this script assumes to load parameters in the default config file, fastspeech/hparams/train.yaml.

    Besides the flags, you can also set parameters in the config file via the command-line. For examples,
    --dataset_path=DATASET_PATH
        Path to dataset directory.
    --tacotron2_path=TACOTRON2_PATH
        Path to tacotron2 checkpoint file.
    --mels_path=MELS_PATH
        Path to preprocessed mels directory.
    --aligns_path=ALIGNS_PATH
        Path to preprocessed alignments directory.
    --log_path=LOG_PATH
        Path to log directory.
    --checkpoint_path=CHECKPOINT_PATH
        Path to checkpoint directory. The latest checkpoint will be loaded.
    --batch_size=BATCH_SIZE
        Batch size to use. Defaults to 16.

    Refer to fastspeech/hparams/train.yaml to see more parameters.

    Args:
        hparam (str, optional): Path to default config file. Defaults to "train.yaml".
        device (str, optional): Device to use. Defaults to "cuda" if avaiable, or "cpu".

    """
    hp.set_hparam(hparam, kwargs)
    tprint("Hparams:\n{}".format(pp.pformat(hp)))
    tprint("Device count: {}".format(torch.cuda.device_count()))

    # model
    model = Fastspeech(
        max_seq_len=hp.max_seq_len,
        d_model=hp.d_model,
        phoneme_side_n_layer=hp.phoneme_side_n_layer,
        phoneme_side_head=hp.phoneme_side_head,
        phoneme_side_conv1d_filter_size=hp.phoneme_side_conv1d_filter_size,
        phoneme_side_output_size=hp.phoneme_side_output_size,
        mel_side_n_layer=hp.mel_side_n_layer,
        mel_side_head=hp.mel_side_head,
        mel_side_conv1d_filter_size=hp.mel_side_conv1d_filter_size,
        mel_side_output_size=hp.mel_side_output_size,
        duration_predictor_filter_size=hp.duration_predictor_filter_size,
        duration_predictor_kernel_size=hp.duration_predictor_kernel_size,
        fft_conv1d_kernel=hp.fft_conv1d_kernel,
        fft_conv1d_padding=hp.fft_conv1d_padding,
        dropout=hp.dropout,
        n_mels=hp.num_mels,
        fused_layernorm=hp.fused_layernorm)

    # dataset
    dataset = LJSpeechDataset(
        root_path=hp.dataset_path,
        meta_file=hp.meta_file,
        mels_path=hp.mels_path,
        aligns_path=hp.aligns_path,
        sr=hp.sr,
        n_fft=hp.n_fft,
        win_len=hp.win_len,
        hop_len=hp.hop_len,
        n_mels=hp.num_mels,
        mel_fmin=hp.mel_fmin,
        mel_fmax=hp.mel_fmax,
    )
    tprint("Dataset size: {}".format(len(dataset)))

    # data loader
    data_loader = PadDataLoader(
        dataset,
        batch_size=hp.batch_size,
        num_workers=hp.n_workers,
        drop_last=True,
    )

    # optimizer
    def get_optimizer(model):
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=hp.learning_rate,
                                     betas=(0.9, 0.98),
                                     eps=1e-9)
        return optimizer

    def get_warmup_lr_scheduler(optimizer):
        d_model = hp.d_model
        warmup_steps = hp.warmup_steps
        lr = lambda step: d_model**-0.5 * min(
            (step + 1)**-0.5,
            (step + 1) * warmup_steps**-1.5) / hp.learning_rate
        scheduler = LambdaLR(optimizer, lr_lambda=[lr])
        return scheduler

    # trainer
    trainer = FastspeechTrainer(
        data_loader,
        'fastspeech',
        model,
        optimizer_fn=get_optimizer,
        final_steps=hp.final_steps,
        log_steps=hp.log_step,
        ckpt_path=hp.checkpoint_path,
        save_steps=hp.save_step,
        log_path=hp.log_path,
        lr_scheduler_fn=get_warmup_lr_scheduler,
        pre_aligns=True if hp.aligns_path else False,
        device=device,
        use_amp=hp.use_amp,
        nvprof_iter_start=hp.nvprof_iter_start,
        nvprof_iter_end=hp.nvprof_iter_end,
        pyprof_enabled=hp.pyprof_enabled,
    )
    trainer.train()
示例#18
0
    def forward(self,
                seq,
                pos,
                duration_target=None,
                alpha=1.0,
                seq_output_len=None,
                use_fp16=False,
                acts=None):

        # Phoneme Embedding
        output = self.word_emb(seq)

        if acts is not None:
            acts["act.emb"] = output

        if use_fp16:
            output = output.half()

        # Phoneme Side FFT Blocks
        output, output_mask = self.phoneme_side(output, pos, acts=acts)

        if acts is not None:
            acts["act.phoneme_side.seq"] = output

        # Length Regulator
        output, pos, duration = self.length_regulator(output,
                                                      output_mask,
                                                      target=duration_target,
                                                      alpha=alpha)

        if seq_output_len:
            output = F.pad(output,
                           pad=(0, 0, 0, seq_output_len - output.size(1)))
            pos = F.pad(pos, pad=(0, seq_output_len - pos.size(1)))

        # length of output mel shouldn't exceed max_seq_len
        output = output[:, :self.max_seq_len]
        pos = pos[:, :self.max_seq_len]

        if acts is not None:
            acts["act.length_regulator.seq"] = output
            acts["act.length_regulator.dur"] = torch.round(duration)

        if self.training or output.bool().any():
            # Mel Side FFT Blocks
            output, output_mask = self.mel_side(output, pos, acts=acts)

            if acts is not None:
                acts["act.mel_side.seq"] = output

            # Linear Layer
            output = self.mel_linear(output)

            if acts is not None:
                acts["out.seq_mask"] = output_mask
                acts["out.seq"] = output
        else:
            # seq length could be zero, in case duration predictor outputs all zeros.
            # In this case, skip feed-forwarding.
            tprint(
                "Duration Predictor outputs all zeros. Output will be zero length."
            )
            output_shape = (output.size(0), 0, output_mask.size(2))
            output = torch.zeros(size=(output_shape))
            output_mask = torch.ones(size=(output_shape))

        if torch.cuda.device_count() > 1:
            # In a multi-gpu setting, all output mels from devices must have the same length.
            # otherwise, an error occurs in process of gathering output.
            if not seq_output_len:
                seq_output_len = self.max_seq_len
            padding = (0, 0, 0, seq_output_len - output.size(1))

            output = F.pad(output, padding)
            output = output[:, :seq_output_len, :]

            output_mask = F.pad(output_mask, padding)
            output_mask = output_mask[:, :seq_output_len, :]

        return output, output_mask, duration
示例#19
0
 def console_log(self, tag, output):
     # console logging
     msg = ""
     for key, value in sorted(output.items()):
         msg += ',\t{}: {}'.format(key, value)
     tprint(msg)
def perf_inference(hparam="infer.yaml",
                   with_vocoder=False,
                   n_iters=None,
                   device=DEFAULT_DEVICE,
                   **kwargs):
    """The script for estimating inference performance.

    By default, this script assumes to load parameters in the default config file, fastspeech/hparams/infer.yaml.

    Besides the flags, you can also set parameters in the config file via the command-line. For examples,
    --dataset_path=DATASET_PATH
        Path to dataset directory.
    --checkpoint_path=CHECKPOINT_PATH
        Path to checkpoint directory. The latest checkpoint will be loaded.
    --batch_size=BATCH_SIZE
        Batch size to use. Defaults to 1.

    Refer to fastspeech/hparams/infer.yaml to see more parameters.

    Args:
        hparam (str, optional): Path to default config file. Defaults to "infer.yaml".
        with_vocoder (bool, optional): Whether or not to estimate with a vocoder. Defaults to False.
        n_iters (int, optional): Number of batches to estimate. Defaults to None (an epoch).
        device (str, optional): Device to use. Defaults to "cuda" if avaiable, or "cpu".

    """

    hp.set_hparam(hparam, kwargs)
    tprint("Hparams:\n{}".format(pp.pformat(hp)))
    tprint("Device count: {}".format(torch.cuda.device_count()))

    model = Fastspeech(
        max_seq_len=hp.max_seq_len,
        d_model=hp.d_model,
        phoneme_side_n_layer=hp.phoneme_side_n_layer,
        phoneme_side_head=hp.phoneme_side_head,
        phoneme_side_conv1d_filter_size=hp.phoneme_side_conv1d_filter_size,
        phoneme_side_output_size=hp.phoneme_side_output_size,
        mel_side_n_layer=hp.mel_side_n_layer,
        mel_side_head=hp.mel_side_head,
        mel_side_conv1d_filter_size=hp.mel_side_conv1d_filter_size,
        mel_side_output_size=hp.mel_side_output_size,
        duration_predictor_filter_size=hp.duration_predictor_filter_size,
        duration_predictor_kernel_size=hp.duration_predictor_kernel_size,
        fft_conv1d_kernel=hp.fft_conv1d_kernel,
        fft_conv1d_padding=hp.fft_conv1d_padding,
        dropout=hp.dropout,
        n_mels=hp.num_mels,
        fused_layernorm=hp.fused_layernorm)

    dataset_size = hp.batch_size * (n_iters if n_iters else 1)
    tprint("Dataset size: {}".format(dataset_size))
    dataset = TextDataset([INPUT_TEXT] * (dataset_size +
                                          (WARMUP_ITERS * hp.batch_size)))

    data_loader = PadDataLoader(
        dataset,
        batch_size=hp.batch_size,
        num_workers=hp.n_workers,
        shuffle=False if hp.use_trt and hp.trt_multi_engine else True,
        drop_last=True,
    )

    fs_inferencer = get_inferencer(model, data_loader, device)

    if with_vocoder:
        if hp.use_trt:
            from fastspeech.trt.waveglow_trt_inferencer import WaveGlowTRTInferencer
            wb_inferencer = WaveGlowTRTInferencer(
                ckpt_file=hp.waveglow_path,
                engine_file=hp.waveglow_engine_path,
                use_fp16=hp.use_fp16)
        else:
            wb_inferencer = WaveGlowInferencer(ckpt_file=hp.waveglow_path,
                                               device=device,
                                               use_fp16=hp.use_fp16)

    with fs_inferencer, wb_inferencer if with_vocoder else ExitStack():

        tprint("Perf started. Batch size={}.".format(hp.batch_size))

        latencies = []
        throughputs = []

        for i in tqdm(range(len(data_loader))):
            start = time.time()

            outputs = fs_inferencer.infer()

            mels = outputs['mel']
            mel_masks = outputs['mel_mask']
            assert (mels.is_cuda)

            if with_vocoder:
                # remove padding
                max_len = mel_masks.sum(axis=1).max()
                mels = mels[..., :max_len]
                mel_masks = mel_masks[..., :max_len]

                with torch.no_grad():
                    wavs = wb_inferencer.infer(mels)
                wavs = to_cpu_numpy(wavs)
            else:
                # include time for DtoH copy
                to_cpu_numpy(mels)
                to_cpu_numpy(mel_masks)

            end = time.time()

            if i > WARMUP_ITERS - 1:
                time_elapsed = end - start
                generated_samples = len(mel_masks.nonzero()) * hp.hop_len
                throughput = generated_samples / time_elapsed

                latencies.append(time_elapsed)
                throughputs.append(throughput)

        latencies.sort()

        avg_latency = np.mean(latencies)
        std_latency = np.std(latencies)
        latency_90 = max(latencies[:int(len(latencies) *
                                        0.90)]) if n_iters > 1 else 0
        latency_95 = max(latencies[:int(len(latencies) *
                                        0.95)]) if n_iters > 1 else 0
        latency_99 = max(latencies[:int(len(latencies) *
                                        0.99)]) if n_iters > 1 else 0

        throughput = np.mean(throughputs)
        rtf = throughput / (hp.sr * hp.batch_size)

        tprint(
            "Batch size\tPrecision\tAvg Latency(s)\tStd Latency(s)\tLatency 90%(s)\tLatency 95%(s)\tLatency 99%(s)\tThroughput(samples/s)\tAvg RTF\n\
        {}\t{}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{}\t{:.2f}".format(
                hp.batch_size, "FP16" if hp.use_fp16 else "FP32",
                avg_latency, std_latency, latency_90, latency_95, latency_99,
                int(throughput), rtf))
示例#21
0
 def console_log(self, tag, loss, meta):
     # console logging
     msg = 'loss: {:.6f}'.format(loss)
     for key, value in meta.items():
         msg += ',\t{}: {:.4f}'.format(key, value)
     tprint(msg)
示例#22
0
    def __init__(self,
                 data_loader,
                 model_name,
                 model,
                 optimizer_fn,
                 final_steps,
                 lr_scheduler_fn=None,
                 step=0,
                 ckpt_path=None,
                 log_path=None,
                 n_epochs=None,
                 save_steps=None,
                 log_steps=10,
                 device='cuda',
                 use_amp=False,
                 nvprof_iter_start=None,
                 nvprof_iter_end=None,
                 pyprof_enabled=False,
                 detect_anomaly=False,
                 seed=None):
        self.data_loader = data_loader
        self.model_name = model_name
        self.model = model
        self.n_epochs = n_epochs
        self.save_steps = save_steps
        self.log_steps = log_steps
        self.ckpt_path = ckpt_path
        self.log_path = log_path
        self.final_steps = final_steps
        self.step = step
        self.device = device
        self.use_amp = use_amp
        self.nvprof_iter_start = nvprof_iter_start
        self.nvprof_iter_end = nvprof_iter_end
        self.pyprof_enabled = pyprof_enabled
        self.detect_anomaly = detect_anomaly

        # model
        self.model.train()
        to_device_async(self.model, self.device)
        num_param = sum(param.numel() for param in model.parameters())
        tprint('The number of {} parameters: {}'.format(
            self.model_name, num_param))

        # optimizer
        self.optimizer = optimizer_fn(model)

        # lr scheduler
        if lr_scheduler_fn:
            self.lr_scheduler = lr_scheduler_fn(self.optimizer)
        else:
            self.lr_scheduler = None

        # automatic mixed precision
        if self.use_amp:
            from apex import amp
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level='O1')

        # profile
        if nvprof_iter_start and nvprof_iter_end is not None and pyprof_enabled:
            from apex import pyprof
            pyprof.nvtx.init()

        # data parallel
        self.model = nn.DataParallel(self.model)

        # set seed
        if seed is None:
            seed = np.random.randint(2**16)
        np.random.seed(seed)
        torch.manual_seed(seed)

        # data loader
        self.data_loader_iter = self.repeat(self.data_loader, n_epochs)

        # logging
        if log_path:
            # tensorboard log path : {log_path}/YYYYMMDD-HHMMMSS
            log_path = os.path.join(log_path, time.strftime('%Y%m%d-%H%M%S'))
            self.tbwriter = SummaryWriter(log_dir=log_path, flush_secs=10)

        # checkpoint path
        if self.ckpt_path:
            self.ckpt_path = os.path.join(self.ckpt_path, self.model_name)
            pathlib.Path(self.ckpt_path).mkdir(parents=True, exist_ok=True)

            # load checkpoint
            self.load()
示例#23
0
def infer(hparam="infer.yaml",
          device=DEFAULT_DEVICE, 
          n_iters=1,
          **kwargs):
    """ The FastSpeech model inference script.

    By default, this script assumes to load parameters in the default config file, fastspeech/hparams/infer.yaml.

    Besides the flags, you can also set parameters in the config file via the command-line. For examples,
    --dataset_path=DATASET_PATH
        Path to dataset directory.
    --checkpoint_path=CHECKPOINT_PATH
        Path to checkpoint directory. The latest checkpoint will be loaded.
    --batch_size=BATCH_SIZE
        Batch size to use. Defaults to 1.

    Refer to fastspeech/hparams/infer.yaml to see more parameters.

    Args:
        hparam (str, optional): Path to default config file. Defaults to "infer.yaml".
        device (str, optional): Device to use. Defaults to "cuda" if avaiable, or "cpu".
        n_iters (int, optional): Number of batches to infer. Defaults to 1.
    """

    hp.set_hparam(hparam, kwargs)
    tprint("Hparams:\n{}".format(pp.pformat(hp)))
    tprint("Device count: {}".format(torch.cuda.device_count()))

    # model
    model = Fastspeech(
        max_seq_len=hp.max_seq_len,
        d_model=hp.d_model,
        phoneme_side_n_layer=hp.phoneme_side_n_layer,
        phoneme_side_head=hp.phoneme_side_head,
        phoneme_side_conv1d_filter_size=hp.phoneme_side_conv1d_filter_size,
        phoneme_side_output_size=hp.phoneme_side_output_size,
        mel_side_n_layer=hp.mel_side_n_layer,
        mel_side_head=hp.mel_side_head,
        mel_side_conv1d_filter_size=hp.mel_side_conv1d_filter_size,
        mel_side_output_size=hp.mel_side_output_size,
        duration_predictor_filter_size=hp.duration_predictor_filter_size,
        duration_predictor_kernel_size=hp.duration_predictor_kernel_size,
        fft_conv1d_kernel=hp.fft_conv1d_kernel,
        fft_conv1d_padding=hp.fft_conv1d_padding,
        dropout=hp.dropout,
        n_mels=hp.num_mels,
        fused_layernorm=hp.fused_layernorm
    )

    dataset = LJSpeechDataset(root_path=hp.dataset_path,
                              meta_file=hp.meta_file,
                              sr=hp.sr,
                              n_fft=hp.n_fft,
                              win_len=hp.win_len,
                              hop_len=hp.hop_len,
                              n_mels=hp.num_mels,
                              mel_fmin=hp.mel_fmin,
                              mel_fmax=hp.mel_fmax,
                              exclude_mels=True,
                              sort_by_length=True if hp.use_trt and hp.trt_multi_engine else False
                              )
    tprint("Dataset size: {}".format(len(dataset)))

    data_loader = PadDataLoader(dataset,
                                batch_size=hp.batch_size,
                                num_workers=hp.n_workers,
                                shuffle=False if hp.use_trt and hp.trt_multi_engine else True,
                                drop_last=True,
                                )

    inferencer = get_inferencer(model, data_loader, device)

    try:
        n_iters = min(len(data_loader), n_iters) if n_iters else len(data_loader)
        tprint("Num of iters: {}".format(n_iters))
        with inferencer:
            for i in range(n_iters):
                    tprint("------------- INFERENCE : batch #{} -------------".format(i))
                    with TimeElapsed(name="Inference Time", cuda_sync=True):
                        out_batch = inferencer.infer()
                        # tprint("Output:\n{}".format(pp.pformat(out_batch)))
        tprint("Inference has been done.")
    except KeyboardInterrupt:
        tprint("Inference has been canceled.")