示例#1
0
    def __init__(self, device='cpu'):
        dict_path = "downloads/data/lang_1char/train_no_dev_units.txt"
        model_path = "downloads/exp/train_no_dev_pytorch_train_pytorch_tacotron2.v3/results/model.last1.avg.best"
        vocoder_path = "downloads/ljspeech.parallel_wavegan.v1/checkpoint-400000steps.pkl"
        vocoder_conf = "downloads/ljspeech.parallel_wavegan.v1/config.yml"

        device = torch.device(device)

        idim, odim, train_args = get_model_conf(model_path)
        model_class = dynamic_import(train_args.model_module)
        model = model_class(idim, odim, train_args)
        torch_load(model_path, model)
        model = model.eval().to(device)
        inference_args = Namespace(**{"threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0})

        with open(vocoder_conf) as f:
            config = yaml.load(f, Loader=yaml.Loader)
        vocoder = ParallelWaveGANGenerator(**config["generator_params"])
        vocoder.load_state_dict(torch.load(vocoder_path, map_location="cpu")["model"]["generator"])
        vocoder.remove_weight_norm()
        vocoder = vocoder.eval().to(device)

        with open(dict_path) as f:
            lines = f.readlines()
        lines = [line.replace("\n", "").split(" ") for line in lines]
        char_to_id = {c: int(i) for c, i in lines}

        self.device = device
        self.char_to_id = char_to_id
        self.idim = idim
        self.model = model
        self.inference_args = inference_args
        self.config = config
        self.vocoder = vocoder
示例#2
0
    def __init__(self, conf):
        if conf["cuda"]:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        self.conf = MODEL_CONF[conf["model"]]

        # define E2E-TTS model
        self.idim, odim, train_args = get_model_conf(self.conf["model_path"])
        model_class = dynamic_import(train_args.model_module)
        self.model = model_class(self.idim, odim, train_args)
        torch_load(self.conf["model_path"], self.model)
        self.model = self.model.eval().to(self.device)

        # load neural vocoder
        with open(VOCODER_CONF["vocoder_conf"]) as f:
            self.vocoder_config = yaml.load(f, Loader=yaml.Loader)
        self.vocoder = ParallelWaveGANGenerator(
            **self.vocoder_config["generator_params"])
        self.vocoder.load_state_dict(\
            torch.load(VOCODER_CONF["vocoder_path"], map_location="cpu")["model"]["generator"])
        self.vocoder.remove_weight_norm()
        self.vocoder = self.vocoder.eval().to(self.device)

        # define character-to-id dictionary
        with open(self.conf["dict_path"]) as f:
            lines = f.readlines()
        lines = [line.replace("\n", "").split(" ") for line in lines]
        self.char_to_id = {c: int(i) for c, i in lines}
示例#3
0
 def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
     sys.path.append(lib_path) # set this if ParallelWaveGAN is not installed globally
     #pylint: disable=import-outside-toplevel
     from parallel_wavegan.models import ParallelWaveGANGenerator
     print(" > Loading PWGAN model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", model_file)
     with open(model_config) as f:
         self.pwgan_config = yaml.load(f, Loader=yaml.Loader)
     self.pwgan = ParallelWaveGANGenerator(**self.pwgan_config["generator_params"])
     self.pwgan.load_state_dict(torch.load(model_file, map_location="cpu")["model"]["generator"])
     self.pwgan.remove_weight_norm()
     if use_cuda:
         self.pwgan.cuda()
     self.pwgan.eval()
def test_causal_parallel_wavegan(upsample_net, aux_context_window):
    batch_size = 1
    batch_length = 4096
    args_g = make_generator_args(
        use_causal_conv=True,
        upsample_net=upsample_net,
        aux_context_window=aux_context_window,
        dropout=0.0,
    )
    model_g = ParallelWaveGANGenerator(**args_g)
    z = torch.randn(batch_size, 1, batch_length)
    c = torch.randn(
        batch_size,
        args_g["aux_channels"],
        batch_length // np.prod(args_g["upsample_params"]["upsample_scales"]),
    )

    z_ = z.clone()
    c_ = c.clone()
    z_[..., z.size(-1) // 2:] = torch.randn(z[..., z.size(-1) // 2:].shape)
    c_[..., c.size(-1) // 2:] = torch.randn(c[..., c.size(-1) // 2:].shape)
    c = torch.nn.ConstantPad1d(args_g["aux_context_window"], 0.0)(c)
    c_ = torch.nn.ConstantPad1d(args_g["aux_context_window"], 0.0)(c_)
    try:
        # check not equal
        np.testing.assert_array_equal(c.numpy(), c_.numpy())
    except AssertionError:
        pass
    else:
        raise AssertionError("Must be different.")
    try:
        # check not equal
        np.testing.assert_array_equal(z.numpy(), z_.numpy())
    except AssertionError:
        pass
    else:
        raise AssertionError("Must be different.")

    # check causality
    y = model_g(z, c)
    y_ = model_g(z_, c_)
    np.testing.assert_array_equal(
        y[..., :y.size(-1) // 2].detach().cpu().numpy(),
        y_[..., :y_.size(-1) // 2].detach().cpu().numpy(),
    )
示例#5
0
 def load_pwgan(self, model_file, model_config, use_cuda):
     #pylint: disable=import-outside-toplevel
     from parallel_wavegan.models import ParallelWaveGANGenerator
     from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder
     print(" > Loading PWGAN model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", model_file)
     with open(model_config) as f:
         self.pwgan_config = yaml.load(f, Loader=yaml.Loader)
     self.pwgan = ParallelWaveGANGenerator(
         **self.pwgan_config["generator_params"])
     self.pwgan.load_state_dict(
         torch.load(model_file, map_location="cpu")["model"]["generator"])
     self.pwgan.remove_weight_norm()
     self.pwgan_ap = AudioProcessorVocoder(**self.pwgan_config["audio"])
     if use_cuda:
         self.pwgan.cuda()
     self.pwgan.eval()
def test_parallel_wavegan_with_residual_discriminator_trainable(
        dict_g, dict_d, dict_loss):
    # setup
    batch_size = 4
    batch_length = 4096
    args_g = make_generator_args(**dict_g)
    args_d = make_residual_discriminator_args(**dict_d)
    args_loss = make_mutli_reso_stft_loss_args(**dict_loss)
    z = torch.randn(batch_size, 1, batch_length)
    y = torch.randn(batch_size, 1, batch_length)
    c = torch.randn(
        batch_size,
        args_g["aux_channels"],
        batch_length // np.prod(args_g["upsample_params"]["upsample_scales"]) +
        2 * args_g["aux_context_window"],
    )
    model_g = ParallelWaveGANGenerator(**args_g)
    model_d = ResidualParallelWaveGANDiscriminator(**args_d)
    aux_criterion = MultiResolutionSTFTLoss(**args_loss)
    gen_adv_criterion = GeneratorAdversarialLoss()
    dis_adv_criterion = DiscriminatorAdversarialLoss()
    optimizer_g = RAdam(model_g.parameters())
    optimizer_d = RAdam(model_d.parameters())

    # check generator trainable
    y_hat = model_g(z, c)
    p_hat = model_d(y_hat)
    adv_loss = gen_adv_criterion(p_hat)
    sc_loss, mag_loss = aux_criterion(y_hat, y)
    aux_loss = sc_loss + mag_loss
    loss_g = adv_loss + aux_loss
    optimizer_g.zero_grad()
    loss_g.backward()
    optimizer_g.step()

    # check discriminator trainable
    p = model_d(y)
    p_hat = model_d(y_hat.detach())
    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
    loss_d = real_loss + fake_loss
    optimizer_d.zero_grad()
    loss_d.backward()
    optimizer_d.step()
def test_parallel_wavegan_trainable(dict_g, dict_d, dict_loss):
    # setup
    batch_size = 4
    batch_length = 4096
    args_g = make_generator_args(**dict_g)
    args_d = make_discriminator_args(**dict_d)
    args_loss = make_mutli_reso_stft_loss_args(**dict_loss)
    z = torch.randn(batch_size, 1, batch_length)
    y = torch.randn(batch_size, 1, batch_length)
    c = torch.randn(
        batch_size, args_g["aux_channels"],
        batch_length // np.prod(args_g["upsample_params"]["upsample_scales"]) +
        2 * args_g["aux_context_window"])
    model_g = ParallelWaveGANGenerator(**args_g)
    model_d = ParallelWaveGANDiscriminator(**args_d)
    aux_criterion = MultiResolutionSTFTLoss(**args_loss)
    optimizer_g = RAdam(model_g.parameters())
    optimizer_d = RAdam(model_d.parameters())

    # check generator trainable
    y_hat = model_g(z, c)
    p_hat = model_d(y_hat)
    y, y_hat, p_hat = y.squeeze(1), y_hat.squeeze(1), p_hat.squeeze(1)
    adv_loss = F.mse_loss(p_hat, p_hat.new_ones(p_hat.size()))
    sc_loss, mag_loss = aux_criterion(y_hat, y)
    aux_loss = sc_loss + mag_loss
    loss_g = adv_loss + aux_loss
    optimizer_g.zero_grad()
    loss_g.backward()
    optimizer_g.step()

    # check discriminator trainable
    y, y_hat = y.unsqueeze(1), y_hat.unsqueeze(1).detach()
    p = model_d(y)
    p_hat = model_d(y_hat)
    p, p_hat = p.squeeze(1), p_hat.squeeze(1)
    loss_d = F.mse_loss(p, p.new_ones(p.size())) + F.mse_loss(
        p_hat, p_hat.new_zeros(p_hat.size()))
    optimizer_d.zero_grad()
    loss_d.backward()
    optimizer_d.step()
示例#8
0
 def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
     if lib_path:
         # set this if ParallelWaveGAN is not installed globally
         sys.path.append(lib_path)
     try:
         #pylint: disable=import-outside-toplevel
         from parallel_wavegan.models import ParallelWaveGANGenerator
     except ImportError as e:
         raise RuntimeError(
             f"cannot import parallel-wavegan, either install it or set its directory using the --pwgan_lib_path command line argument: {e}"
         )
     print(" > Loading PWGAN model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", model_file)
     with open(model_config) as f:
         self.pwgan_config = yaml.load(f, Loader=yaml.Loader)
     self.pwgan = ParallelWaveGANGenerator(
         **self.pwgan_config["generator_params"])
     self.pwgan.load_state_dict(
         torch.load(model_file, map_location="cpu")["model"]["generator"])
     self.pwgan.remove_weight_norm()
     if use_cuda:
         self.pwgan.cuda()
     self.pwgan.eval()
示例#9
0
def main():
    """Run training process."""
    parser = argparse.ArgumentParser(
        description=
        "Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)."
    )
    parser.add_argument("--train-dumpdir",
                        type=str,
                        required=True,
                        help="directory including trainning data.")
    parser.add_argument("--dev-dumpdir",
                        type=str,
                        required=True,
                        help="directory including development data.")
    parser.add_argument("--outdir",
                        type=str,
                        required=True,
                        help="directory to save checkpoints.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument(
        "--resume",
        default="",
        type=str,
        nargs="?",
        help="checkpoint file path to resume training. (default=\"\")")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('skip DEBUG/INFO messages')

    # check directory existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))
    with open(os.path.join(args.outdir, "config.yml"), "w") as f:
        yaml.dump(config, f, Dumper=yaml.Dumper)
    for key, value in config.items():
        logging.info(f"{key} = {value}")

    # get dataset
    if config["remove_short_samples"]:
        mel_length_threshold = config["batch_max_steps"] // config["hop_size"] + \
            2 * config["generator_params"]["aux_context_window"]
    else:
        mel_length_threshold = None
    if config["format"] == "hdf5":
        audio_query, mel_query = "*.h5", "*.h5"
        audio_load_fn = lambda x: read_hdf5(x, "wave")  # NOQA
        mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
    elif config["format"] == "npy":
        audio_query, mel_query = "*-wave.npy", "*-feats.npy"
        audio_load_fn = np.load
        mel_load_fn = np.load
    else:
        raise ValueError("support only hdf5 or npy format.")
    dataset = {
        "train":
        AudioMelDataset(
            root_dir=args.train_dumpdir,
            audio_query=audio_query,
            mel_query=mel_query,
            audio_load_fn=audio_load_fn,
            mel_load_fn=mel_load_fn,
            mel_length_threshold=mel_length_threshold,
            allow_cache=config.get("allow_cache", False),  # keep compatibilty
        ),
        "dev":
        AudioMelDataset(
            root_dir=args.dev_dumpdir,
            audio_query=audio_query,
            mel_query=mel_query,
            audio_load_fn=audio_load_fn,
            mel_load_fn=mel_load_fn,
            mel_length_threshold=mel_length_threshold,
            allow_cache=config.get("allow_cache", False),  # keep compatibilty
        ),
    }

    # get data loader
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    collater = Collater(
        batch_max_steps=config["batch_max_steps"],
        hop_size=config["hop_size"],
        aux_context_window=config["generator_params"]["aux_context_window"],
    )
    data_loader = {
        "train":
        DataLoader(dataset=dataset["train"],
                   shuffle=True,
                   collate_fn=collater,
                   batch_size=config["batch_size"],
                   num_workers=config["num_workers"],
                   pin_memory=config["pin_memory"]),
        "dev":
        DataLoader(dataset=dataset["dev"],
                   shuffle=True,
                   collate_fn=collater,
                   batch_size=config["batch_size"],
                   num_workers=config["num_workers"],
                   pin_memory=config["pin_memory"]),
    }

    # define models and optimizers
    model = {
        "generator":
        ParallelWaveGANGenerator(**config["generator_params"]).to(device),
        "discriminator":
        ParallelWaveGANDiscriminator(
            **config["discriminator_params"]).to(device),
    }
    criterion = {
        "stft":
        MultiResolutionSTFTLoss(**config["stft_loss_params"]).to(device),
        "mse": torch.nn.MSELoss().to(device),
    }
    optimizer = {
        "generator":
        RAdam(model["generator"].parameters(),
              **config["generator_optimizer_params"]),
        "discriminator":
        RAdam(model["discriminator"].parameters(),
              **config["discriminator_optimizer_params"]),
    }
    scheduler = {
        "generator":
        torch.optim.lr_scheduler.StepLR(
            optimizer=optimizer["generator"],
            **config["generator_scheduler_params"]),
        "discriminator":
        torch.optim.lr_scheduler.StepLR(
            optimizer=optimizer["discriminator"],
            **config["discriminator_scheduler_params"]),
    }
    logging.info(model["generator"])
    logging.info(model["discriminator"])

    # define trainer
    trainer = Trainer(
        steps=0,
        epochs=0,
        data_loader=data_loader,
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        config=config,
        device=device,
    )

    # resume from checkpoint
    if len(args.resume) != 0:
        trainer.load_checkpoint(args.resume)
        logging.info(f"resumed from {args.resume}.")

    # run training loop
    try:
        trainer.run()
    finally:
        trainer.save_checkpoint(
            os.path.join(config["outdir"],
                         f"checkpoint-{trainer.steps}steps.pkl"))
        logging.info(f"successfully saved checkpoint @ {trainer.steps}steps.")
示例#10
0
def main():
    """Run decoding process."""
    parser = argparse.ArgumentParser(
        description="Decode dumped features with trained Parallel WaveGAN Generator.")
    parser.add_argument("--scp", default=None, type=str,
                        help="Kaldi-style feats.scp file.")
    parser.add_argument("--dumpdir", default=None, type=str,
                        help="Directory including feature files.")
    parser.add_argument("--outdir", default=None, type=str, required=True,
                        help="Direcotry to save generated speech.")
    parser.add_argument("--checkpoint", default=None, type=str, required=True,
                        help="Checkpoint file.")
    parser.add_argument("--config", default=None, type=str,
                        help="Yaml format configuration file.")
    parser.add_argument("--verbose", type=int, default=1,
                        help="logging level (higher is more logging)")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning("skip DEBUG/INFO messages")

    # check direcotry existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # load config
    if args.config is None:
        dirname = os.path.dirname(args.checkpoint)
        args.config = os.path.join(dirname, "config.yml")
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.scp is not None and args.dumpdir is not None) or \
            (args.scp is None and args.dumpdir is None):
        raise ValueError("Please specify either dumpdir or scp.")

    # get dataset
    if args.scp is None:
        if config["format"] == "hdf5":
            mel_query = "*.h5"
            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
        elif config["format"] == "npy":
            mel_query = "*-feats.npy"
            mel_load_fn = np.load
        else:
            raise ValueError("support only hdf5 or npy format.")
        dataset = MelDataset(
            args.dumpdir,
            mel_query=mel_query,
            mel_load_fn=mel_load_fn,
            return_filename=True)
        logging.info(f"the number of features to be decoded = {len(dataset)}.")
    else:
        dataset = kaldiio.ReadHelper(f"scp:{args.scp}")
        logging.info(f"the feature loaded from {args.scp}.")

    # setup
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model = ParallelWaveGANGenerator(**config["generator_params"])
    model.load_state_dict(torch.load(args.checkpoint, map_location="cpu")["model"]["generator"])
    model.remove_weight_norm()
    model = model.eval().to(device)
    logging.info(f"loaded model parameters from {args.checkpoint}.")

    # start generation
    pad_size = (config["generator_params"]["aux_context_window"],
                config["generator_params"]["aux_context_window"])
    total_rtf = 0.0
    with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar:
        for idx, (feat_path, c) in enumerate(pbar, 1):
            # generate each utterance
            z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device)
            c = np.pad(c, (pad_size, (0, 0)), "edge")
            c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device)
            start = time.time()
            y = model(z, c).view(-1).cpu().numpy()
            rtf = (time.time() - start) / (len(y) / config["sampling_rate"])
            pbar.set_postfix({"RTF": rtf})
            total_rtf += rtf

            # save as PCM 16 bit wav file
            utt_id = os.path.splitext(os.path.basename(feat_path))[0]
            sf.write(os.path.join(config["outdir"], f"{utt_id}_gen.wav"),
                     y, config["sampling_rate"], "PCM_16")

    # report average RTF
    logging.info(f"finished generation of {idx} utterances (RTF = {total_rtf / idx:.03f}).")
示例#11
0
model_class = dynamic_import(train_args.model_module)
model = model_class(idim, odim, train_args)
torch_load(model_path, model)
model = model.eval().to(device)
inference_args = Namespace(**{
    "threshold": 0.5,
    "minlenratio": 0.0,
    "maxlenratio": 10.0
})

# define neural vocoder
import yaml
from parallel_wavegan.models import ParallelWaveGANGenerator
with open(vocoder_conf) as f:
    config = yaml.load(f, Loader=yaml.Loader)
vocoder = ParallelWaveGANGenerator(**config["generator_params"])
vocoder.load_state_dict(
    torch.load(vocoder_path, map_location="cpu")["model"]["generator"])
vocoder.remove_weight_norm()
vocoder = vocoder.eval().to(device)

# define text frontend
from pypinyin import pinyin, Style
from pypinyin.style._utils import get_initials, get_finals
with open(dict_path) as f:
    lines = f.readlines()
lines = [line.replace("\n", "").split(" ") for line in lines]
char_to_id = {c: int(i) for c, i in lines}


def frontend(text):
示例#12
0
文件: vqvae2.py 项目: xinkez/crank
 def _construct_net(self):
     self.encoders = nn.ModuleList()
     self.decoders = nn.ModuleList()
     self.quantizers = nn.ModuleList()
     for n in range(self.conf["n_vq_stacks"]):
         if n == 0:
             enc_in_channels = self.conf["input_size"]
             enc_out_channels = self.conf["emb_dim"][n]
             if self.conf["encoder_spkr_classifier"]:
                 enc_out_channels += self.spkr_size
             enc_aux_channels = self.conf["enc_aux_size"]
             dec_in_channels = sum(
                 [self.conf["emb_dim"][i] for i in range(self.conf["n_vq_stacks"])]
             )
             dec_out_channels = self.conf["output_size"]
             if self.conf["use_spkr_embedding"]:
                 if not self.conf["use_embedding_transform"]:
                     dec_aux_channels = (
                         self.conf["dec_aux_size"] + self.conf["spkr_embedding_size"]
                     )
                 else:
                     dec_aux_channels = (
                         self.conf["dec_aux_size"]
                         + self.conf["embedding_transform_size"]
                     )
             else:
                 dec_aux_channels = self.conf["dec_aux_size"] + self.spkr_size
         elif n >= 1:
             enc_in_channels = self.conf["emb_dim"][n - 1]
             enc_out_channels = self.conf["emb_dim"][n]
             enc_aux_channels = 0
             dec_in_channels = self.conf["emb_dim"][n]
             dec_out_channels = self.conf["emb_dim"][n - 1]
             dec_aux_channels = 0
         self.encoders.append(
             ParallelWaveGANGenerator(
                 in_channels=enc_in_channels,
                 out_channels=enc_out_channels,
                 kernel_size=self.conf["kernel_size"][n],
                 layers=self.conf["n_layers"][n] * self.conf["n_layers_stacks"][n],
                 stacks=self.conf["n_layers_stacks"][n],
                 residual_channels=self.conf["residual_channels"],
                 gate_channels=128,
                 skip_channels=64,
                 aux_channels=enc_aux_channels,
                 aux_context_window=0,
                 dropout=0.0,
                 bias=True,
                 use_weight_norm=True,
                 use_causal_conv=self.conf["causal"],
                 upsample_conditional_features=False,
             )
         )
         self.decoders.append(
             ParallelWaveGANGenerator(
                 in_channels=dec_in_channels,
                 out_channels=dec_out_channels,
                 kernel_size=self.conf["kernel_size"][n],
                 layers=self.conf["n_layers"][n] * self.conf["n_layers_stacks"][n],
                 stacks=self.conf["n_layers_stacks"][n],
                 residual_channels=self.conf["residual_channels"],
                 gate_channels=128,
                 skip_channels=64,
                 aux_channels=dec_aux_channels,
                 aux_context_window=0,
                 dropout=0.0,
                 bias=True,
                 use_weight_norm=True,
                 use_causal_conv=self.conf["causal"],
                 upsample_conditional_features=False,
             )
         )
         self.quantizers.append(
             Quantizer(
                 self.conf["emb_dim"][n],
                 self.conf["emb_size"][n],
                 ema_flag=self.conf["ema_flag"],
                 bdt_flag=True,
             )
         )
示例#13
0
def main():
    """Run training process."""
    parser = argparse.ArgumentParser(
        description=
        "Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)."
    )
    parser.add_argument("--train-dumpdir",
                        type=str,
                        required=True,
                        help="directory including training data.")
    parser.add_argument("--dev-dumpdir",
                        type=str,
                        required=True,
                        help="directory including development data.")
    parser.add_argument("--outdir",
                        type=str,
                        required=True,
                        help="directory to save checkpoints.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument(
        "--resume",
        default="",
        type=str,
        nargs="?",
        help="checkpoint file path to resume training. (default=\"\")")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    parser.add_argument(
        "--rank",
        "--local_rank",
        default=0,
        type=int,
        help="rank for distributed training. no need to explictly specify.")
    args = parser.parse_args()

    args.distributed = False
    if not torch.cuda.is_available():
        device = torch.device("cpu")
    else:
        device = torch.device("cuda")
        # effective when using fixed size inputs
        # see https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
        torch.backends.cudnn.benchmark = True
        torch.cuda.set_device(args.rank)
        # setup for distributed training
        # see example: https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed
        if "WORLD_SIZE" in os.environ:
            args.world_size = int(os.environ["WORLD_SIZE"])
            args.distributed = args.world_size > 1
        if args.distributed:
            torch.distributed.init_process_group(backend="nccl",
                                                 init_method="env://")

    # suppress logging for distributed training
    if args.rank != 0:
        sys.stdout = open(os.devnull, "w")

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning("skip DEBUG/INFO messages")

    # check directory existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))
    config["version"] = parallel_wavegan.__version__  # add version info
    with open(os.path.join(args.outdir, "config.yml"), "w") as f:
        yaml.dump(config, f, Dumper=yaml.Dumper)
    for key, value in config.items():
        logging.info(f"{key} = {value}")

    # get dataset
    if config["remove_short_samples"]:
        mel_length_threshold = config["batch_max_steps"] // config["hop_size"] + \
            2 * config["generator_params"]["aux_context_window"]
    else:
        mel_length_threshold = None
    if config["format"] == "hdf5":
        audio_query, mel_query = "*.h5", "*.h5"
        audio_load_fn = lambda x: read_hdf5(x, "wave")  # NOQA
        mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
    elif config["format"] == "npy":
        audio_query, mel_query = "*-wave.npy", "*-feats.npy"
        audio_load_fn = np.load
        mel_load_fn = np.load
    else:
        raise ValueError("support only hdf5 or npy format.")
    dataset = {
        "train":
        AudioMelDataset(root_dir=args.train_dumpdir,
                        audio_query=audio_query,
                        mel_query=mel_query,
                        audio_load_fn=audio_load_fn,
                        mel_load_fn=mel_load_fn,
                        mel_length_threshold=mel_length_threshold,
                        allow_cache=config.get("allow_cache",
                                               False)),  # keep compatibility
        "dev":
        AudioMelDataset(root_dir=args.dev_dumpdir,
                        audio_query=audio_query,
                        mel_query=mel_query,
                        audio_load_fn=audio_load_fn,
                        mel_load_fn=mel_load_fn,
                        mel_length_threshold=mel_length_threshold,
                        allow_cache=config.get("allow_cache",
                                               False)),  # keep compatibility
    }

    # get data loader
    collater = Collater(
        batch_max_steps=config["batch_max_steps"],
        hop_size=config["hop_size"],
        aux_context_window=config["generator_params"]["aux_context_window"],
    )
    train_sampler, dev_sampler = None, None
    if args.distributed:
        # setup sampler for distributed training
        from torch.utils.data.distributed import DistributedSampler
        train_sampler = DistributedSampler(dataset=dataset["train"],
                                           num_replicas=args.world_size,
                                           rank=args.rank,
                                           shuffle=True)
        dev_sampler = DistributedSampler(dataset=dataset["dev"],
                                         num_replicas=args.world_size,
                                         rank=args.rank,
                                         shuffle=False)
    data_loader = {
        "train":
        DataLoader(dataset=dataset["train"],
                   shuffle=False if args.distributed else True,
                   collate_fn=collater,
                   batch_size=config["batch_size"],
                   num_workers=config["num_workers"],
                   sampler=train_sampler,
                   pin_memory=config["pin_memory"]),
        "dev":
        DataLoader(dataset=dataset["dev"],
                   shuffle=False if args.distributed else True,
                   collate_fn=collater,
                   batch_size=config["batch_size"],
                   num_workers=config["num_workers"],
                   sampler=dev_sampler,
                   pin_memory=config["pin_memory"]),
    }

    # define models and optimizers
    model = {
        "generator":
        ParallelWaveGANGenerator(**config["generator_params"]).to(device),
        "discriminator":
        ParallelWaveGANDiscriminator(
            **config["discriminator_params"]).to(device),
    }
    criterion = {
        "stft":
        MultiResolutionSTFTLoss(**config["stft_loss_params"]).to(device),
        "mse": torch.nn.MSELoss().to(device),
    }
    optimizer = {
        "generator":
        RAdam(model["generator"].parameters(),
              **config["generator_optimizer_params"]),
        "discriminator":
        RAdam(model["discriminator"].parameters(),
              **config["discriminator_optimizer_params"]),
    }
    scheduler = {
        "generator":
        torch.optim.lr_scheduler.StepLR(
            optimizer=optimizer["generator"],
            **config["generator_scheduler_params"]),
        "discriminator":
        torch.optim.lr_scheduler.StepLR(
            optimizer=optimizer["discriminator"],
            **config["discriminator_scheduler_params"]),
    }
    if args.distributed:
        # wrap model for distributed training
        try:
            from apex.parallel import DistributedDataParallel
        except ImportError:
            raise ImportError(
                "apex is not installed. please check https://github.com/NVIDIA/apex."
            )
        model["generator"] = DistributedDataParallel(model["generator"])
        model["discriminator"] = DistributedDataParallel(
            model["discriminator"])
    logging.info(model["generator"])
    logging.info(model["discriminator"])

    # define trainer
    trainer = Trainer(
        steps=0,
        epochs=0,
        data_loader=data_loader,
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        config=config,
        device=device,
    )

    # resume from checkpoint
    if len(args.resume) != 0:
        trainer.load_checkpoint(args.resume)
        logging.info(f"successfully resumed from {args.resume}.")

    # run training loop
    try:
        trainer.run()
    except KeyboardInterrupt:
        trainer.save_checkpoint(
            os.path.join(config["outdir"],
                         f"checkpoint-{trainer.steps}steps.pkl"))
        logging.info(f"successfully saved checkpoint @ {trainer.steps}steps.")
示例#14
0
# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()
print(cp['step'])
print(cp['r'])

# set model stepsize
if 'r' in cp:
    model.decoder.set_r(cp['r'])

# load PWGAN
if use_gl == False:
    vocoder_model = ParallelWaveGANGenerator(
        **PWGAN_CONFIG["generator_params"])
    vocoder_model.load_state_dict(
        torch.load(PWGAN_MODEL, map_location="cpu")["model"]["generator"])
    vocoder_model.remove_weight_norm()
    ap_vocoder = AudioProcessorVocoder(**PWGAN_CONFIG['audio'])
    if use_cuda:
        vocoder_model.cuda()
    vocoder_model.eval()

data = ''
with open('configuration/text/result/final.txt', 'r') as myfile:
    data = myfile.read()

data = data.replace(';', '')
sentence = data.split('.')
示例#15
0
    def __init__(self, domain: Domain = "", identifier: str = None, use_cuda=False, sub_topic_domains: Dict[str, str] = {}):
        """
        Text To Speech Module that reads out the system utterance.
        
        Args:
            domain (Domain): Needed for Service, no meaning here
            identifier (string): Needed for Service
            use_cuda (boolean): Whether or not to perform computations on GPU. Highly recommended if available
            sub_topic_domains: see `services.service.Service` constructor for more details
        """
        Service.__init__(self, domain=domain, identifier=identifier, sub_topic_domains=sub_topic_domains)
        self.models_directory = os.path.join(get_root_dir(), "resources", "models", "speech")

        # The following lines can be changed to incorporate different models.
        # This is the only thing that needs to be changed for that, everything else should be dynamic.
        self.transcription_type = "phn"
        self.dict_path = os.path.join(self.models_directory,
                                      "phn_train_no_dev_pytorch_train_fastspeech.v4", "data", "lang_1phn",
                                      "train_no_dev_units.txt")
        self.model_path = os.path.join(self.models_directory,
                                       "phn_train_no_dev_pytorch_train_fastspeech.v4", "exp",
                                       "phn_train_no_dev_pytorch_train_fastspeech.v4", "results",
                                       "model.last1.avg.best")
        self.vocoder_path = os.path.join(self.models_directory,
                                         "ljspeech.parallel_wavegan.v1", "checkpoint-400000steps.pkl")
        self.vocoder_conf = os.path.join(self.models_directory, "ljspeech.parallel_wavegan.v1", "config.yml")

        # define device to run the synthesis on
        if use_cuda:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        # define end to end TTS model
        self.input_dimensions, self.output_dimensions, self.train_args = get_model_conf(self.model_path)
        model_class = dynamic_import.dynamic_import(self.train_args.model_module)
        model = model_class(self.input_dimensions, self.output_dimensions, self.train_args)
        torch_load(self.model_path, model)
        self.model = model.eval().to(self.device)
        self.inference_args = Namespace(**{"threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0})

        # define neural vocoder
        with open(self.vocoder_conf) as vocoder_config_file:
            self.config = yaml.load(vocoder_config_file, Loader=yaml.Loader)
        vocoder = ParallelWaveGANGenerator(**self.config["generator_params"])
        vocoder.load_state_dict(torch.load(self.vocoder_path, map_location="cpu")["model"]["generator"])
        vocoder.remove_weight_norm()
        self.vocoder = vocoder.eval().to(self.device)

        with open(self.dict_path) as dictionary_file:
            lines = dictionary_file.readlines()
        lines = [line.replace("\n", "").split(" ") for line in lines]
        self.char_to_id = {c: int(i) for c, i in lines}
        self.g2p = G2p()

        # download the pretrained Punkt tokenizer from NLTK. This is done only
        # the first time the code is executed on a machine, if it has been done
        # before, this line will be skipped and output a warning. We will probably
        # redirect warnings into a file rather than std_err in the future, since
        # there's also a lot of pytorch warnings going on etc.
        nltk.download('punkt', quiet=True)