def __init__(self, device='cpu'): dict_path = "downloads/data/lang_1char/train_no_dev_units.txt" model_path = "downloads/exp/train_no_dev_pytorch_train_pytorch_tacotron2.v3/results/model.last1.avg.best" vocoder_path = "downloads/ljspeech.parallel_wavegan.v1/checkpoint-400000steps.pkl" vocoder_conf = "downloads/ljspeech.parallel_wavegan.v1/config.yml" device = torch.device(device) idim, odim, train_args = get_model_conf(model_path) model_class = dynamic_import(train_args.model_module) model = model_class(idim, odim, train_args) torch_load(model_path, model) model = model.eval().to(device) inference_args = Namespace(**{"threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0}) with open(vocoder_conf) as f: config = yaml.load(f, Loader=yaml.Loader) vocoder = ParallelWaveGANGenerator(**config["generator_params"]) vocoder.load_state_dict(torch.load(vocoder_path, map_location="cpu")["model"]["generator"]) vocoder.remove_weight_norm() vocoder = vocoder.eval().to(device) with open(dict_path) as f: lines = f.readlines() lines = [line.replace("\n", "").split(" ") for line in lines] char_to_id = {c: int(i) for c, i in lines} self.device = device self.char_to_id = char_to_id self.idim = idim self.model = model self.inference_args = inference_args self.config = config self.vocoder = vocoder
def __init__(self, conf): if conf["cuda"]: self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.conf = MODEL_CONF[conf["model"]] # define E2E-TTS model self.idim, odim, train_args = get_model_conf(self.conf["model_path"]) model_class = dynamic_import(train_args.model_module) self.model = model_class(self.idim, odim, train_args) torch_load(self.conf["model_path"], self.model) self.model = self.model.eval().to(self.device) # load neural vocoder with open(VOCODER_CONF["vocoder_conf"]) as f: self.vocoder_config = yaml.load(f, Loader=yaml.Loader) self.vocoder = ParallelWaveGANGenerator( **self.vocoder_config["generator_params"]) self.vocoder.load_state_dict(\ torch.load(VOCODER_CONF["vocoder_path"], map_location="cpu")["model"]["generator"]) self.vocoder.remove_weight_norm() self.vocoder = self.vocoder.eval().to(self.device) # define character-to-id dictionary with open(self.conf["dict_path"]) as f: lines = f.readlines() lines = [line.replace("\n", "").split(" ") for line in lines] self.char_to_id = {c: int(i) for c, i in lines}
def load_pwgan(self, lib_path, model_file, model_config, use_cuda): sys.path.append(lib_path) # set this if ParallelWaveGAN is not installed globally #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) with open(model_config) as f: self.pwgan_config = yaml.load(f, Loader=yaml.Loader) self.pwgan = ParallelWaveGANGenerator(**self.pwgan_config["generator_params"]) self.pwgan.load_state_dict(torch.load(model_file, map_location="cpu")["model"]["generator"]) self.pwgan.remove_weight_norm() if use_cuda: self.pwgan.cuda() self.pwgan.eval()
def test_causal_parallel_wavegan(upsample_net, aux_context_window): batch_size = 1 batch_length = 4096 args_g = make_generator_args( use_causal_conv=True, upsample_net=upsample_net, aux_context_window=aux_context_window, dropout=0.0, ) model_g = ParallelWaveGANGenerator(**args_g) z = torch.randn(batch_size, 1, batch_length) c = torch.randn( batch_size, args_g["aux_channels"], batch_length // np.prod(args_g["upsample_params"]["upsample_scales"]), ) z_ = z.clone() c_ = c.clone() z_[..., z.size(-1) // 2:] = torch.randn(z[..., z.size(-1) // 2:].shape) c_[..., c.size(-1) // 2:] = torch.randn(c[..., c.size(-1) // 2:].shape) c = torch.nn.ConstantPad1d(args_g["aux_context_window"], 0.0)(c) c_ = torch.nn.ConstantPad1d(args_g["aux_context_window"], 0.0)(c_) try: # check not equal np.testing.assert_array_equal(c.numpy(), c_.numpy()) except AssertionError: pass else: raise AssertionError("Must be different.") try: # check not equal np.testing.assert_array_equal(z.numpy(), z_.numpy()) except AssertionError: pass else: raise AssertionError("Must be different.") # check causality y = model_g(z, c) y_ = model_g(z_, c_) np.testing.assert_array_equal( y[..., :y.size(-1) // 2].detach().cpu().numpy(), y_[..., :y_.size(-1) // 2].detach().cpu().numpy(), )
def load_pwgan(self, model_file, model_config, use_cuda): #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) with open(model_config) as f: self.pwgan_config = yaml.load(f, Loader=yaml.Loader) self.pwgan = ParallelWaveGANGenerator( **self.pwgan_config["generator_params"]) self.pwgan.load_state_dict( torch.load(model_file, map_location="cpu")["model"]["generator"]) self.pwgan.remove_weight_norm() self.pwgan_ap = AudioProcessorVocoder(**self.pwgan_config["audio"]) if use_cuda: self.pwgan.cuda() self.pwgan.eval()
def test_parallel_wavegan_with_residual_discriminator_trainable( dict_g, dict_d, dict_loss): # setup batch_size = 4 batch_length = 4096 args_g = make_generator_args(**dict_g) args_d = make_residual_discriminator_args(**dict_d) args_loss = make_mutli_reso_stft_loss_args(**dict_loss) z = torch.randn(batch_size, 1, batch_length) y = torch.randn(batch_size, 1, batch_length) c = torch.randn( batch_size, args_g["aux_channels"], batch_length // np.prod(args_g["upsample_params"]["upsample_scales"]) + 2 * args_g["aux_context_window"], ) model_g = ParallelWaveGANGenerator(**args_g) model_d = ResidualParallelWaveGANDiscriminator(**args_d) aux_criterion = MultiResolutionSTFTLoss(**args_loss) gen_adv_criterion = GeneratorAdversarialLoss() dis_adv_criterion = DiscriminatorAdversarialLoss() optimizer_g = RAdam(model_g.parameters()) optimizer_d = RAdam(model_d.parameters()) # check generator trainable y_hat = model_g(z, c) p_hat = model_d(y_hat) adv_loss = gen_adv_criterion(p_hat) sc_loss, mag_loss = aux_criterion(y_hat, y) aux_loss = sc_loss + mag_loss loss_g = adv_loss + aux_loss optimizer_g.zero_grad() loss_g.backward() optimizer_g.step() # check discriminator trainable p = model_d(y) p_hat = model_d(y_hat.detach()) real_loss, fake_loss = dis_adv_criterion(p_hat, p) loss_d = real_loss + fake_loss optimizer_d.zero_grad() loss_d.backward() optimizer_d.step()
def test_parallel_wavegan_trainable(dict_g, dict_d, dict_loss): # setup batch_size = 4 batch_length = 4096 args_g = make_generator_args(**dict_g) args_d = make_discriminator_args(**dict_d) args_loss = make_mutli_reso_stft_loss_args(**dict_loss) z = torch.randn(batch_size, 1, batch_length) y = torch.randn(batch_size, 1, batch_length) c = torch.randn( batch_size, args_g["aux_channels"], batch_length // np.prod(args_g["upsample_params"]["upsample_scales"]) + 2 * args_g["aux_context_window"]) model_g = ParallelWaveGANGenerator(**args_g) model_d = ParallelWaveGANDiscriminator(**args_d) aux_criterion = MultiResolutionSTFTLoss(**args_loss) optimizer_g = RAdam(model_g.parameters()) optimizer_d = RAdam(model_d.parameters()) # check generator trainable y_hat = model_g(z, c) p_hat = model_d(y_hat) y, y_hat, p_hat = y.squeeze(1), y_hat.squeeze(1), p_hat.squeeze(1) adv_loss = F.mse_loss(p_hat, p_hat.new_ones(p_hat.size())) sc_loss, mag_loss = aux_criterion(y_hat, y) aux_loss = sc_loss + mag_loss loss_g = adv_loss + aux_loss optimizer_g.zero_grad() loss_g.backward() optimizer_g.step() # check discriminator trainable y, y_hat = y.unsqueeze(1), y_hat.unsqueeze(1).detach() p = model_d(y) p_hat = model_d(y_hat) p, p_hat = p.squeeze(1), p_hat.squeeze(1) loss_d = F.mse_loss(p, p.new_ones(p.size())) + F.mse_loss( p_hat, p_hat.new_zeros(p_hat.size())) optimizer_d.zero_grad() loss_d.backward() optimizer_d.step()
def load_pwgan(self, lib_path, model_file, model_config, use_cuda): if lib_path: # set this if ParallelWaveGAN is not installed globally sys.path.append(lib_path) try: #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator except ImportError as e: raise RuntimeError( f"cannot import parallel-wavegan, either install it or set its directory using the --pwgan_lib_path command line argument: {e}" ) print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) with open(model_config) as f: self.pwgan_config = yaml.load(f, Loader=yaml.Loader) self.pwgan = ParallelWaveGANGenerator( **self.pwgan_config["generator_params"]) self.pwgan.load_state_dict( torch.load(model_file, map_location="cpu")["model"]["generator"]) self.pwgan.remove_weight_norm() if use_cuda: self.pwgan.cuda() self.pwgan.eval()
def main(): """Run training process.""" parser = argparse.ArgumentParser( description= "Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)." ) parser.add_argument("--train-dumpdir", type=str, required=True, help="directory including trainning data.") parser.add_argument("--dev-dumpdir", type=str, required=True, help="directory including development data.") parser.add_argument("--outdir", type=str, required=True, help="directory to save checkpoints.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument( "--resume", default="", type=str, nargs="?", help="checkpoint file path to resume training. (default=\"\")") parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('skip DEBUG/INFO messages') # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load and save config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) with open(os.path.join(args.outdir, "config.yml"), "w") as f: yaml.dump(config, f, Dumper=yaml.Dumper) for key, value in config.items(): logging.info(f"{key} = {value}") # get dataset if config["remove_short_samples"]: mel_length_threshold = config["batch_max_steps"] // config["hop_size"] + \ 2 * config["generator_params"]["aux_context_window"] else: mel_length_threshold = None if config["format"] == "hdf5": audio_query, mel_query = "*.h5", "*.h5" audio_load_fn = lambda x: read_hdf5(x, "wave") # NOQA mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": audio_query, mel_query = "*-wave.npy", "*-feats.npy" audio_load_fn = np.load mel_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") dataset = { "train": AudioMelDataset( root_dir=args.train_dumpdir, audio_query=audio_query, mel_query=mel_query, audio_load_fn=audio_load_fn, mel_load_fn=mel_load_fn, mel_length_threshold=mel_length_threshold, allow_cache=config.get("allow_cache", False), # keep compatibilty ), "dev": AudioMelDataset( root_dir=args.dev_dumpdir, audio_query=audio_query, mel_query=mel_query, audio_load_fn=audio_load_fn, mel_load_fn=mel_load_fn, mel_length_threshold=mel_length_threshold, allow_cache=config.get("allow_cache", False), # keep compatibilty ), } # get data loader if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") collater = Collater( batch_max_steps=config["batch_max_steps"], hop_size=config["hop_size"], aux_context_window=config["generator_params"]["aux_context_window"], ) data_loader = { "train": DataLoader(dataset=dataset["train"], shuffle=True, collate_fn=collater, batch_size=config["batch_size"], num_workers=config["num_workers"], pin_memory=config["pin_memory"]), "dev": DataLoader(dataset=dataset["dev"], shuffle=True, collate_fn=collater, batch_size=config["batch_size"], num_workers=config["num_workers"], pin_memory=config["pin_memory"]), } # define models and optimizers model = { "generator": ParallelWaveGANGenerator(**config["generator_params"]).to(device), "discriminator": ParallelWaveGANDiscriminator( **config["discriminator_params"]).to(device), } criterion = { "stft": MultiResolutionSTFTLoss(**config["stft_loss_params"]).to(device), "mse": torch.nn.MSELoss().to(device), } optimizer = { "generator": RAdam(model["generator"].parameters(), **config["generator_optimizer_params"]), "discriminator": RAdam(model["discriminator"].parameters(), **config["discriminator_optimizer_params"]), } scheduler = { "generator": torch.optim.lr_scheduler.StepLR( optimizer=optimizer["generator"], **config["generator_scheduler_params"]), "discriminator": torch.optim.lr_scheduler.StepLR( optimizer=optimizer["discriminator"], **config["discriminator_scheduler_params"]), } logging.info(model["generator"]) logging.info(model["discriminator"]) # define trainer trainer = Trainer( steps=0, epochs=0, data_loader=data_loader, model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, config=config, device=device, ) # resume from checkpoint if len(args.resume) != 0: trainer.load_checkpoint(args.resume) logging.info(f"resumed from {args.resume}.") # run training loop try: trainer.run() finally: trainer.save_checkpoint( os.path.join(config["outdir"], f"checkpoint-{trainer.steps}steps.pkl")) logging.info(f"successfully saved checkpoint @ {trainer.steps}steps.")
def main(): """Run decoding process.""" parser = argparse.ArgumentParser( description="Decode dumped features with trained Parallel WaveGAN Generator.") parser.add_argument("--scp", default=None, type=str, help="Kaldi-style feats.scp file.") parser.add_argument("--dumpdir", default=None, type=str, help="Directory including feature files.") parser.add_argument("--outdir", default=None, type=str, required=True, help="Direcotry to save generated speech.") parser.add_argument("--checkpoint", default=None, type=str, required=True, help="Checkpoint file.") parser.add_argument("--config", default=None, type=str, help="Yaml format configuration file.") parser.add_argument("--verbose", type=int, default=1, help="logging level (higher is more logging)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning("skip DEBUG/INFO messages") # check direcotry existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load config if args.config is None: dirname = os.path.dirname(args.checkpoint) args.config = os.path.join(dirname, "config.yml") with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.scp is not None and args.dumpdir is not None) or \ (args.scp is None and args.dumpdir is None): raise ValueError("Please specify either dumpdir or scp.") # get dataset if args.scp is None: if config["format"] == "hdf5": mel_query = "*.h5" mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": mel_query = "*-feats.npy" mel_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") dataset = MelDataset( args.dumpdir, mel_query=mel_query, mel_load_fn=mel_load_fn, return_filename=True) logging.info(f"the number of features to be decoded = {len(dataset)}.") else: dataset = kaldiio.ReadHelper(f"scp:{args.scp}") logging.info(f"the feature loaded from {args.scp}.") # setup if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") model = ParallelWaveGANGenerator(**config["generator_params"]) model.load_state_dict(torch.load(args.checkpoint, map_location="cpu")["model"]["generator"]) model.remove_weight_norm() model = model.eval().to(device) logging.info(f"loaded model parameters from {args.checkpoint}.") # start generation pad_size = (config["generator_params"]["aux_context_window"], config["generator_params"]["aux_context_window"]) total_rtf = 0.0 with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar: for idx, (feat_path, c) in enumerate(pbar, 1): # generate each utterance z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device) c = np.pad(c, (pad_size, (0, 0)), "edge") c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device) start = time.time() y = model(z, c).view(-1).cpu().numpy() rtf = (time.time() - start) / (len(y) / config["sampling_rate"]) pbar.set_postfix({"RTF": rtf}) total_rtf += rtf # save as PCM 16 bit wav file utt_id = os.path.splitext(os.path.basename(feat_path))[0] sf.write(os.path.join(config["outdir"], f"{utt_id}_gen.wav"), y, config["sampling_rate"], "PCM_16") # report average RTF logging.info(f"finished generation of {idx} utterances (RTF = {total_rtf / idx:.03f}).")
model_class = dynamic_import(train_args.model_module) model = model_class(idim, odim, train_args) torch_load(model_path, model) model = model.eval().to(device) inference_args = Namespace(**{ "threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0 }) # define neural vocoder import yaml from parallel_wavegan.models import ParallelWaveGANGenerator with open(vocoder_conf) as f: config = yaml.load(f, Loader=yaml.Loader) vocoder = ParallelWaveGANGenerator(**config["generator_params"]) vocoder.load_state_dict( torch.load(vocoder_path, map_location="cpu")["model"]["generator"]) vocoder.remove_weight_norm() vocoder = vocoder.eval().to(device) # define text frontend from pypinyin import pinyin, Style from pypinyin.style._utils import get_initials, get_finals with open(dict_path) as f: lines = f.readlines() lines = [line.replace("\n", "").split(" ") for line in lines] char_to_id = {c: int(i) for c, i in lines} def frontend(text):
def _construct_net(self): self.encoders = nn.ModuleList() self.decoders = nn.ModuleList() self.quantizers = nn.ModuleList() for n in range(self.conf["n_vq_stacks"]): if n == 0: enc_in_channels = self.conf["input_size"] enc_out_channels = self.conf["emb_dim"][n] if self.conf["encoder_spkr_classifier"]: enc_out_channels += self.spkr_size enc_aux_channels = self.conf["enc_aux_size"] dec_in_channels = sum( [self.conf["emb_dim"][i] for i in range(self.conf["n_vq_stacks"])] ) dec_out_channels = self.conf["output_size"] if self.conf["use_spkr_embedding"]: if not self.conf["use_embedding_transform"]: dec_aux_channels = ( self.conf["dec_aux_size"] + self.conf["spkr_embedding_size"] ) else: dec_aux_channels = ( self.conf["dec_aux_size"] + self.conf["embedding_transform_size"] ) else: dec_aux_channels = self.conf["dec_aux_size"] + self.spkr_size elif n >= 1: enc_in_channels = self.conf["emb_dim"][n - 1] enc_out_channels = self.conf["emb_dim"][n] enc_aux_channels = 0 dec_in_channels = self.conf["emb_dim"][n] dec_out_channels = self.conf["emb_dim"][n - 1] dec_aux_channels = 0 self.encoders.append( ParallelWaveGANGenerator( in_channels=enc_in_channels, out_channels=enc_out_channels, kernel_size=self.conf["kernel_size"][n], layers=self.conf["n_layers"][n] * self.conf["n_layers_stacks"][n], stacks=self.conf["n_layers_stacks"][n], residual_channels=self.conf["residual_channels"], gate_channels=128, skip_channels=64, aux_channels=enc_aux_channels, aux_context_window=0, dropout=0.0, bias=True, use_weight_norm=True, use_causal_conv=self.conf["causal"], upsample_conditional_features=False, ) ) self.decoders.append( ParallelWaveGANGenerator( in_channels=dec_in_channels, out_channels=dec_out_channels, kernel_size=self.conf["kernel_size"][n], layers=self.conf["n_layers"][n] * self.conf["n_layers_stacks"][n], stacks=self.conf["n_layers_stacks"][n], residual_channels=self.conf["residual_channels"], gate_channels=128, skip_channels=64, aux_channels=dec_aux_channels, aux_context_window=0, dropout=0.0, bias=True, use_weight_norm=True, use_causal_conv=self.conf["causal"], upsample_conditional_features=False, ) ) self.quantizers.append( Quantizer( self.conf["emb_dim"][n], self.conf["emb_size"][n], ema_flag=self.conf["ema_flag"], bdt_flag=True, ) )
def main(): """Run training process.""" parser = argparse.ArgumentParser( description= "Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)." ) parser.add_argument("--train-dumpdir", type=str, required=True, help="directory including training data.") parser.add_argument("--dev-dumpdir", type=str, required=True, help="directory including development data.") parser.add_argument("--outdir", type=str, required=True, help="directory to save checkpoints.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument( "--resume", default="", type=str, nargs="?", help="checkpoint file path to resume training. (default=\"\")") parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") parser.add_argument( "--rank", "--local_rank", default=0, type=int, help="rank for distributed training. no need to explictly specify.") args = parser.parse_args() args.distributed = False if not torch.cuda.is_available(): device = torch.device("cpu") else: device = torch.device("cuda") # effective when using fixed size inputs # see https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936 torch.backends.cudnn.benchmark = True torch.cuda.set_device(args.rank) # setup for distributed training # see example: https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed if "WORLD_SIZE" in os.environ: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 if args.distributed: torch.distributed.init_process_group(backend="nccl", init_method="env://") # suppress logging for distributed training if args.rank != 0: sys.stdout = open(os.devnull, "w") # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning("skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load and save config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) config["version"] = parallel_wavegan.__version__ # add version info with open(os.path.join(args.outdir, "config.yml"), "w") as f: yaml.dump(config, f, Dumper=yaml.Dumper) for key, value in config.items(): logging.info(f"{key} = {value}") # get dataset if config["remove_short_samples"]: mel_length_threshold = config["batch_max_steps"] // config["hop_size"] + \ 2 * config["generator_params"]["aux_context_window"] else: mel_length_threshold = None if config["format"] == "hdf5": audio_query, mel_query = "*.h5", "*.h5" audio_load_fn = lambda x: read_hdf5(x, "wave") # NOQA mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": audio_query, mel_query = "*-wave.npy", "*-feats.npy" audio_load_fn = np.load mel_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") dataset = { "train": AudioMelDataset(root_dir=args.train_dumpdir, audio_query=audio_query, mel_query=mel_query, audio_load_fn=audio_load_fn, mel_load_fn=mel_load_fn, mel_length_threshold=mel_length_threshold, allow_cache=config.get("allow_cache", False)), # keep compatibility "dev": AudioMelDataset(root_dir=args.dev_dumpdir, audio_query=audio_query, mel_query=mel_query, audio_load_fn=audio_load_fn, mel_load_fn=mel_load_fn, mel_length_threshold=mel_length_threshold, allow_cache=config.get("allow_cache", False)), # keep compatibility } # get data loader collater = Collater( batch_max_steps=config["batch_max_steps"], hop_size=config["hop_size"], aux_context_window=config["generator_params"]["aux_context_window"], ) train_sampler, dev_sampler = None, None if args.distributed: # setup sampler for distributed training from torch.utils.data.distributed import DistributedSampler train_sampler = DistributedSampler(dataset=dataset["train"], num_replicas=args.world_size, rank=args.rank, shuffle=True) dev_sampler = DistributedSampler(dataset=dataset["dev"], num_replicas=args.world_size, rank=args.rank, shuffle=False) data_loader = { "train": DataLoader(dataset=dataset["train"], shuffle=False if args.distributed else True, collate_fn=collater, batch_size=config["batch_size"], num_workers=config["num_workers"], sampler=train_sampler, pin_memory=config["pin_memory"]), "dev": DataLoader(dataset=dataset["dev"], shuffle=False if args.distributed else True, collate_fn=collater, batch_size=config["batch_size"], num_workers=config["num_workers"], sampler=dev_sampler, pin_memory=config["pin_memory"]), } # define models and optimizers model = { "generator": ParallelWaveGANGenerator(**config["generator_params"]).to(device), "discriminator": ParallelWaveGANDiscriminator( **config["discriminator_params"]).to(device), } criterion = { "stft": MultiResolutionSTFTLoss(**config["stft_loss_params"]).to(device), "mse": torch.nn.MSELoss().to(device), } optimizer = { "generator": RAdam(model["generator"].parameters(), **config["generator_optimizer_params"]), "discriminator": RAdam(model["discriminator"].parameters(), **config["discriminator_optimizer_params"]), } scheduler = { "generator": torch.optim.lr_scheduler.StepLR( optimizer=optimizer["generator"], **config["generator_scheduler_params"]), "discriminator": torch.optim.lr_scheduler.StepLR( optimizer=optimizer["discriminator"], **config["discriminator_scheduler_params"]), } if args.distributed: # wrap model for distributed training try: from apex.parallel import DistributedDataParallel except ImportError: raise ImportError( "apex is not installed. please check https://github.com/NVIDIA/apex." ) model["generator"] = DistributedDataParallel(model["generator"]) model["discriminator"] = DistributedDataParallel( model["discriminator"]) logging.info(model["generator"]) logging.info(model["discriminator"]) # define trainer trainer = Trainer( steps=0, epochs=0, data_loader=data_loader, model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, config=config, device=device, ) # resume from checkpoint if len(args.resume) != 0: trainer.load_checkpoint(args.resume) logging.info(f"successfully resumed from {args.resume}.") # run training loop try: trainer.run() except KeyboardInterrupt: trainer.save_checkpoint( os.path.join(config["outdir"], f"checkpoint-{trainer.steps}steps.pkl")) logging.info(f"successfully saved checkpoint @ {trainer.steps}steps.")
# load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() print(cp['step']) print(cp['r']) # set model stepsize if 'r' in cp: model.decoder.set_r(cp['r']) # load PWGAN if use_gl == False: vocoder_model = ParallelWaveGANGenerator( **PWGAN_CONFIG["generator_params"]) vocoder_model.load_state_dict( torch.load(PWGAN_MODEL, map_location="cpu")["model"]["generator"]) vocoder_model.remove_weight_norm() ap_vocoder = AudioProcessorVocoder(**PWGAN_CONFIG['audio']) if use_cuda: vocoder_model.cuda() vocoder_model.eval() data = '' with open('configuration/text/result/final.txt', 'r') as myfile: data = myfile.read() data = data.replace(';', '') sentence = data.split('.')
def __init__(self, domain: Domain = "", identifier: str = None, use_cuda=False, sub_topic_domains: Dict[str, str] = {}): """ Text To Speech Module that reads out the system utterance. Args: domain (Domain): Needed for Service, no meaning here identifier (string): Needed for Service use_cuda (boolean): Whether or not to perform computations on GPU. Highly recommended if available sub_topic_domains: see `services.service.Service` constructor for more details """ Service.__init__(self, domain=domain, identifier=identifier, sub_topic_domains=sub_topic_domains) self.models_directory = os.path.join(get_root_dir(), "resources", "models", "speech") # The following lines can be changed to incorporate different models. # This is the only thing that needs to be changed for that, everything else should be dynamic. self.transcription_type = "phn" self.dict_path = os.path.join(self.models_directory, "phn_train_no_dev_pytorch_train_fastspeech.v4", "data", "lang_1phn", "train_no_dev_units.txt") self.model_path = os.path.join(self.models_directory, "phn_train_no_dev_pytorch_train_fastspeech.v4", "exp", "phn_train_no_dev_pytorch_train_fastspeech.v4", "results", "model.last1.avg.best") self.vocoder_path = os.path.join(self.models_directory, "ljspeech.parallel_wavegan.v1", "checkpoint-400000steps.pkl") self.vocoder_conf = os.path.join(self.models_directory, "ljspeech.parallel_wavegan.v1", "config.yml") # define device to run the synthesis on if use_cuda: self.device = torch.device("cuda") else: self.device = torch.device("cpu") # define end to end TTS model self.input_dimensions, self.output_dimensions, self.train_args = get_model_conf(self.model_path) model_class = dynamic_import.dynamic_import(self.train_args.model_module) model = model_class(self.input_dimensions, self.output_dimensions, self.train_args) torch_load(self.model_path, model) self.model = model.eval().to(self.device) self.inference_args = Namespace(**{"threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0}) # define neural vocoder with open(self.vocoder_conf) as vocoder_config_file: self.config = yaml.load(vocoder_config_file, Loader=yaml.Loader) vocoder = ParallelWaveGANGenerator(**self.config["generator_params"]) vocoder.load_state_dict(torch.load(self.vocoder_path, map_location="cpu")["model"]["generator"]) vocoder.remove_weight_norm() self.vocoder = vocoder.eval().to(self.device) with open(self.dict_path) as dictionary_file: lines = dictionary_file.readlines() lines = [line.replace("\n", "").split(" ") for line in lines] self.char_to_id = {c: int(i) for c, i in lines} self.g2p = G2p() # download the pretrained Punkt tokenizer from NLTK. This is done only # the first time the code is executed on a machine, if it has been done # before, this line will be skipped and output a warning. We will probably # redirect warnings into a file rather than std_err in the future, since # there's also a lot of pytorch warnings going on etc. nltk.download('punkt', quiet=True)