예제 #1
0
파일: train.py 프로젝트: dmc31a42/glow-tts
def train_and_eval(rank, n_gpus, hps):
  global global_step
  if rank == 0:
    logger = utils.get_logger(hps.model_dir)
    logger.info(hps)
    utils.check_git_hash(hps.model_dir)
    writer = SummaryWriter(log_dir=hps.model_dir)
    writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))

  dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
  torch.manual_seed(hps.train.seed)
  torch.cuda.set_device(rank)

  train_dataset = TextMelLoader(hps.data.training_files, hps.data)
  train_sampler = torch.utils.data.distributed.DistributedSampler(
      train_dataset,
      num_replicas=n_gpus,
      rank=rank,
      shuffle=True)
  collate_fn = TextMelCollate(1)
  train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False,
      batch_size=hps.train.batch_size, pin_memory=True,
      drop_last=True, collate_fn=collate_fn, sampler=train_sampler)
  if rank == 0:
    val_dataset = TextMelLoader(hps.data.validation_files, hps.data)
    val_loader = DataLoader(val_dataset, num_workers=8, shuffle=False,
        batch_size=hps.train.batch_size, pin_memory=True,
        drop_last=True, collate_fn=collate_fn)

  generator = models.FlowGenerator(
      n_vocab=len(symbols), 
      out_channels=hps.data.n_mel_channels, 
      **hps.model).cuda(rank)
  optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps)
  if hps.train.fp16_run:
    generator, optimizer_g._optim = amp.initialize(generator, optimizer_g._optim, opt_level="O1")
  generator = DDP(generator)
  epoch_str = 1
  global_step = 0
  try:
    _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), generator, optimizer_g)
    epoch_str += 1
    optimizer_g.step_num = (epoch_str - 1) * len(train_loader)
    optimizer_g._update_learning_rate()
    global_step = (epoch_str - 1) * len(train_loader)
  except:
    if hps.train.ddi and os.path.isfile(os.path.join(hps.model_dir, "ddi_G.pth")):
      _ = utils.load_checkpoint(os.path.join(hps.model_dir, "ddi_G.pth"), generator, optimizer_g)
  
  for epoch in range(epoch_str, hps.train.epochs + 1):
    if rank==0:
      train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer)
      evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval)
      if epoch%50 == 0:
        utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(epoch)))
    else:
      train(rank, epoch, hps, generator, optimizer_g, train_loader, None, None)
예제 #2
0
def init(checkpoint_path, config_path, device="cuda"):
    hps = glow_utils.get_hparams_from_json(checkpoint_path, config_path)
    model = models.FlowGenerator(len(symbols),
                                 out_channels=hps.data.n_mel_channels,
                                 **hps.model).to(device)

    if os.path.isdir(checkpoint_path):
        checkpoint_path = glow_utils.latest_checkpoint_path(checkpoint_path)
    glow_utils.load_checkpoint(checkpoint_path, model)
    model.decoder.store_inverse(
    )  # do not calcuate jacobians for fast decoding
    _ = model.eval()

    cmu_dict = cmudict.CMUDict(hps.data.cmudict_path)

    return cmu_dict, model
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--input_file', type=str, help='Input file with text inside', required=True)
    parser.add_argument("-c", "--checkpoint_glow", type=str, default=None, required=True,
                        help="Path to glow checkpoint.")
    parser.add_argument("-hp", "--hyperparams", type=str, default=None, required=True,
                        help="Path to config file in JSON format")
    parser.add_argument("-o", "--output_dir", type=str, default=None, required=True,
                        help="Output directory path, where plots and wavs will be put.")
    parser.add_argument("--cuda", action='store_true', help="Add to run on gpu")
    parser.add_argument("--spaces", action='store_true', help="Add to add start/end spaces for glow synthesis")
    parser.add_argument("-w", "--waveglow_path", type=str, default=None, required=True,
                        help="Path to waveglow checkpoint")
    args = parser.parse_args()

    # set device
    if args.cuda:
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    # loading models
    print("Loading models...")

    hps = utils.get_hparams_from_dir(args.hyperparams)
    model = models.FlowGenerator(
        speaker_dim=hps.model.speaker_embedding,
        n_vocab=len(symbols),
        out_channels=hps.data.n_mel_channels,
        **hps.model).to(device)
    utils.load_checkpoint(args.checkpoint_glow, model)
    model.decoder.store_inverse()  # do not calcuate jacobians for fast decoding
    _ = model.eval()
    print("---GLOW--- loaded")
    # handle case of no path
    waveglow, denoiser = load_waveglow_model(args.waveglow_path, device)
    print("Using waveglow neural vocoder")
    # synthesis
    print("Synthesizing...")
    speakers, audio_names = synthesize_glow(model, device, hps, args.input_file, args.output_dir, waveglow, denoiser,
                                            args.spaces)
    print("Speech synthesis complete.")
예제 #4
0
import gradio as gr

# load WaveGlow
waveglow_path = WAVEGLOW_PATH
waveglow = torch.load(waveglow_path, map_location=torch.device('cpu'))['model']
waveglow = waveglow.remove_weightnorm(waveglow)
_ = waveglow.eval()
# from apex import amp
# waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") # Try if you want to boost up synthesis speed.

# If you are using a provided pretrained model
hps = utils.get_hparams_from_file("./configs/base.json")
checkpoint_path = PRETRAINED_GLOW_TTS_PATH

model = models.FlowGenerator(len(symbols) +
                             getattr(hps.data, "add_blank", False),
                             out_channels=hps.data.n_mel_channels,
                             **hps.model)

utils.load_checkpoint(checkpoint_path, model)
model.decoder.store_inverse()  # do not calcuate jacobians for fast decoding
_ = model.eval()

cmu_dict = cmudict.CMUDict(hps.data.cmudict_path)


# normalizing & type casting
def normalize_audio(x, max_wav_value=hps.data.max_wav_value):
    return np.clip((x / np.abs(x).max()) * max_wav_value, -32768,
                   32767).astype("int16")