Exemplo n.º 1
0
def get_collate_function(model_name, n_frames_per_step):
    if model_name == 'Tacotron2':
        collate_fn = TextMelCollate(n_frames_per_step)
    elif model_name == 'WaveGlow':
        collate_fn = torch.utils.data.dataloader.default_collate
    else:
        raise NotImplementedError(
            "unknown collate function requested: {}".format(model_name))

    return collate_fn
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch TTS Data Pre-processing')
    parser = parse_args(parser)
    args, unk_args = parser.parse_known_args()
    if len(unk_args) > 0:
        raise ValueError(f'Invalid options {unk_args}')

    if args.extract_pitch_char:
        assert args.extract_durations, "Durations required for pitch extraction"

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})

    model = load_and_setup_model(
        'Tacotron2',
        parser,
        args.tacotron2_checkpoint,
        amp=False,
        device=torch.device('cuda' if args.cuda else 'cpu'),
        forward_is_infer=False,
        ema=False)

    if args.train_mode:
        model.train()

    # n_mel_channels arg has been consumed by model's arg parser
    args.n_mel_channels = model.n_mel_channels

    for datum in ('mels', 'mels_teacher', 'attentions', 'durations',
                  'pitch_mel', 'pitch_char', 'pitch_trichar'):
        if getattr(args, f'extract_{datum}'):
            Path(args.dataset_path, datum).mkdir(parents=False, exist_ok=True)

    filenames = [
        Path(l.split('|')[0]).stem for l in open(args.wav_text_filelist, 'r')
    ]
    # Compatibility with Tacotron2 Data loader
    args.n_speakers = 1
    dataset = FilenamedLoader(filenames,
                              args.dataset_path,
                              args.wav_text_filelist,
                              args,
                              load_mel_from_disk=False)
    # TextMelCollate supports only n_frames_per_step=1
    data_loader = DataLoader(dataset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             sampler=None,
                             num_workers=0,
                             collate_fn=TextMelCollate(1),
                             pin_memory=False,
                             drop_last=False)
    pitch_vecs = {'mel': {}, 'char': {}, 'trichar': {}}
    for i, batch in enumerate(data_loader):
        tik = time.time()
        fnames = batch[-1]
        x, _, _ = batch_to_gpu(batch[:-1])
        _, text_lens, mels_padded, _, mel_lens = x

        for j, mel in enumerate(mels_padded):
            fpath = Path(args.dataset_path, 'mels', fnames[j] + '.pt')
            torch.save(mel[:, :mel_lens[j]].cpu(), fpath)

        with torch.no_grad():
            out_mels, out_mels_postnet, _, alignments = model.forward(x)

        if args.extract_mels_teacher:
            for j, mel in enumerate(out_mels_postnet):
                fpath = Path(args.dataset_path, 'mels_teacher',
                             fnames[j] + '.pt')
                torch.save(mel[:, :mel_lens[j]].cpu(), fpath)
        if args.extract_attentions:
            for j, ali in enumerate(alignments):
                ali = ali[:mel_lens[j], :text_lens[j]]
                fpath = Path(args.dataset_path, 'attentions',
                             fnames[j] + '.pt')
                torch.save(ali.cpu(), fpath)
        durations = []
        if args.extract_durations:
            for j, ali in enumerate(alignments):
                text_len = text_lens[j]
                ali = ali[:mel_lens[j], :text_len]
                dur = torch.histc(torch.argmax(ali, dim=1),
                                  min=0,
                                  max=text_len - 1,
                                  bins=text_len)
                durations.append(dur)
                fpath = Path(args.dataset_path, 'durations', fnames[j] + '.pt')
                torch.save(dur.cpu().int(), fpath)
        if args.extract_pitch_mel or args.extract_pitch_char or args.extract_pitch_trichar:
            for j, dur in enumerate(durations):
                fpath = Path(args.dataset_path, 'pitch_char',
                             fnames[j] + '.pt')
                wav = Path(args.dataset_path, 'wavs', fnames[j] + '.wav')
                p_mel, p_char, p_trichar = calculate_pitch(
                    str(wav),
                    dur.cpu().numpy())
                pitch_vecs['mel'][fnames[j]] = p_mel
                pitch_vecs['char'][fnames[j]] = p_char
                pitch_vecs['trichar'][fnames[j]] = p_trichar

        nseconds = time.time() - tik
        DLLogger.log(step=f'{i+1}/{len(data_loader)} ({nseconds:.2f}s)',
                     data={})

    if args.extract_pitch_mel:
        normalize_pitch_vectors(pitch_vecs['mel'])
        for fname, pitch in pitch_vecs['mel'].items():
            fpath = Path(args.dataset_path, 'pitch_mel', fname + '.pt')
            torch.save(torch.from_numpy(pitch), fpath)

    if args.extract_pitch_char:
        mean, std = normalize_pitch_vectors(pitch_vecs['char'])
        for fname, pitch in pitch_vecs['char'].items():
            fpath = Path(args.dataset_path, 'pitch_char', fname + '.pt')
            torch.save(torch.from_numpy(pitch), fpath)
        save_stats(args.dataset_path, args.wav_text_filelist, 'pitch_char',
                   mean, std)

    if args.extract_pitch_trichar:
        normalize_pitch_vectors(pitch_vecs['trichar'])
        for fname, pitch in pitch_vecs['trichar'].items():
            fpath = Path(args.dataset_path, 'pitch_trichar', fname + '.pt')
            torch.save(torch.from_numpy(pitch), fpath)

    DLLogger.flush()
Exemplo n.º 3
0
def main():

    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=os.path.join(
            args.output_directory, args.log_file) if args.rank == 0 else None,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])

    LOGGER.timed_block_start("run")
    LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS,
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE)
    LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("train_epoch_frames/sec",
                           metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("train_epoch_avg_frames/sec",
                           metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("train_epoch_avg_loss",
                           metric_scope=dllg.EPOCH_SCOPE)

    log_hardware()

    parser = parse_tacotron2_args(parser)
    args = parser.parse_args()

    log_args(args)

    torch.backends.cudnn.enabled = args.cudnn_enabled
    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    distributed_run = args.world_size > 1
    if distributed_run:
        init_distributed(args, args.world_size, args.rank, args.group_name)

    os.makedirs(args.output_directory, exist_ok=True)

    LOGGER.log(key=tags.RUN_START)
    run_start_time = time.time()

    model = get_tacotron2_model(args,
                                len(args.training_anchor_dirs),
                                is_training=True)

    if not args.amp_run and distributed_run:
        model = DDP(model)

    model.restore_checkpoint(
        os.path.join(args.output_directory, args.latest_checkpoint_file))

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.init_lr,
                                 weight_decay=args.weight_decay)

    if args.amp_run:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
        if distributed_run:
            model = DDP(model)

    criterion = Tacotron2Loss()

    collate_fn = TextMelCollate(args)
    train_dataset = TextMelDataset(args, args.training_anchor_dirs)
    train_loader = DataLoader(train_dataset,
                              num_workers=2,
                              shuffle=False,
                              batch_size=args.batch_size //
                              len(args.training_anchor_dirs),
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    # valate_dataset = TextMelDataset(args, args.validation_anchor_dirs)

    model.train()

    elapsed_epochs = model.get_elapsed_epochs()
    epochs = args.epochs - elapsed_epochs
    iteration = elapsed_epochs * len(train_loader)

    LOGGER.log(key=tags.TRAIN_LOOP)

    for epoch in range(1, epochs + 1):
        LOGGER.epoch_start()
        epoch_start_time = time.time()
        epoch += elapsed_epochs
        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)

        # used to calculate avg frames/sec over epoch
        reduced_num_frames_epoch = 0

        # used to calculate avg loss over epoch
        train_epoch_avg_loss = 0.0
        train_epoch_avg_frames_per_sec = 0.0
        num_iters = 0

        adjust_learning_rate(optimizer, epoch, args)

        for i, batch in enumerate(train_loader):
            print(f"Batch: {i}/{len(train_loader)} epoch {epoch}")
            LOGGER.iteration_start()
            iter_start_time = time.time()
            LOGGER.log(key=tags.TRAIN_ITER_START, value=i)

            # start = time.perf_counter()

            optimizer.zero_grad()
            x, y, num_frames = batch_to_gpu(batch)

            y_pred = model(x)

            loss = criterion(y_pred, y)

            if distributed_run:
                reduced_loss = reduce_tensor(loss.data, args.world_size).item()
                reduced_num_frames = reduce_tensor(num_frames.data, 1).item()
            else:
                reduced_loss = loss.item()
                reduced_num_frames = num_frames.item()
            if np.isnan(reduced_loss):
                raise Exception("loss is NaN")

            LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss)

            train_epoch_avg_loss += reduced_loss
            num_iters += 1

            # accumulate number of frames processed in this epoch
            reduced_num_frames_epoch += reduced_num_frames

            if args.amp_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), args.grad_clip_thresh)
            else:
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.grad_clip_thresh)

            optimizer.step()

            iteration += 1

            LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i)

            iter_stop_time = time.time()
            iter_time = iter_stop_time - iter_start_time
            frames_per_sec = reduced_num_frames / iter_time
            train_epoch_avg_frames_per_sec += frames_per_sec

            LOGGER.log(key="train_iter_frames/sec", value=frames_per_sec)
            LOGGER.log(key="iter_time", value=iter_time)
            LOGGER.iteration_stop()

        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
        epoch_stop_time = time.time()
        epoch_time = epoch_stop_time - epoch_start_time

        LOGGER.log(key="train_epoch_frames/sec",
                   value=(reduced_num_frames_epoch / epoch_time))
        LOGGER.log(key="train_epoch_avg_frames/sec",
                   value=(train_epoch_avg_frames_per_sec /
                          num_iters if num_iters > 0 else 0.0))
        LOGGER.log(key="train_epoch_avg_loss",
                   value=(train_epoch_avg_loss /
                          num_iters if num_iters > 0 else 0.0))
        LOGGER.log(key="epoch_time", value=epoch_time)

        LOGGER.log(key=tags.EVAL_START, value=epoch)

        # validate(model, criterion, valate_dataset, iteration, collate_fn, distributed_run, args)

        LOGGER.log(key=tags.EVAL_STOP, value=epoch)

        # Store latest checkpoint in each epoch
        model.elapse_epoch()
        checkpoint_path = os.path.join(args.output_directory,
                                       args.latest_checkpoint_file)
        torch.save(model.state_dict(), checkpoint_path)

        # Plot alignemnt
        if epoch % args.epochs_per_alignment == 0 and args.rank == 0:
            alignments = y_pred[3].data.numpy()
            index = np.random.randint(len(alignments))
            plot_alignment(
                alignments[index].transpose(0, 1),  # [enc_step, dec_step]
                os.path.join(args.output_directory,
                             f"align_{epoch:04d}_{iteration}.png"),
                info=
                f"{datetime.now().strftime('%Y-%m-%d %H:%M')} Epoch={epoch:04d} Iteration={iteration} Average loss={train_epoch_avg_loss/num_iters:.5f}"
            )

        # Save checkpoint
        if epoch % args.epochs_per_checkpoint == 0 and args.rank == 0:
            checkpoint_path = os.path.join(args.output_directory,
                                           f"checkpoint_{epoch:04d}.pt")
            print(
                f"Saving model and optimizer state at epoch {epoch:04d} to {checkpoint_path}"
            )
            torch.save(model.state_dict(), checkpoint_path)

            # Save evaluation
            # save_sample(model, args.tacotron2_checkpoint, args.phrase_path,
            #             os.path.join(args.output_directory, f"sample_{epoch:04d}_{iteration}.wav"), args.sampling_rate)

        LOGGER.epoch_stop()

    run_stop_time = time.time()
    run_time = run_stop_time - run_start_time
    LOGGER.log(key="run_time", value=run_time)
    LOGGER.log(key=tags.RUN_FINAL)

    print("training time", run_stop_time - run_start_time)

    LOGGER.timed_block_stop("run")

    if args.rank == 0:
        LOGGER.finish()
def get_collate_function(model_name):
    return {
        'Tacotron2': lambda _: TextMelCollate(n_frames_per_step=1),
        'WaveGlow': lambda _: torch.utils.data.dataloader.default_collate,
        'FastPitch': TextMelAliCollate
    }[model_name]()