예제 #1
0
def main():
    parser = argparse.ArgumentParser()
    # path setting
    parser.add_argument("--waveforms",
                        required=True,
                        type=str,
                        help="directory or list of wav files")
    parser.add_argument("--feats",
                        required=True,
                        type=str,
                        help="directory or list of aux feat files")
    parser.add_argument("--stats",
                        required=True,
                        type=str,
                        help="hdf5 file including statistics")
    parser.add_argument("--expdir",
                        required=True,
                        type=str,
                        help="directory to save the model")
    # network structure setting
    parser.add_argument("--n_quantize",
                        default=256,
                        type=int,
                        help="number of quantization")
    parser.add_argument("--n_aux",
                        default=28,
                        type=int,
                        help="number of dimension of aux feats")
    parser.add_argument("--n_resch",
                        default=512,
                        type=int,
                        help="number of channels of residual output")
    parser.add_argument("--n_skipch",
                        default=256,
                        type=int,
                        help="number of channels of skip output")
    parser.add_argument("--dilation_depth",
                        default=10,
                        type=int,
                        help="depth of dilation")
    parser.add_argument("--dilation_repeat",
                        default=1,
                        type=int,
                        help="number of repeating of dilation")
    parser.add_argument("--kernel_size",
                        default=2,
                        type=int,
                        help="kernel size of dilated causal convolution")
    parser.add_argument("--upsampling_factor",
                        default=0,
                        type=int,
                        help="upsampling factor of aux features"
                        "(if set 0, do not apply)")
    parser.add_argument("--use_speaker_code",
                        default=False,
                        type=strtobool,
                        help="flag to use speaker code")
    # network training setting
    parser.add_argument("--lr", default=1e-4, type=float, help="learning rate")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="weight decay coefficient")
    parser.add_argument(
        "--batch_length",
        default=20000,
        type=int,
        help="batch length (if set 0, utterance batch will be used)")
    parser.add_argument(
        "--batch_size",
        default=1,
        type=int,
        help="batch size (if use utterance batch, batch_size will be 1.")
    parser.add_argument("--iters",
                        default=200000,
                        type=int,
                        help="number of iterations")
    # other setting
    parser.add_argument("--checkpoints",
                        default=10000,
                        type=int,
                        help="how frequent saving model")
    parser.add_argument("--intervals",
                        default=100,
                        type=int,
                        help="log interval")
    parser.add_argument("--seed", default=1, type=int, help="seed number")
    parser.add_argument("--resume",
                        default=None,
                        nargs="?",
                        type=str,
                        help="model path to restart training")
    parser.add_argument("--n_gpus", default=1, type=int, help="number of gpus")
    parser.add_argument("--verbose", default=1, type=int, help="log level")
    args = parser.parse_args()

    # set log level
    if args.verbose == 1:
        logging.basicConfig(
            level=logging.INFO,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
            datefmt='%m/%d/%Y %I:%M:%S')
    elif args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
            datefmt='%m/%d/%Y %I:%M:%S')
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
            datefmt='%m/%d/%Y %I:%M:%S')
        logging.warn("logging is disabled.")

    # show argmument
    for key, value in vars(args).items():
        logging.info("%s = %s" % (key, str(value)))

    # make experimental directory
    if not os.path.exists(args.expdir):
        os.makedirs(args.expdir)

    # fix seed
    os.environ['PYTHONHASHSEED'] = str(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # save args as conf
    torch.save(args, args.expdir + "/model.conf")

    # # define network
    model = WaveNet(n_quantize=args.n_quantize,
                    n_aux=args.n_aux,
                    n_resch=args.n_resch,
                    n_skipch=args.n_skipch,
                    dilation_depth=args.dilation_depth,
                    dilation_repeat=args.dilation_repeat,
                    kernel_size=args.kernel_size,
                    upsampling_factor=args.upsampling_factor)
    logging.info(model)
    model.apply(initialize)
    model.train()

    if args.n_gpus > 1:
        device_ids = range(args.n_gpus)
        model = torch.nn.DataParallel(model, device_ids)
        model.receptive_field = model.module.receptive_field
        if args.n_gpus > args.batch_size:
            logging.warn("batch size is less than number of gpus.")

    # define loss and optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    criterion = nn.CrossEntropyLoss()

    # define transforms
    scaler = StandardScaler()
    scaler.mean_ = read_hdf5(args.stats, "/mean")
    scaler.scale_ = read_hdf5(args.stats, "/scale")
    wav_transform = transforms.Compose(
        [lambda x: encode_mu_law(x, args.n_quantize)])
    feat_transform = transforms.Compose([lambda x: scaler.transform(x)])

    # define generator
    if os.path.isdir(args.waveforms):
        filenames = sorted(
            find_files(args.waveforms, "*.wav", use_dir_name=False))
        wav_list = [args.waveforms + "/" + filename for filename in filenames]
        feat_list = [
            args.feats + "/" + filename.replace(".wav", ".h5")
            for filename in filenames
        ]
    elif os.path.isfile(args.waveforms):
        wav_list = read_txt(args.waveforms)
        feat_list = read_txt(args.feats)
    else:
        logging.error("--waveforms should be directory or list.")
        sys.exit(1)
    assert len(wav_list) == len(feat_list)
    logging.info("number of training data = %d." % len(wav_list))
    generator = train_generator(wav_list,
                                feat_list,
                                receptive_field=model.receptive_field,
                                batch_length=args.batch_length,
                                batch_size=args.batch_size,
                                wav_transform=wav_transform,
                                feat_transform=feat_transform,
                                shuffle=True,
                                upsampling_factor=args.upsampling_factor,
                                use_speaker_code=args.use_speaker_code)

    # charge minibatch in queue
    while not generator.queue.full():
        time.sleep(0.1)

    # resume
    if args.resume is not None and len(args.resume) != 0:
        checkpoint = torch.load(
            args.resume, map_location=lambda storage, loc: storage.cuda(0))
        if args.n_gpus > 1:
            model.module.load_state_dict(checkpoint["model"])
        else:
            model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        iterations = checkpoint["iterations"]
        logging.info("restored from %d-iter checkpoint." % iterations)
    else:
        iterations = 0

    # send to gpu
    if torch.cuda.is_available():
        model.cuda()
        criterion.cuda()
    else:
        logging.error("gpu is not available. please check the setting.")
        sys.exit(1)

    # train
    loss = 0
    total = 0
    for i in six.moves.range(iterations, args.iters):
        start = time.time()
        (batch_x, batch_h), batch_t = generator.next()
        batch_output = model(batch_x, batch_h)
        batch_loss = criterion(
            batch_output[:, model.receptive_field:].contiguous().view(
                -1, args.n_quantize),
            batch_t[:, model.receptive_field:].contiguous().view(-1))
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        loss += batch_loss.data[0]
        total += time.time() - start
        logging.debug("batch loss = %.3f (%.3f sec / batch)" %
                      (batch_loss.data[0], time.time() - start))

        # report progress
        if (i + 1) % args.intervals == 0:
            logging.info(
                "(iter:%d) average loss = %.6f (%.3f sec / batch)" %
                (i + 1, loss / args.intervals, total / args.intervals))
            logging.info(
                "estimated required time = "
                "{0.days:02}:{0.hours:02}:{0.minutes:02}:{0.seconds:02}".
                format(
                    relativedelta(seconds=int((args.iters - (i + 1)) *
                                              (total / args.intervals)))))
            loss = 0
            total = 0

        # save intermidiate model
        if (i + 1) % args.checkpoints == 0:
            if args.n_gpus > 1:
                save_checkpoint(args.expdir, model.module, optimizer, i + 1)
            else:
                save_checkpoint(args.expdir, model, optimizer, i + 1)

    # save final model
    if args.n_gpus > 1:
        torch.save({"model": model.module.state_dict()},
                   args.expdir + "/checkpoint-final.pkl")
    else:
        torch.save({"model": model.state_dict()},
                   args.expdir + "/checkpoint-final.pkl")
    logging.info("final checkpoint created.")
예제 #2
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          iters_per_checkpoint, batch_size, seed, checkpoint_path):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = CrossEntropyLoss()
    model = WaveNet(**wavenet_config).cpu()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    print(f"receptive_field: {model.receptive_field()}")
    trainset = WavenetDataset(
        dataset_file='data/dataset.npz',
        item_length=model.receptive_field() + 1000 + model.output_length - 1,
        target_length=model.output_length,
        file_location='data/',
        test_stride=500,
    )
    print(trainset._length)
    print('the dataset has ' + str(len(trainset)) + ' items')
    train_loader = DataLoader(
        trainset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=False,
    )

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    start = time.time()
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()
            y, target = batch
            y = to_gpu(y).float()
            target = to_gpu(target)
            y_pred = model((None, y))
            loss = criterion(y_pred[:, :, -model.output_length:], target)
            loss.backward()
            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, loss))
            print_etr(start,
                      total_iterations=(epochs - epoch_offset) *
                      len(train_loader),
                      current_iteration=epoch * len(train_loader) + i + 1)
            writer.add_scalar('Loss/train', loss, global_step=iteration)

            if (iteration % iters_per_checkpoint == 0):
                y_choice = y_pred[0].detach().cpu().transpose(0, 1)
                y_prob = F.softmax(y_choice, dim=1)
                y_prob_collapsed = torch.multinomial(y_prob,
                                                     num_samples=1).squeeze(1)
                y_pred_audio = mu_law_decode_numpy(y_prob_collapsed.numpy(),
                                                   model.n_out_channels)
                import torchaudio
                y_audio = mu_law_decode_numpy(y.numpy(), model.n_out_channels)
                torchaudio.save("test_in.wav", torch.tensor(y_audio), 16000)
                torchaudio.save("test_out.wav", torch.tensor(y_pred_audio),
                                16000)
                writer.add_audio('Audio',
                                 y_pred_audio,
                                 global_step=iteration,
                                 sample_rate=data_config['sampling_rate'])
                checkpoint_path = "{}/wavenet_{}".format(
                    output_directory, iteration)
                save_checkpoint(model, optimizer, learning_rate, iteration,
                                checkpoint_path)

            writer.flush()
            iteration += 1