def noise_shaping(wav_list, args):
    """APPLY NOISE SHAPING BASED ON MLSA FILTER."""
    # load coefficient of filter
    if check_hdf5(args.stats, "/mlsa/coef"):
        mlsa_coef = read_hdf5(args.stats, "/mlsa/coef")
        alpha = read_hdf5(args.stats, "/mlsa/alpha")
    else:
        raise KeyError("\"/mlsa/coef\" is not found in %s." % (args.stats))
    if args.inv:
        mlsa_coef *= -1.0

    # define synthesizer
    shiftl = int(args.fs / 1000 * args.shiftms)
    synthesizer = pysptk.synthesis.Synthesizer(
        pysptk.synthesis.MLSADF(
            order=mlsa_coef.shape[0] - 1,
            alpha=alpha),
        hopsize=shiftl
    )

    for i, wav_name in enumerate(wav_list):
        logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list)))

        # load wavfile and apply low cut filter
        fs, x = wavfile.read(wav_name)
        if x.dtype != np.int16:
            logging.warning("wav file format is not 16 bit PCM.")
        x = np.float64(x)

        # check sampling frequency
        if not fs == args.fs:
            logging.error("sampling frequency is not matched.")
            sys.exit(1)

        # replicate coef for time-invariant filtering
        num_frames = int(len(x) / shiftl) + 1
        mlsa_coefs = np.float64(np.tile(mlsa_coef, [num_frames, 1]))

        # synthesis and write
        x_ns = synthesizer.synthesis(x, mlsa_coefs)
        write_name = args.outdir + "/" + os.path.basename(wav_name)
        wavfile.write(write_name, args.fs, np.int16(x_ns))
def test_preprocessing(feature_type):
    # make arguments
    args = make_args(feature_type=feature_type)

    # prepare dummy wav files
    wavdir = "tmp/wav"
    if not os.path.exists(wavdir):
        os.makedirs(wavdir)
    for i in range(5):
        make_dummy_wav(wavdir + "/%d.wav" % i, 8000, args.fs)

    # feature extract
    wav_list = find_files(wavdir, "*.wav")
    if not os.path.exists(args.wavdir):
        os.makedirs(args.wavdir)
    if args.feature_type == "world":
        world_feature_extract(wav_list, args)
    elif args.feature_type == "melspc":
        melspectrogram_extract(wav_list, args)
    else:
        melcepstrum_extract(wav_list, args)

    # calc_stats
    file_list = find_files(args.hdf5dir, "*.h5")
    calc_stats(file_list, args)

    # noise shaping
    if feature_type != "melspc":
        wav_list = find_files(args.wavdir, "*.wav")
        if not os.path.exists(args.outdir):
            os.makedirs(args.outdir)
        if not check_hdf5(args.stats, "/mlsa/coef"):
            avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean")
            if args.feature_type == "world":
                avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end]
            mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag,
                                                  args.mcep_alpha)
            write_hdf5(args.stats, "/mlsa/coef", mlsa_coef)
            write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha)
        noise_shaping(wav_list, args)

    # remove
    shutil.rmtree("tmp")
示例#3
0
def calc_stats(file_list, args):
    """CALCULATE STATISTICS."""
    scaler = StandardScaler()

    # process over all of data
    for i, filename in enumerate(file_list):
        logging.info("now processing %s (%d/%d)" %
                     (filename, i + 1, len(file_list)))
        feat = read_hdf5(filename, "/" + args.feature_type)
        scaler.partial_fit(feat)

    # add uv term
    mean = scaler.mean_
    scale = scaler.scale_
    if args.feature_type == "world":
        mean[0] = 0.0
        scale[0] = 1.0

    # write to hdf5
    write_hdf5(args.stats, "/" + args.feature_type + "/mean", np.float32(mean))
    write_hdf5(args.stats, "/" + args.feature_type + "/scale",
               np.float32(scale))
def train_generator(wav_list, feat_list, receptive_field,
                    batch_length=None,
                    batch_size=1,
                    feature_type="world",
                    wav_transform=None,
                    feat_transform=None,
                    shuffle=True,
                    upsampling_factor=80,
                    use_upsampling_layer=True,
                    use_speaker_code=False):
    """GENERATE TRAINING BATCH.

    Args:
        wav_list (list): List of wav files.
        feat_list (list): List of feat files.
        receptive_field (int): Size of receptive filed.
        batch_length (int): Batch length (if set None, utterance batch will be used.).
        batch_size (int): Batch size (if batch_length = None, batch_size will be 1.).
        feature_type (str): Auxiliary feature type.
        wav_transform (func): Preprocessing function for waveform.
        feat_transform (func): Preprocessing function for aux feats.
        shuffle (bool): Whether to shuffle the file list.
        upsampling_factor (int): Upsampling factor.
        use_upsampling_layer (bool): Whether to use upsampling layer.
        use_speaker_code (bool): Whether to use speaker code.

    Returns:
        generator: Generator instance.

    """
    # shuffle list
    if shuffle:
        n_files = len(wav_list)
        idx = np.random.permutation(n_files)
        wav_list = [wav_list[i] for i in idx]
        feat_list = [feat_list[i] for i in idx]

    # check batch_length
    if batch_length is not None and use_upsampling_layer:
        batch_mod = (receptive_field + batch_length) % upsampling_factor
        logging.warning("batch length is decreased due to upsampling (%d -> %d)" % (
            batch_length, batch_length - batch_mod))
        batch_length -= batch_mod

    # show warning
    if batch_length is None and batch_size > 1:
        logging.warning("in utterance batch mode, batchsize will be 1.")

    while True:
        batch_x, batch_h, batch_t = [], [], []
        # process over all of files
        for wavfile, featfile in zip(wav_list, feat_list):
            # load waveform and aux feature
            x, fs = sf.read(wavfile, dtype=np.float32)
            h = read_hdf5(featfile, "/" + feature_type)
            if not use_upsampling_layer:
                h = extend_time(h, upsampling_factor)
            if use_speaker_code:
                sc = read_hdf5(featfile, "/speaker_code")
                sc = np.tile(sc, [h.shape[0], 1])
                h = np.concatenate([h, sc], axis=1)

            # check both lengths are same
            logging.debug("before x length = %d" % x.shape[0])
            logging.debug("before h length = %d" % h.shape[0])
            if use_upsampling_layer:
                x, h = validate_length(x, h, upsampling_factor)
            else:
                x, h = validate_length(x, h)
            logging.debug("after x length = %d" % x.shape[0])
            logging.debug("after h length = %d" % h.shape[0])

            # ---------------------------------------
            # use mini batch without upsampling layer
            # ---------------------------------------
            if batch_length is not None and not use_upsampling_layer:
                # make buffer array
                if "x_buffer" not in locals():
                    x_buffer = np.empty((0), dtype=np.float32)
                    h_buffer = np.empty((0, h.shape[1]), dtype=np.float32)
                x_buffer = np.concatenate([x_buffer, x], axis=0)
                h_buffer = np.concatenate([h_buffer, h], axis=0)

                while len(x_buffer) > receptive_field + batch_length:
                    # get pieces
                    x_ = x_buffer[:receptive_field + batch_length]
                    h_ = h_buffer[:receptive_field + batch_length]

                    # perform pre-processing
                    if wav_transform is not None:
                        x_ = wav_transform(x_)
                    if feat_transform is not None:
                        h_ = feat_transform(h_)

                    # convert to torch variable
                    x_ = torch.from_numpy(x_).long()
                    h_ = torch.from_numpy(h_).float()

                    # remove the last and first sample for training
                    batch_x += [x_[:-1]]  # (T)
                    batch_h += [h_[:-1].transpose(0, 1)]  # (D x T)
                    batch_t += [x_[1:]]  # (T)

                    # update buffer
                    x_buffer = x_buffer[batch_length:]
                    h_buffer = h_buffer[batch_length:]

                    # return mini batch
                    if len(batch_x) == batch_size:
                        batch_x = torch.stack(batch_x)
                        batch_h = torch.stack(batch_h)
                        batch_t = torch.stack(batch_t)

                        # send to cuda
                        if torch.cuda.is_available():
                            batch_x = batch_x.cuda()
                            batch_h = batch_h.cuda()
                            batch_t = batch_t.cuda()

                        yield (batch_x, batch_h), batch_t

                        batch_x, batch_h, batch_t = [], [], []

            # ------------------------------------
            # use mini batch with upsampling layer
            # ------------------------------------
            elif batch_length is not None and use_upsampling_layer:
                # make buffer array
                if "x_buffer" not in locals():
                    x_buffer = np.empty((0), dtype=np.float32)
                    h_buffer = np.empty((0, h.shape[1]), dtype=np.float32)
                x_buffer = np.concatenate([x_buffer, x], axis=0)
                h_buffer = np.concatenate([h_buffer, h], axis=0)

                while len(h_buffer) > (receptive_field + batch_length) // upsampling_factor:
                    # set batch size
                    h_bs = (receptive_field + batch_length) // upsampling_factor
                    x_bs = h_bs * upsampling_factor + 1

                    # get pieces
                    h_ = h_buffer[:h_bs]
                    x_ = x_buffer[:x_bs]

                    # perform pre-processing
                    if wav_transform is not None:
                        x_ = wav_transform(x_)
                    if feat_transform is not None:
                        h_ = feat_transform(h_)

                    # convert to torch variable
                    x_ = torch.from_numpy(x_).long()
                    h_ = torch.from_numpy(h_).float()

                    # remove the last and first sample for training
                    batch_h += [h_.transpose(0, 1)]  # (D x T)
                    batch_x += [x_[:-1]]  # (T)
                    batch_t += [x_[1:]]  # (T)

                    # set shift size
                    h_ss = batch_length // upsampling_factor
                    x_ss = h_ss * upsampling_factor

                    # update buffer
                    h_buffer = h_buffer[h_ss:]
                    x_buffer = x_buffer[x_ss:]

                    # return mini batch
                    if len(batch_x) == batch_size:
                        batch_x = torch.stack(batch_x)
                        batch_h = torch.stack(batch_h)
                        batch_t = torch.stack(batch_t)

                        # send to cuda
                        if torch.cuda.is_available():
                            batch_x = batch_x.cuda()
                            batch_h = batch_h.cuda()
                            batch_t = batch_t.cuda()

                        yield (batch_x, batch_h), batch_t

                        batch_x, batch_h, batch_t = [], [], []

            # --------------------------------------------
            # use utterance batch without upsampling layer
            # --------------------------------------------
            elif batch_length is None and not use_upsampling_layer:
                # perform pre-processing
                if wav_transform is not None:
                    x = wav_transform(x)
                if feat_transform is not None:
                    h = feat_transform(h)

                # convert to torch variable
                x = torch.from_numpy(x).long()
                h = torch.from_numpy(h).float()

                # remove the last and first sample for training
                batch_x = x[:-1].unsqueeze(0)  # (1 x T)
                batch_h = h[:-1].transpose(0, 1).unsqueeze(0)  # (1 x D x T)
                batch_t = x[1:].unsqueeze(0)  # (1 x T)

                # send to cuda
                if torch.cuda.is_available():
                    batch_x = batch_x.cuda()
                    batch_h = batch_h.cuda()
                    batch_t = batch_t.cuda()

                yield (batch_x, batch_h), batch_t

            # -----------------------------------------
            # use utterance batch with upsampling layer
            # -----------------------------------------
            else:
                # remove last frame
                h = h[:-1]
                x = x[:-upsampling_factor + 1]

                # perform pre-processing
                if wav_transform is not None:
                    x = wav_transform(x)
                if feat_transform is not None:
                    h = feat_transform(h)

                # convert to torch variable
                x = torch.from_numpy(x).long()
                h = torch.from_numpy(h).float()

                # remove the last and first sample for training
                batch_h = h.transpose(0, 1).unsqueeze(0)  # (1 x D x T')
                batch_x = x[:-1].unsqueeze(0)  # (1 x T)
                batch_t = x[1:].unsqueeze(0)  # (1 x T)

                # send to cuda
                if torch.cuda.is_available():
                    batch_x = batch_x.cuda()
                    batch_h = batch_h.cuda()
                    batch_t = batch_t.cuda()

                yield (batch_x, batch_h), batch_t

        # re-shuffle
        if shuffle:
            idx = np.random.permutation(n_files)
            wav_list = [wav_list[i] for i in idx]
            feat_list = [feat_list[i] for i in idx]
def main():
    """RUN TRAINING."""
    parser = argparse.ArgumentParser()
    # path setting
    parser.add_argument("--waveforms", required=True,
                        type=str, help="directory or list of wav files")
    parser.add_argument("--feats", required=True,
                        type=str, help="directory or list of aux feat files")
    parser.add_argument("--stats", required=True,
                        type=str, help="hdf5 file including statistics")
    parser.add_argument("--expdir", required=True,
                        type=str, help="directory to save the model")
    parser.add_argument("--feature_type", default="world", choices=["world", "melspc"],
                        type=str, help="feature type")
    # network structure setting
    parser.add_argument("--n_quantize", default=256,
                        type=int, help="number of quantization")
    parser.add_argument("--n_aux", default=28,
                        type=int, help="number of dimension of aux feats")
    parser.add_argument("--n_resch", default=512,
                        type=int, help="number of channels of residual output")
    parser.add_argument("--n_skipch", default=256,
                        type=int, help="number of channels of skip output")
    parser.add_argument("--dilation_depth", default=10,
                        type=int, help="depth of dilation")
    parser.add_argument("--dilation_repeat", default=1,
                        type=int, help="number of repeating of dilation")
    parser.add_argument("--kernel_size", default=2,
                        type=int, help="kernel size of dilated causal convolution")
    parser.add_argument("--upsampling_factor", default=80,
                        type=int, help="upsampling factor of aux features")
    parser.add_argument("--use_upsampling_layer", default=True,
                        type=strtobool, help="flag to use upsampling layer")
    parser.add_argument("--use_speaker_code", default=False,
                        type=strtobool, help="flag to use speaker code")
    # network training setting
    parser.add_argument("--lr", default=1e-4,
                        type=float, help="learning rate")
    parser.add_argument("--weight_decay", default=0.0,
                        type=float, help="weight decay coefficient")
    parser.add_argument("--batch_length", default=20000,
                        type=int, help="batch length (if set 0, utterance batch will be used)")
    parser.add_argument("--batch_size", default=1,
                        type=int, help="batch size (if use utterance batch, batch_size will be 1.")
    parser.add_argument("--iters", default=200000,
                        type=int, help="number of iterations")
    # other setting
    parser.add_argument("--checkpoint_interval", default=10000,
                        type=int, help="how frequent saving model")
    parser.add_argument("--intervals", default=100,
                        type=int, help="log interval")
    parser.add_argument("--seed", default=1,
                        type=int, help="seed number")
    parser.add_argument("--resume", default=None, nargs="?",
                        type=str, help="model path to restart training")
    parser.add_argument("--n_gpus", default=1,
                        type=int, help="number of gpus")
    parser.add_argument("--verbose", default=1,
                        type=int, help="log level")
    args = parser.parse_args()

    # set log level
    if args.verbose == 1:
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
    elif args.verbose > 1:
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
    else:
        logging.basicConfig(level=logging.WARNING,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
        logging.warning("logging is disabled.")

    # show arguments
    for key, value in vars(args).items():
        logging.info("%s = %s" % (key, str(value)))

    # make experimental directory
    if not os.path.exists(args.expdir):
        os.makedirs(args.expdir)

    # fix seed
    os.environ['PYTHONHASHSEED'] = str(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # fix slow computation of dilated conv
    # https://github.com/pytorch/pytorch/issues/15054#issuecomment-450191923
    torch.backends.cudnn.benchmark = True

    # save args as conf
    torch.save(args, args.expdir + "/model.conf")

    # define network
    if args.use_upsampling_layer:
        upsampling_factor = args.upsampling_factor
    else:
        upsampling_factor = 0
    model = WaveNet(
        n_quantize=args.n_quantize,
        n_aux=args.n_aux,
        n_resch=args.n_resch,
        n_skipch=args.n_skipch,
        dilation_depth=args.dilation_depth,
        dilation_repeat=args.dilation_repeat,
        kernel_size=args.kernel_size,
        upsampling_factor=upsampling_factor)
    logging.info(model)
    model.apply(initialize)
    model.train()

    if args.n_gpus > 1:
        device_ids = range(args.n_gpus)
        model = torch.nn.DataParallel(model, device_ids)
        model.receptive_field = model.module.receptive_field
        if args.n_gpus > args.batch_size:
            logging.warning("batch size is less than number of gpus.")

    # define optimizer and loss
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=args.lr,
        weight_decay=args.weight_decay)
    criterion = nn.CrossEntropyLoss()

    # define transforms
    scaler = StandardScaler()
    scaler.mean_ = read_hdf5(args.stats, "/" + args.feature_type + "/mean")
    scaler.scale_ = read_hdf5(args.stats, "/" + args.feature_type + "/scale")
    wav_transform = transforms.Compose([
        lambda x: encode_mu_law(x, args.n_quantize)])
    feat_transform = transforms.Compose([
        lambda x: scaler.transform(x)])

    # define generator
    if os.path.isdir(args.waveforms):
        filenames = sorted(find_files(args.waveforms, "*.wav", use_dir_name=False))
        wav_list = [args.waveforms + "/" + filename for filename in filenames]
        feat_list = [args.feats + "/" + filename.replace(".wav", ".h5") for filename in filenames]
    elif os.path.isfile(args.waveforms):
        wav_list = read_txt(args.waveforms)
        feat_list = read_txt(args.feats)
    else:
        logging.error("--waveforms should be directory or list.")
        sys.exit(1)
    assert len(wav_list) == len(feat_list)
    logging.info("number of training data = %d." % len(wav_list))
    generator = train_generator(
        wav_list, feat_list,
        receptive_field=model.receptive_field,
        batch_length=args.batch_length,
        batch_size=args.batch_size,
        feature_type=args.feature_type,
        wav_transform=wav_transform,
        feat_transform=feat_transform,
        shuffle=True,
        upsampling_factor=args.upsampling_factor,
        use_upsampling_layer=args.use_upsampling_layer,
        use_speaker_code=args.use_speaker_code)

    # charge minibatch in queue
    while not generator.queue.full():
        time.sleep(0.1)

    # resume model and optimizer
    if args.resume is not None and len(args.resume) != 0:
        checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage)
        iterations = checkpoint["iterations"]
        if args.n_gpus > 1:
            model.module.load_state_dict(checkpoint["model"])
        else:
            model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        logging.info("restored from %d-iter checkpoint." % iterations)
    else:
        iterations = 0

    # check gpu and then send to gpu
    if torch.cuda.is_available():
        model.cuda()
        criterion.cuda()
        for state in optimizer.state.values():
            for key, value in state.items():
                if torch.is_tensor(value):
                    state[key] = value.cuda()
    else:
        logging.error("gpu is not available. please check the setting.")
        sys.exit(1)

    # train
    loss = 0
    total = 0
    for i in six.moves.range(iterations, args.iters):
        start = time.time()
        (batch_x, batch_h), batch_t = generator.next()
        batch_output = model(batch_x, batch_h)
        batch_loss = criterion(
            batch_output[:, model.receptive_field:].contiguous().view(-1, args.n_quantize),
            batch_t[:, model.receptive_field:].contiguous().view(-1))
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        loss += batch_loss.item()
        total += time.time() - start
        logging.debug("batch loss = %.3f (%.3f sec / batch)" % (
            batch_loss.item(), time.time() - start))

        # report progress
        if (i + 1) % args.intervals == 0:
            logging.info("(iter:%d) average loss = %.6f (%.3f sec / batch)" % (
                i + 1, loss / args.intervals, total / args.intervals))
            logging.info("estimated required time = "
                         "{0.days:02}:{0.hours:02}:{0.minutes:02}:{0.seconds:02}"
                         .format(relativedelta(
                             seconds=int((args.iters - (i + 1)) * (total / args.intervals)))))
            loss = 0
            total = 0

        # save intermidiate model
        if (i + 1) % args.checkpoint_interval == 0:
            if args.n_gpus > 1:
                save_checkpoint(args.expdir, model.module, optimizer, i + 1)
            else:
                save_checkpoint(args.expdir, model, optimizer, i + 1)

    # save final model
    if args.n_gpus > 1:
        torch.save({"model": model.module.state_dict()}, args.expdir + "/checkpoint-final.pkl")
    else:
        torch.save({"model": model.state_dict()}, args.expdir + "/checkpoint-final.pkl")
    logging.info("final checkpoint created.")
示例#6
0
def decode_generator(
    feat_list,
    batch_size=32,
    feature_type="world",
    wav_transform=None,
    feat_transform=None,
    pulse_transform=p_trans_binary_multi_channel,
    upsampling_factor=80,
    use_upsampling_layer=True,
    use_speaker_code=False,
    use_pulse=True,
    f0_transform=None,
):
    """GENERATE DECODING BATCH.

    Args:
        feat_list (list): List of feature files.
        batch_size (int): Batch size in decoding.
        feature_type (str): Feature type.
        wav_transform (func): Preprocessing function for waveform.
        feat_transform (func): Preprocessing function for aux feats.
        upsampling_factor (int): Upsampling factor.
        use_upsampling_layer (bool): Whether to use upsampling layer.
        use_speaker_code (bool): Whether to use speaker code>

    Returns:
        generator: Generator instance.

    """
    # ---------------------------
    # sample-by-sample generation
    # ---------------------------
    if batch_size == 1:
        raise NotImplementedError

    # ----------------
    # batch generation
    # ----------------
    else:
        # sort with the feature length
        shape_list = [shape_hdf5(f, "/" + feature_type)[0] for f in feat_list]

        idx = np.argsort(shape_list)
        feat_list = [feat_list[i] for i in idx]

        # divide into batch list
        n_batch = math.ceil(len(feat_list) / batch_size)
        batch_lists = np.array_split(feat_list, n_batch)
        batch_lists = [f.tolist() for f in batch_lists]

        for batch_list in batch_lists:
            batch_x = []
            batch_h = []
            batch_p = []
            n_samples_list = []
            feat_ids = []
            for featfile in batch_list:
                # make seed waveform and load aux feature
                x = np.zeros((1))
                h = read_hdf5(featfile, "/" + feature_type)

                if f0_transform is not None:
                    f0 = read_hdf5(featfile, "/" + 'world_f0')
                    f0 = f0_transform(f0)
                    fs = args.fs
                    p = pw.synthesize_pulse_new(
                        f0, fs, frame_period=args.shiftms).astype(np.int32)
                    __p = read_hdf5(featfile, "/" + 'world_pulse')
                    assert len(p) == len(__p)
                else:
                    p = read_hdf5(featfile, "/" + 'world_pulse')

                if pulse_transform is not None:
                    p = pulse_transform(p)
                    assert p.max() <= 1.0

                if not use_upsampling_layer:
                    h = extend_time(h, upsampling_factor)
                if use_speaker_code:
                    sc = read_hdf5(featfile, "/speaker_code")
                    sc = np.tile(sc, [h.shape[0], 1])
                    h = np.concatenate([h, sc], axis=1)

                # perform pre-processing
                if wav_transform is not None:
                    x = wav_transform(x)
                if feat_transform is not None:
                    h = feat_transform(h)

                if use_pulse:
                    h = np.concatenate([h[:, 0:1], h[:, 2:]],
                                       axis=1)  # remove cont_f0_lpf
                # append to list
                batch_x += [x]
                batch_h += [h]
                batch_p += [p]
                if not use_upsampling_layer:
                    n_samples_list += [h.shape[0] - 1]
                else:
                    n_samples_list += [h.shape[0] * upsampling_factor - 1]
                feat_ids += [os.path.basename(featfile).replace(".h5", "")]

            # convert list to ndarray
            batch_x = np.stack(batch_x, axis=0)

            len_p_max = max([len(p) for p in batch_p])
            batch_p = [
                pad_along_axis(p, len_p_max, axis=0)
                for p, n_sample in zip(batch_p, n_samples_list)
            ]
            batch_p = np.stack(batch_p)
            batch_h = pad_list(batch_h)

            # convert to torch variable
            batch_x = torch.from_numpy(batch_x).long()  # B, 1
            batch_p = torch.from_numpy(batch_p).float().transpose(
                1, 2)  # B, C=1, T
            batch_h = torch.from_numpy(batch_h).float().transpose(
                1, 2)  # B, C, T(Frame)

            # print(batch_x.shape, batch_p.shape, batch_h.shape)

            # send to cuda
            if torch.cuda.is_available():
                batch_x = batch_x.cuda()
                batch_h = batch_h.cuda()
                batch_p = batch_p.cuda()

            yield feat_ids, (batch_x, batch_h, batch_p, n_samples_list)
示例#7
0
def main(args):
    """RUN DECODING."""

    # set log level
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
            datefmt='%m/%d/%Y %I:%M:%S')
    elif args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
            datefmt='%m/%d/%Y %I:%M:%S')
    else:
        logging.basicConfig(
            level=logging.WARNING,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
            datefmt='%m/%d/%Y %I:%M:%S')
        logging.warning("logging is disabled.")

    # show arguments
    for key, value in vars(args).items():
        logging.info("%s = %s" % (key, str(value)))

    # check arguments
    if args.stats is None:
        args.stats = os.path.dirname(args.checkpoint) + "/stats.h5"
    if args.config is None:
        args.config = os.path.dirname(args.checkpoint) + "/model.conf"
    if not os.path.exists(args.stats):
        raise FileNotFoundError("statistics file is missing (%s)." %
                                (args.stats))
    if not os.path.exists(args.config):
        raise FileNotFoundError("config file is missing (%s)." % (args.config))

    # check directory existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # fix seed
    os.environ['PYTHONHASHSEED'] = str(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # fix slow computation of dilated convargs.feats
    # https://github.com/pytorch/pytorch/issues/15054#issuecomment-450191923
    torch.backends.cudnn.benchmark = True

    # load config
    config = torch.load(args.config)

    # get file list
    if os.path.isdir(args.feats):
        feat_list = sorted(find_files(args.feats, "*.h5"))
    elif os.path.isfile(args.feats):
        feat_list = read_txt(args.feats)
    else:
        logging.error("--feats should be directory or list.")
        sys.exit(1)

    # prepare the file list for parallel decoding
    feat_lists = np.array_split(feat_list, args.n_gpus)
    feat_lists = [f_list.tolist() for f_list in feat_lists]

    # define transform
    scaler = StandardScaler()
    scaler.mean_ = read_hdf5(args.stats, "/" + config.feature_type + "/mean")
    scaler.scale_ = read_hdf5(args.stats, "/" + config.feature_type + "/scale")
    wav_transform = transforms.Compose(
        [lambda x: encode_mu_law(x, config.n_quantize)])
    feat_transform = transforms.Compose([lambda x: scaler.transform(x)])
    f0_transform = transforms.Compose(
        [partial(shift_semi_tone_f0_pulse, shift=args.f0_shift)])

    # define gpu decode function
    def gpu_decode(feat_list, gpu):
        # set default gpu and do not track gradient
        torch.cuda.set_device(gpu)
        torch.set_grad_enabled(False)

        # define model and load parameters
        if config.use_upsampling_layer:
            upsampling_factor = config.upsampling_factor
        else:
            upsampling_factor = 0

        if args.use_pulse:
            _WaveNet = WaveNetPulse
        else:
            _WaveNet = WaveNet
            config.n_aux = 28

        model = _WaveNet(n_quantize=config.n_quantize,
                         n_aux=config.n_aux,
                         n_resch=config.n_resch,
                         n_skipch=config.n_skipch,
                         dilation_depth=config.dilation_depth,
                         dilation_repeat=config.dilation_repeat,
                         kernel_size=config.kernel_size,
                         upsampling_factor=upsampling_factor)

        model.load_state_dict(
            torch.load(args.checkpoint,
                       map_location=lambda storage, loc: storage)["model"])
        model.eval()
        model.cuda()
        print(args.use_pulse)
        # define generator
        generator = decode_generator(
            feat_list,
            batch_size=args.batch_size,
            feature_type=config.feature_type,
            wav_transform=wav_transform,
            feat_transform=feat_transform,
            f0_transform=f0_transform,
            upsampling_factor=config.upsampling_factor,
            use_upsampling_layer=config.use_upsampling_layer,
            use_speaker_code=config.use_speaker_code,
            use_pulse=args.use_pulse)

        # decode
        if args.batch_size > 1:
            for feat_ids, (batch_x, batch_h, batch_p,
                           n_samples_list) in generator:
                logging.info("decoding start")
                samples_list = model.batch_fast_generate(
                    batch_x,
                    batch_h,
                    n_samples_list,
                    batch_p,
                    intervals=args.intervals)
                for feat_id, samples in zip(feat_ids, samples_list):
                    wav = decode_mu_law(samples, config.n_quantize)
                    sf.write(args.outdir + "/" + feat_id + ".wav", wav,
                             args.fs, "PCM_16")
                    logging.info("wrote %s.wav in %s." %
                                 (feat_id, args.outdir))
        else:
            raise NotImplementedError

    # parallel decode
    processes = []
    for gpu, feat_list in enumerate(feat_lists):
        p = mp.Process(target=gpu_decode, args=(
            feat_list,
            gpu,
        ))
        p.start()
        processes.append(p)

    # wait for all process
    for p in processes:
        p.join()
def main():
    """RUN NOISE SHAPING IN PARALLEL."""
    parser = argparse.ArgumentParser(
        description="making feature file argsurations.")

    parser.add_argument(
        "--waveforms", default=None,
        help="directory or list of filename of input wavfile")
    parser.add_argument(
        "--stats", default=None,
        help="filename of hdf5 format")
    parser.add_argument(
        "--outdir", default=None,
        help="directory to save preprocessed wav file")
    parser.add_argument(
        "--fs", default=16000,
        type=int, help="Sampling frequency")
    parser.add_argument(
        "--shiftms", default=5,
        type=float, help="Frame shift in msec")
    parser.add_argument(
        "--feature_type", default="world", choices=["world", "mcep", "melspc"],
        type=str, help="feature type")
    parser.add_argument(
        "--mcep_dim_start", default=2,
        type=int, help="Start index of mel cepstrum")
    parser.add_argument(
        "--mcep_dim_end", default=27,
        type=int, help="End index of mel cepstrum")
    parser.add_argument(
        "--mcep_alpha", default=0.41,
        type=float, help="Alpha of mel cepstrum")
    parser.add_argument(
        "--mag", default=0.5,
        type=float, help="magnification of noise shaping")
    parser.add_argument(
        "--verbose", default=1,
        type=int, help="log message level")
    parser.add_argument(
        '--n_jobs', default=10,
        type=int, help="number of parallel jobs")
    parser.add_argument(
        '--inv', default=False, type=strtobool,
        help="if True, inverse filtering will be performed")

    args = parser.parse_args()

    # set log level
    if args.verbose == 1:
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
    elif args.verbose > 1:
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
    else:
        logging.basicConfig(level=logging.WARNING,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
        logging.warning("logging is disabled.")

    # show arguments
    for key, value in vars(args).items():
        logging.info("%s = %s" % (key, str(value)))

    # read list
    if os.path.isdir(args.waveforms):
        file_list = sorted(find_files(args.waveforms, "*.wav"))
    else:
        file_list = read_txt(args.waveforms)
    logging.info("number of utterances = %d" % len(file_list))

    # check directory existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # divide list
    file_lists = np.array_split(file_list, args.n_jobs)
    file_lists = [f_list.tolist() for f_list in file_lists]

    # calculate MLSA coef ans save it
    if not check_hdf5(args.stats, "/mlsa/coef"):
        avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean")
        if args.feature_type == "world":
            avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end]
        mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag, args.mcep_alpha)
        write_hdf5(args.stats, "/mlsa/coef", mlsa_coef)
        write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha)

    # multi processing
    processes = []
    if args.feature_type == "melspc":
        # TODO(kan-bayashi): implement noise shaping using melspectrogram
        raise NotImplementedError("currently, support only world and mcep.")
    for f in file_lists:
        p = mp.Process(target=noise_shaping, args=(f, args,))
        p.start()
        processes.append(p)

    # wait for all process
    for p in processes:
        p.join()
示例#9
0
    def get_dataloader(self):
        args = self.args
        # define transforms
        scaler = StandardScaler()
        scaler.mean_ = read_hdf5(args.stats, "/" + args.feature_type + "/mean")
        scaler.scale_ = read_hdf5(args.stats,
                                  "/" + args.feature_type + "/scale")
        wav_transform = transforms.Compose(
            [lambda x: encode_mu_law(x, args.n_quantize)])
        feat_transform = transforms.Compose([lambda x: scaler.transform(x)])

        # define generator
        if os.path.isdir(args.waveforms):
            filenames = sorted(
                find_files(args.waveforms, "*.wav", use_dir_name=False))
            wav_list_train = [
                args.waveforms + "/" + filename for filename in filenames
            ]
            feat_list_train = [
                args.feats + "/" + filename.replace(".wav", ".h5")
                for filename in filenames
            ]

        elif os.path.isfile(args.waveforms):
            wav_list_train = read_txt(args.waveforms)
            feat_list_train = read_txt(args.feats)
        else:
            logging.error("--waveforms should be directory or list.")
            sys.exit(1)
        assert len(wav_list_train) == len(feat_list_train)
        logging.info("number of training data = %d." % len(wav_list_train))
        generator = data_generator(
            wav_list_train,
            feat_list_train,
            receptive_field=self.model.receptive_field,
            batch_length=args.batch_length,
            batch_size=args.batch_size,
            feature_type=args.feature_type,
            wav_transform=wav_transform,
            feat_transform=feat_transform,
            shuffle=True,
            upsampling_factor=args.upsampling_factor,
            use_upsampling_layer=args.use_upsampling_layer,
            use_speaker_code=args.use_speaker_code,
            use_pulse=args.use_pulse)

        test_generator = data_generator(
            wav_list_test[:args.batch_size],
            feat_list_test[:args.batch_size],
            receptive_field=self.model.receptive_field,
            batch_length=args.batch_length,
            batch_size=args.batch_size,
            feature_type=args.feature_type,
            wav_transform=wav_transform,
            feat_transform=feat_transform,
            shuffle=False,
            upsampling_factor=args.upsampling_factor,
            use_upsampling_layer=args.use_upsampling_layer,
            use_speaker_code=args.use_speaker_code,
            use_pulse=args.use_pulse)

        # charge minibatch in queue
        while not generator.queue.full():
            time.sleep(0.1)

        return generator, test_generator
示例#10
0
def decode_generator(feat_list,
                     batch_size=32,
                     feature_type="world",
                     wav_transform=None,
                     feat_transform=None,
                     upsampling_factor=80,
                     use_upsampling_layer=True,
                     use_speaker_code=False,
                     pulse=True):
    """GENERATE DECODING BATCH.

    Args:
        feat_list (list): List of feature files.
        batch_size (int): Batch size in decoding.
        feature_type (str): Feature type.
        wav_transform (func): Preprocessing function for waveform.
        feat_transform (func): Preprocessing function for aux feats.
        upsampling_factor (int): Upsampling factor.
        use_upsampling_layer (bool): Whether to use upsampling layer.
        use_speaker_code (bool): Whether to use speaker code>

    Returns:
        generator: Generator instance.

    """
    # ---------------------------
    # sample-by-sample generation
    # ---------------------------
    if batch_size == 1:
        for featfile in feat_list:
            x = np.zeros((1))
            h = read_hdf5(featfile, "/" + feature_type)
            if not use_upsampling_layer:
                h = extend_time(h, upsampling_factor)
            if use_speaker_code:
                sc = read_hdf5(featfile, "/speaker_code")
                sc = np.tile(sc, [h.shape[0], 1])
                h = np.concatenate([h, sc], axis=1)

            # perform pre-processing
            if wav_transform is not None:
                x = wav_transform(x)
            if feat_transform is not None:
                h = feat_transform(h)

            # convert to torch variable
            x = torch.from_numpy(x).long()
            h = torch.from_numpy(h).float()
            x = x.unsqueeze(0)  # 1 => 1 x 1
            h = h.transpose(0, 1).unsqueeze(0)  # T x C => 1 x C x T

            # send to cuda
            if torch.cuda.is_available():
                x = x.cuda()
                h = h.cuda()

            # get target length and file id
            if not use_upsampling_layer:
                n_samples = h.size(2) - 1
            else:
                n_samples = h.size(2) * upsampling_factor - 1
            feat_id = os.path.basename(featfile).replace(".h5", "")

            yield feat_id, (x, h, n_samples)

    # ----------------
    # batch generation
    # ----------------
    else:
        # sort with the feature length
        shape_list = [shape_hdf5(f, "/" + feature_type)[0] for f in feat_list]
        idx = np.argsort(shape_list)
        feat_list = [feat_list[i] for i in idx]

        # divide into batch list
        n_batch = math.ceil(len(feat_list) / batch_size)
        batch_lists = np.array_split(feat_list, n_batch)
        batch_lists = [f.tolist() for f in batch_lists]

        for batch_list in batch_lists:
            batch_x = []
            batch_h = []
            n_samples_list = []
            feat_ids = []
            for featfile in batch_list:
                # make seed waveform and load aux feature
                x = np.zeros((1))
                h = read_hdf5(featfile, "/" + feature_type)
                if not use_upsampling_layer:
                    h = extend_time(h, upsampling_factor)
                if use_speaker_code:
                    sc = read_hdf5(featfile, "/speaker_code")
                    sc = np.tile(sc, [h.shape[0], 1])
                    h = np.concatenate([h, sc], axis=1)

                # perform pre-processing
                if wav_transform is not None:
                    x = wav_transform(x)
                if feat_transform is not None:
                    h = feat_transform(h)

                # append to list
                batch_x += [x]
                batch_h += [h]
                if not use_upsampling_layer:
                    n_samples_list += [h.shape[0] - 1]
                else:
                    n_samples_list += [h.shape[0] * upsampling_factor - 1]
                feat_ids += [os.path.basename(featfile).replace(".h5", "")]

            # convert list to ndarray
            batch_x = np.stack(batch_x, axis=0)
            batch_h = pad_list(batch_h)

            # convert to torch variable
            batch_x = torch.from_numpy(batch_x).long()
            batch_h = torch.from_numpy(batch_h).float().transpose(1, 2)

            # send to cuda
            if torch.cuda.is_available():
                batch_x = batch_x.cuda()
                batch_h = batch_h.cuda()

            yield feat_ids, (batch_x, batch_h, n_samples_list)
示例#11
0
def train_generator(wav_list,
                    feat_list,
                    receptive_field,
                    batch_length=None,
                    batch_size=1,
                    feature_type="world",
                    wav_transform=None,
                    feat_transform=None,
                    pulse_transform=p_trans_binary_multi_channel,
                    shuffle=True,
                    upsampling_factor=80,
                    use_upsampling_layer=True,
                    use_speaker_code=False,
                    use_pulse=True):
    """GENERATE TRAINING BATCH.

    Args:
        wav_list (list): List of wav files.
        feat_list (list): List of feat files.
        receptive_field (int): Size of receptive filed.
        batch_length (int): Batch length (if set None, utterance batch will be used.).
        batch_size (int): Batch size (if batch_length = None, batch_size will be 1.).
        feature_type (str): Auxiliary feature type.
        wav_transform (func): Preprocessing function for waveform.
        feat_transform (func): Preprocessing function for aux feats.
        shuffle (bool): Whether to shuffle the file list.
        upsampling_factor (int): Upsampling factor.
        use_upsampling_layer (bool): Whether to use upsampling layer.
        use_speaker_code (bool): Whether to use speaker code.
        use_pulse (bool): use pulse signal

    Returns:
        generator: Generator instance.

    """
    # shuffle list
    if shuffle:
        n_files = len(wav_list)
        idx = np.random.permutation(n_files)
        wav_list = [wav_list[i] for i in idx]
        feat_list = [feat_list[i] for i in idx]

    # check batch_length
    if batch_length is not None and use_upsampling_layer:
        batch_mod = (receptive_field + batch_length) % upsampling_factor
        logging.warning(
            "batch length is decreased due to upsampling (%d -> %d)" %
            (batch_length, batch_length - batch_mod))
        batch_length -= batch_mod

    # show warning
    if batch_length is None and batch_size > 1:
        logging.warning("in utterance batch mode, batchsize will be 1.")

    while True:
        batch_x, batch_p, batch_h, batch_t = [], [], [], []
        # process over all of files
        for wavfile, featfile in zip(wav_list, feat_list):
            # load waveform and aux feature
            # x, fs = sf.read(wavfile, dtype=np.float32)
            fs, data = wf.read(wavfile)
            # print(data.shape)
            x = data.astype(np.float) / 32768
            h = read_hdf5(featfile, "/" + feature_type)
            p = read_hdf5(featfile, "/" + 'world_pulse')
            # p
            if pulse_transform:
                p = pulse_transform(p)

            if not use_upsampling_layer:
                h = extend_time(h, upsampling_factor)

            if use_speaker_code:
                sc = read_hdf5(featfile, "/speaker_code")
                sc = np.tile(sc, [h.shape[0], 1])
                h = np.concatenate([h, sc], axis=1)

            # check both lengths are same
            logging.debug("before x length = %d" % x.shape[0])
            logging.debug("before h length = %d" % h.shape[0])
            if use_upsampling_layer:
                x, h = validate_length(x, h, upsampling_factor)
            else:
                x, h = validate_length(x, h)
            logging.debug("after x length = %d" % x.shape[0])
            logging.debug("after h length = %d" % h.shape[0])

            # ---------------------------------------
            # use mini batch without upsampling layer
            # ---------------------------------------
            if batch_length is not None and not use_upsampling_layer:
                raise NotImplementedError

            # ------------------------------------
            # use mini batch with upsampling layer <-------This TODO
            # ------------------------------------
            elif batch_length is not None and use_upsampling_layer:
                # make buffer array
                if "x_buffer" not in locals():
                    x_buffer = np.empty((0), dtype=np.float32)
                    # p_buffer = np.empty((0), dtype=np.float32)
                    p_buffer = np.empty((0, p.shape[1]), dtype=np.float32)
                    h_buffer = np.empty((0, h.shape[1]), dtype=np.float32)
                x_buffer = np.concatenate([x_buffer, x], axis=0)
                p_buffer = np.concatenate([p_buffer, p], axis=0)
                h_buffer = np.concatenate([h_buffer, h], axis=0)

                while len(h_buffer) > (receptive_field +
                                       batch_length) // upsampling_factor:
                    # set batch size
                    h_bs = (receptive_field +
                            batch_length) // upsampling_factor
                    x_bs = h_bs * upsampling_factor + 1
                    p_bs = h_bs * upsampling_factor + 1

                    # get pieces
                    h_ = h_buffer[:h_bs]
                    x_ = x_buffer[:x_bs]
                    p_ = p_buffer[:p_bs]

                    # perform pre-processing
                    if wav_transform is not None:
                        x_ = wav_transform(x_)
                    if feat_transform is not None:
                        h_ = feat_transform(h_)

                    if use_pulse:
                        h_ = np.concatenate(
                            [h_[:, 0:1], h_[:, 2:]], axis=1
                        )  # remove cont_f0_lpf (vuv[1]+mcep[25]+ap_code[1])
                        # h_ = np.concatenate([h_[:, 0:1], h_[:, -1:]], axis=1)  # remove cont_f0_lpf and mcep (vuv[1]+ap_code[1])
                        # mcep = h_[:, 1:-2]  # extract mcep

                    # convert to torch variable
                    x_ = torch.from_numpy(x_).long()
                    p_ = torch.from_numpy(p_).float()
                    h_ = torch.from_numpy(h_).float()

                    # remove the last and first sample for training
                    batch_h += [h_.transpose(0, 1)]  # (D x T)
                    batch_x += [x_[:-1]]  # (T)
                    batch_p += [p_[:-1].transpose(0, 1)]  # (C x T)
                    batch_t += [x_[1:]]  # (T)

                    # set shift size
                    h_ss = batch_length // upsampling_factor
                    x_ss = h_ss * upsampling_factor
                    p_ss = h_ss * upsampling_factor

                    # update buffer
                    h_buffer = h_buffer[h_ss:]
                    x_buffer = x_buffer[x_ss:]
                    p_buffer = p_buffer[p_ss:]

                    # return mini batch
                    if len(batch_x) == batch_size:
                        batch_x = torch.stack(batch_x)
                        batch_p = torch.stack(batch_p)
                        batch_h = torch.stack(batch_h)
                        batch_t = torch.stack(batch_t)

                        # send to cuda
                        if torch.cuda.is_available():
                            batch_x = batch_x.cuda()
                            batch_p = batch_p.cuda()
                            batch_h = batch_h.cuda()
                            batch_t = batch_t.cuda()

                        yield (batch_x, batch_h, batch_p), batch_t

                        batch_x, batch_h, batch_p, batch_t, = [], [], [], []

            # --------------------------------------------
            # use utterance batch without upsampling layer
            # --------------------------------------------
            elif batch_length is None and not use_upsampling_layer:
                raise NotImplementedError

            # -----------------------------------------
            # use utterance batch with upsampling layer
            # -----------------------------------------
            else:
                raise NotImplementedError

        # re-shuffle
        if shuffle:
            idx = np.random.permutation(n_files)
            wav_list = [wav_list[i] for i in idx]
            feat_list = [feat_list[i] for i in idx]