def noise_shaping(wav_list, args): """APPLY NOISE SHAPING BASED ON MLSA FILTER.""" # load coefficient of filter if check_hdf5(args.stats, "/mlsa/coef"): mlsa_coef = read_hdf5(args.stats, "/mlsa/coef") alpha = read_hdf5(args.stats, "/mlsa/alpha") else: raise KeyError("\"/mlsa/coef\" is not found in %s." % (args.stats)) if args.inv: mlsa_coef *= -1.0 # define synthesizer shiftl = int(args.fs / 1000 * args.shiftms) synthesizer = pysptk.synthesis.Synthesizer( pysptk.synthesis.MLSADF( order=mlsa_coef.shape[0] - 1, alpha=alpha), hopsize=shiftl ) for i, wav_name in enumerate(wav_list): logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) if x.dtype != np.int16: logging.warning("wav file format is not 16 bit PCM.") x = np.float64(x) # check sampling frequency if not fs == args.fs: logging.error("sampling frequency is not matched.") sys.exit(1) # replicate coef for time-invariant filtering num_frames = int(len(x) / shiftl) + 1 mlsa_coefs = np.float64(np.tile(mlsa_coef, [num_frames, 1])) # synthesis and write x_ns = synthesizer.synthesis(x, mlsa_coefs) write_name = args.outdir + "/" + os.path.basename(wav_name) wavfile.write(write_name, args.fs, np.int16(x_ns))
def test_preprocessing(feature_type): # make arguments args = make_args(feature_type=feature_type) # prepare dummy wav files wavdir = "tmp/wav" if not os.path.exists(wavdir): os.makedirs(wavdir) for i in range(5): make_dummy_wav(wavdir + "/%d.wav" % i, 8000, args.fs) # feature extract wav_list = find_files(wavdir, "*.wav") if not os.path.exists(args.wavdir): os.makedirs(args.wavdir) if args.feature_type == "world": world_feature_extract(wav_list, args) elif args.feature_type == "melspc": melspectrogram_extract(wav_list, args) else: melcepstrum_extract(wav_list, args) # calc_stats file_list = find_files(args.hdf5dir, "*.h5") calc_stats(file_list, args) # noise shaping if feature_type != "melspc": wav_list = find_files(args.wavdir, "*.wav") if not os.path.exists(args.outdir): os.makedirs(args.outdir) if not check_hdf5(args.stats, "/mlsa/coef"): avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean") if args.feature_type == "world": avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end] mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag, args.mcep_alpha) write_hdf5(args.stats, "/mlsa/coef", mlsa_coef) write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha) noise_shaping(wav_list, args) # remove shutil.rmtree("tmp")
def calc_stats(file_list, args): """CALCULATE STATISTICS.""" scaler = StandardScaler() # process over all of data for i, filename in enumerate(file_list): logging.info("now processing %s (%d/%d)" % (filename, i + 1, len(file_list))) feat = read_hdf5(filename, "/" + args.feature_type) scaler.partial_fit(feat) # add uv term mean = scaler.mean_ scale = scaler.scale_ if args.feature_type == "world": mean[0] = 0.0 scale[0] = 1.0 # write to hdf5 write_hdf5(args.stats, "/" + args.feature_type + "/mean", np.float32(mean)) write_hdf5(args.stats, "/" + args.feature_type + "/scale", np.float32(scale))
def train_generator(wav_list, feat_list, receptive_field, batch_length=None, batch_size=1, feature_type="world", wav_transform=None, feat_transform=None, shuffle=True, upsampling_factor=80, use_upsampling_layer=True, use_speaker_code=False): """GENERATE TRAINING BATCH. Args: wav_list (list): List of wav files. feat_list (list): List of feat files. receptive_field (int): Size of receptive filed. batch_length (int): Batch length (if set None, utterance batch will be used.). batch_size (int): Batch size (if batch_length = None, batch_size will be 1.). feature_type (str): Auxiliary feature type. wav_transform (func): Preprocessing function for waveform. feat_transform (func): Preprocessing function for aux feats. shuffle (bool): Whether to shuffle the file list. upsampling_factor (int): Upsampling factor. use_upsampling_layer (bool): Whether to use upsampling layer. use_speaker_code (bool): Whether to use speaker code. Returns: generator: Generator instance. """ # shuffle list if shuffle: n_files = len(wav_list) idx = np.random.permutation(n_files) wav_list = [wav_list[i] for i in idx] feat_list = [feat_list[i] for i in idx] # check batch_length if batch_length is not None and use_upsampling_layer: batch_mod = (receptive_field + batch_length) % upsampling_factor logging.warning("batch length is decreased due to upsampling (%d -> %d)" % ( batch_length, batch_length - batch_mod)) batch_length -= batch_mod # show warning if batch_length is None and batch_size > 1: logging.warning("in utterance batch mode, batchsize will be 1.") while True: batch_x, batch_h, batch_t = [], [], [] # process over all of files for wavfile, featfile in zip(wav_list, feat_list): # load waveform and aux feature x, fs = sf.read(wavfile, dtype=np.float32) h = read_hdf5(featfile, "/" + feature_type) if not use_upsampling_layer: h = extend_time(h, upsampling_factor) if use_speaker_code: sc = read_hdf5(featfile, "/speaker_code") sc = np.tile(sc, [h.shape[0], 1]) h = np.concatenate([h, sc], axis=1) # check both lengths are same logging.debug("before x length = %d" % x.shape[0]) logging.debug("before h length = %d" % h.shape[0]) if use_upsampling_layer: x, h = validate_length(x, h, upsampling_factor) else: x, h = validate_length(x, h) logging.debug("after x length = %d" % x.shape[0]) logging.debug("after h length = %d" % h.shape[0]) # --------------------------------------- # use mini batch without upsampling layer # --------------------------------------- if batch_length is not None and not use_upsampling_layer: # make buffer array if "x_buffer" not in locals(): x_buffer = np.empty((0), dtype=np.float32) h_buffer = np.empty((0, h.shape[1]), dtype=np.float32) x_buffer = np.concatenate([x_buffer, x], axis=0) h_buffer = np.concatenate([h_buffer, h], axis=0) while len(x_buffer) > receptive_field + batch_length: # get pieces x_ = x_buffer[:receptive_field + batch_length] h_ = h_buffer[:receptive_field + batch_length] # perform pre-processing if wav_transform is not None: x_ = wav_transform(x_) if feat_transform is not None: h_ = feat_transform(h_) # convert to torch variable x_ = torch.from_numpy(x_).long() h_ = torch.from_numpy(h_).float() # remove the last and first sample for training batch_x += [x_[:-1]] # (T) batch_h += [h_[:-1].transpose(0, 1)] # (D x T) batch_t += [x_[1:]] # (T) # update buffer x_buffer = x_buffer[batch_length:] h_buffer = h_buffer[batch_length:] # return mini batch if len(batch_x) == batch_size: batch_x = torch.stack(batch_x) batch_h = torch.stack(batch_h) batch_t = torch.stack(batch_t) # send to cuda if torch.cuda.is_available(): batch_x = batch_x.cuda() batch_h = batch_h.cuda() batch_t = batch_t.cuda() yield (batch_x, batch_h), batch_t batch_x, batch_h, batch_t = [], [], [] # ------------------------------------ # use mini batch with upsampling layer # ------------------------------------ elif batch_length is not None and use_upsampling_layer: # make buffer array if "x_buffer" not in locals(): x_buffer = np.empty((0), dtype=np.float32) h_buffer = np.empty((0, h.shape[1]), dtype=np.float32) x_buffer = np.concatenate([x_buffer, x], axis=0) h_buffer = np.concatenate([h_buffer, h], axis=0) while len(h_buffer) > (receptive_field + batch_length) // upsampling_factor: # set batch size h_bs = (receptive_field + batch_length) // upsampling_factor x_bs = h_bs * upsampling_factor + 1 # get pieces h_ = h_buffer[:h_bs] x_ = x_buffer[:x_bs] # perform pre-processing if wav_transform is not None: x_ = wav_transform(x_) if feat_transform is not None: h_ = feat_transform(h_) # convert to torch variable x_ = torch.from_numpy(x_).long() h_ = torch.from_numpy(h_).float() # remove the last and first sample for training batch_h += [h_.transpose(0, 1)] # (D x T) batch_x += [x_[:-1]] # (T) batch_t += [x_[1:]] # (T) # set shift size h_ss = batch_length // upsampling_factor x_ss = h_ss * upsampling_factor # update buffer h_buffer = h_buffer[h_ss:] x_buffer = x_buffer[x_ss:] # return mini batch if len(batch_x) == batch_size: batch_x = torch.stack(batch_x) batch_h = torch.stack(batch_h) batch_t = torch.stack(batch_t) # send to cuda if torch.cuda.is_available(): batch_x = batch_x.cuda() batch_h = batch_h.cuda() batch_t = batch_t.cuda() yield (batch_x, batch_h), batch_t batch_x, batch_h, batch_t = [], [], [] # -------------------------------------------- # use utterance batch without upsampling layer # -------------------------------------------- elif batch_length is None and not use_upsampling_layer: # perform pre-processing if wav_transform is not None: x = wav_transform(x) if feat_transform is not None: h = feat_transform(h) # convert to torch variable x = torch.from_numpy(x).long() h = torch.from_numpy(h).float() # remove the last and first sample for training batch_x = x[:-1].unsqueeze(0) # (1 x T) batch_h = h[:-1].transpose(0, 1).unsqueeze(0) # (1 x D x T) batch_t = x[1:].unsqueeze(0) # (1 x T) # send to cuda if torch.cuda.is_available(): batch_x = batch_x.cuda() batch_h = batch_h.cuda() batch_t = batch_t.cuda() yield (batch_x, batch_h), batch_t # ----------------------------------------- # use utterance batch with upsampling layer # ----------------------------------------- else: # remove last frame h = h[:-1] x = x[:-upsampling_factor + 1] # perform pre-processing if wav_transform is not None: x = wav_transform(x) if feat_transform is not None: h = feat_transform(h) # convert to torch variable x = torch.from_numpy(x).long() h = torch.from_numpy(h).float() # remove the last and first sample for training batch_h = h.transpose(0, 1).unsqueeze(0) # (1 x D x T') batch_x = x[:-1].unsqueeze(0) # (1 x T) batch_t = x[1:].unsqueeze(0) # (1 x T) # send to cuda if torch.cuda.is_available(): batch_x = batch_x.cuda() batch_h = batch_h.cuda() batch_t = batch_t.cuda() yield (batch_x, batch_h), batch_t # re-shuffle if shuffle: idx = np.random.permutation(n_files) wav_list = [wav_list[i] for i in idx] feat_list = [feat_list[i] for i in idx]
def main(): """RUN TRAINING.""" parser = argparse.ArgumentParser() # path setting parser.add_argument("--waveforms", required=True, type=str, help="directory or list of wav files") parser.add_argument("--feats", required=True, type=str, help="directory or list of aux feat files") parser.add_argument("--stats", required=True, type=str, help="hdf5 file including statistics") parser.add_argument("--expdir", required=True, type=str, help="directory to save the model") parser.add_argument("--feature_type", default="world", choices=["world", "melspc"], type=str, help="feature type") # network structure setting parser.add_argument("--n_quantize", default=256, type=int, help="number of quantization") parser.add_argument("--n_aux", default=28, type=int, help="number of dimension of aux feats") parser.add_argument("--n_resch", default=512, type=int, help="number of channels of residual output") parser.add_argument("--n_skipch", default=256, type=int, help="number of channels of skip output") parser.add_argument("--dilation_depth", default=10, type=int, help="depth of dilation") parser.add_argument("--dilation_repeat", default=1, type=int, help="number of repeating of dilation") parser.add_argument("--kernel_size", default=2, type=int, help="kernel size of dilated causal convolution") parser.add_argument("--upsampling_factor", default=80, type=int, help="upsampling factor of aux features") parser.add_argument("--use_upsampling_layer", default=True, type=strtobool, help="flag to use upsampling layer") parser.add_argument("--use_speaker_code", default=False, type=strtobool, help="flag to use speaker code") # network training setting parser.add_argument("--lr", default=1e-4, type=float, help="learning rate") parser.add_argument("--weight_decay", default=0.0, type=float, help="weight decay coefficient") parser.add_argument("--batch_length", default=20000, type=int, help="batch length (if set 0, utterance batch will be used)") parser.add_argument("--batch_size", default=1, type=int, help="batch size (if use utterance batch, batch_size will be 1.") parser.add_argument("--iters", default=200000, type=int, help="number of iterations") # other setting parser.add_argument("--checkpoint_interval", default=10000, type=int, help="how frequent saving model") parser.add_argument("--intervals", default=100, type=int, help="log interval") parser.add_argument("--seed", default=1, type=int, help="seed number") parser.add_argument("--resume", default=None, nargs="?", type=str, help="model path to restart training") parser.add_argument("--n_gpus", default=1, type=int, help="number of gpus") parser.add_argument("--verbose", default=1, type=int, help="log level") args = parser.parse_args() # set log level if args.verbose == 1: logging.basicConfig(level=logging.INFO, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') elif args.verbose > 1: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') else: logging.basicConfig(level=logging.WARNING, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') logging.warning("logging is disabled.") # show arguments for key, value in vars(args).items(): logging.info("%s = %s" % (key, str(value))) # make experimental directory if not os.path.exists(args.expdir): os.makedirs(args.expdir) # fix seed os.environ['PYTHONHASHSEED'] = str(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # fix slow computation of dilated conv # https://github.com/pytorch/pytorch/issues/15054#issuecomment-450191923 torch.backends.cudnn.benchmark = True # save args as conf torch.save(args, args.expdir + "/model.conf") # define network if args.use_upsampling_layer: upsampling_factor = args.upsampling_factor else: upsampling_factor = 0 model = WaveNet( n_quantize=args.n_quantize, n_aux=args.n_aux, n_resch=args.n_resch, n_skipch=args.n_skipch, dilation_depth=args.dilation_depth, dilation_repeat=args.dilation_repeat, kernel_size=args.kernel_size, upsampling_factor=upsampling_factor) logging.info(model) model.apply(initialize) model.train() if args.n_gpus > 1: device_ids = range(args.n_gpus) model = torch.nn.DataParallel(model, device_ids) model.receptive_field = model.module.receptive_field if args.n_gpus > args.batch_size: logging.warning("batch size is less than number of gpus.") # define optimizer and loss optimizer = torch.optim.Adam( model.parameters(), lr=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss() # define transforms scaler = StandardScaler() scaler.mean_ = read_hdf5(args.stats, "/" + args.feature_type + "/mean") scaler.scale_ = read_hdf5(args.stats, "/" + args.feature_type + "/scale") wav_transform = transforms.Compose([ lambda x: encode_mu_law(x, args.n_quantize)]) feat_transform = transforms.Compose([ lambda x: scaler.transform(x)]) # define generator if os.path.isdir(args.waveforms): filenames = sorted(find_files(args.waveforms, "*.wav", use_dir_name=False)) wav_list = [args.waveforms + "/" + filename for filename in filenames] feat_list = [args.feats + "/" + filename.replace(".wav", ".h5") for filename in filenames] elif os.path.isfile(args.waveforms): wav_list = read_txt(args.waveforms) feat_list = read_txt(args.feats) else: logging.error("--waveforms should be directory or list.") sys.exit(1) assert len(wav_list) == len(feat_list) logging.info("number of training data = %d." % len(wav_list)) generator = train_generator( wav_list, feat_list, receptive_field=model.receptive_field, batch_length=args.batch_length, batch_size=args.batch_size, feature_type=args.feature_type, wav_transform=wav_transform, feat_transform=feat_transform, shuffle=True, upsampling_factor=args.upsampling_factor, use_upsampling_layer=args.use_upsampling_layer, use_speaker_code=args.use_speaker_code) # charge minibatch in queue while not generator.queue.full(): time.sleep(0.1) # resume model and optimizer if args.resume is not None and len(args.resume) != 0: checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) iterations = checkpoint["iterations"] if args.n_gpus > 1: model.module.load_state_dict(checkpoint["model"]) else: model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) logging.info("restored from %d-iter checkpoint." % iterations) else: iterations = 0 # check gpu and then send to gpu if torch.cuda.is_available(): model.cuda() criterion.cuda() for state in optimizer.state.values(): for key, value in state.items(): if torch.is_tensor(value): state[key] = value.cuda() else: logging.error("gpu is not available. please check the setting.") sys.exit(1) # train loss = 0 total = 0 for i in six.moves.range(iterations, args.iters): start = time.time() (batch_x, batch_h), batch_t = generator.next() batch_output = model(batch_x, batch_h) batch_loss = criterion( batch_output[:, model.receptive_field:].contiguous().view(-1, args.n_quantize), batch_t[:, model.receptive_field:].contiguous().view(-1)) optimizer.zero_grad() batch_loss.backward() optimizer.step() loss += batch_loss.item() total += time.time() - start logging.debug("batch loss = %.3f (%.3f sec / batch)" % ( batch_loss.item(), time.time() - start)) # report progress if (i + 1) % args.intervals == 0: logging.info("(iter:%d) average loss = %.6f (%.3f sec / batch)" % ( i + 1, loss / args.intervals, total / args.intervals)) logging.info("estimated required time = " "{0.days:02}:{0.hours:02}:{0.minutes:02}:{0.seconds:02}" .format(relativedelta( seconds=int((args.iters - (i + 1)) * (total / args.intervals))))) loss = 0 total = 0 # save intermidiate model if (i + 1) % args.checkpoint_interval == 0: if args.n_gpus > 1: save_checkpoint(args.expdir, model.module, optimizer, i + 1) else: save_checkpoint(args.expdir, model, optimizer, i + 1) # save final model if args.n_gpus > 1: torch.save({"model": model.module.state_dict()}, args.expdir + "/checkpoint-final.pkl") else: torch.save({"model": model.state_dict()}, args.expdir + "/checkpoint-final.pkl") logging.info("final checkpoint created.")
def decode_generator( feat_list, batch_size=32, feature_type="world", wav_transform=None, feat_transform=None, pulse_transform=p_trans_binary_multi_channel, upsampling_factor=80, use_upsampling_layer=True, use_speaker_code=False, use_pulse=True, f0_transform=None, ): """GENERATE DECODING BATCH. Args: feat_list (list): List of feature files. batch_size (int): Batch size in decoding. feature_type (str): Feature type. wav_transform (func): Preprocessing function for waveform. feat_transform (func): Preprocessing function for aux feats. upsampling_factor (int): Upsampling factor. use_upsampling_layer (bool): Whether to use upsampling layer. use_speaker_code (bool): Whether to use speaker code> Returns: generator: Generator instance. """ # --------------------------- # sample-by-sample generation # --------------------------- if batch_size == 1: raise NotImplementedError # ---------------- # batch generation # ---------------- else: # sort with the feature length shape_list = [shape_hdf5(f, "/" + feature_type)[0] for f in feat_list] idx = np.argsort(shape_list) feat_list = [feat_list[i] for i in idx] # divide into batch list n_batch = math.ceil(len(feat_list) / batch_size) batch_lists = np.array_split(feat_list, n_batch) batch_lists = [f.tolist() for f in batch_lists] for batch_list in batch_lists: batch_x = [] batch_h = [] batch_p = [] n_samples_list = [] feat_ids = [] for featfile in batch_list: # make seed waveform and load aux feature x = np.zeros((1)) h = read_hdf5(featfile, "/" + feature_type) if f0_transform is not None: f0 = read_hdf5(featfile, "/" + 'world_f0') f0 = f0_transform(f0) fs = args.fs p = pw.synthesize_pulse_new( f0, fs, frame_period=args.shiftms).astype(np.int32) __p = read_hdf5(featfile, "/" + 'world_pulse') assert len(p) == len(__p) else: p = read_hdf5(featfile, "/" + 'world_pulse') if pulse_transform is not None: p = pulse_transform(p) assert p.max() <= 1.0 if not use_upsampling_layer: h = extend_time(h, upsampling_factor) if use_speaker_code: sc = read_hdf5(featfile, "/speaker_code") sc = np.tile(sc, [h.shape[0], 1]) h = np.concatenate([h, sc], axis=1) # perform pre-processing if wav_transform is not None: x = wav_transform(x) if feat_transform is not None: h = feat_transform(h) if use_pulse: h = np.concatenate([h[:, 0:1], h[:, 2:]], axis=1) # remove cont_f0_lpf # append to list batch_x += [x] batch_h += [h] batch_p += [p] if not use_upsampling_layer: n_samples_list += [h.shape[0] - 1] else: n_samples_list += [h.shape[0] * upsampling_factor - 1] feat_ids += [os.path.basename(featfile).replace(".h5", "")] # convert list to ndarray batch_x = np.stack(batch_x, axis=0) len_p_max = max([len(p) for p in batch_p]) batch_p = [ pad_along_axis(p, len_p_max, axis=0) for p, n_sample in zip(batch_p, n_samples_list) ] batch_p = np.stack(batch_p) batch_h = pad_list(batch_h) # convert to torch variable batch_x = torch.from_numpy(batch_x).long() # B, 1 batch_p = torch.from_numpy(batch_p).float().transpose( 1, 2) # B, C=1, T batch_h = torch.from_numpy(batch_h).float().transpose( 1, 2) # B, C, T(Frame) # print(batch_x.shape, batch_p.shape, batch_h.shape) # send to cuda if torch.cuda.is_available(): batch_x = batch_x.cuda() batch_h = batch_h.cuda() batch_p = batch_p.cuda() yield feat_ids, (batch_x, batch_h, batch_p, n_samples_list)
def main(args): """RUN DECODING.""" # set log level if args.verbose > 0: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') elif args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') else: logging.basicConfig( level=logging.WARNING, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') logging.warning("logging is disabled.") # show arguments for key, value in vars(args).items(): logging.info("%s = %s" % (key, str(value))) # check arguments if args.stats is None: args.stats = os.path.dirname(args.checkpoint) + "/stats.h5" if args.config is None: args.config = os.path.dirname(args.checkpoint) + "/model.conf" if not os.path.exists(args.stats): raise FileNotFoundError("statistics file is missing (%s)." % (args.stats)) if not os.path.exists(args.config): raise FileNotFoundError("config file is missing (%s)." % (args.config)) # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # fix seed os.environ['PYTHONHASHSEED'] = str(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # fix slow computation of dilated convargs.feats # https://github.com/pytorch/pytorch/issues/15054#issuecomment-450191923 torch.backends.cudnn.benchmark = True # load config config = torch.load(args.config) # get file list if os.path.isdir(args.feats): feat_list = sorted(find_files(args.feats, "*.h5")) elif os.path.isfile(args.feats): feat_list = read_txt(args.feats) else: logging.error("--feats should be directory or list.") sys.exit(1) # prepare the file list for parallel decoding feat_lists = np.array_split(feat_list, args.n_gpus) feat_lists = [f_list.tolist() for f_list in feat_lists] # define transform scaler = StandardScaler() scaler.mean_ = read_hdf5(args.stats, "/" + config.feature_type + "/mean") scaler.scale_ = read_hdf5(args.stats, "/" + config.feature_type + "/scale") wav_transform = transforms.Compose( [lambda x: encode_mu_law(x, config.n_quantize)]) feat_transform = transforms.Compose([lambda x: scaler.transform(x)]) f0_transform = transforms.Compose( [partial(shift_semi_tone_f0_pulse, shift=args.f0_shift)]) # define gpu decode function def gpu_decode(feat_list, gpu): # set default gpu and do not track gradient torch.cuda.set_device(gpu) torch.set_grad_enabled(False) # define model and load parameters if config.use_upsampling_layer: upsampling_factor = config.upsampling_factor else: upsampling_factor = 0 if args.use_pulse: _WaveNet = WaveNetPulse else: _WaveNet = WaveNet config.n_aux = 28 model = _WaveNet(n_quantize=config.n_quantize, n_aux=config.n_aux, n_resch=config.n_resch, n_skipch=config.n_skipch, dilation_depth=config.dilation_depth, dilation_repeat=config.dilation_repeat, kernel_size=config.kernel_size, upsampling_factor=upsampling_factor) model.load_state_dict( torch.load(args.checkpoint, map_location=lambda storage, loc: storage)["model"]) model.eval() model.cuda() print(args.use_pulse) # define generator generator = decode_generator( feat_list, batch_size=args.batch_size, feature_type=config.feature_type, wav_transform=wav_transform, feat_transform=feat_transform, f0_transform=f0_transform, upsampling_factor=config.upsampling_factor, use_upsampling_layer=config.use_upsampling_layer, use_speaker_code=config.use_speaker_code, use_pulse=args.use_pulse) # decode if args.batch_size > 1: for feat_ids, (batch_x, batch_h, batch_p, n_samples_list) in generator: logging.info("decoding start") samples_list = model.batch_fast_generate( batch_x, batch_h, n_samples_list, batch_p, intervals=args.intervals) for feat_id, samples in zip(feat_ids, samples_list): wav = decode_mu_law(samples, config.n_quantize) sf.write(args.outdir + "/" + feat_id + ".wav", wav, args.fs, "PCM_16") logging.info("wrote %s.wav in %s." % (feat_id, args.outdir)) else: raise NotImplementedError # parallel decode processes = [] for gpu, feat_list in enumerate(feat_lists): p = mp.Process(target=gpu_decode, args=( feat_list, gpu, )) p.start() processes.append(p) # wait for all process for p in processes: p.join()
def main(): """RUN NOISE SHAPING IN PARALLEL.""" parser = argparse.ArgumentParser( description="making feature file argsurations.") parser.add_argument( "--waveforms", default=None, help="directory or list of filename of input wavfile") parser.add_argument( "--stats", default=None, help="filename of hdf5 format") parser.add_argument( "--outdir", default=None, help="directory to save preprocessed wav file") parser.add_argument( "--fs", default=16000, type=int, help="Sampling frequency") parser.add_argument( "--shiftms", default=5, type=float, help="Frame shift in msec") parser.add_argument( "--feature_type", default="world", choices=["world", "mcep", "melspc"], type=str, help="feature type") parser.add_argument( "--mcep_dim_start", default=2, type=int, help="Start index of mel cepstrum") parser.add_argument( "--mcep_dim_end", default=27, type=int, help="End index of mel cepstrum") parser.add_argument( "--mcep_alpha", default=0.41, type=float, help="Alpha of mel cepstrum") parser.add_argument( "--mag", default=0.5, type=float, help="magnification of noise shaping") parser.add_argument( "--verbose", default=1, type=int, help="log message level") parser.add_argument( '--n_jobs', default=10, type=int, help="number of parallel jobs") parser.add_argument( '--inv', default=False, type=strtobool, help="if True, inverse filtering will be performed") args = parser.parse_args() # set log level if args.verbose == 1: logging.basicConfig(level=logging.INFO, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') elif args.verbose > 1: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') else: logging.basicConfig(level=logging.WARNING, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') logging.warning("logging is disabled.") # show arguments for key, value in vars(args).items(): logging.info("%s = %s" % (key, str(value))) # read list if os.path.isdir(args.waveforms): file_list = sorted(find_files(args.waveforms, "*.wav")) else: file_list = read_txt(args.waveforms) logging.info("number of utterances = %d" % len(file_list)) # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # divide list file_lists = np.array_split(file_list, args.n_jobs) file_lists = [f_list.tolist() for f_list in file_lists] # calculate MLSA coef ans save it if not check_hdf5(args.stats, "/mlsa/coef"): avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean") if args.feature_type == "world": avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end] mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag, args.mcep_alpha) write_hdf5(args.stats, "/mlsa/coef", mlsa_coef) write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha) # multi processing processes = [] if args.feature_type == "melspc": # TODO(kan-bayashi): implement noise shaping using melspectrogram raise NotImplementedError("currently, support only world and mcep.") for f in file_lists: p = mp.Process(target=noise_shaping, args=(f, args,)) p.start() processes.append(p) # wait for all process for p in processes: p.join()
def get_dataloader(self): args = self.args # define transforms scaler = StandardScaler() scaler.mean_ = read_hdf5(args.stats, "/" + args.feature_type + "/mean") scaler.scale_ = read_hdf5(args.stats, "/" + args.feature_type + "/scale") wav_transform = transforms.Compose( [lambda x: encode_mu_law(x, args.n_quantize)]) feat_transform = transforms.Compose([lambda x: scaler.transform(x)]) # define generator if os.path.isdir(args.waveforms): filenames = sorted( find_files(args.waveforms, "*.wav", use_dir_name=False)) wav_list_train = [ args.waveforms + "/" + filename for filename in filenames ] feat_list_train = [ args.feats + "/" + filename.replace(".wav", ".h5") for filename in filenames ] elif os.path.isfile(args.waveforms): wav_list_train = read_txt(args.waveforms) feat_list_train = read_txt(args.feats) else: logging.error("--waveforms should be directory or list.") sys.exit(1) assert len(wav_list_train) == len(feat_list_train) logging.info("number of training data = %d." % len(wav_list_train)) generator = data_generator( wav_list_train, feat_list_train, receptive_field=self.model.receptive_field, batch_length=args.batch_length, batch_size=args.batch_size, feature_type=args.feature_type, wav_transform=wav_transform, feat_transform=feat_transform, shuffle=True, upsampling_factor=args.upsampling_factor, use_upsampling_layer=args.use_upsampling_layer, use_speaker_code=args.use_speaker_code, use_pulse=args.use_pulse) test_generator = data_generator( wav_list_test[:args.batch_size], feat_list_test[:args.batch_size], receptive_field=self.model.receptive_field, batch_length=args.batch_length, batch_size=args.batch_size, feature_type=args.feature_type, wav_transform=wav_transform, feat_transform=feat_transform, shuffle=False, upsampling_factor=args.upsampling_factor, use_upsampling_layer=args.use_upsampling_layer, use_speaker_code=args.use_speaker_code, use_pulse=args.use_pulse) # charge minibatch in queue while not generator.queue.full(): time.sleep(0.1) return generator, test_generator
def decode_generator(feat_list, batch_size=32, feature_type="world", wav_transform=None, feat_transform=None, upsampling_factor=80, use_upsampling_layer=True, use_speaker_code=False, pulse=True): """GENERATE DECODING BATCH. Args: feat_list (list): List of feature files. batch_size (int): Batch size in decoding. feature_type (str): Feature type. wav_transform (func): Preprocessing function for waveform. feat_transform (func): Preprocessing function for aux feats. upsampling_factor (int): Upsampling factor. use_upsampling_layer (bool): Whether to use upsampling layer. use_speaker_code (bool): Whether to use speaker code> Returns: generator: Generator instance. """ # --------------------------- # sample-by-sample generation # --------------------------- if batch_size == 1: for featfile in feat_list: x = np.zeros((1)) h = read_hdf5(featfile, "/" + feature_type) if not use_upsampling_layer: h = extend_time(h, upsampling_factor) if use_speaker_code: sc = read_hdf5(featfile, "/speaker_code") sc = np.tile(sc, [h.shape[0], 1]) h = np.concatenate([h, sc], axis=1) # perform pre-processing if wav_transform is not None: x = wav_transform(x) if feat_transform is not None: h = feat_transform(h) # convert to torch variable x = torch.from_numpy(x).long() h = torch.from_numpy(h).float() x = x.unsqueeze(0) # 1 => 1 x 1 h = h.transpose(0, 1).unsqueeze(0) # T x C => 1 x C x T # send to cuda if torch.cuda.is_available(): x = x.cuda() h = h.cuda() # get target length and file id if not use_upsampling_layer: n_samples = h.size(2) - 1 else: n_samples = h.size(2) * upsampling_factor - 1 feat_id = os.path.basename(featfile).replace(".h5", "") yield feat_id, (x, h, n_samples) # ---------------- # batch generation # ---------------- else: # sort with the feature length shape_list = [shape_hdf5(f, "/" + feature_type)[0] for f in feat_list] idx = np.argsort(shape_list) feat_list = [feat_list[i] for i in idx] # divide into batch list n_batch = math.ceil(len(feat_list) / batch_size) batch_lists = np.array_split(feat_list, n_batch) batch_lists = [f.tolist() for f in batch_lists] for batch_list in batch_lists: batch_x = [] batch_h = [] n_samples_list = [] feat_ids = [] for featfile in batch_list: # make seed waveform and load aux feature x = np.zeros((1)) h = read_hdf5(featfile, "/" + feature_type) if not use_upsampling_layer: h = extend_time(h, upsampling_factor) if use_speaker_code: sc = read_hdf5(featfile, "/speaker_code") sc = np.tile(sc, [h.shape[0], 1]) h = np.concatenate([h, sc], axis=1) # perform pre-processing if wav_transform is not None: x = wav_transform(x) if feat_transform is not None: h = feat_transform(h) # append to list batch_x += [x] batch_h += [h] if not use_upsampling_layer: n_samples_list += [h.shape[0] - 1] else: n_samples_list += [h.shape[0] * upsampling_factor - 1] feat_ids += [os.path.basename(featfile).replace(".h5", "")] # convert list to ndarray batch_x = np.stack(batch_x, axis=0) batch_h = pad_list(batch_h) # convert to torch variable batch_x = torch.from_numpy(batch_x).long() batch_h = torch.from_numpy(batch_h).float().transpose(1, 2) # send to cuda if torch.cuda.is_available(): batch_x = batch_x.cuda() batch_h = batch_h.cuda() yield feat_ids, (batch_x, batch_h, n_samples_list)
def train_generator(wav_list, feat_list, receptive_field, batch_length=None, batch_size=1, feature_type="world", wav_transform=None, feat_transform=None, pulse_transform=p_trans_binary_multi_channel, shuffle=True, upsampling_factor=80, use_upsampling_layer=True, use_speaker_code=False, use_pulse=True): """GENERATE TRAINING BATCH. Args: wav_list (list): List of wav files. feat_list (list): List of feat files. receptive_field (int): Size of receptive filed. batch_length (int): Batch length (if set None, utterance batch will be used.). batch_size (int): Batch size (if batch_length = None, batch_size will be 1.). feature_type (str): Auxiliary feature type. wav_transform (func): Preprocessing function for waveform. feat_transform (func): Preprocessing function for aux feats. shuffle (bool): Whether to shuffle the file list. upsampling_factor (int): Upsampling factor. use_upsampling_layer (bool): Whether to use upsampling layer. use_speaker_code (bool): Whether to use speaker code. use_pulse (bool): use pulse signal Returns: generator: Generator instance. """ # shuffle list if shuffle: n_files = len(wav_list) idx = np.random.permutation(n_files) wav_list = [wav_list[i] for i in idx] feat_list = [feat_list[i] for i in idx] # check batch_length if batch_length is not None and use_upsampling_layer: batch_mod = (receptive_field + batch_length) % upsampling_factor logging.warning( "batch length is decreased due to upsampling (%d -> %d)" % (batch_length, batch_length - batch_mod)) batch_length -= batch_mod # show warning if batch_length is None and batch_size > 1: logging.warning("in utterance batch mode, batchsize will be 1.") while True: batch_x, batch_p, batch_h, batch_t = [], [], [], [] # process over all of files for wavfile, featfile in zip(wav_list, feat_list): # load waveform and aux feature # x, fs = sf.read(wavfile, dtype=np.float32) fs, data = wf.read(wavfile) # print(data.shape) x = data.astype(np.float) / 32768 h = read_hdf5(featfile, "/" + feature_type) p = read_hdf5(featfile, "/" + 'world_pulse') # p if pulse_transform: p = pulse_transform(p) if not use_upsampling_layer: h = extend_time(h, upsampling_factor) if use_speaker_code: sc = read_hdf5(featfile, "/speaker_code") sc = np.tile(sc, [h.shape[0], 1]) h = np.concatenate([h, sc], axis=1) # check both lengths are same logging.debug("before x length = %d" % x.shape[0]) logging.debug("before h length = %d" % h.shape[0]) if use_upsampling_layer: x, h = validate_length(x, h, upsampling_factor) else: x, h = validate_length(x, h) logging.debug("after x length = %d" % x.shape[0]) logging.debug("after h length = %d" % h.shape[0]) # --------------------------------------- # use mini batch without upsampling layer # --------------------------------------- if batch_length is not None and not use_upsampling_layer: raise NotImplementedError # ------------------------------------ # use mini batch with upsampling layer <-------This TODO # ------------------------------------ elif batch_length is not None and use_upsampling_layer: # make buffer array if "x_buffer" not in locals(): x_buffer = np.empty((0), dtype=np.float32) # p_buffer = np.empty((0), dtype=np.float32) p_buffer = np.empty((0, p.shape[1]), dtype=np.float32) h_buffer = np.empty((0, h.shape[1]), dtype=np.float32) x_buffer = np.concatenate([x_buffer, x], axis=0) p_buffer = np.concatenate([p_buffer, p], axis=0) h_buffer = np.concatenate([h_buffer, h], axis=0) while len(h_buffer) > (receptive_field + batch_length) // upsampling_factor: # set batch size h_bs = (receptive_field + batch_length) // upsampling_factor x_bs = h_bs * upsampling_factor + 1 p_bs = h_bs * upsampling_factor + 1 # get pieces h_ = h_buffer[:h_bs] x_ = x_buffer[:x_bs] p_ = p_buffer[:p_bs] # perform pre-processing if wav_transform is not None: x_ = wav_transform(x_) if feat_transform is not None: h_ = feat_transform(h_) if use_pulse: h_ = np.concatenate( [h_[:, 0:1], h_[:, 2:]], axis=1 ) # remove cont_f0_lpf (vuv[1]+mcep[25]+ap_code[1]) # h_ = np.concatenate([h_[:, 0:1], h_[:, -1:]], axis=1) # remove cont_f0_lpf and mcep (vuv[1]+ap_code[1]) # mcep = h_[:, 1:-2] # extract mcep # convert to torch variable x_ = torch.from_numpy(x_).long() p_ = torch.from_numpy(p_).float() h_ = torch.from_numpy(h_).float() # remove the last and first sample for training batch_h += [h_.transpose(0, 1)] # (D x T) batch_x += [x_[:-1]] # (T) batch_p += [p_[:-1].transpose(0, 1)] # (C x T) batch_t += [x_[1:]] # (T) # set shift size h_ss = batch_length // upsampling_factor x_ss = h_ss * upsampling_factor p_ss = h_ss * upsampling_factor # update buffer h_buffer = h_buffer[h_ss:] x_buffer = x_buffer[x_ss:] p_buffer = p_buffer[p_ss:] # return mini batch if len(batch_x) == batch_size: batch_x = torch.stack(batch_x) batch_p = torch.stack(batch_p) batch_h = torch.stack(batch_h) batch_t = torch.stack(batch_t) # send to cuda if torch.cuda.is_available(): batch_x = batch_x.cuda() batch_p = batch_p.cuda() batch_h = batch_h.cuda() batch_t = batch_t.cuda() yield (batch_x, batch_h, batch_p), batch_t batch_x, batch_h, batch_p, batch_t, = [], [], [], [] # -------------------------------------------- # use utterance batch without upsampling layer # -------------------------------------------- elif batch_length is None and not use_upsampling_layer: raise NotImplementedError # ----------------------------------------- # use utterance batch with upsampling layer # ----------------------------------------- else: raise NotImplementedError # re-shuffle if shuffle: idx = np.random.permutation(n_files) wav_list = [wav_list[i] for i in idx] feat_list = [feat_list[i] for i in idx]