def my_app(config: DictConfig) -> None: logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) utt_list = to_absolute_path(config.utt_list) in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) utt_ids = load_utt_list(utt_list) stream_sizes, has_dynamic_features = get_world_stream_info( config.acoustic.sample_rate, config.acoustic.mgc_order, config.acoustic.num_windows, config.acoustic.vibrato_mode, ) os.makedirs(out_dir, exist_ok=True) with ProcessPoolExecutor(max_workers=config.max_workers) as executor: futures = [ executor.submit( _extract_static_features, in_dir, out_dir, utt_id, config.acoustic.num_windows, stream_sizes, has_dynamic_features, ) for utt_id in utt_ids ] for future in tqdm(futures): future.result()
def my_app(config : DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(config.pretty()) if use_cuda: from torch.backends import cudnn cudnn.benchmark = config.train.cudnn.benchmark cudnn.deterministic = config.train.cudnn.deterministic logger.info(f"cudnn.deterministic: {cudnn.deterministic}") logger.info(f"cudnn.benchmark: {cudnn.benchmark}") logger.info(f"Random seed: {config.seed}") init_seed(config.seed) device = torch.device("cuda" if use_cuda else "cpu") if config.train.use_detect_anomaly: torch.autograd.set_detect_anomaly(True) logger.info("Set to use torch.autograd.detect_anomaly") # Model model = hydra.utils.instantiate(config.model.netG).to(device) # Optimizer optimizer_class = getattr(optim, config.train.optim.optimizer.name) optimizer = optimizer_class(model.parameters(), **config.train.optim.optimizer.params) # Scheduler lr_scheduler_class = getattr(optim.lr_scheduler, config.train.optim.lr_scheduler.name) lr_scheduler = lr_scheduler_class(optimizer, **config.train.optim.lr_scheduler.params) data_loaders = get_data_loaders(config) # Resume if config.train.resume.checkpoint is not None and len(config.train.resume.checkpoint) > 0: logger.info("Load weights from {}".format(config.train.resume.checkpoint)) checkpoint = torch.load(to_absolute_path(config.train.resume.checkpoint)) model.load_state_dict(checkpoint["state_dict"]) if config.train.resume.load_optimizer: logger.info("Load optimizer state") optimizer.load_state_dict(checkpoint["optimizer_state"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state"]) # Save model definition out_dir = to_absolute_path(config.train.out_dir) os.makedirs(out_dir, exist_ok=True) with open(join(out_dir, "model.yaml"), "w") as f: OmegaConf.save(config.model, f) # Run training loop train_loop(config, device, model, optimizer, lr_scheduler, data_loaders)
def my_app(config: DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) scaler_path = to_absolute_path(config.scaler_path) scaler = joblib.load(scaler_path) inverse = config.inverse num_workers = config.num_workers os.makedirs(out_dir, exist_ok=True) apply_normalization_dir2dir(in_dir, out_dir, scaler, inverse, num_workers)
def my_app(config: DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(config.pretty()) list_path = to_absolute_path(config.list_path) out_path = to_absolute_path(config.out_path) scaler = hydra.utils.instantiate(config.scaler) with open(list_path) as f: for path in f: c = np.load(to_absolute_path(path.strip())) scaler.partial_fit(c) joblib.dump(scaler, out_path) if config.verbose > 0: if isinstance(scaler, StandardScaler): logger.info("mean:\n{}".format(scaler.mean_)) logger.info("std:\n{}".format(np.sqrt(scaler.var_))) if isinstance(scaler, MinMaxScaler): logger.info("data min:\n{}".format(scaler.data_min_)) logger.info("data max:\n{}".format(scaler.data_max_))
def my_app(config : DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(config.pretty()) device = torch.device("cuda" if use_cuda else "cpu") in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) os.makedirs(out_dir, exist_ok=True) model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(to_absolute_path(config.model.checkpoint), map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["state_dict"]) scaler = joblib.load(to_absolute_path(config.out_scaler_path)) in_feats = FileSourceDataset(NpyFileSource(in_dir)) with torch.no_grad(): for idx in tqdm(range(len(in_feats))): feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device) out = model(feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy() out = scaler.inverse_transform(out) # Apply MLPG if necessary if np.any(model_config.has_dynamic_features): windows = get_windows(3) out = multi_stream_mlpg( out, scaler.var_, windows, model_config.stream_sizes, model_config.has_dynamic_features) name = basename(in_feats.collected_files[idx][0]) out_path = join(out_dir, name) np.save(out_path, out, allow_pickle=False)
def acoustic2world(config: DictConfig, path_timing, path_acoustic, path_f0, path_spcetrogram, path_aperiodicity): """ Acousticの行列のCSVを読んで、WAVファイルとして出力する。 """ # loggerの設定 global logger # pylint: disable=global-statement logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) # load labels and question duration_modified_labels = hts.load(path_timing).round_() # CUDAが使えるかどうか # device = 'cuda' if torch.cuda.is_available() else 'cpu' # 各種設定を読み込む typ = 'acoustic' model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml)) # hedファイルを読み取る。 question_path = to_absolute_path(config.question_path) # hts2wav.pyだとこう↓----------------- # これだと各モデルに別個のhedを適用できる。 # if config[typ].question_path is None: # config[typ].question_path = config.question_path # -------------------------------------- # hedファイルを辞書として読み取る。 binary_dict, continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) # pitch indices in the input features pitch_idx = len(binary_dict) + 1 # pitch_indices = np.arange(len(binary_dict), len(binary_dict)+3) # pylint: disable=no-member # Acousticの数値を読み取る acoustic_features = np.loadtxt(path_acoustic, delimiter=',', dtype=np.float64) # AcousticからWORLD用のパラメータを取り出す。 f0, spectrogram, aperiodicity = gen_world_params( duration_modified_labels, acoustic_features, binary_dict, continuous_dict, model_config.stream_sizes, model_config.has_dynamic_features, subphone_features=config.acoustic.subphone_features, pitch_idx=pitch_idx, num_windows=model_config.num_windows, post_filter=config.acoustic.post_filter, sample_rate=config.sample_rate, frame_period=config.frame_period, relative_f0=config.acoustic.relative_f0, vibrato_scale=1.0, vuv_threshold=0.3) # csvファイルとしてf0の行列を出力 for path, array in ((path_f0, f0), (path_spcetrogram, spectrogram), (path_aperiodicity, aperiodicity)): np.savetxt(path, array, fmt='%.16f', delimiter=',')
def _score2duration(config: DictConfig, labels): """ full_score と timelag ラベルから durationラベルを生成する。 """ # ----------------------------------------------------- # ここから nnsvs.bin.synthesis.my_app() の内容 -------- # ----------------------------------------------------- # loggerの設定 global logger # pylint: disable=global-statement logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) typ = 'duration' # CUDAが使えるかどうか device = 'cuda' if torch.cuda.is_available() else 'cpu' # maybe_set_checkpoints_(config) のかわり set_checkpoint(config, typ) # maybe_set_normalization_stats_(config) のかわり set_normalization_stat(config, typ) # 各種設定を読み込む model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(config[typ].checkpoint, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict']) in_scaler = joblib.load(config[typ].in_scaler_path) out_scaler = joblib.load(config[typ].out_scaler_path) model.eval() # ----------------------------------------------------- # ここまで nnsvs.bin.synthesis.my_app() の内容 -------- # ----------------------------------------------------- # ----------------------------------------------------- # ここから nnsvs.bin.synthesis.synthesis() の内容 ----- # ----------------------------------------------------- # full_score_lab を読み取る。 # labels = hts.load(score_path).round_() # いまのduraitonモデルだと使わない # timelag = hts.load(timelag_path).round_() # hedファイルを読み取る。 question_path = to_absolute_path(config.question_path) # hts2wav.pyだとこう↓----------------- # これだと各モデルに別個のhedを適用できる。 # if config[typ].question_path is None: # config[typ].question_path = config.question_path # -------------------------------------- # hedファイルを辞書として読み取る。 binary_dict, continuous_dict = \ hts.load_question_set(question_path, append_hat_for_LL=False) # pitch indices in the input features # pitch_idx = len(binary_dict) + 1 pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) # f0の設定を読み取る。 log_f0_conditioning = config.log_f0_conditioning # durationモデルを適用 duration = predict_duration(device, labels, model, model_config, in_scaler, out_scaler, binary_dict, continuous_dict, pitch_indices, log_f0_conditioning, force_clip_input_features=False) # durationのタプルまたはndarrayを返す return duration
def timing2acoustic(config: DictConfig, timing_path, acoustic_path): """ フルラベルを読み取って、音響特長量のファイルを出力する。 """ # ----------------------------------------------------- # ここから nnsvs.bin.synthesis.my_app() の内容 -------- # ----------------------------------------------------- # loggerの設定 global logger # pylint: disable=global-statement logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) typ = 'acoustic' # CUDAが使えるかどうか device = 'cuda' if torch.cuda.is_available() else 'cpu' # maybe_set_checkpoints_(config) のかわり set_checkpoint(config, typ) # maybe_set_normalization_stats_(config) のかわり set_normalization_stat(config, typ) # 各種設定を読み込む model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(config[typ].checkpoint, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict']) in_scaler = joblib.load(config[typ].in_scaler_path) out_scaler = joblib.load(config[typ].out_scaler_path) model.eval() # ----------------------------------------------------- # ここまで nnsvs.bin.synthesis.my_app() の内容 -------- # ----------------------------------------------------- # ----------------------------------------------------- # ここから nnsvs.bin.synthesis.synthesis() の内容 ----- # ----------------------------------------------------- # full_score_lab を読み取る。 duration_modified_labels = hts.load(timing_path).round_() # hedファイルを読み取る。 question_path = to_absolute_path(config.question_path) # hts2wav.pyだとこう↓----------------- # これだと各モデルに別個のhedを適用できる。 # if config[typ].question_path is None: # config[typ].question_path = config.question_path # -------------------------------------- # hedファイルを辞書として読み取る。 binary_dict, continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) # pitch indices in the input features # pitch_idx = len(binary_dict) + 1 pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) # f0の設定を読み取る。 log_f0_conditioning = config.log_f0_conditioning acoustic_features = predict_acoustic(device, duration_modified_labels, model, model_config, in_scaler, out_scaler, binary_dict, continuous_dict, config.acoustic.subphone_features, pitch_indices, log_f0_conditioning) # csvファイルとしてAcousticの行列を出力 np.savetxt(acoustic_path, acoustic_features, delimiter=',')
def my_app(config : DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(config.pretty()) utt_list = to_absolute_path(config.utt_list) out_dir = to_absolute_path(config.out_dir) question_path_general = to_absolute_path(config.question_path) # Time-lag model # in: musical/linguistic context # out: time-lag (i.e. onset time deviation) if config.timelag.question_path is not None: question_path = config.timelag.question_path else: question_path = question_path_general in_timelag_source = MusicalLinguisticSource(utt_list, to_absolute_path(config.timelag.label_phone_score_dir), add_frame_features=False, subphone_features=None, question_path=question_path, log_f0_conditioning=config.log_f0_conditioning) out_timelag_source = TimeLagFeatureSource(utt_list, to_absolute_path(config.timelag.label_phone_score_dir), to_absolute_path(config.timelag.label_phone_align_dir)) in_timelag = FileSourceDataset(in_timelag_source) out_timelag = FileSourceDataset(out_timelag_source) # Duration model # in: musical/linguistic context # out: duration if config.duration.question_path is not None: question_path = config.duration.question_path else: question_path = question_path_general in_duration_source = MusicalLinguisticSource(utt_list, to_absolute_path(config.duration.label_dir), add_frame_features=False, subphone_features=None, question_path=question_path, log_f0_conditioning=config.log_f0_conditioning) out_duration_source = DurationFeatureSource( utt_list, to_absolute_path(config.duration.label_dir)) in_duration = FileSourceDataset(in_duration_source) out_duration = FileSourceDataset(out_duration_source) # Acoustic model # in: musical/linguistic context # out: acoustic features if config.acoustic.question_path is not None: question_path = config.acoustic.question_path else: question_path = question_path_general in_acoustic_source = MusicalLinguisticSource(utt_list, to_absolute_path(config.acoustic.label_dir), question_path, add_frame_features=True, subphone_features=config.acoustic.subphone_features, log_f0_conditioning=config.log_f0_conditioning) out_acoustic_source = WORLDAcousticSource(utt_list, to_absolute_path(config.acoustic.wav_dir), to_absolute_path(config.acoustic.label_dir), question_path, use_harvest=config.acoustic.use_harvest, f0_ceil=config.acoustic.f0_ceil, f0_floor=config.acoustic.f0_floor, frame_period=config.acoustic.frame_period, mgc_order=config.acoustic.mgc_order, num_windows=config.acoustic.num_windows, relative_f0=config.acoustic.relative_f0) in_acoustic = FileSourceDataset(in_acoustic_source) out_acoustic = FileSourceDataset(out_acoustic_source) # Save as files in_timelag_root = join(out_dir, "in_timelag") out_timelag_root = join(out_dir, "out_timelag") in_duration_root = join(out_dir, "in_duration") out_duration_root = join(out_dir, "out_duration") in_acoustic_root = join(out_dir, "in_acoustic") out_acoustic_root = join(out_dir, "out_acoustic") for d in [in_timelag_root, out_timelag_root, in_duration_root, out_duration_root, in_acoustic_root, out_acoustic_root]: if not os.path.exists(d): logger.info("mkdirs: {}".format(d)) os.makedirs(d) # Save features for timelag model if config.timelag.enabled: logger.info("Timelag linguistic feature dim: {}".format(in_timelag[0].shape[1])) logger.info("Timelag feature dim: {}".format(out_timelag[0].shape[1])) for idx in tqdm(range(len(in_timelag))): x, y = in_timelag[idx], out_timelag[idx] name = splitext(basename(in_timelag.collected_files[idx][0]))[0] xpath = join(in_timelag_root, name + "-feats.npy") ypath = join(out_timelag_root, name + "-feats.npy") np.save(xpath, x, allow_pickle=False) np.save(ypath, y, allow_pickle=False) # Save features for duration model if config.duration.enabled: logger.info("Duration linguistic feature dim: {}".format(in_duration[0].shape[1])) logger.info("Duration feature dim: {}".format(out_duration[0].shape[1])) for idx in tqdm(range(len(in_duration))): x, y = in_duration[idx], out_duration[idx] name = splitext(basename(in_duration.collected_files[idx][0]))[0] xpath = join(in_duration_root, name + "-feats.npy") ypath = join(out_duration_root, name + "-feats.npy") np.save(xpath, x, allow_pickle=False) np.save(ypath, y, allow_pickle=False) # Save features for acoustic model if config.acoustic.enabled: logger.info("Acoustic linguistic feature dim: {}".format(in_acoustic[0].shape[1])) logger.info("Acoustic feature dim: {}".format(out_acoustic[0][0].shape[1])) for idx in tqdm(range(len(in_acoustic))): x, (y, wave) = in_acoustic[idx], out_acoustic[idx] name = splitext(basename(in_acoustic.collected_files[idx][0]))[0] xpath = join(in_acoustic_root, name + "-feats.npy") ypath = join(out_acoustic_root, name + "-feats.npy") wpath = join(out_acoustic_root, name + "-wave.npy") np.save(xpath, x, allow_pickle=False) np.save(ypath, y, allow_pickle=False) np.save(wpath, wave, allow_pickle=False)
def my_app(config: DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) if not torch.cuda.is_available(): device = torch.device("cpu") else: device = torch.device(config.device) maybe_set_checkpoints_(config) maybe_set_normalization_stats_(config) # timelag timelag_config = OmegaConf.load(to_absolute_path( config.timelag.model_yaml)) timelag_model = hydra.utils.instantiate(timelag_config.netG).to(device) checkpoint = torch.load(to_absolute_path(config.timelag.checkpoint), map_location=lambda storage, loc: storage) timelag_model.load_state_dict(checkpoint["state_dict"]) timelag_in_scaler = joblib.load( to_absolute_path(config.timelag.in_scaler_path)) timelag_out_scaler = joblib.load( to_absolute_path(config.timelag.out_scaler_path)) timelag_model.eval() # duration duration_config = OmegaConf.load( to_absolute_path(config.duration.model_yaml)) duration_model = hydra.utils.instantiate(duration_config.netG).to(device) checkpoint = torch.load(to_absolute_path(config.duration.checkpoint), map_location=lambda storage, loc: storage) duration_model.load_state_dict(checkpoint["state_dict"]) duration_in_scaler = joblib.load( to_absolute_path(config.duration.in_scaler_path)) duration_out_scaler = joblib.load( to_absolute_path(config.duration.out_scaler_path)) duration_model.eval() # acoustic model acoustic_config = OmegaConf.load( to_absolute_path(config.acoustic.model_yaml)) acoustic_model = hydra.utils.instantiate(acoustic_config.netG).to(device) checkpoint = torch.load(to_absolute_path(config.acoustic.checkpoint), map_location=lambda storage, loc: storage) acoustic_model.load_state_dict(checkpoint["state_dict"]) acoustic_in_scaler = joblib.load( to_absolute_path(config.acoustic.in_scaler_path)) acoustic_out_scaler = joblib.load( to_absolute_path(config.acoustic.out_scaler_path)) acoustic_model.eval() # Run synthesis for each utt. question_path = to_absolute_path(config.question_path) if config.utt_list is not None: in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) os.makedirs(out_dir, exist_ok=True) with open(to_absolute_path(config.utt_list)) as f: lines = list(filter(lambda s: len(s.strip()) > 0, f.readlines())) logger.info("Processes %s utterances...", len(lines)) for idx in tqdm(range(len(lines))): utt_id = lines[idx].strip() label_path = join(in_dir, f"{utt_id}.lab") if not exists(label_path): raise RuntimeError( f"Label file does not exist: {label_path}") wav = synthesis(config, device, label_path, question_path, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, duration_model, duration_config, duration_in_scaler, duration_out_scaler, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler) wav = np.clip(wav, -32768, 32767) if config.gain_normalize: wav = wav / np.max(np.abs(wav)) * 32767 out_wav_path = join(out_dir, f"{utt_id}.wav") wavfile.write(out_wav_path, rate=config.sample_rate, data=wav.astype(np.int16)) else: assert config.label_path is not None logger.info("Process the label file: %s", config.label_path) label_path = to_absolute_path(config.label_path) out_wav_path = to_absolute_path(config.out_wav_path) wav = synthesis(config, device, label_path, question_path, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, duration_model, duration_config, duration_in_scaler, duration_out_scaler, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler) wav = wav / np.max(np.abs(wav)) * (2**15 - 1) wavfile.write(out_wav_path, rate=config.sample_rate, data=wav.astype(np.int16))
def my_app(config: DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) utt_list = to_absolute_path(config.utt_list) out_dir = to_absolute_path(config.out_dir) question_path_general = to_absolute_path(config.question_path) # Time-lag model # in: musical/linguistic context # out: time-lag (i.e. onset time deviation) if config.timelag.question_path is not None: question_path = config.timelag.question_path else: question_path = question_path_general in_timelag_source = MusicalLinguisticSource( utt_list, to_absolute_path(config.timelag.label_phone_score_dir), add_frame_features=False, subphone_features=None, question_path=question_path, log_f0_conditioning=config.log_f0_conditioning) out_timelag_source = TimeLagFeatureSource( utt_list, to_absolute_path(config.timelag.label_phone_score_dir), to_absolute_path(config.timelag.label_phone_align_dir)) in_timelag = FileSourceDataset(in_timelag_source) out_timelag = FileSourceDataset(out_timelag_source) # Duration model # in: musical/linguistic context # out: duration if config.duration.question_path is not None: question_path = config.duration.question_path else: question_path = question_path_general in_duration_source = MusicalLinguisticSource( utt_list, to_absolute_path(config.duration.label_dir), add_frame_features=False, subphone_features=None, question_path=question_path, log_f0_conditioning=config.log_f0_conditioning) out_duration_source = DurationFeatureSource( utt_list, to_absolute_path(config.duration.label_dir)) in_duration = FileSourceDataset(in_duration_source) out_duration = FileSourceDataset(out_duration_source) # Acoustic model # in: musical/linguistic context # out: acoustic features if config.acoustic.question_path is not None: question_path = config.acoustic.question_path else: question_path = question_path_general in_acoustic_source = MusicalLinguisticSource( utt_list, to_absolute_path(config.acoustic.label_dir), question_path, add_frame_features=True, subphone_features=config.acoustic.subphone_features, log_f0_conditioning=config.log_f0_conditioning) out_acoustic_source = WORLDAcousticSource( utt_list, to_absolute_path(config.acoustic.wav_dir), to_absolute_path(config.acoustic.label_dir), question_path, use_harvest=config.acoustic.use_harvest, f0_ceil=config.acoustic.f0_ceil, f0_floor=config.acoustic.f0_floor, frame_period=config.acoustic.frame_period, mgc_order=config.acoustic.mgc_order, num_windows=config.acoustic.num_windows, relative_f0=config.acoustic.relative_f0) in_acoustic = FileSourceDataset(in_acoustic_source) out_acoustic = FileSourceDataset(out_acoustic_source) # Save as files in_timelag_root = join(out_dir, "in_timelag") out_timelag_root = join(out_dir, "out_timelag") in_duration_root = join(out_dir, "in_duration") out_duration_root = join(out_dir, "out_duration") in_acoustic_root = join(out_dir, "in_acoustic") out_acoustic_root = join(out_dir, "out_acoustic") for d in [ in_timelag_root, out_timelag_root, in_duration_root, out_duration_root, in_acoustic_root, out_acoustic_root ]: if not os.path.exists(d): logger.info("mkdirs: %s", format(d)) os.makedirs(d) # Save features for timelag model if config.timelag.enabled: logger.info("Timelag linguistic feature dim: %s", str(in_timelag[0].shape[1])) logger.info("Timelag feature dim: %s", str(out_timelag[0].shape[1])) with ProcessPoolExecutor(max_workers=config.max_workers) as executor: futures = [ executor.submit(_prepare_timelag_feature, in_timelag_root, out_timelag_root, in_timelag, out_timelag, idx) for idx in range(len(in_timelag)) ] for future in tqdm(futures): future.result() # Save features for duration model if config.duration.enabled: logger.info("Duration linguistic feature dim: %s", str(in_duration[0].shape[1])) logger.info("Duration feature dim: %s", str(out_duration[0].shape[1])) with ProcessPoolExecutor(max_workers=config.max_workers) as executor: futures = [ executor.submit(_prepare_duration_feature, in_duration_root, out_duration_root, in_duration, out_duration, idx) for idx in range(len(in_duration)) ] for future in tqdm(futures): future.result() # Save features for acoustic model if config.acoustic.enabled: logger.info("Acoustic linguistic feature dim: %s", str(in_acoustic[0].shape[1])) logger.info("Acoustic feature dim: %s", str(out_acoustic[0][0].shape[1])) with ProcessPoolExecutor(max_workers=config.max_workers) as executor: futures = [ executor.submit(_prepare_acoustic_feature, in_acoustic_root, out_acoustic_root, in_acoustic, out_acoustic, idx) for idx in range(len(in_acoustic)) ] for future in tqdm(futures): future.result()
def hts2wav(config: DictConfig, label_path: str = None, out_wav_path: str = None) -> None: """ configファイルから各種設定を取得し、labファイルをもとにWAVファイルを生成する。 もとの my_app との相違点: - ビット深度指定をできるようにした。 - utt_list を使わず単一ファイルのみにした。 - 単一ファイルのときの音量ノーマライズを無効にした。 """ logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) # GPUのCUDAが使えるかどうかを判定 device = 'cuda' if torch.cuda.is_available() else 'cpu' # 使用モデルの学習済みファイルのパスを設定する。 maybe_set_checkpoints_(config) maybe_set_normalization_stats_(config) # モデルに関するファイルを読み取る。 model_root = config.model_dir # timelag timelag_config = OmegaConf.load(join(model_root, "timelag", "model.yaml")) timelag_model = hydra.utils.instantiate(timelag_config.netG).to(device) checkpoint = torch.load(config.timelag.checkpoint, map_location=lambda storage, loc: storage) timelag_model.load_state_dict(checkpoint['state_dict']) timelag_in_scaler = joblib.load(config.timelag.in_scaler_path) timelag_out_scaler = joblib.load(config.timelag.out_scaler_path) timelag_model.eval() # duration duration_config = OmegaConf.load(join(model_root, "duration", "model.yaml")) duration_model = hydra.utils.instantiate(duration_config.netG).to(device) checkpoint = torch.load(config.duration.checkpoint, map_location=lambda storage, loc: storage) duration_model.load_state_dict(checkpoint['state_dict']) duration_in_scaler = joblib.load(config.duration.in_scaler_path) duration_out_scaler = joblib.load(config.duration.out_scaler_path) duration_model.eval() # acoustic model acoustic_config = OmegaConf.load(join(model_root, "acoustic", "model.yaml")) acoustic_model = hydra.utils.instantiate(acoustic_config.netG).to(device) checkpoint = torch.load(config.acoustic.checkpoint, map_location=lambda storage, loc: storage) acoustic_model.load_state_dict(checkpoint['state_dict']) acoustic_in_scaler = joblib.load(config.acoustic.in_scaler_path) acoustic_out_scaler = joblib.load(config.acoustic.out_scaler_path) acoustic_model.eval() # 設定を表示 # print(OmegaConf.to_yaml(config)) # synthesize wav file from lab file. # 入力するラベルファイルを指定。 if label_path is None: assert config.label_path is not None label_path = config.label_path else: pass logger.info('Process the label file: %s', label_path) # 出力するwavファイルの設定。 if out_wav_path is None: out_wav_path = config.out_wav_path # パラメータ推定 logger.info('Synthesize the wav file: %s', out_wav_path) duration_modified_labels, f0, sp, bap, wav = synthesis( config, device, label_path, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, duration_model, duration_config, duration_in_scaler, duration_out_scaler, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler) # 中間ファイル出力 print(type(duration_modified_labels)) with open(out_wav_path.replace('.wav', '_timing.lab'), 'wt') as f_lab: lines = str(duration_modified_labels).splitlines() s = '' for line in lines: t_start, t_end, context = line.split() context = context[context.find('-') + 1: context.find('+')] s += f'{t_start} {t_end} {context}\n' f_lab.write(s) with open(out_wav_path.replace('.wav', '.f0'), 'wb') as f_f0: f0.astype(np.float64).tofile(f_f0) with open(out_wav_path.replace('.wav', '.mgc'), 'wb') as f_mgc: sp.astype(np.float64).tofile(f_mgc) with open(out_wav_path.replace('.wav', '.bap'), 'wb') as f_bap: bap.astype(np.float64).tofile(f_bap) # サンプルレートとビット深度を指定してWAVファイル出力 generate_wav_file(config, wav, out_wav_path) logger.info('Synthesized the wav file: %s', out_wav_path)
def my_app(config: DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) device = torch.device("cuda" if use_cuda else "cpu") in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) os.makedirs(out_dir, exist_ok=True) model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(to_absolute_path(config.model.checkpoint), map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["state_dict"]) scaler = joblib.load(to_absolute_path(config.out_scaler_path)) in_feats = FileSourceDataset(NpyFileSource(in_dir)) with torch.no_grad(): for idx in tqdm(range(len(in_feats))): feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device) if model.prediction_type() == PredictionType.PROBABILISTIC: max_mu, max_sigma = model.inference(feats, [feats.shape[1]]) if np.any(model_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * scaler.var_ max_mu = scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # Apply MLPG # (T, D_out) -> (T, static_dim) out = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features) else: # (T, D_out) out = max_mu.squeeze(0).cpu().data.numpy() out = scaler.inverse_transform(out) else: out = model.inference( feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy() out = scaler.inverse_transform(out) # Apply MLPG if necessary if np.any(model_config.has_dynamic_features): out = multi_stream_mlpg( out, scaler.var_, get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features) name = basename(in_feats.collected_files[idx][0]) out_path = join(out_dir, name) np.save(out_path, out, allow_pickle=False)
def setup_cyclegan(config, device, collate_fn=collate_fn_default): """Setup for training CycleGAN Args: config (dict): configuration for training device (torch.device): device to use for training collate_fn (callable, optional): collate function. Defaults to collate_fn_default. Returns: (tuple): tuple containing model, optimizer, learning rate scheduler, data loaders, tensorboard writer, logger, and scalers. """ logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) logger.info(f"PyTorch version: {torch.__version__}") if torch.cuda.is_available(): from torch.backends import cudnn cudnn.benchmark = config.train.cudnn.benchmark cudnn.deterministic = config.train.cudnn.deterministic logger.info(f"cudnn.deterministic: {cudnn.deterministic}") logger.info(f"cudnn.benchmark: {cudnn.benchmark}") if torch.backends.cudnn.version() is not None: logger.info(f"cuDNN version: {torch.backends.cudnn.version()}") logger.info(f"Random seed: {config.seed}") init_seed(config.seed) if config.train.use_detect_anomaly: torch.autograd.set_detect_anomaly(True) logger.info("Set to use torch.autograd.detect_anomaly") if "use_amp" in config.train and config.train.use_amp: logger.info("Use mixed precision training") grad_scaler = GradScaler() else: grad_scaler = None # Model G netG_A2B = hydra.utils.instantiate(config.model.netG).to(device) netG_B2A = hydra.utils.instantiate(config.model.netG).to(device) logger.info( "[Generator] Number of trainable params: {:.3f} million".format( num_trainable_params(netG_A2B) / 1000000.0)) logger.info(netG_A2B) # Optimizer and LR scheduler for G optG, schedulerG = _instantiate_optim_cyclegan(config.train.optim.netG, netG_A2B, netG_B2A) # Model D netD_A = hydra.utils.instantiate(config.model.netD).to(device) netD_B = hydra.utils.instantiate(config.model.netD).to(device) logger.info( "[Discriminator] Number of trainable params: {:.3f} million".format( num_trainable_params(netD_A) / 1000000.0)) logger.info(netD_A) # Optimizer and LR scheduler for D optD, schedulerD = _instantiate_optim_cyclegan(config.train.optim.netD, netD_A, netD_B) # DataLoader data_loaders = get_data_loaders(config.data, collate_fn, logger) set_epochs_based_on_max_steps_(config.train, len(data_loaders["train_no_dev"]), logger) # Resume # TODO # _resume(logger, config.train.resume.netG, netG, optG, schedulerG) # _resume(logger, config.train.resume.netD, netD, optD, schedulerD) if config.data_parallel: netG_A2B = nn.DataParallel(netG_A2B) netG_B2A = nn.DataParallel(netG_B2A) netD_A = nn.DataParallel(netD_A) netD_B = nn.DataParallel(netD_B) # Mlflow if config.mlflow.enabled: mlflow.set_tracking_uri("file://" + get_original_cwd() + "/mlruns") mlflow.set_experiment(config.mlflow.experiment) # NOTE: disable tensorboard if mlflow is enabled writer = None logger.info("Using mlflow instead of tensorboard") else: # Tensorboard writer = SummaryWriter(to_absolute_path(config.train.log_dir)) # Scalers if "in_scaler_path" in config.data and config.data.in_scaler_path is not None: in_scaler = joblib.load(to_absolute_path(config.data.in_scaler_path)) if isinstance(in_scaler, SKMinMaxScaler): in_scaler = MinMaxScaler( in_scaler.min_, in_scaler.scale_, in_scaler.data_min_, in_scaler.data_max_, ) else: in_scaler = None if "out_scaler_path" in config.data and config.data.out_scaler_path is not None: out_scaler = joblib.load(to_absolute_path(config.data.out_scaler_path)) out_scaler = StandardScaler(out_scaler.mean_, out_scaler.var_, out_scaler.scale_) else: out_scaler = None return ( (netG_A2B, netG_B2A, optG, schedulerG), (netD_A, netD_B, optD, schedulerD), grad_scaler, data_loaders, writer, logger, in_scaler, out_scaler, )
def setup(config, device, collate_fn=collate_fn_default): """Setup for training Args: config (dict): configuration for training device (torch.device): device to use for training collate_fn (callable, optional): collate function. Defaults to collate_fn_default. Returns: (tuple): tuple containing model, optimizer, learning rate scheduler, data loaders, tensorboard writer, logger, and scalers. """ logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) logger.info(f"PyTorch version: {torch.__version__}") if torch.cuda.is_available(): from torch.backends import cudnn cudnn.benchmark = config.train.cudnn.benchmark cudnn.deterministic = config.train.cudnn.deterministic logger.info(f"cudnn.deterministic: {cudnn.deterministic}") logger.info(f"cudnn.benchmark: {cudnn.benchmark}") if torch.backends.cudnn.version() is not None: logger.info(f"cuDNN version: {torch.backends.cudnn.version()}") logger.info(f"Random seed: {config.seed}") init_seed(config.seed) if config.train.use_detect_anomaly: torch.autograd.set_detect_anomaly(True) logger.info("Set to use torch.autograd.detect_anomaly") if "use_amp" in config.train and config.train.use_amp: logger.info("Use mixed precision training") grad_scaler = GradScaler() else: grad_scaler = None # Model model = hydra.utils.instantiate(config.model.netG).to(device) logger.info("Number of trainable params: {:.3f} million".format( num_trainable_params(model) / 1000000.0)) logger.info(model) # Optimizer optimizer_class = getattr(optim, config.train.optim.optimizer.name) optimizer = optimizer_class(model.parameters(), **config.train.optim.optimizer.params) # Scheduler lr_scheduler_class = getattr(optim.lr_scheduler, config.train.optim.lr_scheduler.name) lr_scheduler = lr_scheduler_class(optimizer, **config.train.optim.lr_scheduler.params) # DataLoader data_loaders = get_data_loaders(config.data, collate_fn, logger) set_epochs_based_on_max_steps_(config.train, len(data_loaders["train_no_dev"]), logger) # Resume if (config.train.resume.checkpoint is not None and len(config.train.resume.checkpoint) > 0): logger.info("Load weights from %s", config.train.resume.checkpoint) checkpoint = torch.load( to_absolute_path(config.train.resume.checkpoint)) model.load_state_dict(checkpoint["state_dict"]) if config.train.resume.load_optimizer: logger.info("Load optimizer state") optimizer.load_state_dict(checkpoint["optimizer_state"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state"]) if config.data_parallel: model = nn.DataParallel(model) # Mlflow if config.mlflow.enabled: mlflow.set_tracking_uri("file://" + get_original_cwd() + "/mlruns") mlflow.set_experiment(config.mlflow.experiment) # NOTE: disable tensorboard if mlflow is enabled writer = None logger.info("Using mlflow instead of tensorboard") else: # Tensorboard writer = SummaryWriter(to_absolute_path(config.train.log_dir)) # Scalers if "in_scaler_path" in config.data and config.data.in_scaler_path is not None: in_scaler = joblib.load(to_absolute_path(config.data.in_scaler_path)) in_scaler = MinMaxScaler(in_scaler.min_, in_scaler.scale_, in_scaler.data_min_, in_scaler.data_max_) else: in_scaler = None if "out_scaler_path" in config.data and config.data.out_scaler_path is not None: out_scaler = joblib.load(to_absolute_path(config.data.out_scaler_path)) out_scaler = StandardScaler(out_scaler.mean_, out_scaler.var_, out_scaler.scale_) else: out_scaler = None return ( model, optimizer, lr_scheduler, grad_scaler, data_loaders, writer, logger, in_scaler, out_scaler, )
def my_app(config: DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) device = torch.device("cuda" if use_cuda else "cpu") utt_list = to_absolute_path(config.utt_list) in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) utt_ids = load_utt_list(utt_list) os.makedirs(out_dir, exist_ok=True) model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load( to_absolute_path(config.model.checkpoint), map_location=lambda storage, loc: storage, ) model.load_state_dict(checkpoint["state_dict"]) model.eval() out_scaler = joblib.load(to_absolute_path(config.out_scaler_path)) mean_ = get_static_features( out_scaler.mean_.reshape(1, 1, out_scaler.mean_.shape[-1]), model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) mean_ = np.concatenate(mean_, -1).reshape(1, -1) var_ = get_static_features( out_scaler.var_.reshape(1, 1, out_scaler.var_.shape[-1]), model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) var_ = np.concatenate(var_, -1).reshape(1, -1) scale_ = get_static_features( out_scaler.scale_.reshape(1, 1, out_scaler.scale_.shape[-1]), model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) scale_ = np.concatenate(scale_, -1).reshape(1, -1) static_scaler = StandardScaler(mean_, var_, scale_) static_stream_sizes = get_static_stream_sizes( model_config.stream_sizes, model_config.has_dynamic_features, model_config.num_windows, ) for utt_id in tqdm(utt_ids): in_feats = (torch.from_numpy( np.load(join(in_dir, utt_id + "-feats.npy"))).unsqueeze(0).to(device)) static_feats = _gen_static_features(model, model_config, in_feats, out_scaler) mgc_end_dim = static_stream_sizes[0] bap_start_dim = sum(static_stream_sizes[:3]) bap_end_dim = sum(static_stream_sizes[:4]) if config.gv_postfilter: # mgc static_feats[:, :mgc_end_dim] = variance_scaling( static_scaler.var_.reshape(-1)[:mgc_end_dim], static_feats[:, :mgc_end_dim], offset=config.mgc_offset, ) # bap static_feats[:, bap_start_dim:bap_end_dim] = variance_scaling( static_scaler.var_.reshape(-1)[bap_start_dim:bap_end_dim], static_feats[:, bap_start_dim:bap_end_dim], offset=config.bap_offset, ) if config.normalize: static_feats = static_scaler.transform(static_feats) out_path = join(out_dir, f"{utt_id}-feats.npy") np.save(out_path, static_feats.astype(np.float32), allow_pickle=False)