예제 #1
0
def my_app(config: DictConfig) -> None:
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    utt_list = to_absolute_path(config.utt_list)
    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)

    utt_ids = load_utt_list(utt_list)

    stream_sizes, has_dynamic_features = get_world_stream_info(
        config.acoustic.sample_rate,
        config.acoustic.mgc_order,
        config.acoustic.num_windows,
        config.acoustic.vibrato_mode,
    )

    os.makedirs(out_dir, exist_ok=True)
    with ProcessPoolExecutor(max_workers=config.max_workers) as executor:
        futures = [
            executor.submit(
                _extract_static_features,
                in_dir,
                out_dir,
                utt_id,
                config.acoustic.num_windows,
                stream_sizes,
                has_dynamic_features,
            ) for utt_id in utt_ids
        ]
        for future in tqdm(futures):
            future.result()
예제 #2
0
def my_app(config : DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(config.pretty())

    if use_cuda:
        from torch.backends import cudnn
        cudnn.benchmark = config.train.cudnn.benchmark
        cudnn.deterministic = config.train.cudnn.deterministic
        logger.info(f"cudnn.deterministic: {cudnn.deterministic}")
        logger.info(f"cudnn.benchmark: {cudnn.benchmark}")

    logger.info(f"Random seed: {config.seed}")
    init_seed(config.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    if config.train.use_detect_anomaly:
        torch.autograd.set_detect_anomaly(True)
        logger.info("Set to use torch.autograd.detect_anomaly")

    # Model
    model = hydra.utils.instantiate(config.model.netG).to(device)

    # Optimizer
    optimizer_class = getattr(optim, config.train.optim.optimizer.name)
    optimizer = optimizer_class(model.parameters(), **config.train.optim.optimizer.params)

    # Scheduler
    lr_scheduler_class = getattr(optim.lr_scheduler, config.train.optim.lr_scheduler.name)
    lr_scheduler = lr_scheduler_class(optimizer, **config.train.optim.lr_scheduler.params)

    data_loaders = get_data_loaders(config)

    # Resume
    if config.train.resume.checkpoint is not None and len(config.train.resume.checkpoint) > 0:
        logger.info("Load weights from {}".format(config.train.resume.checkpoint))
        checkpoint = torch.load(to_absolute_path(config.train.resume.checkpoint))
        model.load_state_dict(checkpoint["state_dict"])
        if config.train.resume.load_optimizer:
            logger.info("Load optimizer state")
            optimizer.load_state_dict(checkpoint["optimizer_state"])
            lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state"])

    # Save model definition
    out_dir = to_absolute_path(config.train.out_dir)
    os.makedirs(out_dir, exist_ok=True)
    with open(join(out_dir, "model.yaml"), "w") as f:
        OmegaConf.save(config.model, f)

    # Run training loop
    train_loop(config, device, model, optimizer, lr_scheduler, data_loaders)
예제 #3
0
def my_app(config: DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)
    scaler_path = to_absolute_path(config.scaler_path)
    scaler = joblib.load(scaler_path)
    inverse = config.inverse
    num_workers = config.num_workers

    os.makedirs(out_dir, exist_ok=True)
    apply_normalization_dir2dir(in_dir, out_dir, scaler, inverse, num_workers)
예제 #4
0
def my_app(config: DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(config.pretty())

    list_path = to_absolute_path(config.list_path)
    out_path = to_absolute_path(config.out_path)

    scaler = hydra.utils.instantiate(config.scaler)
    with open(list_path) as f:
        for path in f:
            c = np.load(to_absolute_path(path.strip()))
            scaler.partial_fit(c)
        joblib.dump(scaler, out_path)

    if config.verbose > 0:
        if isinstance(scaler, StandardScaler):
            logger.info("mean:\n{}".format(scaler.mean_))
            logger.info("std:\n{}".format(np.sqrt(scaler.var_)))
        if isinstance(scaler, MinMaxScaler):
            logger.info("data min:\n{}".format(scaler.data_min_))
            logger.info("data max:\n{}".format(scaler.data_max_))
예제 #5
0
def my_app(config : DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(config.pretty())

    device = torch.device("cuda" if use_cuda else "cpu")
    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)
    os.makedirs(out_dir, exist_ok=True)

    model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(to_absolute_path(config.model.checkpoint),
        map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint["state_dict"])

    scaler = joblib.load(to_absolute_path(config.out_scaler_path))

    in_feats = FileSourceDataset(NpyFileSource(in_dir))

    with torch.no_grad():
        for idx in tqdm(range(len(in_feats))):
            feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device)
            out = model(feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy()

            out = scaler.inverse_transform(out)

            # Apply MLPG if necessary
            if np.any(model_config.has_dynamic_features):
                windows = get_windows(3)
                out = multi_stream_mlpg(
                    out, scaler.var_, windows, model_config.stream_sizes,
                    model_config.has_dynamic_features)

            name = basename(in_feats.collected_files[idx][0])
            out_path = join(out_dir, name)
            np.save(out_path, out, allow_pickle=False)
예제 #6
0
파일: world.py 프로젝트: oatsu-gh/ENUNU
def acoustic2world(config: DictConfig, path_timing, path_acoustic, path_f0,
                   path_spcetrogram, path_aperiodicity):
    """
    Acousticの行列のCSVを読んで、WAVファイルとして出力する。
    """
    # loggerの設定
    global logger  # pylint: disable=global-statement
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    # load labels and question
    duration_modified_labels = hts.load(path_timing).round_()

    # CUDAが使えるかどうか
    # device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # 各種設定を読み込む
    typ = 'acoustic'
    model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml))

    # hedファイルを読み取る。
    question_path = to_absolute_path(config.question_path)
    # hts2wav.pyだとこう↓-----------------
    # これだと各モデルに別個のhedを適用できる。
    # if config[typ].question_path is None:
    #     config[typ].question_path = config.question_path
    # --------------------------------------

    # hedファイルを辞書として読み取る。
    binary_dict, continuous_dict = hts.load_question_set(
        question_path, append_hat_for_LL=False)

    # pitch indices in the input features
    pitch_idx = len(binary_dict) + 1
    # pitch_indices = np.arange(len(binary_dict), len(binary_dict)+3)

    # pylint: disable=no-member
    # Acousticの数値を読み取る
    acoustic_features = np.loadtxt(path_acoustic,
                                   delimiter=',',
                                   dtype=np.float64)

    # AcousticからWORLD用のパラメータを取り出す。
    f0, spectrogram, aperiodicity = gen_world_params(
        duration_modified_labels,
        acoustic_features,
        binary_dict,
        continuous_dict,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
        subphone_features=config.acoustic.subphone_features,
        pitch_idx=pitch_idx,
        num_windows=model_config.num_windows,
        post_filter=config.acoustic.post_filter,
        sample_rate=config.sample_rate,
        frame_period=config.frame_period,
        relative_f0=config.acoustic.relative_f0,
        vibrato_scale=1.0,
        vuv_threshold=0.3)

    # csvファイルとしてf0の行列を出力
    for path, array in ((path_f0, f0), (path_spcetrogram, spectrogram),
                        (path_aperiodicity, aperiodicity)):
        np.savetxt(path, array, fmt='%.16f', delimiter=',')
예제 #7
0
def _score2duration(config: DictConfig, labels):
    """
    full_score と timelag ラベルから durationラベルを生成する。
    """
    # -----------------------------------------------------
    # ここから nnsvs.bin.synthesis.my_app() の内容 --------
    # -----------------------------------------------------
    # loggerの設定
    global logger  # pylint: disable=global-statement
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    typ = 'duration'
    # CUDAが使えるかどうか
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # maybe_set_checkpoints_(config) のかわり
    set_checkpoint(config, typ)
    # maybe_set_normalization_stats_(config) のかわり
    set_normalization_stat(config, typ)

    # 各種設定を読み込む
    model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(config[typ].checkpoint,
                            map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint['state_dict'])
    in_scaler = joblib.load(config[typ].in_scaler_path)
    out_scaler = joblib.load(config[typ].out_scaler_path)
    model.eval()
    # -----------------------------------------------------
    # ここまで nnsvs.bin.synthesis.my_app() の内容 --------
    # -----------------------------------------------------

    # -----------------------------------------------------
    # ここから nnsvs.bin.synthesis.synthesis() の内容 -----
    # -----------------------------------------------------
    # full_score_lab を読み取る。
    # labels = hts.load(score_path).round_()
    # いまのduraitonモデルだと使わない
    # timelag = hts.load(timelag_path).round_()

    # hedファイルを読み取る。
    question_path = to_absolute_path(config.question_path)
    # hts2wav.pyだとこう↓-----------------
    # これだと各モデルに別個のhedを適用できる。
    # if config[typ].question_path is None:
    #     config[typ].question_path = config.question_path
    # --------------------------------------
    # hedファイルを辞書として読み取る。
    binary_dict, continuous_dict = \
        hts.load_question_set(question_path, append_hat_for_LL=False)
    # pitch indices in the input features
    # pitch_idx = len(binary_dict) + 1
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)

    # f0の設定を読み取る。
    log_f0_conditioning = config.log_f0_conditioning

    # durationモデルを適用
    duration = predict_duration(device,
                                labels,
                                model,
                                model_config,
                                in_scaler,
                                out_scaler,
                                binary_dict,
                                continuous_dict,
                                pitch_indices,
                                log_f0_conditioning,
                                force_clip_input_features=False)
    # durationのタプルまたはndarrayを返す
    return duration
예제 #8
0
파일: acoustic.py 프로젝트: oatsu-gh/ENUNU
def timing2acoustic(config: DictConfig, timing_path, acoustic_path):
    """
    フルラベルを読み取って、音響特長量のファイルを出力する。
    """
    # -----------------------------------------------------
    # ここから nnsvs.bin.synthesis.my_app() の内容 --------
    # -----------------------------------------------------
    # loggerの設定
    global logger  # pylint: disable=global-statement
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    typ = 'acoustic'
    # CUDAが使えるかどうか
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # maybe_set_checkpoints_(config) のかわり
    set_checkpoint(config, typ)
    # maybe_set_normalization_stats_(config) のかわり
    set_normalization_stat(config, typ)

    # 各種設定を読み込む
    model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(config[typ].checkpoint,
                            map_location=lambda storage, loc: storage)

    model.load_state_dict(checkpoint['state_dict'])
    in_scaler = joblib.load(config[typ].in_scaler_path)
    out_scaler = joblib.load(config[typ].out_scaler_path)
    model.eval()
    # -----------------------------------------------------
    # ここまで nnsvs.bin.synthesis.my_app() の内容 --------
    # -----------------------------------------------------

    # -----------------------------------------------------
    # ここから nnsvs.bin.synthesis.synthesis() の内容 -----
    # -----------------------------------------------------
    # full_score_lab を読み取る。
    duration_modified_labels = hts.load(timing_path).round_()

    # hedファイルを読み取る。
    question_path = to_absolute_path(config.question_path)
    # hts2wav.pyだとこう↓-----------------
    # これだと各モデルに別個のhedを適用できる。
    # if config[typ].question_path is None:
    #     config[typ].question_path = config.question_path
    # --------------------------------------
    # hedファイルを辞書として読み取る。
    binary_dict, continuous_dict = hts.load_question_set(
        question_path, append_hat_for_LL=False)
    # pitch indices in the input features
    # pitch_idx = len(binary_dict) + 1
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)

    # f0の設定を読み取る。
    log_f0_conditioning = config.log_f0_conditioning
    acoustic_features = predict_acoustic(device, duration_modified_labels,
                                         model, model_config, in_scaler,
                                         out_scaler, binary_dict,
                                         continuous_dict,
                                         config.acoustic.subphone_features,
                                         pitch_indices, log_f0_conditioning)

    # csvファイルとしてAcousticの行列を出力
    np.savetxt(acoustic_path, acoustic_features, delimiter=',')
예제 #9
0
def my_app(config : DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(config.pretty())

    utt_list = to_absolute_path(config.utt_list)
    out_dir = to_absolute_path(config.out_dir)
    question_path_general = to_absolute_path(config.question_path)

    # Time-lag model
    # in: musical/linguistic context
    # out: time-lag (i.e. onset time deviation)
    if config.timelag.question_path is not None:
        question_path = config.timelag.question_path
    else:
        question_path = question_path_general
    in_timelag_source = MusicalLinguisticSource(utt_list,
        to_absolute_path(config.timelag.label_phone_score_dir),
        add_frame_features=False, subphone_features=None,
        question_path=question_path,
        log_f0_conditioning=config.log_f0_conditioning)
    out_timelag_source = TimeLagFeatureSource(utt_list,
        to_absolute_path(config.timelag.label_phone_score_dir),
        to_absolute_path(config.timelag.label_phone_align_dir))

    in_timelag = FileSourceDataset(in_timelag_source)
    out_timelag = FileSourceDataset(out_timelag_source)

    # Duration model
    # in: musical/linguistic context
    # out: duration
    if config.duration.question_path is not None:
        question_path = config.duration.question_path
    else:
        question_path = question_path_general
    in_duration_source = MusicalLinguisticSource(utt_list,
        to_absolute_path(config.duration.label_dir),
        add_frame_features=False, subphone_features=None,
        question_path=question_path,
        log_f0_conditioning=config.log_f0_conditioning)
    out_duration_source = DurationFeatureSource(
        utt_list, to_absolute_path(config.duration.label_dir))

    in_duration = FileSourceDataset(in_duration_source)
    out_duration = FileSourceDataset(out_duration_source)

    # Acoustic model
    # in: musical/linguistic context
    # out: acoustic features
    if config.acoustic.question_path is not None:
        question_path = config.acoustic.question_path
    else:
        question_path = question_path_general
    in_acoustic_source = MusicalLinguisticSource(utt_list,
        to_absolute_path(config.acoustic.label_dir), question_path,
        add_frame_features=True, subphone_features=config.acoustic.subphone_features,
        log_f0_conditioning=config.log_f0_conditioning)
    out_acoustic_source = WORLDAcousticSource(utt_list,
        to_absolute_path(config.acoustic.wav_dir), to_absolute_path(config.acoustic.label_dir),
        question_path, use_harvest=config.acoustic.use_harvest,
        f0_ceil=config.acoustic.f0_ceil, f0_floor=config.acoustic.f0_floor,
        frame_period=config.acoustic.frame_period, mgc_order=config.acoustic.mgc_order,
        num_windows=config.acoustic.num_windows,
        relative_f0=config.acoustic.relative_f0)
    in_acoustic = FileSourceDataset(in_acoustic_source)
    out_acoustic = FileSourceDataset(out_acoustic_source)

    # Save as files
    in_timelag_root = join(out_dir, "in_timelag")
    out_timelag_root = join(out_dir, "out_timelag")
    in_duration_root = join(out_dir, "in_duration")
    out_duration_root = join(out_dir, "out_duration")
    in_acoustic_root = join(out_dir, "in_acoustic")
    out_acoustic_root = join(out_dir, "out_acoustic")

    for d in [in_timelag_root, out_timelag_root, in_duration_root, out_duration_root,
            in_acoustic_root, out_acoustic_root]:
        if not os.path.exists(d):
            logger.info("mkdirs: {}".format(d))
            os.makedirs(d)

    # Save features for timelag model
    if config.timelag.enabled:
        logger.info("Timelag linguistic feature dim: {}".format(in_timelag[0].shape[1]))
        logger.info("Timelag feature dim: {}".format(out_timelag[0].shape[1]))
        for idx in tqdm(range(len(in_timelag))):
            x, y = in_timelag[idx], out_timelag[idx]
            name = splitext(basename(in_timelag.collected_files[idx][0]))[0]
            xpath = join(in_timelag_root, name + "-feats.npy")
            ypath = join(out_timelag_root, name + "-feats.npy")
            np.save(xpath, x, allow_pickle=False)
            np.save(ypath, y, allow_pickle=False)

    # Save features for duration model
    if config.duration.enabled:
        logger.info("Duration linguistic feature dim: {}".format(in_duration[0].shape[1]))
        logger.info("Duration feature dim: {}".format(out_duration[0].shape[1]))
        for idx in tqdm(range(len(in_duration))):
            x, y = in_duration[idx], out_duration[idx]
            name = splitext(basename(in_duration.collected_files[idx][0]))[0]
            xpath = join(in_duration_root, name + "-feats.npy")
            ypath = join(out_duration_root, name + "-feats.npy")
            np.save(xpath, x, allow_pickle=False)
            np.save(ypath, y, allow_pickle=False)

    # Save features for acoustic model
    if config.acoustic.enabled:
        logger.info("Acoustic linguistic feature dim: {}".format(in_acoustic[0].shape[1]))
        logger.info("Acoustic feature dim: {}".format(out_acoustic[0][0].shape[1]))
        for idx in tqdm(range(len(in_acoustic))):
            x, (y, wave) = in_acoustic[idx], out_acoustic[idx]
            name = splitext(basename(in_acoustic.collected_files[idx][0]))[0]
            xpath = join(in_acoustic_root, name + "-feats.npy")
            ypath = join(out_acoustic_root, name + "-feats.npy")
            wpath = join(out_acoustic_root, name + "-wave.npy")
            np.save(xpath, x, allow_pickle=False)
            np.save(ypath, y, allow_pickle=False)
            np.save(wpath, wave, allow_pickle=False)
예제 #10
0
def my_app(config: DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    if not torch.cuda.is_available():
        device = torch.device("cpu")
    else:
        device = torch.device(config.device)

    maybe_set_checkpoints_(config)
    maybe_set_normalization_stats_(config)

    # timelag
    timelag_config = OmegaConf.load(to_absolute_path(
        config.timelag.model_yaml))
    timelag_model = hydra.utils.instantiate(timelag_config.netG).to(device)
    checkpoint = torch.load(to_absolute_path(config.timelag.checkpoint),
                            map_location=lambda storage, loc: storage)
    timelag_model.load_state_dict(checkpoint["state_dict"])
    timelag_in_scaler = joblib.load(
        to_absolute_path(config.timelag.in_scaler_path))
    timelag_out_scaler = joblib.load(
        to_absolute_path(config.timelag.out_scaler_path))
    timelag_model.eval()

    # duration
    duration_config = OmegaConf.load(
        to_absolute_path(config.duration.model_yaml))
    duration_model = hydra.utils.instantiate(duration_config.netG).to(device)
    checkpoint = torch.load(to_absolute_path(config.duration.checkpoint),
                            map_location=lambda storage, loc: storage)
    duration_model.load_state_dict(checkpoint["state_dict"])
    duration_in_scaler = joblib.load(
        to_absolute_path(config.duration.in_scaler_path))
    duration_out_scaler = joblib.load(
        to_absolute_path(config.duration.out_scaler_path))
    duration_model.eval()

    # acoustic model
    acoustic_config = OmegaConf.load(
        to_absolute_path(config.acoustic.model_yaml))
    acoustic_model = hydra.utils.instantiate(acoustic_config.netG).to(device)
    checkpoint = torch.load(to_absolute_path(config.acoustic.checkpoint),
                            map_location=lambda storage, loc: storage)
    acoustic_model.load_state_dict(checkpoint["state_dict"])
    acoustic_in_scaler = joblib.load(
        to_absolute_path(config.acoustic.in_scaler_path))
    acoustic_out_scaler = joblib.load(
        to_absolute_path(config.acoustic.out_scaler_path))
    acoustic_model.eval()

    # Run synthesis for each utt.
    question_path = to_absolute_path(config.question_path)

    if config.utt_list is not None:
        in_dir = to_absolute_path(config.in_dir)
        out_dir = to_absolute_path(config.out_dir)
        os.makedirs(out_dir, exist_ok=True)
        with open(to_absolute_path(config.utt_list)) as f:
            lines = list(filter(lambda s: len(s.strip()) > 0, f.readlines()))
            logger.info("Processes %s utterances...", len(lines))
            for idx in tqdm(range(len(lines))):
                utt_id = lines[idx].strip()
                label_path = join(in_dir, f"{utt_id}.lab")
                if not exists(label_path):
                    raise RuntimeError(
                        f"Label file does not exist: {label_path}")

                wav = synthesis(config, device, label_path, question_path,
                                timelag_model, timelag_config,
                                timelag_in_scaler, timelag_out_scaler,
                                duration_model, duration_config,
                                duration_in_scaler, duration_out_scaler,
                                acoustic_model, acoustic_config,
                                acoustic_in_scaler, acoustic_out_scaler)
                wav = np.clip(wav, -32768, 32767)
                if config.gain_normalize:
                    wav = wav / np.max(np.abs(wav)) * 32767

                out_wav_path = join(out_dir, f"{utt_id}.wav")
                wavfile.write(out_wav_path,
                              rate=config.sample_rate,
                              data=wav.astype(np.int16))
    else:
        assert config.label_path is not None
        logger.info("Process the label file: %s", config.label_path)
        label_path = to_absolute_path(config.label_path)
        out_wav_path = to_absolute_path(config.out_wav_path)

        wav = synthesis(config, device, label_path, question_path,
                        timelag_model, timelag_config, timelag_in_scaler,
                        timelag_out_scaler, duration_model, duration_config,
                        duration_in_scaler, duration_out_scaler,
                        acoustic_model, acoustic_config, acoustic_in_scaler,
                        acoustic_out_scaler)
        wav = wav / np.max(np.abs(wav)) * (2**15 - 1)
        wavfile.write(out_wav_path,
                      rate=config.sample_rate,
                      data=wav.astype(np.int16))
예제 #11
0
def my_app(config: DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    utt_list = to_absolute_path(config.utt_list)
    out_dir = to_absolute_path(config.out_dir)
    question_path_general = to_absolute_path(config.question_path)

    # Time-lag model
    # in: musical/linguistic context
    # out: time-lag (i.e. onset time deviation)
    if config.timelag.question_path is not None:
        question_path = config.timelag.question_path
    else:
        question_path = question_path_general

    in_timelag_source = MusicalLinguisticSource(
        utt_list,
        to_absolute_path(config.timelag.label_phone_score_dir),
        add_frame_features=False,
        subphone_features=None,
        question_path=question_path,
        log_f0_conditioning=config.log_f0_conditioning)
    out_timelag_source = TimeLagFeatureSource(
        utt_list, to_absolute_path(config.timelag.label_phone_score_dir),
        to_absolute_path(config.timelag.label_phone_align_dir))

    in_timelag = FileSourceDataset(in_timelag_source)
    out_timelag = FileSourceDataset(out_timelag_source)

    # Duration model
    # in: musical/linguistic context
    # out: duration
    if config.duration.question_path is not None:
        question_path = config.duration.question_path
    else:
        question_path = question_path_general

    in_duration_source = MusicalLinguisticSource(
        utt_list,
        to_absolute_path(config.duration.label_dir),
        add_frame_features=False,
        subphone_features=None,
        question_path=question_path,
        log_f0_conditioning=config.log_f0_conditioning)
    out_duration_source = DurationFeatureSource(
        utt_list, to_absolute_path(config.duration.label_dir))

    in_duration = FileSourceDataset(in_duration_source)
    out_duration = FileSourceDataset(out_duration_source)

    # Acoustic model
    # in: musical/linguistic context
    # out: acoustic features
    if config.acoustic.question_path is not None:
        question_path = config.acoustic.question_path
    else:
        question_path = question_path_general
    in_acoustic_source = MusicalLinguisticSource(
        utt_list,
        to_absolute_path(config.acoustic.label_dir),
        question_path,
        add_frame_features=True,
        subphone_features=config.acoustic.subphone_features,
        log_f0_conditioning=config.log_f0_conditioning)
    out_acoustic_source = WORLDAcousticSource(
        utt_list,
        to_absolute_path(config.acoustic.wav_dir),
        to_absolute_path(config.acoustic.label_dir),
        question_path,
        use_harvest=config.acoustic.use_harvest,
        f0_ceil=config.acoustic.f0_ceil,
        f0_floor=config.acoustic.f0_floor,
        frame_period=config.acoustic.frame_period,
        mgc_order=config.acoustic.mgc_order,
        num_windows=config.acoustic.num_windows,
        relative_f0=config.acoustic.relative_f0)
    in_acoustic = FileSourceDataset(in_acoustic_source)
    out_acoustic = FileSourceDataset(out_acoustic_source)

    # Save as files
    in_timelag_root = join(out_dir, "in_timelag")
    out_timelag_root = join(out_dir, "out_timelag")
    in_duration_root = join(out_dir, "in_duration")
    out_duration_root = join(out_dir, "out_duration")
    in_acoustic_root = join(out_dir, "in_acoustic")
    out_acoustic_root = join(out_dir, "out_acoustic")

    for d in [
            in_timelag_root, out_timelag_root, in_duration_root,
            out_duration_root, in_acoustic_root, out_acoustic_root
    ]:
        if not os.path.exists(d):
            logger.info("mkdirs: %s", format(d))
            os.makedirs(d)

    # Save features for timelag model
    if config.timelag.enabled:
        logger.info("Timelag linguistic feature dim: %s",
                    str(in_timelag[0].shape[1]))
        logger.info("Timelag feature dim: %s", str(out_timelag[0].shape[1]))
        with ProcessPoolExecutor(max_workers=config.max_workers) as executor:
            futures = [
                executor.submit(_prepare_timelag_feature, in_timelag_root,
                                out_timelag_root, in_timelag, out_timelag, idx)
                for idx in range(len(in_timelag))
            ]
            for future in tqdm(futures):
                future.result()

    # Save features for duration model
    if config.duration.enabled:
        logger.info("Duration linguistic feature dim: %s",
                    str(in_duration[0].shape[1]))
        logger.info("Duration feature dim: %s", str(out_duration[0].shape[1]))
        with ProcessPoolExecutor(max_workers=config.max_workers) as executor:
            futures = [
                executor.submit(_prepare_duration_feature, in_duration_root,
                                out_duration_root, in_duration, out_duration,
                                idx) for idx in range(len(in_duration))
            ]
            for future in tqdm(futures):
                future.result()

    # Save features for acoustic model
    if config.acoustic.enabled:
        logger.info("Acoustic linguistic feature dim: %s",
                    str(in_acoustic[0].shape[1]))
        logger.info("Acoustic feature dim: %s",
                    str(out_acoustic[0][0].shape[1]))
        with ProcessPoolExecutor(max_workers=config.max_workers) as executor:
            futures = [
                executor.submit(_prepare_acoustic_feature, in_acoustic_root,
                                out_acoustic_root, in_acoustic, out_acoustic,
                                idx) for idx in range(len(in_acoustic))
            ]
            for future in tqdm(futures):
                future.result()
예제 #12
0
def hts2wav(config: DictConfig, label_path: str = None, out_wav_path: str = None) -> None:
    """
    configファイルから各種設定を取得し、labファイルをもとにWAVファイルを生成する。

    もとの my_app との相違点:
        - ビット深度指定をできるようにした。
        - utt_list を使わず単一ファイルのみにした。
        - 単一ファイルのときの音量ノーマライズを無効にした。
    """
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    # GPUのCUDAが使えるかどうかを判定
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # 使用モデルの学習済みファイルのパスを設定する。
    maybe_set_checkpoints_(config)
    maybe_set_normalization_stats_(config)

    # モデルに関するファイルを読み取る。
    model_root = config.model_dir
    # timelag
    timelag_config = OmegaConf.load(join(model_root, "timelag", "model.yaml"))
    timelag_model = hydra.utils.instantiate(timelag_config.netG).to(device)
    checkpoint = torch.load(config.timelag.checkpoint,
                            map_location=lambda storage,
                            loc: storage)
    timelag_model.load_state_dict(checkpoint['state_dict'])
    timelag_in_scaler = joblib.load(config.timelag.in_scaler_path)
    timelag_out_scaler = joblib.load(config.timelag.out_scaler_path)
    timelag_model.eval()

    # duration
    duration_config = OmegaConf.load(join(model_root, "duration", "model.yaml"))
    duration_model = hydra.utils.instantiate(duration_config.netG).to(device)
    checkpoint = torch.load(config.duration.checkpoint,
                            map_location=lambda storage,
                            loc: storage)
    duration_model.load_state_dict(checkpoint['state_dict'])
    duration_in_scaler = joblib.load(config.duration.in_scaler_path)
    duration_out_scaler = joblib.load(config.duration.out_scaler_path)
    duration_model.eval()

    # acoustic model
    acoustic_config = OmegaConf.load(join(model_root, "acoustic", "model.yaml"))
    acoustic_model = hydra.utils.instantiate(acoustic_config.netG).to(device)
    checkpoint = torch.load(config.acoustic.checkpoint,
                            map_location=lambda storage,
                            loc: storage)
    acoustic_model.load_state_dict(checkpoint['state_dict'])
    acoustic_in_scaler = joblib.load(config.acoustic.in_scaler_path)
    acoustic_out_scaler = joblib.load(config.acoustic.out_scaler_path)
    acoustic_model.eval()

    # 設定を表示
    # print(OmegaConf.to_yaml(config))
    # synthesize wav file from lab file.
    # 入力するラベルファイルを指定。
    if label_path is None:
        assert config.label_path is not None
        label_path = config.label_path
    else:
        pass
    logger.info('Process the label file: %s', label_path)

    # 出力するwavファイルの設定。
    if out_wav_path is None:
        out_wav_path = config.out_wav_path

    # パラメータ推定
    logger.info('Synthesize the wav file: %s', out_wav_path)
    duration_modified_labels, f0, sp, bap, wav = synthesis(
        config, device, label_path,
        timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler,
        duration_model, duration_config, duration_in_scaler, duration_out_scaler,
        acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler)

    # 中間ファイル出力
    print(type(duration_modified_labels))
    with open(out_wav_path.replace('.wav', '_timing.lab'), 'wt') as f_lab:
        lines = str(duration_modified_labels).splitlines()
        s = ''
        for line in lines:
            t_start, t_end, context = line.split()
            context = context[context.find('-') + 1: context.find('+')]
            s += f'{t_start} {t_end} {context}\n'
        f_lab.write(s)
    with open(out_wav_path.replace('.wav', '.f0'), 'wb') as f_f0:
        f0.astype(np.float64).tofile(f_f0)
    with open(out_wav_path.replace('.wav', '.mgc'), 'wb') as f_mgc:
        sp.astype(np.float64).tofile(f_mgc)
    with open(out_wav_path.replace('.wav', '.bap'), 'wb') as f_bap:
        bap.astype(np.float64).tofile(f_bap)
    # サンプルレートとビット深度を指定してWAVファイル出力
    generate_wav_file(config, wav, out_wav_path)

    logger.info('Synthesized the wav file: %s', out_wav_path)
예제 #13
0
def my_app(config: DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    device = torch.device("cuda" if use_cuda else "cpu")
    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)
    os.makedirs(out_dir, exist_ok=True)

    model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(to_absolute_path(config.model.checkpoint),
                            map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint["state_dict"])

    scaler = joblib.load(to_absolute_path(config.out_scaler_path))

    in_feats = FileSourceDataset(NpyFileSource(in_dir))

    with torch.no_grad():
        for idx in tqdm(range(len(in_feats))):
            feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device)

            if model.prediction_type() == PredictionType.PROBABILISTIC:

                max_mu, max_sigma = model.inference(feats, [feats.shape[1]])

                if np.any(model_config.has_dynamic_features):
                    # Apply denormalization
                    # (B, T, D_out) -> (T, D_out)
                    max_sigma_sq = max_sigma.squeeze(
                        0).cpu().data.numpy()**2 * scaler.var_
                    max_mu = scaler.inverse_transform(
                        max_mu.squeeze(0).cpu().data.numpy())
                    # Apply MLPG
                    # (T, D_out) -> (T, static_dim)
                    out = multi_stream_mlpg(
                        max_mu, max_sigma_sq,
                        get_windows(model_config.num_windows),
                        model_config.stream_sizes,
                        model_config.has_dynamic_features)

                else:
                    # (T, D_out)
                    out = max_mu.squeeze(0).cpu().data.numpy()
                    out = scaler.inverse_transform(out)
            else:
                out = model.inference(
                    feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy()
                out = scaler.inverse_transform(out)

                # Apply MLPG if necessary
                if np.any(model_config.has_dynamic_features):
                    out = multi_stream_mlpg(
                        out, scaler.var_,
                        get_windows(model_config.num_windows),
                        model_config.stream_sizes,
                        model_config.has_dynamic_features)

            name = basename(in_feats.collected_files[idx][0])
            out_path = join(out_dir, name)
            np.save(out_path, out, allow_pickle=False)
예제 #14
0
파일: train_util.py 프로젝트: r9y9/nnsvs
def setup_cyclegan(config, device, collate_fn=collate_fn_default):
    """Setup for training CycleGAN

    Args:
        config (dict): configuration for training
        device (torch.device): device to use for training
        collate_fn (callable, optional): collate function. Defaults to collate_fn_default.

    Returns:
        (tuple): tuple containing model, optimizer, learning rate scheduler,
            data loaders, tensorboard writer, logger, and scalers.
    """
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    logger.info(f"PyTorch version: {torch.__version__}")

    if torch.cuda.is_available():
        from torch.backends import cudnn

        cudnn.benchmark = config.train.cudnn.benchmark
        cudnn.deterministic = config.train.cudnn.deterministic
        logger.info(f"cudnn.deterministic: {cudnn.deterministic}")
        logger.info(f"cudnn.benchmark: {cudnn.benchmark}")
        if torch.backends.cudnn.version() is not None:
            logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")

    logger.info(f"Random seed: {config.seed}")
    init_seed(config.seed)

    if config.train.use_detect_anomaly:
        torch.autograd.set_detect_anomaly(True)
        logger.info("Set to use torch.autograd.detect_anomaly")

    if "use_amp" in config.train and config.train.use_amp:
        logger.info("Use mixed precision training")
        grad_scaler = GradScaler()
    else:
        grad_scaler = None

    # Model G
    netG_A2B = hydra.utils.instantiate(config.model.netG).to(device)
    netG_B2A = hydra.utils.instantiate(config.model.netG).to(device)
    logger.info(
        "[Generator] Number of trainable params: {:.3f} million".format(
            num_trainable_params(netG_A2B) / 1000000.0))
    logger.info(netG_A2B)
    # Optimizer and LR scheduler for G
    optG, schedulerG = _instantiate_optim_cyclegan(config.train.optim.netG,
                                                   netG_A2B, netG_B2A)

    # Model D
    netD_A = hydra.utils.instantiate(config.model.netD).to(device)
    netD_B = hydra.utils.instantiate(config.model.netD).to(device)
    logger.info(
        "[Discriminator] Number of trainable params: {:.3f} million".format(
            num_trainable_params(netD_A) / 1000000.0))
    logger.info(netD_A)
    # Optimizer and LR scheduler for D
    optD, schedulerD = _instantiate_optim_cyclegan(config.train.optim.netD,
                                                   netD_A, netD_B)

    # DataLoader
    data_loaders = get_data_loaders(config.data, collate_fn, logger)

    set_epochs_based_on_max_steps_(config.train,
                                   len(data_loaders["train_no_dev"]), logger)

    # Resume
    # TODO
    # _resume(logger, config.train.resume.netG, netG, optG, schedulerG)
    # _resume(logger, config.train.resume.netD, netD, optD, schedulerD)

    if config.data_parallel:
        netG_A2B = nn.DataParallel(netG_A2B)
        netG_B2A = nn.DataParallel(netG_B2A)
        netD_A = nn.DataParallel(netD_A)
        netD_B = nn.DataParallel(netD_B)

    # Mlflow
    if config.mlflow.enabled:
        mlflow.set_tracking_uri("file://" + get_original_cwd() + "/mlruns")
        mlflow.set_experiment(config.mlflow.experiment)
        # NOTE: disable tensorboard if mlflow is enabled
        writer = None
        logger.info("Using mlflow instead of tensorboard")
    else:
        # Tensorboard
        writer = SummaryWriter(to_absolute_path(config.train.log_dir))

    # Scalers
    if "in_scaler_path" in config.data and config.data.in_scaler_path is not None:
        in_scaler = joblib.load(to_absolute_path(config.data.in_scaler_path))
        if isinstance(in_scaler, SKMinMaxScaler):
            in_scaler = MinMaxScaler(
                in_scaler.min_,
                in_scaler.scale_,
                in_scaler.data_min_,
                in_scaler.data_max_,
            )
    else:
        in_scaler = None
    if "out_scaler_path" in config.data and config.data.out_scaler_path is not None:
        out_scaler = joblib.load(to_absolute_path(config.data.out_scaler_path))
        out_scaler = StandardScaler(out_scaler.mean_, out_scaler.var_,
                                    out_scaler.scale_)
    else:
        out_scaler = None

    return (
        (netG_A2B, netG_B2A, optG, schedulerG),
        (netD_A, netD_B, optD, schedulerD),
        grad_scaler,
        data_loaders,
        writer,
        logger,
        in_scaler,
        out_scaler,
    )
예제 #15
0
파일: train_util.py 프로젝트: r9y9/nnsvs
def setup(config, device, collate_fn=collate_fn_default):
    """Setup for training

    Args:
        config (dict): configuration for training
        device (torch.device): device to use for training
        collate_fn (callable, optional): collate function. Defaults to collate_fn_default.

    Returns:
        (tuple): tuple containing model, optimizer, learning rate scheduler,
            data loaders, tensorboard writer, logger, and scalers.
    """
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    logger.info(f"PyTorch version: {torch.__version__}")

    if torch.cuda.is_available():
        from torch.backends import cudnn

        cudnn.benchmark = config.train.cudnn.benchmark
        cudnn.deterministic = config.train.cudnn.deterministic
        logger.info(f"cudnn.deterministic: {cudnn.deterministic}")
        logger.info(f"cudnn.benchmark: {cudnn.benchmark}")
        if torch.backends.cudnn.version() is not None:
            logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")

    logger.info(f"Random seed: {config.seed}")
    init_seed(config.seed)

    if config.train.use_detect_anomaly:
        torch.autograd.set_detect_anomaly(True)
        logger.info("Set to use torch.autograd.detect_anomaly")

    if "use_amp" in config.train and config.train.use_amp:
        logger.info("Use mixed precision training")
        grad_scaler = GradScaler()
    else:
        grad_scaler = None

    # Model
    model = hydra.utils.instantiate(config.model.netG).to(device)
    logger.info("Number of trainable params: {:.3f} million".format(
        num_trainable_params(model) / 1000000.0))
    logger.info(model)

    # Optimizer
    optimizer_class = getattr(optim, config.train.optim.optimizer.name)
    optimizer = optimizer_class(model.parameters(),
                                **config.train.optim.optimizer.params)

    # Scheduler
    lr_scheduler_class = getattr(optim.lr_scheduler,
                                 config.train.optim.lr_scheduler.name)
    lr_scheduler = lr_scheduler_class(optimizer,
                                      **config.train.optim.lr_scheduler.params)

    # DataLoader
    data_loaders = get_data_loaders(config.data, collate_fn, logger)

    set_epochs_based_on_max_steps_(config.train,
                                   len(data_loaders["train_no_dev"]), logger)

    # Resume
    if (config.train.resume.checkpoint is not None
            and len(config.train.resume.checkpoint) > 0):
        logger.info("Load weights from %s", config.train.resume.checkpoint)
        checkpoint = torch.load(
            to_absolute_path(config.train.resume.checkpoint))
        model.load_state_dict(checkpoint["state_dict"])
        if config.train.resume.load_optimizer:
            logger.info("Load optimizer state")
            optimizer.load_state_dict(checkpoint["optimizer_state"])
            lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state"])

    if config.data_parallel:
        model = nn.DataParallel(model)

    # Mlflow
    if config.mlflow.enabled:
        mlflow.set_tracking_uri("file://" + get_original_cwd() + "/mlruns")
        mlflow.set_experiment(config.mlflow.experiment)
        # NOTE: disable tensorboard if mlflow is enabled
        writer = None
        logger.info("Using mlflow instead of tensorboard")
    else:
        # Tensorboard
        writer = SummaryWriter(to_absolute_path(config.train.log_dir))

    # Scalers
    if "in_scaler_path" in config.data and config.data.in_scaler_path is not None:
        in_scaler = joblib.load(to_absolute_path(config.data.in_scaler_path))
        in_scaler = MinMaxScaler(in_scaler.min_, in_scaler.scale_,
                                 in_scaler.data_min_, in_scaler.data_max_)
    else:
        in_scaler = None
    if "out_scaler_path" in config.data and config.data.out_scaler_path is not None:
        out_scaler = joblib.load(to_absolute_path(config.data.out_scaler_path))
        out_scaler = StandardScaler(out_scaler.mean_, out_scaler.var_,
                                    out_scaler.scale_)
    else:
        out_scaler = None

    return (
        model,
        optimizer,
        lr_scheduler,
        grad_scaler,
        data_loaders,
        writer,
        logger,
        in_scaler,
        out_scaler,
    )
예제 #16
0
def my_app(config: DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    device = torch.device("cuda" if use_cuda else "cpu")
    utt_list = to_absolute_path(config.utt_list)
    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)

    utt_ids = load_utt_list(utt_list)

    os.makedirs(out_dir, exist_ok=True)

    model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(
        to_absolute_path(config.model.checkpoint),
        map_location=lambda storage, loc: storage,
    )
    model.load_state_dict(checkpoint["state_dict"])
    model.eval()

    out_scaler = joblib.load(to_absolute_path(config.out_scaler_path))

    mean_ = get_static_features(
        out_scaler.mean_.reshape(1, 1, out_scaler.mean_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    mean_ = np.concatenate(mean_, -1).reshape(1, -1)
    var_ = get_static_features(
        out_scaler.var_.reshape(1, 1, out_scaler.var_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    var_ = np.concatenate(var_, -1).reshape(1, -1)
    scale_ = get_static_features(
        out_scaler.scale_.reshape(1, 1, out_scaler.scale_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    scale_ = np.concatenate(scale_, -1).reshape(1, -1)
    static_scaler = StandardScaler(mean_, var_, scale_)

    static_stream_sizes = get_static_stream_sizes(
        model_config.stream_sizes,
        model_config.has_dynamic_features,
        model_config.num_windows,
    )

    for utt_id in tqdm(utt_ids):
        in_feats = (torch.from_numpy(
            np.load(join(in_dir,
                         utt_id + "-feats.npy"))).unsqueeze(0).to(device))
        static_feats = _gen_static_features(model, model_config, in_feats,
                                            out_scaler)

        mgc_end_dim = static_stream_sizes[0]
        bap_start_dim = sum(static_stream_sizes[:3])
        bap_end_dim = sum(static_stream_sizes[:4])

        if config.gv_postfilter:
            # mgc
            static_feats[:, :mgc_end_dim] = variance_scaling(
                static_scaler.var_.reshape(-1)[:mgc_end_dim],
                static_feats[:, :mgc_end_dim],
                offset=config.mgc_offset,
            )
            # bap
            static_feats[:, bap_start_dim:bap_end_dim] = variance_scaling(
                static_scaler.var_.reshape(-1)[bap_start_dim:bap_end_dim],
                static_feats[:, bap_start_dim:bap_end_dim],
                offset=config.bap_offset,
            )

        if config.normalize:
            static_feats = static_scaler.transform(static_feats)
        out_path = join(out_dir, f"{utt_id}-feats.npy")
        np.save(out_path, static_feats.astype(np.float32), allow_pickle=False)