示例#1
0
    def _process_single_file(data):
        # parse inputs for each audio
        audio_name, mel_name, audio, mel = data

        # normalize
        """Scale features of X according to feature_range.
        mel *= self.scale_
        mel += self.min_ """
        mel = scaler.transform(mel)

        # save
        if config["format"] == "hdf5":
            write_hdf5(
                os.path.join(args.dumpdir, f"{os.path.basename(audio_name)}"),
                "wave", audio.astype(np.float32))
            write_hdf5(
                os.path.join(args.dumpdir, f"{os.path.basename(mel_name)}"),
                "feats", mel.astype(np.float32))
        elif config["format"] == "npy":
            np.save(os.path.join(args.dumpdir,
                                 f"{os.path.basename(audio_name)}"),
                    audio.astype(np.float32),
                    allow_pickle=False)
            np.save(os.path.join(args.dumpdir,
                                 f"{os.path.basename(mel_name)}"),
                    mel.astype(np.float32),
                    allow_pickle=False)
        else:
            raise ValueError("support only hdf5 or npy format.")
示例#2
0
    def _process_single_file(data):
        # parse inputs
        if args.scp is not None:
            utt_id, (fs, audio) = data
            audio = audio.astype(np.float32)
            audio /= (1 << (16 - 1))  # assume that wav is PCM 16 bit
        else:
            name, (audio, fs) = data
            utt_id = os.path.basename(name).replace(".wav", "")

        # check
        assert len(audio.shape) == 1, \
            f"{utt_id} seems to be multi-channel signal."
        assert fs == config["sampling_rate"], \
            f"{utt_id} seems to have a different sampling rate."
        assert np.abs(audio).max() <= 1.0, \
            f"{utt_id} seems to be different from 16 bit PCM."

        # trim silence
        if config["trim_silence"]:
            audio, _ = librosa.effects.trim(audio,
                                            top_db=config["trim_threshold_in_db"],
                                            frame_length=config["trim_frame_size"],
                                            hop_length=config["trim_hop_size"])

        # extract feature
        mel = logmelfilterbank(audio, fs,
                               fft_size=config["fft_size"],
                               hop_size=config["hop_size"],
                               win_length=config["win_length"],
                               window=config["window"],
                               num_mels=config["num_mels"],
                               fmin=config["fmin"],
                               fmax=config["fmax"])

        # make sure the audio length and feature length are matched
        audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
        audio = audio[:len(mel) * config["hop_size"]]
        assert len(mel) * config["hop_size"] == len(audio)

        # apply global gain
        if config["global_gain_scale"] > 0.0:
            audio *= config["global_gain_scale"]
            if np.abs(audio).max() > 1.0:
                logging.warn(f"{utt_id} causes clipping. "
                             f"it is better to re-consider global gain scale.")
                return

        # save
        if config["format"] == "hdf5":
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "wave", audio.astype(np.float32))
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", mel.astype(np.float32))
        elif config["format"] == "npy":
            np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"),
                    audio.astype(np.float32), allow_pickle=False)
            np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"),
                    mel.astype(np.float32), allow_pickle=False)
        else:
            raise ValueError("support only hdf5 or npy format.")
示例#3
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description=
        "Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)."
    )
    parser.add_argument(
        "--wav-scp",
        "--scp",
        default=None,
        type=str,
        help=
        "kaldi-style wav.scp file. you need to specify either scp or rootdir.")
    parser.add_argument(
        "--segments",
        default=None,
        type=str,
        help=
        "kaldi-style segments file. if use, you must to specify both scp and segments."
    )
    parser.add_argument(
        "--rootdir",
        default=None,
        type=str,
        help=
        "directory including wav files. you need to specify either scp or rootdir."
    )
    parser.add_argument("--dumpdir",
                        type=str,
                        required=True,
                        help="directory to dump feature files.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('Skip DEBUG/INFO messages')

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.wav_scp is not None and args.rootdir is not None) or \
            (args.wav_scp is None and args.rootdir is None):
        raise ValueError("Please specify either --rootdir or --wav-scp.")

    # get dataset
    if args.rootdir is not None:
        dataset = AudioDataset(
            args.rootdir,
            "*.wav",
            audio_load_fn=sf.read,
            return_utt_id=True,
        )
    else:
        dataset = AudioSCPDataset(
            args.wav_scp,
            segments=args.segments,
            return_utt_id=True,
            return_sampling_rate=True,
        )

    # check directly existence
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir, exist_ok=True)

    # process each data
    for utt_id, (audio, fs) in tqdm(dataset):
        # check
        assert len(audio.shape) == 1, \
            f"{utt_id} seems to be multi-channel signal."
        assert np.abs(audio).max() <= 1.0, \
            f"{utt_id} seems to be different from 16 bit PCM."
        assert fs == config["sampling_rate"], \
            f"{utt_id} seems to have a different sampling rate."

        # trim silence
        if config["trim_silence"]:
            audio, _ = librosa.effects.trim(
                audio,
                top_db=config["trim_threshold_in_db"],
                frame_length=config["trim_frame_size"],
                hop_length=config["trim_hop_size"])

        if "sampling_rate_for_feats" not in config:
            x = audio
            sampling_rate = config["sampling_rate"]
            hop_size = config["hop_size"]
        else:
            # NOTE(kan-bayashi): this procedure enables to train the model with different
            #   sampling rate for feature and audio, e.g., training with mel extracted
            #   using 16 kHz audio and 24 kHz audio as a target waveform
            x = librosa.resample(audio, fs, config["sampling_rate_for_feats"])
            sampling_rate = config["sampling_rate_for_feats"]
            assert config["hop_size"] * config["sampling_rate_for_feats"] % fs == 0, \
                "hop_size must be int value. please check sampling_rate_for_feats is correct."
            hop_size = config["hop_size"] * config[
                "sampling_rate_for_feats"] // fs

        # extract feature
        mel = logmelfilterbank(x,
                               sampling_rate=sampling_rate,
                               hop_size=hop_size,
                               fft_size=config["fft_size"],
                               win_length=config["win_length"],
                               window=config["window"],
                               num_mels=config["num_mels"],
                               fmin=config["fmin"],
                               fmax=config["fmax"])

        # make sure the audio length and feature length are matched
        audio = np.pad(audio, (0, config["fft_size"]), mode="reflect")
        audio = audio[:len(mel) * config["hop_size"]]
        assert len(mel) * config["hop_size"] == len(audio)

        # apply global gain
        if config["global_gain_scale"] > 0.0:
            audio *= config["global_gain_scale"]
        if np.abs(audio).max() >= 1.0:
            logging.warn(f"{utt_id} causes clipping. "
                         f"it is better to re-consider global gain scale.")
            continue

        # save
        if config["format"] == "hdf5":
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "wave",
                       audio.astype(np.float32))
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats",
                       mel.astype(np.float32))
        elif config["format"] == "npy":
            np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"),
                    audio.astype(np.float32),
                    allow_pickle=False)
            np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"),
                    mel.astype(np.float32),
                    allow_pickle=False)
        else:
            raise ValueError("support only hdf5 or npy format.")
示例#4
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py).")
    parser.add_argument("--rootdir", default=None, type=str,
                        help="directory including feature files to be normalized. "
                             "you need to specify either *-scp or rootdir.")
    parser.add_argument("--wav-scp", default=None, type=str,
                        help="kaldi-style wav.scp file. "
                             "you need to specify either *-scp or rootdir.")
    parser.add_argument("--feats-scp", default=None, type=str,
                        help="kaldi-style feats.scp file. "
                             "you need to specify either *-scp or rootdir.")
    parser.add_argument("--segments", default=None, type=str,
                        help="kaldi-style segments file.")
    parser.add_argument("--dumpdir", type=str, required=True,
                        help="directory to dump normalized feature files.")
    parser.add_argument("--stats", type=str, required=True,
                        help="statistics file.")
    parser.add_argument("--skip-wav-copy", default=False, action="store_true",
                        help="whether to skip the copy of wav files.")
    parser.add_argument("--config", type=str, required=True,
                        help="yaml format configuration file.")
    parser.add_argument("--ftype", default='mel', type=str,
                        help="feature type")
    parser.add_argument("--verbose", type=int, default=1,
                        help="logging level. higher is more logging. (default=1)")

    # runtime mode
    args = parser.parse_args()

    # interactive mode
    # args = argparse.ArgumentParser()
    # args.wav_scp = None
    # args.feats_scp = None
    # args.segment = None
    # args.dumpdir = ""
    # args.skip_wav_copy = True
    # args.config = 'egs/so_emo_female/voc1/conf/multi_band_melgan.v2.yaml'
    # args.ftype = 'spec'
    # args.verbose = 1

    # args.rootdir = '/data/evs/VCTK/VCTK-wgan/spec'
    # args.stats = '/data/evs/VCTK/VCTK-wgan/spec/mel_mean_std.npy'

    # args.rootdir = '/data/evs/Arctic/spec'
    # args.stats = '/data/evs/Arctic/spec/spec_mean_std.npy'

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('Skip DEBUG/INFO messages')

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.feats_scp is not None and args.rootdir is not None) or \
            (args.feats_scp is None and args.rootdir is None):
        raise ValueError("Please specify either --rootdir or --feats-scp.")

    # check directory existence
    if args.dumpdir != "":
        if not os.path.exists(args.dumpdir):
            os.makedirs(args.dumpdir, exist_ok=True)

    # get dataset
    if args.rootdir is not None:
        if config["format"] == "hdf5":
            audio_query, mel_query = "*.h5", "*.h5"
            audio_load_fn = lambda x: read_hdf5(x, "wave")  # NOQA
            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
        elif config["format"] == "npy":
            audio_query, mel_query, spc_query = "*.wav.npy", "*.mel.npy", "*.spec.npy"
            audio_load_fn = np.load
            mel_load_fn = np.load
            spc_load_fn = np.load
        else:
            raise ValueError("support only hdf5 or npy format.")
        if not args.skip_wav_copy:
            dataset = AudioMelDataset(
                root_dir=args.rootdir,
                audio_query=audio_query,
                mel_query=mel_query,
                audio_load_fn=audio_load_fn,
                mel_load_fn=mel_load_fn,
                return_utt_id=True,
            )
        else:
            dataset1 = MelDatasetNew(
                root_dir=args.rootdir,
                mel_query=mel_query,
                mel_load_fn=mel_load_fn,
                return_utt_id=True,
            )
            dataset2 = SpcDatasetNew(
                root_dir=args.rootdir,
                spc_query=spc_query,
                spc_load_fn=spc_load_fn,
                return_utt_id=True,
            )
    else:
        if not args.skip_wav_copy:
            dataset = AudioMelSCPDataset(
                wav_scp=args.wav_scp,
                feats_scp=args.feats_scp,
                segments=args.segments,
                return_utt_id=True,
            )
        else:
            dataset = MelSCPDataset(
                feats_scp=args.feats_scp,
                return_utt_id=True,
            )
    logging.info(f"The number of files in mel dataset = {len(dataset1)}.")
    logging.info(f"The number of files in spc dataset = {len(dataset2)}.")

    # restore scaler
    scaler = StandardScaler()
    if config["format"] == "hdf5":
        scaler.mean_ = read_hdf5(args.stats, "mean")
        scaler.scale_ = read_hdf5(args.stats, "scale")
    elif config["format"] == "npy":
        scaler.mean_ = np.load(args.stats)[0]
        scaler.scale_ = np.load(args.stats)[1]
    else:
        raise ValueError("support only hdf5 or npy format.")
    # from version 0.23.0, this information is needed
    scaler.n_features_in_ = scaler.mean_.shape[0]

    # process each file
    if args.ftype == 'mel':
      dataset = dataset1
    elif args.ftype == 'spec':
      dataset = dataset2

    for items in tqdm(dataset):
        if not args.skip_wav_copy:
            utt_id, audio, feat = items
        else:
            utt_id, feat, feat_file = items

        # normalize
        feat = scaler.transform(feat)
        # feat = (feat - scaler.mean_) / scaler.scale_ # this is identical to scaler.transform(feat)

        # save
        if config["format"] == "hdf5":
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"),
                       "feats", feat.astype(np.float32))
            if not args.skip_wav_copy:
                write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"),
                           "wave", audio.astype(np.float32))
        elif config["format"] == "npy":
            if args.dumpdir == "":
                feat_file = feat_file.replace('.npy', '')

                np.save((feat_file + "-norm.npy"),
                        feat.astype(np.float32), allow_pickle=False)
                if not args.skip_wav_copy:
                    print("Please include --skip_wav_copy in arguments")

            else:
                np.save(os.path.join(args.dumpdir, f"{utt_id}.npy"),
                        feat.astype(np.float32), allow_pickle=False)
                if not args.skip_wav_copy:
                    np.save(os.path.join(args.dumpdir, f"{utt_id}.wav.npy"),
                            audio.astype(np.float32), allow_pickle=False)
        else:
            raise ValueError("support only hdf5 or npy format.")
示例#5
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
    )
    parser.add_argument(
        "--rootdir",
        default=None,
        type=str,
        help="directory including feature files to be normalized. "
        "you need to specify either *-scp or rootdir.",
    )
    parser.add_argument(
        "--wav-scp",
        default=None,
        type=str,
        help="kaldi-style wav.scp file. "
        "you need to specify either *-scp or rootdir.",
    )
    parser.add_argument(
        "--feats-scp",
        default=None,
        type=str,
        help="kaldi-style feats.scp file. "
        "you need to specify either *-scp or rootdir.",
    )
    parser.add_argument(
        "--segments",
        default=None,
        type=str,
        help="kaldi-style segments file.",
    )
    parser.add_argument(
        "--dumpdir",
        type=str,
        required=True,
        help="directory to dump normalized feature files.",
    )
    parser.add_argument(
        "--stats",
        type=str,
        required=True,
        help="statistics file.",
    )
    parser.add_argument(
        "--skip-wav-copy",
        default=False,
        action="store_true",
        help="whether to skip the copy of wav files.",
    )
    parser.add_argument(
        "--config", type=str, required=True, help="yaml format configuration file."
    )
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)",
    )
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.feats_scp is not None and args.rootdir is not None) or (
        args.feats_scp is None and args.rootdir is None
    ):
        raise ValueError("Please specify either --rootdir or --feats-scp.")

    # check directory existence
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir)

    # get dataset
    if args.rootdir is not None:
        if config["format"] == "hdf5":
            audio_query, mel_query = "*.h5", "*.h5"
            audio_load_fn = lambda x: read_hdf5(x, "wave")  # NOQA
            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
        elif config["format"] == "npy":
            audio_query, mel_query = "*-wave.npy", "*-feats.npy"
            audio_load_fn = np.load
            mel_load_fn = np.load
        else:
            raise ValueError("support only hdf5 or npy format.")
        if not args.skip_wav_copy:
            dataset = AudioMelDataset(
                root_dir=args.rootdir,
                audio_query=audio_query,
                mel_query=mel_query,
                audio_load_fn=audio_load_fn,
                mel_load_fn=mel_load_fn,
                return_utt_id=True,
            )
        else:
            dataset = MelDataset(
                root_dir=args.rootdir,
                mel_query=mel_query,
                mel_load_fn=mel_load_fn,
                return_utt_id=True,
            )
    else:
        if not args.skip_wav_copy:
            dataset = AudioMelSCPDataset(
                wav_scp=args.wav_scp,
                feats_scp=args.feats_scp,
                segments=args.segments,
                return_utt_id=True,
            )
        else:
            dataset = MelSCPDataset(
                feats_scp=args.feats_scp,
                return_utt_id=True,
            )
    logging.info(f"The number of files = {len(dataset)}.")

    # restore scaler
    scaler = StandardScaler()
    if config["format"] == "hdf5":
        scaler.mean_ = read_hdf5(args.stats, "mean")
        scaler.scale_ = read_hdf5(args.stats, "scale")
    elif config["format"] == "npy":
        scaler.mean_ = np.load(args.stats)[0]
        scaler.scale_ = np.load(args.stats)[1]
    else:
        raise ValueError("support only hdf5 or npy format.")
    # from version 0.23.0, this information is needed
    scaler.n_features_in_ = scaler.mean_.shape[0]

    # process each file
    for items in tqdm(dataset):
        if not args.skip_wav_copy:
            utt_id, audio, mel = items
        else:
            utt_id, mel = items

        # normalize
        mel = scaler.transform(mel)

        # save
        if config["format"] == "hdf5":
            write_hdf5(
                os.path.join(args.dumpdir, f"{utt_id}.h5"),
                "feats",
                mel.astype(np.float32),
            )
            if not args.skip_wav_copy:
                write_hdf5(
                    os.path.join(args.dumpdir, f"{utt_id}.h5"),
                    "wave",
                    audio.astype(np.float32),
                )
        elif config["format"] == "npy":
            np.save(
                os.path.join(args.dumpdir, f"{utt_id}-feats.npy"),
                mel.astype(np.float32),
                allow_pickle=False,
            )
            if not args.skip_wav_copy:
                np.save(
                    os.path.join(args.dumpdir, f"{utt_id}-wave.npy"),
                    audio.astype(np.float32),
                    allow_pickle=False,
                )
        else:
            raise ValueError("support only hdf5 or npy format.")
示例#6
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Compute mean and variance of dumped raw features "
        "(See detail in parallel_wavegan/bin/compute_statistics.py).")
    parser.add_argument(
        "--feats-scp",
        "--scp",
        default=None,
        type=str,
        help="kaldi-style feats.scp file. "
        "you need to specify either feats-scp or rootdir.",
    )
    parser.add_argument(
        "--rootdir",
        type=str,
        help="directory including feature files. "
        "you need to specify either feats-scp or rootdir.",
    )
    parser.add_argument(
        "--config",
        type=str,
        required=True,
        help="yaml format configuration file.",
    )
    parser.add_argument(
        "--dumpdir",
        default=None,
        type=str,
        required=True,
        help="directory to save statistics. if not provided, "
        "stats will be saved in the above root directory. (default=None)",
    )
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)",
    )
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.feats_scp is not None
            and args.rootdir is not None) or (args.feats_scp is None
                                              and args.rootdir is None):
        raise ValueError("Please specify either --rootdir or --feats-scp.")

    # check directory existence
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir)

    # get dataset
    if args.feats_scp is None:
        if config["format"] == "hdf5":
            mel_query = "*.h5"
            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
        elif config["format"] == "npy":
            mel_query = "*-feats.npy"
            mel_load_fn = np.load
        else:
            raise ValueError("support only hdf5 or npy format.")
        dataset = MelDataset(args.rootdir,
                             mel_query=mel_query,
                             mel_load_fn=mel_load_fn)
    else:
        dataset = MelSCPDataset(args.feats_scp)
    logging.info(f"The number of files = {len(dataset)}.")

    # calculate statistics
    scaler = StandardScaler()
    for mel in tqdm(dataset):
        scaler.partial_fit(mel)

    if config["format"] == "hdf5":
        write_hdf5(
            os.path.join(args.dumpdir, "stats.h5"),
            "mean",
            scaler.mean_.astype(np.float32),
        )
        write_hdf5(
            os.path.join(args.dumpdir, "stats.h5"),
            "scale",
            scaler.scale_.astype(np.float32),
        )
    else:
        stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
        np.save(
            os.path.join(args.dumpdir, "stats.npy"),
            stats.astype(np.float32),
            allow_pickle=False,
        )
示例#7
0
dataname = 'man1028'
data_dir = '/workspace/ssd3/train_pwg_pindao/1'
root_dir = '/workspace/pwg/egs/csmsc/voc1/dump_man1028/'
datasets = ['train_nodev', 'dev', 'eval']
mels_dir = f"{data_dir}/mels"
wave_dir = f"{data_dir}/audio"
dev_num = 50
eval_num = 50
num_split = 16

mels = glob(f"{mels_dir}/*.npy")
mels_dataset = [
    mels[:len(mels) - dev_num - eval_num], mels[-dev_num - eval_num:-eval_num],
    mels[-eval_num:]
]
for ind, dataset in enumerate(datasets):
    for cnt, melnpy in tqdm(enumerate(mels_dataset[ind])):
        job = (cnt % num_split) + 1
        dump_dir = f'{root_dir}/{dataset}/norm/dump.{job}'
        os.makedirs(dump_dir, exist_ok=True)
        utt_id = '_'.join(melnpy.split('.')[0].split('-')[1:])
        utt_id = dataname + utt_id
        wavenpy = melnpy.replace('mels', 'audio').replace('mel', 'audio')
        mel = np.load(melnpy)
        audio = np.load(wavenpy)
        assert audio.shape[0] == mel.shape[0] * 300
        write_hdf5(os.path.join(dump_dir, f"{utt_id}.h5"), "wave",
                   audio.astype(np.float32))
        write_hdf5(os.path.join(dump_dir, f"{utt_id}.h5"), "feats",
                   mel.astype(np.float32))
示例#8
0
import sys
sys.path.insert(0, '../../../')
import os
from parallel_wavegan.utils import write_hdf5
import numpy as np

inputnpy=sys.argv[1]

if os.path.isdir(inputnpy):
    for filebase in os.listdir(inputnpy):
        if not filebase.endswith('.npy'):
            continue
        dirname=inputnpy
        filename=filebase.split('.')[0]

        print(os.path.join(dirname, filebase))
        mel=np.load(os.path.join(dirname, filebase))
        write_hdf5(os.path.join(dirname, f"{filename}.h5"), "feats", mel.astype(np.float32))
else:
    dirname=os.path.dirname(inputnpy)
    filename=os.path.basename(inputnpy).split('.')[0]

    mel=np.load(inputnpy)
    write_hdf5(os.path.join(dirname, f"{filename}.h5"), "feats", mel.astype(np.float32))
示例#9
0
def main():
    parser = argparse.ArgumentParser(description="Preprocess audio and extract features (see detail in parallel_wavegan/bin/preprocess.py ")
    parser.add_argument("--wav-scp","--scp",default=None,type=str,
                        help="kaldi-styke wav.scp file. you need to specify either scp or rootdir.")
    parser.add_argument("--segments",default=None,type=str,
                        help="kaldi-style segments file. if use you must specify both scp and segments.")
    parser.add_argument("--rootdir",default=None,type=str,
                        help="directory icluding wav files. you need to specify either scp or rootdir.")
    parser.add_argument("--dumpdir",type=str,required=True,
                        help="directory to dump feature files.")
    parser.add_argument("--config",type=str,required=True,
                        help="yaml format configuration file.")
    parser.add_argument("--verbose",type=int,default=1,
                        help="logging level. higher is more logging.")
    args = parser.parse_args()


    # setting logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('skip DEBUG/INFO messages')


    # loading config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.load)
    config.update(vars(args))

    # checking arguments
    if (args.wav_scp is not None and args.rootdir is not None) or \
            (args.wav_scp is None and args.rootdir is None):
        raise ValueError("Please specify either --wav_scp or --rootdir")


    # getting dataset
    if args.rootdir is not None:
        dataset = AudioDataset(
            args.rootdir,"*.wav",
            audio_load_fn=sf.read,
            return_utt_id=True,
        )

    else:
        dataset = AudioSCPDataset(
            args.wav_scp,
            segments=args.segments,
            return_utt_id=True,
            return_sampling_rate=True,
        )



    # check directory existence
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir, exist_ok=True)

    # process each data
    for utt_id,(audio,fs) in tqdm(dataset):

        # checking
        assert len(audio.shape) == 1, f"{utt_id} is multichannel signal."
        assert np.abs(audio).max() <= 1.0, f"{utt_id} is different from 16 bit PCM."
        assert fs == config['sampling_rate'], f"{utt_id} has different sampling rate."

        # trim silence
        if config['trim_silence]']:
            audio,_ = librosa.effects.trim(audio,
                                           top_db=config['trim_threshold_in_db'],
                                           frame_length=config['trim_frame_size'],
                                           hop_length=config['trim_hop_size'])

        if "sampling_rate_for_feats" not in config:
            x = audio
            sampling_rate = config['sampling_rate']
            hop_size = config['hop_size']

        else: # here we can train model with different sampling rate for feature and audio
            x = librosa.resample(audio, fs, config['sampling_rate_for_feats'])
            sampling_rate = config['sampling_rate_for_feats']
            assert config['hop_size'] * config['sampling_rate_for_feats'] % fs == 0, \
                "hop_size must be int value. please check sampling_rate_for_feats is correct."
            hop_size = config['hop_size'] * config['samping_rate_for_feats'] // fs

        # extracting feature
        mel = logmelfilterbank(x,
                               sampling_rate = sampling_rate,
                               hop_size=hop_size,
                               fft_size=config['fft_size'],
                               win_length=config['win_length'],
                               window=config['window'],
                               num_mels=config['num_mels'],
                               fmax=config['fmin'],
                               fmax=config['fmax'])

        # making sure the audio length and feature length are matched
        audio = np.pad(audio, (0, config['fft_size']), mode="edge")
        audio = audio[:len(mel) * config['hop_size']]
        assert len(mel) * config['hop_size'] == len(audio)


        # apply global gain 
        if config['global_gain_scale'] > 0.0:
            audio *= config['global_gain_scale']

        if np.abs(audio).max() >= 1.0:
            logging.warn(f"{utt_id} causes clipping. "
                         f"it is better to reconsider global gain scale.")

            continue
                    
        if config['format'] == "hdf5":
            write_hdf5(os.path.join(args.dumpdir,f"{utt_id}.h5"), "wave", audio.astype(np.float32))
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", mel.astype(np.float32))

        elif config['format'] == "npy":
            np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"),
                    audio.astype(np.float32), allow_pickle=False)
            np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"),
                    mel.astype(np.float32), allow_pickle=False)

        else:
            raise ValueError('support only hdf5 or npy format.')
示例#10
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Compute mean and variance of dumped raw features "
                    "(See detail in parallel_wavegan/bin/compute_statistics.py).")
    parser.add_argument("--feats-scp", "--scp", default=None, type=str,
                        help="kaldi-style feats.scp file. "
                             "you need to specify either feats-scp or rootdir.")
    parser.add_argument("--rootdir", type=str, required=True,
                        help="directory including feature files. "
                             "you need to specify either feats-scp or rootdir.")
    parser.add_argument("--config", type=str, required=True,
                        help="yaml format configuration file.")
    parser.add_argument("--dumpdir", default=None, type=str,
                        help="directory to save statistics. if not provided, "
                             "stats will be saved in the above root directory. (default=None)")
    parser.add_argument("--ftype", default='mel', type=str,
                        help="feature type")
    parser.add_argument("--verbose", type=int, default=1,
                        help="logging level. higher is more logging. (default=1)")

    # runtime mode
    args = parser.parse_args()

    # interactive mode
    # args = argparse.ArgumentParser()
    # args.feats_scp = None
    # args.config = 'egs/so_emo_female/voc1/conf/multi_band_melgan.v2.yaml'
    # args.verbose = 1
    # args.ftype = 'spec'

    # args.rootdir = '/data/evs/VCTK/VCTK-wgan/spec'
    # args.rootdir = '/data/evs/Arctic/spec'

    # args.dumpdir = os.path.join(args.rootdir, "")


    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('Skip DEBUG/INFO messages')

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.feats_scp is not None and args.rootdir is not None) or \
            (args.feats_scp is None and args.rootdir is None):
        raise ValueError("Please specify either --rootdir or --feats-scp.")

    # check directory existence
    if args.dumpdir is None:
        args.dumpdir = os.path.dirname(args.rootdir)
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir)

    # get dataset
    if args.feats_scp is None:
        if config["format"] == "hdf5":
            mel_query = "*.h5"
            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
        elif config["format"] == "npy":
            mel_query = "*.mel.npy"
            mel_load_fn = np.load
            spc_query = "*.spec.npy"
            spc_load_fn = np.load
        else:
            raise ValueError("support only hdf5 or npy format.")
        dataset1 = MelDataset(
            args.rootdir,
            mel_query=mel_query,
            mel_load_fn=mel_load_fn)
        dataset2 = SpcDataset(
            args.rootdir,
            spc_query=spc_query,
            spc_load_fn=spc_load_fn)
    else:
        dataset = MelSCPDataset(args.feats_scp)
    logging.info(f"The number of files in mel dataset = {len(dataset1)}.")
    logging.info(f"The number of files in spc dataset = {len(dataset2)}.")

    # calculate statistics
    scaler = StandardScaler()
    if args.ftype == 'mel':
        for mel in tqdm(dataset1):
            scaler.partial_fit(mel)
    elif args.ftype == 'spec':
        for spc in tqdm(dataset2):
            scaler.partial_fit(spc)

    if config["format"] == "hdf5":
        write_hdf5(os.path.join(args.dumpdir, "{}_mean_std.h5".format(args.ftype)),
                   "mean", scaler.mean_.astype(np.float32))
        write_hdf5(os.path.join(args.dumpdir, "{}_mean_std.h5".format(args.ftype)),
                   "scale", scaler.scale_.astype(np.float32))
    else:
        stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
        np.save(os.path.join(args.dumpdir, "{}_mean_std.npy".format(args.ftype)),
                stats.astype(np.float32), allow_pickle=False)
示例#11
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description="Compute mean and variance of dumped raw features.")
    parser.add_argument("--rootdir",
                        default=None,
                        type=str,
                        required=True,
                        help="Direcotry including feature files.")
    parser.add_argument("--dumpdir",
                        default=None,
                        type=str,
                        help="Direcotry to save statistics.")
    parser.add_argument("--config",
                        default="hparam.yml",
                        type=str,
                        required=True,
                        help="Yaml format configuration file.")
    parser.add_argument("--verbose",
                        type=int,
                        default=1,
                        help="logging level (higher is more logging)")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('skip DEBUG/INFO messages')

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check direcotry existence
    if args.dumpdir is None:
        args.dumpdir = os.path.dirname(args.rootdir)
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir)

    # get dataset
    if config["format"] == "hdf5":
        mel_query = "*.h5"
        mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
    elif config["format"] == "npy":
        mel_query = "*-feats.npy"
        mel_load_fn = np.load
    else:
        raise ValueError("support only hdf5 or npy format.")
    dataset = MelDataset(args.rootdir,
                         mel_query=mel_query,
                         mel_load_fn=mel_load_fn)
    logging.info(f"the number of files = {len(dataset)}.")

    # calculate statistics
    scaler = StandardScaler()
    for mel in tqdm(dataset):
        scaler.partial_fit(mel)

    if config["format"] == "hdf5":
        write_hdf5(os.path.join(args.dumpdir, "stats.h5"), "mean",
                   scaler.mean_.astype(np.float32))
        write_hdf5(os.path.join(args.dumpdir, "stats.h5"), "scale",
                   scaler.scale_.astype(np.float32))
    else:
        stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
        np.save(os.path.join(args.dumpdir, "stats.npy"),
                stats.astype(np.float32),
                allow_pickle=False)