예제 #1
0
def test_KaldiReader(tmpdir, filetype):
    ark = str(tmpdir.join('a.foo'))
    scp = str(tmpdir.join('a.scp'))
    fs = 16000

    with file_writer_helper(wspecifier=f'ark,scp:{ark},{scp}',
                            filetype=filetype,
                            write_num_frames='ark,t:out.txt',
                            compress=False,
                            compression_method=2,
                            pcm_format='wav') as writer:

        if 'sound' in filetype:
            aaa = np.random.randint(-10, 10, 100, dtype=np.int16)
            bbb = np.random.randint(-10, 10, 50, dtype=np.int16)
        else:
            aaa = np.random.randn(10, 10)
            bbb = np.random.randn(13, 5)
        if 'sound' in filetype:
            writer['aaa'] = fs, aaa
            writer['bbb'] = fs, bbb
        else:
            writer['aaa'] = aaa
            writer['bbb'] = bbb
        valid = {'aaa': aaa, 'bbb': bbb}

    # 1. Test ark read
    if filetype != 'sound':
        for key, value in file_reader_helper(f'ark:{ark}',
                                             filetype=filetype,
                                             return_shape=False):
            if 'sound' in filetype:
                assert_scipy_wav_style(value)
                value = value[1]
            np.testing.assert_array_equal(value, valid[key])
    # 2. Test scp read
    for key, value in file_reader_helper(f'scp:{scp}',
                                         filetype=filetype,
                                         return_shape=False):
        if 'sound' in filetype:
            assert_scipy_wav_style(value)
            value = value[1]
        np.testing.assert_array_equal(value, valid[key])

    # 3. Test ark shape read
    if filetype != 'sound':
        for key, value in file_reader_helper(f'ark:{ark}',
                                             filetype=filetype,
                                             return_shape=True):
            if 'sound' in filetype:
                value = value[1]
            np.testing.assert_array_equal(value, valid[key].shape)
    # 4. Test scp shape read
    for key, value in file_reader_helper(f'scp:{scp}',
                                         filetype=filetype,
                                         return_shape=True):
        if 'sound' in filetype:
            value = value[1]
        np.testing.assert_array_equal(value, valid[key].shape)
예제 #2
0
def main():
    args = get_parser().parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if ":" in args.stats_rspecifier_or_rxfilename:
        is_rspcifier = True
        if args.stats_filetype == "npy":
            stats_filetype = "hdf5"
        else:
            stats_filetype = args.stats_filetype

        stats_dict = dict(
            file_reader_helper(args.stats_rspecifier_or_rxfilename,
                               stats_filetype))
    else:
        is_rspcifier = False
        if args.stats_filetype == "mat":
            stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename)
        else:
            stats = numpy.load(args.stats_rspecifier_or_rxfilename)
        stats_dict = {None: stats}

    cmvn = CMVN(
        stats=stats_dict,
        norm_means=args.norm_means,
        norm_vars=args.norm_vars,
        utt2spk=args.utt2spk,
        spk2utt=args.spk2utt,
        reverse=args.reverse,
    )

    with file_writer_helper(
            args.wspecifier,
            filetype=args.out_filetype,
            write_num_frames=args.write_num_frames,
            compress=args.compress,
            compression_method=args.compression_method,
    ) as writer:
        for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype):
            if is_scipy_wav_style(mat):
                # If data is sound file, then got as Tuple[int, ndarray]
                rate, mat = mat
            mat = cmvn(mat, utt if is_rspcifier else None)
            writer[utt] = mat
예제 #3
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if args.preprocess_conf is not None:
        preprocessing = Transformation(args.preprocess_conf)
        logging.info('Apply preprocessing: {}'.format(preprocessing))
    else:
        preprocessing = None

    # There are no necessary for matrix without preprocessing,
    # so change to file_reader_helper to return shape.
    # This make sense only with filetype="hdf5".
    for utt, mat in file_reader_helper(args.rspecifier, args.filetype,
                                       return_shape=preprocessing is None):
        if preprocessing is not None:
            if is_scipy_wav_style(mat):
                # If data is sound file, then got as Tuple[int, ndarray]
                rate, mat = mat
            mat = preprocessing(mat, uttid_list=utt)
            shape_str = ','.join(map(str, mat.shape))
        else:
            if len(mat) == 2 and isinstance(mat[1], tuple):
                # If data is sound file, Tuple[int, Tuple[int, ...]]
                rate, mat = mat
            shape_str = ','.join(map(str, mat))
        args.out.write('{} {}\n'.format(utt, shape_str))
def main():
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    )
    logging.info(get_commandline_args())

    # check directory
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    for idx, (utt_id, lmspc) in enumerate(
            file_reader_helper(args.rspecifier, args.filetype), 1):
        if args.n_mels is not None:
            spc = logmelspc_to_linearspc(lmspc,
                                         fs=args.fs,
                                         n_mels=args.n_mels,
                                         n_fft=args.n_fft,
                                         fmin=args.fmin,
                                         fmax=args.fmax)
        else:
            spc = lmspc
        y = griffin_lim(spc,
                        n_fft=args.n_fft,
                        n_shift=args.n_shift,
                        win_length=args.win_length,
                        window=args.window,
                        n_iters=args.iters)
        logging.info("(%d) %s" % (idx, utt_id))
        write(args.outdir + "/%s.wav" % utt_id, args.fs,
              (y * np.iinfo(np.int16).max).astype(np.int16))
예제 #5
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if args.preprocess_conf is not None:
        preprocessing = Transformation(args.preprocess_conf)
        logging.info("Apply preprocessing: {}".format(preprocessing))
    else:
        preprocessing = None

    with file_writer_helper(
            args.wspecifier,
            filetype=args.out_filetype,
            write_num_frames=args.write_num_frames,
            compress=args.compress,
            compression_method=args.compression_method,
    ) as writer:
        for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype):
            if is_scipy_wav_style(mat):
                # If data is sound file, then got as Tuple[int, ndarray]
                rate, mat = mat

            if preprocessing is not None:
                mat = preprocessing(mat, uttid_list=utt)

            # shape = (Time, Channel)
            if args.out_filetype in ["sound.hdf5", "sound"]:
                # Write Tuple[int, numpy.ndarray] (scipy style)
                writer[utt] = (rate, mat)
            else:
                writer[utt] = mat
예제 #6
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    # logging info
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )
    logging.info(get_commandline_args())

    # check directory
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # load model config
    model_dir = os.path.dirname(args.model)
    train_args = torch.load(os.path.join(model_dir, "model.conf"))

    # load statistics
    scaler = StandardScaler()
    with h5py.File(os.path.join(model_dir, "stats.h5")) as f:
        scaler.mean_ = f["/melspc/mean"][()]
        scaler.scale_ = f["/melspc/scale"][()]
        # TODO(kan-bayashi): include following info as default
        coef = f["/mlsa/coef"][()]
        alpha = f["/mlsa/alpha"][()]

    # define MLSA filter for noise shaping
    mlsa_filter = TimeInvariantMLSAFilter(
        coef=coef,
        alpha=alpha,
        n_shift=args.n_shift,
    )

    # define model and laod parameters
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = WaveNet(
        n_quantize=train_args.n_quantize,
        n_aux=train_args.n_aux,
        n_resch=train_args.n_resch,
        n_skipch=train_args.n_skipch,
        dilation_depth=train_args.dilation_depth,
        dilation_repeat=train_args.dilation_repeat,
        kernel_size=train_args.kernel_size,
        upsampling_factor=train_args.upsampling_factor,
    )
    model.load_state_dict(torch.load(args.model, map_location="cpu")["model"])
    model.eval()
    model.to(device)

    for idx, (utt_id, lmspc) in enumerate(
            file_reader_helper(args.rspecifier, args.filetype), 1):
        logging.info("(%d) %s" % (idx, utt_id))

        # perform preprocesing
        x = encode_mu_law(np.zeros(
            (1)), mu=train_args.n_quantize)  # quatize initial seed waveform
        h = scaler.transform(lmspc)  # normalize features

        # convert to tensor
        x = torch.tensor(x, dtype=torch.long, device=device)  # (1,)
        h = torch.tensor(h, dtype=torch.float, device=device)  # (T, n_aux)

        # get length of waveform
        n_samples = (h.shape[0] - 1) * args.n_shift + args.n_fft

        # generate
        start_time = time.time()
        with torch.no_grad():
            y = model.generate(x, h, n_samples, interval=100)
        logging.info("generation speed = %s (sec / sample)" %
                     ((time.time() - start_time) / (len(y) - 1)))
        y = decode_mu_law(y, mu=train_args.n_quantize)

        # apply mlsa filter for noise shaping
        y = mlsa_filter(y)

        # save as .wav file
        write(
            os.path.join(args.outdir, "%s.wav" % utt_id),
            args.fs,
            (y * np.iinfo(np.int16).max).astype(np.int16),
        )
예제 #7
0
def main():
    args = get_parser().parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    is_wspecifier = ":" in args.wspecifier_or_wxfilename

    if is_wspecifier:
        if args.spk2utt is not None:
            logging.info("Performing as speaker CMVN mode")
            utt2spk_dict = {}
            with open(args.spk2utt) as f:
                for line in f:
                    spk, utts = line.rstrip().split(None, 1)
                    for utt in utts.split():
                        utt2spk_dict[utt] = spk

            def utt2spk(x):
                return utt2spk_dict[x]

        else:
            logging.info("Performing as utterance CMVN mode")

            def utt2spk(x):
                return x

        if args.out_filetype == "npy":
            logging.warning("--out-filetype npy is allowed only for "
                            "Global CMVN mode, changing to hdf5")
            args.out_filetype = "hdf5"

    else:
        logging.info("Performing as global CMVN mode")
        if args.spk2utt is not None:
            logging.warning("spk2utt is not used for global CMVN mode")

        def utt2spk(x):
            return None

        if args.out_filetype == "hdf5":
            logging.warning("--out-filetype hdf5 is not allowed for "
                            "Global CMVN mode, changing to npy")
            args.out_filetype = "npy"

    if args.preprocess_conf is not None:
        preprocessing = Transformation(args.preprocess_conf)
        logging.info("Apply preprocessing: {}".format(preprocessing))
    else:
        preprocessing = None

    # Calculate stats for each speaker
    counts = {}
    sum_feats = {}
    square_sum_feats = {}

    idx = 0
    for idx, (utt, matrix) in enumerate(
            file_reader_helper(args.rspecifier, args.in_filetype), 1):
        if is_scipy_wav_style(matrix):
            # If data is sound file, then got as Tuple[int, ndarray]
            rate, matrix = matrix
        if preprocessing is not None:
            matrix = preprocessing(matrix, uttid_list=utt)

        spk = utt2spk(utt)

        # Init at the first seen of the spk
        if spk not in counts:
            counts[spk] = 0
            feat_shape = matrix.shape[1:]
            # Accumulate in double precision
            sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64)
            square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64)

        counts[spk] += matrix.shape[0]
        sum_feats[spk] += matrix.sum(axis=0)
        square_sum_feats[spk] += (matrix**2).sum(axis=0)
    logging.info("Processed {} utterances".format(idx))
    assert idx > 0, idx

    cmvn_stats = {}
    for spk in counts:
        feat_shape = sum_feats[spk].shape
        cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:]
        _cmvn_stats = np.empty(cmvn_shape, dtype=np.float64)
        _cmvn_stats[0, :-1] = sum_feats[spk]
        _cmvn_stats[1, :-1] = square_sum_feats[spk]

        _cmvn_stats[0, -1] = counts[spk]
        _cmvn_stats[1, -1] = 0.0

        # You can get the mean and std as following,
        # >>> N = _cmvn_stats[0, -1]
        # >>> mean = _cmvn_stats[0, :-1] / N
        # >>> std = np.sqrt(_cmvn_stats[1, :-1] / N - mean ** 2)

        cmvn_stats[spk] = _cmvn_stats

    # Per utterance or speaker CMVN
    if is_wspecifier:
        with file_writer_helper(args.wspecifier_or_wxfilename,
                                filetype=args.out_filetype) as writer:
            for spk, mat in cmvn_stats.items():
                writer[spk] = mat

    # Global CMVN
    else:
        matrix = cmvn_stats[None]
        if args.out_filetype == "npy":
            np.save(args.wspecifier_or_wxfilename, matrix)
        elif args.out_filetype == "mat":
            # Kaldi supports only matrix or vector
            kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix)
        else:
            raise RuntimeError("Not supporting: --out-filetype {}".format(
                args.out_filetype))
예제 #8
0
def test_KaldiReader(tmpdir, filetype):
    ark = str(tmpdir.join("a.foo"))
    scp = str(tmpdir.join("a.scp"))
    fs = 16000

    with file_writer_helper(
        wspecifier=f"ark,scp:{ark},{scp}",
        filetype=filetype,
        write_num_frames="ark,t:out.txt",
        compress=False,
        compression_method=2,
        pcm_format="wav",
    ) as writer:

        if "sound" in filetype:
            aaa = np.random.randint(-10, 10, 100, dtype=np.int16)
            bbb = np.random.randint(-10, 10, 50, dtype=np.int16)
        else:
            aaa = np.random.randn(10, 10)
            bbb = np.random.randn(13, 5)
        if "sound" in filetype:
            writer["aaa"] = fs, aaa
            writer["bbb"] = fs, bbb
        else:
            writer["aaa"] = aaa
            writer["bbb"] = bbb
        valid = {"aaa": aaa, "bbb": bbb}

    # 1. Test ark read
    if filetype != "sound":
        for key, value in file_reader_helper(
            f"ark:{ark}", filetype=filetype, return_shape=False
        ):
            if "sound" in filetype:
                assert_scipy_wav_style(value)
                value = value[1]
            np.testing.assert_array_equal(value, valid[key])
    # 2. Test scp read
    for key, value in file_reader_helper(
        f"scp:{scp}", filetype=filetype, return_shape=False
    ):
        if "sound" in filetype:
            assert_scipy_wav_style(value)
            value = value[1]
        np.testing.assert_array_equal(value, valid[key])

    # 3. Test ark shape read
    if filetype != "sound":
        for key, value in file_reader_helper(
            f"ark:{ark}", filetype=filetype, return_shape=True
        ):
            if "sound" in filetype:
                value = value[1]
            np.testing.assert_array_equal(value, valid[key].shape)
    # 4. Test scp shape read
    for key, value in file_reader_helper(
        f"scp:{scp}", filetype=filetype, return_shape=True
    ):
        if "sound" in filetype:
            value = value[1]
        np.testing.assert_array_equal(value, valid[key].shape)