示例#1
0
def run(args):
    src_reader = NumpyReader(args.src_scp) if args.src == "npy" else MatReader(
        args.src_scp, args.key)
    num_mat = 0
    mat_list = []
    ops = args.op.split(",")
    for op in ops:
        if op and op not in supported_op:
            raise RuntimeError(f"Unknown operation: {op}")
    stack = "stack" in ops
    with ArchiveWriter(args.dst_ark, args.scp) as writer:
        for key, mat in src_reader:
            for op in ops:
                if op == "trans":
                    mat = np.transpose(mat)
                elif op == "log":
                    mat = np.log(np.maximum(mat, EPSILON))
                elif op == "minus":
                    mat = 1 - mat
                else:
                    pass
            if stack:
                mat_list.append(mat)
            else:
                writer.write(key, mat)
            num_mat += 1
        if stack:
            mat = np.vstack(mat_list)
            writer.write(filekey(args.dst_ark), mat)
            logger.info("Merge {0} matrix into archive {1}, shape as "
                        "{2[0]}x{2[1]}".format(num_mat, args.dst_ark,
                                               mat.shape))
    if not stack:
        logger.info("Copy {0} matrix into archive {1}".format(
            num_mat, args.dst_ark))
示例#2
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
    }

    speech_reader = SpectrogramReader(args.speech_scp, **stft_kwargs)
    bnoise_reader = SpectrogramReader(args.noise_scp, **stft_kwargs)

    num_utts = 0
    cutoff = args.cutoff
    with ArchiveWriter(args.mask_ark, args.scp) as writer:
        for key, speech in speech_reader:
            if key in bnoise_reader:
                num_utts += 1
                noise = bnoise_reader[key]
                mask = compute_mask(speech, noise, args.mask)
                if cutoff > 0:
                    num_items = np.sum(mask > cutoff)
                    mask = np.minimum(mask, cutoff)
                    if num_items:
                        logger.info("Clip {:d} items for utterance {}".format(
                            num_items, key))
                    mask = np.maximum(mask, 0)
                writer.write(key, mask)
    logger.info("Processed {} utterances".format(num_utts))
示例#3
0
def run(args):
    src_reader = NumpyReader(args.src_scp) if not args.matlab else MatReader(
        args.src_scp, args.key)
    num_mat = 0
    mat_list = []
    with ArchiveWriter(args.dst_ark, args.scp) as writer:
        for key, mat in src_reader:
            if args.transpose:
                mat = np.transpose(mat)
            if args.apply_log:
                mat = np.log(np.maximum(mat, EPSILON))
            if args.minus_by_one:
                mat = 1 - mat
            if not args.merge:
                writer.write(key, mat)
            else:
                mat_list.append(mat)
            num_mat += 1
        if args.merge:
            mat = np.vstack(mat_list)
            writer.write(filekey(args.dst_ark), mat)
            logger.info(
                "Merge {0} matrix into archive {1}, shape as {2[0]}x{2[1]}".
                format(num_mat, args.dst_ark, mat.shape))
    if not args.merge:
        logger.info("Copy {0} matrix into archive {1}".format(
            num_mat, args.dst_ark))
示例#4
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    stft_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    if args.utt2idx:
        utt2idx = ScpReader(args.utt2idx, value_processor=int)
        logger.info(f"Using --utt2idx={args.utt2idx}")
    else:
        utt2idx = None
        logger.info(f"Using --doa-idx={args.doa_idx}")

    df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")]
    if not len(df_pair):
        raise RuntimeError(f"Bad configurations with --pair {args.pair}")
    logger.info(f"Compute directional feature with {df_pair}")

    # A x M x F
    steer_vector = np.load(args.steer_vector)

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, stft in stft_reader:
            # sv: M x F
            if utt2idx is None:
                idx = [int(v) for v in args.doa_idx.split(",")]
                dfs = [
                    directional_feats(stft, steer_vector[i], df_pair=df_pair)
                    for i in idx
                ]
                if len(dfs) == 1:
                    df = dfs[0]
                else:
                    # N x T x F
                    dfs = np.stack(dfs)
                    df = dfs.transpose(1, 0, 2).reshape(dfs.shape[1], -1)
            elif key in utt2idx:
                # stft: M x F x T
                df = directional_feats(stft,
                                       steer_vector[utt2idx[key]],
                                       df_pair=df_pair)
            else:
                logger.warn(f"Missing utt2idx for utterance {key}")
                continue
            writer.write(key, df)
            num_done += 1
            if not num_done % 1000:
                logger.info(f"Processed {num_done:d} utterance...")
    logger.info(f"Processed {num_done:d} utterances over {len(stft_reader):d}")
示例#5
0
def run(args):
    srp_pair = [
        tuple(map(int, p.split(","))) for p in args.diag_pair.split(";")
    ]
    if not len(srp_pair):
        raise RuntimeError("Bad configurations with --pair {}".format(
            args.pair))
    logger.info("Compute gcc with {}".format(srp_pair))

    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    num_done = 0
    num_ffts = nfft(
        args.frame_len) if args.round_power_of_two else args.frame_len
    reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    with ArchiveWriter(args.srp_ark, args.scp) as writer:
        for key, stft_mat in reader:
            num_done += 1
            srp = []
            # N x T x F
            for (i, j) in srp_pair:
                srp.append(
                    gcc_phat_diag(
                        stft_mat[i],
                        stft_mat[j],
                        min(i, j) * np.pi * 2 / args.n,
                        args.d,
                        num_bins=num_ffts // 2 + 1,
                        sr=args.sr,
                        num_doa=args.num_doa))
            srp = sum(srp) / len(srp_pair)
            nan = np.sum(np.isnan(srp))
            if nan:
                raise RuntimeError("Matrix {} has nan ({:d}} items)".format(
                    key, nan))
            writer.write(key, srp)
            if not num_done % 1000:
                logger.info("Processed {:d} utterances...".format(num_done))
    logger.info("Processd {:d} utterances done".format(len(reader)))
示例#6
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
    }

    speech_reader = SpectrogramReader(args.speech_scp, **stft_kwargs)
    denorm_reader = SpectrogramReader(args.denorm_scp, **stft_kwargs)

    num_utts = 0
    cutoff = args.cutoff
    with ArchiveWriter(args.mask_ark, args.scp) as writer:
        for key, speech in speech_reader:
            if key in denorm_reader:
                num_utts += 1
                denorm = denorm_reader[key]
                mask = compute_mask(speech[0] if speech.ndim == 3 else speech,
                                    denorm[0] if denorm.ndim == 3 else denorm,
                                    args.mask)
                # iam, psm, psa
                if cutoff > 0:
                    num_items = np.sum(mask > cutoff)
                    mask = np.minimum(mask, cutoff)
                    if num_items:
                        percent = float(num_items) / mask.size
                        logger.info(
                            "Clip {:d}({:.2f}) items over {:.2f} for utterance {}"
                            .format(num_items, percent, cutoff, key))
                num_items = np.sum(mask < 0)
                # psm, psa
                if num_items:
                    percent = float(num_items) / mask.size
                    average = np.sum(mask[mask < 0]) / num_items
                    logger.info(
                        "Clip {:d}({:.2f}, {:.2f}) items below zero for utterance {}"
                        .format(num_items, percent, average, key))
                    mask = np.maximum(mask, 0)
                writer.write(key, mask)
            else:
                logger.warn("Missing bg-noise for utterance {}".format(key))
    logger.info("Processed {} utterances".format(num_utts))
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "apply_log": args.apply_log,
        "apply_pow": args.apply_pow,
        "normalize": args.normalize,
        "apply_abs": True,
        "transpose": True  # T x F
    }
    reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, feats in reader:
            # default using ch1 in multi-channel case
            writer.write(key, feats[0] if feats.ndim == 3 else feats)
    logger.info("Process {:d} utterances".format(len(reader)))
示例#8
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    num_bins = nextpow2(args.frame_len) // 2 + 1
    beamformer = MvdrBeamformer(num_bins)

    df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")]
    if not len(df_pair):
        raise RuntimeError(f"Bad configurations with --pair {args.pair}")
    logger.info(f"Compute directional feature with {df_pair}")

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spect in feat_reader:
            if key in mask_reader:
                speech_masks = mask_reader[key]
                # make sure speech_masks in T x F
                _, F, _ = spect.shape
                if speech_masks.shape[0] == F:
                    speech_masks = np.transpose(speech_masks)
                speech_masks = np.minimum(speech_masks, 1)
                # spectrogram: N x F x T
                speech_covar = beamformer.compute_covar_mat(
                    speech_masks, spect)
                sv = beamformer.compute_steer_vector(speech_covar)
                df = directional_feats(spect, sv.T, df_pair=df_pair)
                writer.write(key, df)
                num_done += 1
                if not num_done % 1000:
                    logger.info(f"Processed {num_done:d} utterance...")
            else:
                logger.warn(f"Missing TF-mask for utterance {key}")
    logger.info(f"Processed {num_done:d} utterances over {len(feat_reader):d}")
示例#9
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    num_utts = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spectrogram in spectrogram_reader:
            # spectrogram: shape NxTxF
            feats = compute_spatial_feats(args, spectrogram)
            # feats: T x F
            writer.write(key, feats)
            num_utts += 1
    logger.info("Processed {} for {:d} utterances".format(
        args.type.upper(), num_utts))
示例#10
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "apply_log": args.apply_log,
        "apply_pow": args.apply_pow,
        "normalize": args.normalize,
        "apply_abs": True,
        "transpose": True  # T x F
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    num_utts = 0

    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, feats in spectrogram_reader:
            writer.write(key, feats)
            num_utts += 1
    logger.info("Process {:d} utterances".format(num_utts))
示例#11
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    num_utts = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spectrogram in spectrogram_reader:
            # spectrogram: shape NxTxF
            feats = compute_spatial_feats(args, spectrogram)
            # feats: T x F
            writer.write(key, feats)
            num_utts += 1
            if not num_utts % 1000:
                logger.info(f"Processed {num_utts} utterance...")
    logger.info(f"Processed {args.type.upper()} for {num_utts} utterances")
示例#12
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    num_bins = nfft(args.frame_len) // 2 + 1
    beamformer = MvdrBeamformer(num_bins)

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spect in feat_reader:
            if key in mask_reader:
                speech_masks = mask_reader[key]
                # make sure speech_masks in T x F
                _, F, _ = spect.shape
                if speech_masks.shape[0] == F:
                    speech_masks = np.transpose(speech_masks)
                speech_masks = np.minimum(speech_masks, 1)
                # spectrogram: N x F x T
                speech_covar = beamformer.compute_covar_mat(
                    speech_masks, spect)
                sv = beamformer.compute_steer_vector(speech_covar)
                df = directional_feats(spect, sv.T)
                writer.write(key, df)
                num_done += 1
                if not num_done % 1000:
                    logger.info("Processed {:d} utterance...".format(num_done))
            else:
                logger.warn("Missing TF-mask for utterance {}".format(key))
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(feat_reader)))
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_bins = nfft(args.frame_length) // 2 + 1
    beamformer = MvdrBeamformer(num_bins)

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spect in feat_reader:
            if key in mask_reader:
                speech_masks = mask_reader[key]
                if args.trans:
                    speech_masks = np.transpose(speech_masks)
                speech_masks = np.minimum(speech_masks, 1)
                # spectrogram: N x F x T
                speech_covar = beamformer.compute_covar_mat(
                    speech_masks, spect)
                sv = beamformer.compute_steer_vector(speech_covar)
                df = directional_feats(spect, sv.T)
                writer.write(key, df)
                num_done += 1
                if not num_done % 1000:
                    logger.info("Processed {:d} utterance...".format(num_done))
            else:
                logger.warn("Missing TF-mask for utterance {}".format(key))
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(feat_reader)))
示例#14
0
def run(args):
    mel_kwargs = {
        "n_mels": args.num_bins,
        "fmin": args.min_freq,
        "fmax": args.max_freq,
        "htk": True
    }
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "apply_log": False,
        "apply_pow": args.apply_pow,
        "normalize": args.normalize,
        "apply_abs": True,
        "transpose": False  # F x T
    }

    if args.max_freq > args.samp_freq // 2:
        raise RuntimeError("Max frequency for mel exceeds sample frequency")
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    # N x F
    mel_weights = audio_lib.filters.mel(args.samp_freq,
                                        nfft(args.frame_length), **mel_kwargs)

    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, spectrum in spectrogram_reader:
            # N x F * F x T = N * T => T x N
            fbank = np.transpose(
                np.dot(mel_weights,
                       spectrum[0] if spectrum.ndim == 3 else spectrum))
            if args.apply_log:
                fbank = np.log(np.maximum(fbank, EPSILON))
            writer.write(key, fbank)
    logger.info("Process {:d} utterances".format(len(spectrogram_reader)))