示例#1
0
def run(args):
    # shape: T x F
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,
        "apply_abs": True,
    }
    spk_scps = args.spks.split(",")
    if len(spk_scps) < 2:
        raise RuntimeError("Please give at least 2 speakers")
    mix_reader = SpectrogramReader(args.mix, **stft_kwargs)
    spk_reader = [SpectrogramReader(spk, **stft_kwargs) for spk in spk_scps]

    with NumpyWriter(args.dir) as writer:
        for key, mix in mix_reader:
            T, F = mix.shape
            masks = np.zeros_like(mix, dtype=np.float32)
            # sil: -1
            mix_2db = 20 * np.log10(np.maximum(mix, EPSILON))
            sil_idx = mix_2db < (np.max(mix_2db) - args.beta)
            masks[sil_idx] = -1
            logger.info("For {}, silence covered {:.2f}%".format(
                key,
                np.sum(sil_idx) * 100 / (T * F)))
            # for each speaker
            act_idx = ~sil_idx
            labels = np.argmax(np.stack([reader[key]
                                         for reader in spk_reader]),
                               axis=0)
            masks[act_idx] = labels[act_idx]
            writer.write(key, masks)
    logger.info("Processed {:d} utterances done".format(len(mix_reader)))
示例#2
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, specs in spectrogram_reader:
            if key in mask_reader:
                num_done += 1
                mask = mask_reader[key]
                if args.transpose:
                    mask = np.transpose(mask)
                logger.info("Processing utterance {}...".format(key))
                if mask.shape != specs.shape:
                    raise ValueError(
                        "Dimention mismatch between mask and spectrogram"
                        "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures"
                        .format(mask.shape, specs.shape))
                nsamps = spectrogram_reader.nsamps(
                    key) if args.keep_length else None
                norm = spectrogram_reader.samp_norm(key)
                samps = istft(
                    specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
示例#3
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_utts = 0
    fs = args.samp_freq,
    for key, specs in spectrogram_reader:
        if key in mask_reader:
            num_utts += 1
            mask = mask_reader[key]
            if args.transpose:
                mask = np.transpose(mask)
            logger.info("Processing utterance {}...".format(key))
            if mask.shape != specs.shape:
                raise ValueError(
                    "Dimention mismatch between mask and spectrogram"
                    "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures"
                    .format(mask.shape, specs.shape))
            nsamps = spectrogram_reader.nsamps(
                key) if args.keep_length else None
            istft(os.path.join(args.dst_dir, "{}.wav".format(key)),
                  specs * mask,
                  **stft_kwargs,
                  fs=fs,
                  nsamps=nsamps)
    logger.info("Processed {} utterances".format(num_utts))
示例#4
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
    }

    speech_reader = SpectrogramReader(args.speech_scp, **stft_kwargs)
    bnoise_reader = SpectrogramReader(args.noise_scp, **stft_kwargs)

    num_utts = 0
    cutoff = args.cutoff
    with ArchiveWriter(args.mask_ark, args.scp) as writer:
        for key, speech in speech_reader:
            if key in bnoise_reader:
                num_utts += 1
                noise = bnoise_reader[key]
                mask = compute_mask(speech, noise, args.mask)
                if cutoff > 0:
                    num_items = np.sum(mask > cutoff)
                    mask = np.minimum(mask, cutoff)
                    if num_items:
                        logger.info("Clip {:d} items for utterance {}".format(
                            num_items, key))
                    mask = np.maximum(mask, 0)
                writer.write(key, mask)
    logger.info("Processed {} utterances".format(num_utts))
示例#5
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }
    topo = list(map(float, args.linear_topo.split(",")))
    doa = args.doa if args.doa > 0 else 180 + args.doa
    if doa < 0 or doa > 180:
        raise RuntimeError("Illegal value for DoA: {:.2f}".format(args.doa))

    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    beamformer = DSBeamformer(topo)
    logger.info("Initialize {:d} channel DSBeamformer".format(len(topo)))

    with WaveWriter(args.dst_dir, fs=args.fs) as writer:
        for key, stft_src in spectrogram_reader:
            stft_enh = beamformer.run(
                doa, stft_src, c=args.speed, sample_rate=args.fs)
            power = spectrogram_reader.power(key)
            samps = istft(stft_enh, **stft_kwargs, power=power)
            writer.write(key, samps)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
示例#6
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    # F x N or B x F x N
    weights = np.load(args.weights)
    if weights.ndim == 2:
        beamformer = FixedBeamformer(weights)
        beam_index = None
    else:
        beamformer = [FixedBeamformer(w) for w in weights]
        if not args.beam:
            raise RuntimeError(
                "--beam must be assigned, as there are multiple beams")
        beam_index = ScpReader(args.beam, value_processor=lambda x: int)
    with WaveWriter(args.dst_dir) as writer:
        for key, stft_mat in spectrogram_reader:
            logger.info(f"Processing utterance {key}...")
            if beamformer:
                beam = beam_index[key]
                stft_enh = beamformer[beam].run(stft_mat)
            else:
                stft_enh = beamformer.run(stft_mat)
            norm = spectrogram_reader.maxabs(key)
            samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm)
            writer.write(key, samps)
    logger.info(f"Processed {len(spectrogram_reader):d} utterances")
示例#7
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
        "transpose": True  # F x T instead of T x F
    }

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    for key, spectrogram in spectrogram_reader:
        logger.info("Processing utterance {}...".format(key))
        separated = auxiva(spectrogram, args.epochs)
        for idx in range(separated.shape[0]):
            samps = istft(
                separated[idx],
                **stft_kwargs,
                norm=spectrogram_reader.samp_norm(key))
            write_wav(
                os.path.join(args.dst_dir, "{}.SRC{:d}.wav".format(
                    key, idx + 1)),
                samps,
                fs=args.fs)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
示例#8
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        **stft_kwargs,
        round_power_of_two=args.round_power_of_two)
    phase_reader = None
    if args.phase_ref:
        phase_reader = SpectrogramReader(
            args.phase_ref,
            **stft_kwargs,
            round_power_of_two=args.round_power_of_two)
        logger.info("Using phase reference from {}".format(args.phase_ref))
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    mask_reader = MaskReader[args.fmt](args.mask_scp)

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer:
        for key, specs in spectrogram_reader:
            # specs: T x F
            if key in mask_reader:
                num_done += 1
                mask = mask_reader[key]
                # mask sure mask in T x F
                _, F = specs.shape
                if mask.shape[0] == F:
                    mask = np.transpose(mask)
                logger.info("Processing utterance {}...".format(key))
                if mask.shape != specs.shape:
                    raise ValueError(
                        "Dimention mismatch between mask and spectrogram"
                        "({0[0]} x {0[1]} vs {1[0]} x {1[1]}), need check configures"
                        .format(mask.shape, specs.shape))
                nsamps = spectrogram_reader.nsamps(
                    key) if args.keep_length else None
                norm = spectrogram_reader.samp_norm(key)
                # use phase from ref
                if phase_reader is not None:
                    angle = np.angle(phase_reader[key])
                    phase = np.exp(angle * 1j)
                    samps = istft(np.abs(specs) * mask * phase,
                                  **stft_kwargs,
                                  norm=norm,
                                  nsamps=nsamps)
                else:
                    samps = istft(specs * mask,
                                  **stft_kwargs,
                                  norm=norm,
                                  nsamps=nsamps)
                writer.write(key, samps)
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
示例#9
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }

    if args.geometry == "linear":
        topo = str2tuple(args.linear_topo)
        beamformer = LinearSDBeamformer(topo)
        logger.info(f"Initialize LinearSDBeamformer for array: {topo}")
    else:
        beamformer = CircularSDBeamformer(args.circular_radius,
                                          args.circular_around,
                                          center=args.circular_center)
        logger.info(
            "Initialize CircularSDBeamformer for " +
            f"radius = {args.circular_radius}, center = {args.circular_center}"
        )

    utt2doa = None
    doa = None
    if args.utt2doa:
        utt2doa = ScpReader(args.utt2doa, value_processor=lambda x: float(x))
        logger.info(f"Use --utt2doa={args.utt2doa} for each utterance")
    else:
        doa = args.doa
        if not check_doa(args.geometry, doa):
            logger.info(f"Invalid doa {doa:.2f} for {args.geometry} array")
        logger.info(f"Use --doa={doa:.2f} for all utterances")

    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)

    done = 0
    with WaveWriter(args.dst_dir, sr=args.sr) as writer:
        for key, stft_src in spectrogram_reader:
            if utt2doa:
                if key not in utt2doa:
                    continue
                doa = utt2doa[key]
                if not check_doa(args.geometry, doa):
                    logger.info(f"Invalid DoA {doa:.2f} for utterance {key}")
                    continue
            stft_enh = beamformer.run(doa, stft_src, c=args.speed, sr=args.sr)
            done += 1
            norm = spectrogram_reader.maxabs(key)
            samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm)
            writer.write(key, samps)
    logger.info(f"Processed {done} utterances over {len(spectrogram_reader)}")
示例#10
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }

    utt2doa = None
    doa = None
    if args.utt2doa:
        utt2doa = ScpReader(args.utt2doa, value_processor=lambda x: float(x))
        logger.info(f"Use utt2doa {args.utt2doa} for each utterance")
    else:
        doa = args.doa
        if doa < 0:
            doa = 180 + doa
        if doa < 0 or doa > 180:
            raise RuntimeError(f"Invalid doa {doa:.2f} for --doa")
        logger.info(f"Use DoA {doa:.2f} for all utterances")

    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)

    done = 0
    topo = str2tuple(args.linear_topo)
    beamformer = LinearDSBeamformer(topo)
    logger.info(f"Initialize channel LinearDSBeamformer for array: {topo}")

    with WaveWriter(args.dst_dir, fs=args.fs) as writer:
        for key, stft_src in spectrogram_reader:
            if utt2doa:
                if key not in utt2doa:
                    continue
                doa = utt2doa[key]
                if doa < 0:
                    doa = 180 + doa
                if doa < 0 or doa > 180:
                    logger.info(f"Invalid doa {doa:.2f} for utterance {key}")
                    continue
            stft_enh = beamformer.run(doa, stft_src, c=args.speed, sr=args.fs)
            done += 1
            norm = spectrogram_reader.maxabs(key)
            samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm)
            writer.write(key, samps)
    logger.info(f"Processed {done} utterances over {len(spectrogram_reader)}")
示例#11
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    weights_dict = loadmat(args.weights)
    if args.weight_key not in weights_dict:
        raise KeyError("Weight key error: no \'{}\' in {}".format(
            args.weight_key, args.weights))

    beamformer = FixedBeamformer(weights_dict[args.weight_key])
    with WaveWriter(args.dump_dir) as writer:
        for key, stft_mat in spectrogram_reader:
            logger.info("Processing utterance {}...".format(key))
            stft_enh = beamformer.run(stft_mat)
            # do not normalize
            samps = istft(stft_enh, **stft_kwargs)
            writer.write(key, samps)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
示例#12
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    wpe_kwargs = {
        "taps": args.taps,
        "delay": args.delay,
        "iters": args.iters,
        "psd_context": args.context
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    if not os.path.exists(args.dst_dir):
        os.makedirs(args.dst_dir)

    for key, reverbed in spectrogram_reader:
        # N x T x F => F x N x T
        reverbed = np.transpose(reverbed, [2, 0, 1])
        # F x N x T
        dereverb = wpe(reverbed, **wpe_kwargs)
        # F x N x T => N x T x F
        dereverb = np.transpose(dereverb, [1, 2, 0])
        # write for each channel
        for chid in range(dereverb.shape[0]):
            samps = istft(dereverb[chid], **stft_kwargs)
            write_wav(os.path.join(args.dst_dir,
                                   "{}.CH{:d}.wav".format(key, chid + 1)),
                      samps,
                      fs=args.samp_freq)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
示例#13
0
def run(args):
    cache_dir = Path(args.cache_dir)
    cache_dir.mkdir(parents=True, exist_ok=True)
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center  # false to comparable with kaldi
    }
    reader = SpectrogramReader(args.wav_scp,
                               **stft_kwargs,
                               apply_abs=True,
                               apply_log=True,
                               transpose=True)

    for key, mat in reader:
        if mat.ndim == 3 and args.index >= 0:
            mat = mat[args.index]
        save_figure(key,
                    mat,
                    cache_dir / key.replace(".", "-"),
                    cmap=args.cmap,
                    hop=args.frame_hop,
                    sr=args.sr,
                    title=args.title)
示例#14
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }

    if not os.path.exists(args.dst_dir):
        os.makedirs(args.dst_dir)

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    num_done = 0
    for key, stft in spectrogram_reader:
        if not os.path.exists(
                os.path.join(args.dst_dir, "{}.npy".format(key))):
            # stft: N x F x T
            trainer = CgmmTrainer(stft)
            try:
                speech_masks = trainer.train(args.num_epochs)
                num_done += 1
                np.save(
                    os.path.join(args.dst_dir, key),
                    speech_masks.astype(np.float32))
                logger.info("Training utterance {} ... Done".format(key))
            except RuntimeError:
                logger.warn("Training utterance {} ... Failed".format(key))
        else:
            logger.info("Training utterance {} ... Skip".format(key))
    logger.info("Train {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    init_mask_reader = ScriptReader(args.init_mask) if args.init_mask else None

    num_done = 0
    with NumpyWriter(args.dst_dir) as writer:
        for key, stft in spectrogram_reader:
            if not os.path.exists(
                    os.path.join(args.dst_dir, "{}.npy".format(key))):
                init_mask = None
                if init_mask_reader and key in init_mask_reader:
                    init_mask = init_mask_reader[key]
                    logger.info(
                        "Using external speech mask to initialize cgmm")
                # stft: N x F x T
                trainer = CgmmTrainer(stft, Ms=init_mask)
                try:
                    speech_masks = trainer.train(args.num_epochs)
                    num_done += 1
                    writer.write(key, speech_masks.astype(np.float32))
                    logger.info("Training utterance {} ... Done".format(key))
                except RuntimeError:
                    logger.warn("Training utterance {} ... Failed".format(key))
            else:
                logger.info("Training utterance {} ... Skip".format(key))
    logger.info("Train {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))
示例#16
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
    }

    speech_reader = SpectrogramReader(args.speech_scp, **stft_kwargs)
    denorm_reader = SpectrogramReader(args.denorm_scp, **stft_kwargs)

    num_utts = 0
    cutoff = args.cutoff
    WriterImpl = {"kaldi": ArchiveWriter, "exraw": BinaryWriter}[args.format]

    with WriterImpl(args.mask_ark, args.scp) as writer:
        for key, speech in speech_reader:
            if key in denorm_reader:
                num_utts += 1
                denorm = denorm_reader[key]
                mask = compute_mask(speech[0] if speech.ndim == 3 else speech,
                                    denorm[0] if denorm.ndim == 3 else denorm,
                                    args.mask)
                # iam, psm, psa
                if cutoff > 0:
                    num_items = np.sum(mask > cutoff)
                    mask = np.minimum(mask, cutoff)
                    if num_items:
                        percent = float(num_items) / mask.size
                        logger.info(
                            "Clip {:d}({:.2f}) items over {:.2f} for utterance {}"
                            .format(num_items, percent, cutoff, key))
                num_items = np.sum(mask < 0)
                # psm, psa
                if num_items:
                    percent = float(num_items) / mask.size
                    average = np.sum(mask[mask < 0]) / num_items
                    logger.info(
                        "Clip {:d}({:.2f}, {:.2f}) items below zero for utterance {}"
                        .format(num_items, percent, average, key))
                    mask = np.maximum(mask, 0)
                writer.write(key, mask)
            else:
                logger.warn("Missing bg-noise for utterance {}".format(key))
    logger.info("Processed {} utterances".format(num_utts))
示例#17
0
def run(args):
    # shape: T x F, complex
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
    }

    clean_reader = SpectrogramReader(args.clean_scp, **stft_kwargs)
    noisy_reader = SpectrogramReader(args.noisy_scp, **stft_kwargs)

    num_utts = 0
    cutoff = args.cutoff
    WriterImpl = {"kaldi": ArchiveWriter, "exraw": BinaryWriter}[args.format]

    with WriterImpl(args.mask_ark, args.scp) as writer:
        for key, clean in clean_reader:
            if key in noisy_reader:
                num_utts += 1
                noisy = noisy_reader[key]
                mask = compute_mask(clean[0] if clean.ndim == 3 else clean,
                                    noisy[0] if noisy.ndim == 3 else noisy,
                                    args.mask)
                # iam, psm, psa
                if cutoff > 0:
                    num_items = np.sum(mask > cutoff)
                    mask = np.minimum(mask, cutoff)
                    if num_items:
                        percent = float(num_items) / mask.size
                        logger.info(
                            f"Clip {num_items:d}({percent:.2f}) items over " +
                            f"{cutoff:.2f} for utterance {key}")
                num_items = np.sum(mask < 0)
                # psm, psa
                if num_items:
                    percent = float(num_items) / mask.size
                    average = np.sum(mask[mask < 0]) / num_items
                    logger.info(
                        f"Clip {num_items}({percent:.2f}, {average:.2f}) " +
                        f"items below zero for utterance {key}")
                    mask = np.maximum(mask, 0)
                writer.write(key, mask)
            else:
                logger.warn(f"Missing bg-noise for utterance {key}")
    logger.info(f"Processed {num_utts} utterances")
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader(
        args.mask_scp)

    num_bins = nfft(args.frame_length) // 2 + 1
    supported_beamformer = {
        "mvdr": MvdrBeamformer(num_bins),
        "gevd": GevdBeamformer(num_bins),
        "pmwf": PmwfBeamformer(num_bins)
    }
    beamformer = supported_beamformer[args.beamformer]

    num_utts = 0
    for key, stft_mat in spectrogram_reader:
        if key in mask_reader:
            num_utts += 1
            norm = spectrogram_reader.samp_norm(key)
            logger.info("Processing utterance {}(norm to {:.2f})...".format(
                key, norm))
            # prefer T x F
            speech_mask = mask_reader[key]
            if args.trans:
                speech_mask = np.transpose(speech_mask)
            # stft_enh, stft_mat: F x T
            stft_enh = beamformer.run(speech_mask,
                                      stft_mat,
                                      normalize=args.postf)
            # masking beamformer output if necessary
            if args.mask:
                stft_enh = stft_enh * np.transpose(speech_mask)
            istft(os.path.join(args.dst_dir, '{}.wav'.format(key)),
                  stft_enh,
                  norm=norm,
                  fs=args.samp_freq,
                  **stft_kwargs)
    logger.info("Processed {:d} utterances out of {:d}".format(
        num_utts, len(spectrogram_reader)))
示例#19
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)

    num_done = 0
    with WaveWriter(args.dst_dir, sr=args.sr) as writer:
        for key, obs in spectrogram_reader:
            logger.info(f"Processing utt {key}...")
            if obs.ndim != 3:
                raise RuntimeError(f"Expected 3D array, but got {obs.ndim}")
            try:
                # N x T x F => T x F
                tf_mask, wpd_enh = facted_wpd(obs,
                                              wpd_iters=args.wpd_iters,
                                              cgmm_iters=args.cgmm_iters,
                                              update_alpha=args.update_alpha,
                                              context=args.context,
                                              taps=args.taps,
                                              delay=args.delay)
            except np.linalg.LinAlgError:
                logger.warn(f"{key}: Failed cause LinAlgError in wpd")
                continue
            norm = spectrogram_reader.maxabs(key)
            # dump multi-channel
            samps = inverse_stft(wpd_enh, norm=norm, **stft_kwargs)
            writer.write(key, samps)
            if args.dump_mask:
                np.save(f"{args.dst_dir}/{key}", tf_mask[..., 0])
            # show progress cause slow speed
            num_done += 1
            if not num_done % 100:
                logger.info(f"Processed {num_done:d} utterances...")
    logger.info(
        f"Processed {num_done:d} utterances over {len(spectrogram_reader):d}")
示例#20
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": False  # F x T
    }
    stft_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    if args.utt2idx:
        utt2idx = ScpReader(args.utt2idx, value_processor=int)
        logger.info(f"Using --utt2idx={args.utt2idx}")
    else:
        utt2idx = None
        logger.info(f"Using --doa-idx={args.doa_idx}")

    df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")]
    if not len(df_pair):
        raise RuntimeError(f"Bad configurations with --pair {args.pair}")
    logger.info(f"Compute directional feature with {df_pair}")

    # A x M x F
    steer_vector = np.load(args.steer_vector)

    num_done = 0
    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, stft in stft_reader:
            # sv: M x F
            if utt2idx is None:
                idx = [int(v) for v in args.doa_idx.split(",")]
                dfs = [
                    directional_feats(stft, steer_vector[i], df_pair=df_pair)
                    for i in idx
                ]
                if len(dfs) == 1:
                    df = dfs[0]
                else:
                    # N x T x F
                    dfs = np.stack(dfs)
                    df = dfs.transpose(1, 0, 2).reshape(dfs.shape[1], -1)
            elif key in utt2idx:
                # stft: M x F x T
                df = directional_feats(stft,
                                       steer_vector[utt2idx[key]],
                                       df_pair=df_pair)
            else:
                logger.warn(f"Missing utt2idx for utterance {key}")
                continue
            writer.write(key, df)
            num_done += 1
            if not num_done % 1000:
                logger.info(f"Processed {num_done:d} utterance...")
    logger.info(f"Processed {num_done:d} utterances over {len(stft_reader):d}")
示例#21
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }
    np.random.seed(args.seed)
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    init_mask_reader = MaskReader[args.fmt](
        args.init_mask) if args.init_mask else None

    num_done = 0
    with NumpyWriter(args.dst_dir) as writer:
        dst_dir = Path(args.dst_dir)
        for key, stft in spectrogram_reader:
            if not (dst_dir / f"{key}.npy").exists():
                init_mask = None
                if init_mask_reader and key in init_mask_reader:
                    init_mask = init_mask_reader[key]
                    # T x F => F x T
                    if init_mask.ndim == 2:
                        init_mask = np.transpose(init_mask)
                    else:
                        init_mask = np.transpose(init_mask, (0, 2, 1))
                    logger.info("Using external TF-mask to initialize cgmm")
                # stft: N x F x T
                trainer = CgmmTrainer(stft,
                                      args.num_classes,
                                      gamma=init_mask,
                                      update_alpha=args.update_alpha)
                try:
                    masks = trainer.train(args.num_iters)
                    # K x F x T => K x T x F
                    masks = np.transpose(masks, (0, 2, 1))
                    num_done += 1
                    if args.solve_permu:
                        masks = permu_aligner(masks)
                        logger.info(
                            "Permutation alignment done on each frequency")
                    if args.num_classes == 2:
                        masks = masks[0]
                    writer.write(key, masks.astype(np.float32))
                    logger.info(f"Training utterance {key} ... Done")
                except RuntimeError:
                    logger.warn(f"Training utterance {key} ... Failed")
            else:
                logger.info(f"Training utterance {key} ... Skip")
    logger.info(
        f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
示例#22
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    wpe_kwargs = {
        "num_iters": args.num_iters,
        "context": args.context,
        "taps": args.taps,
        "delay": args.delay
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.sr) as writer:
        for key, reverbed in spectrogram_reader:
            logger.info(f"Processing utt {key}...")
            if reverbed.ndim == 2:
                reverbed = reverbed[None, ...]
            # N x T x F => F x N x T
            reverbed = np.transpose(reverbed, (2, 0, 1))
            try:
                if args.nara_wpe:
                    from nara_wpe.wpe import wpe_v8
                    # T x F x N
                    dereverb = wpe_v8(reverbed,
                                      taps=args.taps,
                                      delay=args.delay,
                                      iterations=args.num_iters,
                                      psd_context=args.context)
                else:
                    dereverb = wpe(reverbed, **wpe_kwargs)
            except np.linalg.LinAlgError:
                logger.warn(f"{key}: Failed cause LinAlgError in wpe")
                continue
            # F x N x T => N x T x F
            dereverb = np.transpose(dereverb, (1, 2, 0))
            # dump multi-channel
            samps = np.stack(
                [inverse_stft(spectra, **stft_kwargs) for spectra in dereverb])
            writer.write(key, samps)
            # show progress cause slow speed
            num_done += 1
            if not num_done % 100:
                logger.info(f"Processed {num_done:d} utterances...")
    logger.info(
        f"Processed {num_done:d} utterances over {len(spectrogram_reader):d}")
示例#23
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,
        "transpose": False
    }

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    init_mask_reader = MaskReader[args.fmt](
        args.init_mask) if args.init_mask else None

    n_fft = nfft(args.frame_len) if args.round_power_of_two else args.frame_len
    # now use pb_bss
    pb_perm_solver = load_module(pb_bss_align_url)
    aligner = pb_perm_solver.DHTVPermutationAlignment.from_stft_size(n_fft)

    num_done = 0
    with NumpyWriter(args.dst_dir) as writer:
        dst_dir = Path(args.dst_dir)
        for key, stft in spectrogram_reader:
            if not (dst_dir / f"{key}.npy").exists():
                # K x F x T
                init_mask = None
                if init_mask_reader and key in init_mask_reader:
                    init_mask = init_mask_reader[key]
                    logger.info("Using external mask to initialize cacgmm")
                # stft: N x F x T
                trainer = CacgmmTrainer(stft,
                                        args.num_classes,
                                        gamma=init_mask,
                                        cgmm_init=args.cgmm_init)
                try:
                    # EM progress
                    masks = trainer.train(args.num_epoches)
                    # align if needed
                    if not args.cgmm_init or args.num_classes != 2:
                        masks = aligner(masks)
                        logger.info(
                            "Permutation align done for each frequency")
                    num_done += 1
                    writer.write(key, masks.astype(np.float32))
                    logger.info(f"Training utterance {key} ... Done")
                except np.linalg.LinAlgError:
                    logger.warn(f"Training utterance {key} ... Failed")
            else:
                logger.info(f"Training utterance {key} ... Skip")
    logger.info(
        f"Train {num_done:d} utterances over {len(spectrogram_reader):d}")
示例#24
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": True  # F x T instead of T x F
    }

    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    for key, spectrogram in spectrogram_reader:
        logger.info(f"Processing utterance {key}...")
        separated = auxiva(spectrogram, args.epochs)
        norm = spectrogram_reader.maxabs(key)
        for idx in range(separated.shape[0]):
            samps = inverse_stft(separated[idx], **stft_kwargs, norm=norm)
            fname = Path(args.dst_dir) / f"{key}.src{idx + 1}.wav"
            write_wav(fname, samps, fs=args.fs)
    logger.info(f"Processed {len(spectrogram_reader)} utterances")
示例#25
0
def run(args):
    # return complex result
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center
    }
    logger.info("Using mask: {}".format(args.mask.upper()))
    mixture_reader = SpectrogramReader(
        args.mix_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    ref_scp_list = args.ref_scp.split(",")
    logger.info("Number of speakers: {:d}".format(len(ref_scp_list)))
    targets_reader = [
        SpectrogramReader(scp, **stft_kwargs) for scp in ref_scp_list
    ]
    num_utts = 0
    for key, mixture in tqdm(mixture_reader):
        nsamps = mixture_reader.nsamps(key) if args.keep_length else None
        skip = False
        for reader in targets_reader:
            if key not in reader:
                logger.info("Skip utterance {}, missing targets".format(key))
                skip = True
                break
        if skip:
            continue
        num_utts += 1
        targets_list = [reader[key] for reader in targets_reader]
        spk_masks = compute_mask(mixture, targets_list, args.mask)
        for index, mask in enumerate(spk_masks):
            samps = istft(mixture * mask, **stft_kwargs, nsamps=nsamps)
            write_wav(os.path.join(args.dump_dir,
                                   "spk{:d}/{}.wav".format(index + 1, key)),
                      samps,
                      fs=args.fs)
    logger.info("Processed {} utterance!".format(num_utts))
示例#26
0
def run(args):
    # return complex result
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center
    }
    logger.info(f"Using mask: {args.mask.upper()}")
    mixture_reader = SpectrogramReader(
        args.mix_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    ref_scp_list = args.ref_scp.split(",")
    logger.info(f"Number of speakers: {len(ref_scp_list)}")
    targets_reader = [
        SpectrogramReader(scp, **stft_kwargs) for scp in ref_scp_list
    ]
    num_utts = 0
    for key, mixture in tqdm(mixture_reader):
        nsamps = mixture_reader.nsamps(key) if args.keep_length else None
        skip = False
        for reader in targets_reader:
            if key not in reader:
                logger.info(f"Skip utterance {key}, missing targets")
                skip = True
                break
        if skip:
            continue
        num_utts += 1
        targets_list = [reader[key] for reader in targets_reader]
        spk_masks = compute_mask(mixture, targets_list, args.mask)
        for index, mask in enumerate(spk_masks):
            samps = inverse_stft(mixture * mask, **stft_kwargs, nsamps=nsamps)
            write_wav(os.path.join(args.dump_dir, f"spk{index + 1}/{key}.wav"),
                      samps,
                      sr=args.sr)
    logger.info(f"Processed {num_utts} utterance")
示例#27
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
    }

    FeatureReader = {"numpy": NumpyReader, "kaldi": ScriptReader}
    feature_reader = FeatureReader[args.fmt](args.feat_scp)

    phase_reader = None
    if args.phase_ref:
        phase_reader = SpectrogramReader(
            args.phase_ref,
            **stft_kwargs,
            round_power_of_two=args.round_power_of_two)
        logger.info(f"Using phase reference from {args.phase_ref}")

    with WaveWriter(args.dump_dir, fs=args.sr,
                    normalize=args.normalize) as writer:
        for key, spec in feature_reader:
            logger.info(f"Processing utterance {key}...")
            # if log, tranform to linear
            if args.apply_log:
                spec = np.exp(spec)
            # if power spectrum, tranform to magnitude spectrum
            if args.apply_pow:
                spec = np.sqrt(spec)
            if phase_reader is None:
                # griffin lim
                samps = griffin_lim(spec,
                                    epoches=args.epoches,
                                    transpose=True,
                                    norm=0.8,
                                    **stft_kwargs)
            else:
                if key not in phase_reader:
                    raise KeyError(f"Missing key {key} in phase reader")
                ref = phase_reader[key]
                angle = np.angle(ref[0] if ref.ndim == 3 else ref)
                phase = np.exp(angle * 1j)
                samps = inverse_stft(spec * phase, **stft_kwargs, norm=0.8)
            writer.write(key, samps)
    logger.info(f"Processed {len(feature_reader)} utterance done")
示例#28
0
def run(args):
    srp_pair = [
        tuple(map(int, p.split(","))) for p in args.diag_pair.split(";")
    ]
    if not len(srp_pair):
        raise RuntimeError("Bad configurations with --pair {}".format(
            args.pair))
    logger.info("Compute gcc with {}".format(srp_pair))

    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "round_power_of_two": args.round_power_of_two,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    num_done = 0
    num_ffts = nfft(
        args.frame_len) if args.round_power_of_two else args.frame_len
    reader = SpectrogramReader(args.wav_scp, **stft_kwargs)
    with ArchiveWriter(args.srp_ark, args.scp) as writer:
        for key, stft_mat in reader:
            num_done += 1
            srp = []
            # N x T x F
            for (i, j) in srp_pair:
                srp.append(
                    gcc_phat_diag(
                        stft_mat[i],
                        stft_mat[j],
                        min(i, j) * np.pi * 2 / args.n,
                        args.d,
                        num_bins=num_ffts // 2 + 1,
                        sr=args.sr,
                        num_doa=args.num_doa))
            srp = sum(srp) / len(srp_pair)
            nan = np.sum(np.isnan(srp))
            if nan:
                raise RuntimeError("Matrix {} has nan ({:d}} items)".format(
                    key, nan))
            writer.write(key, srp)
            if not num_done % 1000:
                logger.info("Processed {:d} utterances...".format(num_done))
    logger.info("Processd {:d} utterances done".format(len(reader)))
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "apply_log": args.apply_log,
        "apply_pow": args.apply_pow,
        "normalize": args.normalize,
        "apply_abs": True,
        "transpose": True  # T x F
    }
    reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    with ArchiveWriter(args.dup_ark, args.scp) as writer:
        for key, feats in reader:
            # default using ch1 in multi-channel case
            writer.write(key, feats[0] if feats.ndim == 3 else feats)
    logger.info("Process {:d} utterances".format(len(reader)))
示例#30
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    wpe_kwargs = {
        "num_iters": args.num_iters,
        "context": args.context,
        "taps": args.taps,
        "delay": args.delay
    }
    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)

    num_done = 0
    with WaveWriter(args.dst_dir, fs=args.samp_fs) as writer:
        for key, reverbed in spectrogram_reader:
            logger.info("Processing utt {}...".format(key))
            # N x T x F => F x N x T
            reverbed = np.transpose(reverbed, (2, 0, 1))
            try:
                # F x N x T
                dereverb = wpe(reverbed, **wpe_kwargs)
            except np.linalg.LinAlgError:
                logger.warn("{}: Failed cause LinAlgError in wpe".format(key))
                continue
            # F x N x T => N x T x F
            dereverb = np.transpose(dereverb, (1, 2, 0))
            # dump multi-channel
            samps = np.stack(
                [istft(spectra, **stft_kwargs) for spectra in dereverb])
            writer.write(key, samps)
            # show progress cause slow speed
            num_done += 1
            if not num_done % 100:
                logger.info("Processed {:d} utterances...".format(num_done))
    logger.info("Processed {:d} utterances over {:d}".format(
        num_done, len(spectrogram_reader)))