예제 #1
0
def extract_spec(filename,
                 samp_freq,
                 frame_length_ms=25,
                 frame_shift_ms=10,
                 round_to_power_of_two=True,
                 snip_edges=True):
    '''
    extract spectrogram using kaldi
    args:
        filename: wav file path
        samp_freq: sample frequence
    return:
        spectrogram: (frame, fre)
    '''
    # get rspec and wspec
    with open('wav.scp', 'w') as f:
        f.write('test1 ' + filename + '\n')
    rspec = 'scp,p:' + 'wav.scp'
    wspec = 'ark,t:' + 'spec.ark'
    # set po
    usage = """Extract MFCC features.Usage: example.py [opts...] <rspec> <wspec>"""
    po = ParseOptions(usage)
    po.register_float("min-duration", 0.0, "minimum segment duration")
    opts = po.parse_args()
    # set options
    spec_opts = SpectrogramOptions()
    spec_opts.frame_opts.samp_freq = samp_freq
    spec_opts.frame_opts.frame_length_ms = frame_length_ms
    spec_opts.frame_opts.frame_shift_ms = frame_shift_ms
    spec_opts.frame_opts.round_to_power_of_two = round_to_power_of_two
    spec_opts.frame_opts.snip_edges = snip_edges
    spec_opts.register(po)
    spec = Spectrogram(spec_opts)
    sf = spec_opts.frame_opts.samp_freq
    with SequentialWaveReader(rspec) as reader, MatrixWriter(wspec) as writer:
        for key, wav in reader:
            if wav.duration < opts.min_duration:
                continue
            assert (wav.samp_freq >= sf)
            assert (wav.samp_freq % sf == 0)
            s = wav.data()
            s = s[:, ::int(wav.samp_freq / sf)]
            m = SubVector(mean(s, axis=0))
            f = spec.compute_features(m, sf, 1.0)
            f_array = np.array(f)
            writer[key] = f
    return f_array
예제 #2
0
    return num_success != 0


if __name__ == "__main__":

    usage = """Compute VAD.

    Usage:  compute-vad [options...] <wav-rspecifier> <feats-wspecifier>
    """

    po = ParseOptions(usage)

    po.register_float(
        "min-duration",
        0.0,
        "Minimum duration of segments to process in seconds (default: 0.0).",
    )
    po.register_int(
        "channel",
        -1,
        "Channel to extract (-1 -> mono (default), 0 -> left, 1 -> right)",
    )
    po.register_int("frame-window", 25,
                    "Length of frame window in ms (default: 25)")
    po.register_int("frame-shift", 10,
                    "Length of frame shift in ms (default: 10)")
    po.register_int("nfft", 512, "Number of DFT points (default: 256)")
    po.register_int(
        "arma-order",
        5,
예제 #3
0
          file=sys.stderr)

    return num_success != 0


if __name__ == "__main__":

    usage = """Compute VAD.

  Usage:  compute-vad [options...] <wav-rspecifier> <feats-wspecifier>
  """

    po = ParseOptions(usage)

    po.register_float(
        "min-duration", 0.0, "Minimum duration of segments "
        "to process (in seconds).")
    po.register_int(
        "channel", -1, "Channel to extract (-1 -> expect mono, "
        "0 -> left, 1 -> right)")
    po.register_int("frame-window", 25, "Length of frame window in ms "
                    "default is 25ms")
    po.register_int("frame-shift", 10, "Length of frame shift in ms "
                    "default is 10ms")
    po.register_int("nfft", 256, "Number of DFT points " "default is 256")
    po.register_int(
        "arma-order", 5, "Length of ARMA window that will be applied "
        "to the spectrogram")
    po.register_int("ltsv-ctx-window", 50,
                    "Context window for LTSV computation "
                    "default is 50")
예제 #4
0
    logging.basicConfig(format="%(levelname)s (%(module)s[{}]:%(funcName)s():"
                               "%(filename)s:%(lineno)s) %(message)s"
                               .format(__version__), level=logging.INFO)

    usage = """Decode features using GMM-based model.

    Usage:  gmm-decode-faster.py [options] model-in fst-in features-rspecifier
                words-wspecifier [alignments-wspecifier [lattice-wspecifier]]

    Note: lattices, if output, will just be linear sequences;
          use gmm-latgen-faster if you want "real" lattices.
    """
    po = ParseOptions(usage)
    decoder_opts = FasterDecoderOptions()
    decoder_opts.register(po, True)
    po.register_float("acoustic-scale", 0.1,
                      "Scaling factor for acoustic likelihoods")
    po.register_bool("allow-partial", True,
                     "Produce output even when final state was not reached")
    po.register_str("word-symbol-table", "",
                    "Symbol table for words [for debug output]");
    opts = po.parse_args()

    if po.num_args() < 4 or po.num_args() > 6:
        po.print_usage()
        sys.exit()

    model_rxfilename = po.get_arg(1)
    fst_rxfilename = po.get_arg(2)
    feature_rspecifier = po.get_arg(3)
    words_wspecifier = po.get_arg(4)
    alignment_wspecifier = po.get_opt_arg(5)
예제 #5
0
if __name__ == '__main__':
    usage = """Create MFCC feature files.

    Usage:  compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier>
    """
    po = ParseOptions(usage)

    mfcc_opts = MfccOptions()
    mfcc_opts.register(po)

    po.register_bool(
        "subtract-mean", False, "Subtract mean of each feature"
        "file [CMS]; not recommended to do it this way.")
    po.register_float(
        "vtln-warp", 1.0, "Vtln warp factor (only applicable "
        "if vtln-map not specified)")
    po.register_str(
        "vtln-map", "", "Map from utterance or speaker-id to "
        "vtln warp factor (rspecifier)")
    po.register_str(
        "utt2spk", "", "Utterance to speaker-id map rspecifier"
        "(if doing VTLN and you have warps per speaker)")
    po.register_int(
        "channel", -1, "Channel to extract (-1 -> expect mono, "
        "0 -> left, 1 -> right)")
    po.register_float(
        "min-duration", 0.0, "Minimum duration of segments "
        "to process (in seconds).")

    opts = po.parse_args()
예제 #6
0
if __name__ == '__main__':
    # Configure log messages to look like Kaldi messages
    from kaldi import __version__
    logging.addLevelName(20, 'LOG')
    logging.basicConfig(
        format='%(levelname)s (%(module)s[{}]:%(funcName)s():'
        '%(filename)s:%(lineno)s) %(message)s'.format(__version__),
        level=logging.INFO)

    usage = """Extract segments from a large audio file in WAV format.
    Usage:
        extract-segments [options] <wav-rspecifier> <segments-file> <wav-wspecifier>
    """
    po = ParseOptions(usage)
    po.register_float(
        "min-segment-length", 0.1, "Minimum segment length "
        "in seconds (reject shorter segments)")
    po.register_float(
        "max_overshoot", 0.5, "End segments overshooting audio "
        "by less than this (in seconds) are truncated, "
        "else rejected.")

    opts = po.parse_args()
    if po.num_args() != 3:
        po.print_usage()
        sys.exit()

    wav_rspecifier = po.get_arg(1)
    segments_rxfilename = po.get_arg(2)
    wav_wspecifier = po.get_arg(3)
예제 #7
0
    Usage: copy-matrix [options] <matrix-in-rspecifier> <matrix-out-wspecifier>
    or     copy-matrix [options] <matrix-in-rxfilename> <matrix-out-wxfilename>

    e.g.
        copy-matrix --binary=false 1.mat -
        copy-matrix ark:2.trans ark,t:-
    """

    po = ParseOptions(usage)

    po.register_bool(
        "binary", True,
        "Write in binary mode (only relevant if output is a wxfilename)")
    po.register_float(
        "scale", 1.0,
        "This option can be used to scale the matrices being copied.")
    po.register_bool(
        "apply-log", False,
        "This option can be used to apply log on the matrices. Must be avoided if matrix has negative quantities."
    )
    po.register_bool("apply-exp", False,
                     "This option can be used to apply exp on the matrices")
    po.register_float(
        "apply-power", 1.0,
        "This option can be used to apply a power on the matrices")
    po.register_bool(
        "apply-softmax-per-row", False,
        "This option can be used to apply softmax per row of the matrices")

    opts = po.parse_args()
    sw02005-A sw02005 A
    sw02005-B sw02005 B
    interpreted as <utterance-id> <call-id> <side> and for each <call-id>
    that has two sides, does the 'only-the-louder' computation, else does
    per-utterance stats in the normal way.
    Note: loudness is judged by the first feature component, either energy or c0
    only applicable to MFCCs or PLPs (this code could be modified to handle filterbanks).

    Usage: compute-cmvn-stats-two-channel [options] <reco2file-and-channel> <feats-rspecifier> <stats-wspecifier>
    e.g.: compute-cmvn-stats-two-channel data/train_unseg/reco2file_and_channel scp:data/train_unseg/feats.scp ark,t:-
    """

    po = ParseOptions(usage)

    po.register_float(
        "quieter_channel_weight", 0.01, "For the quieter channel,"
        " apply this weight to the stats, so that we still get "
        "stats if one channel always dominates.")

    opts = po.parse_args()

    if po.num_args() != 3:
        po.print_usage()
        sys.exit(1)

    reco2file_and_channel_rxfilename = po.get_arg(1)
    feats_rspecifier = po.get_arg(2)
    stats_wspecifier = po.get_arg(3)

    compute_cmvn_stats_two_channel(reco2file_and_channel_rxfilename,
                                   feats_rspecifier, stats_wspecifier, opts)
예제 #9
0
    po.register_int("sampling-rate", 16000,
                    "Sampling rate of waveforms and labels.")
    po.register_int(
        "signal-window-length", 200,
        "Window length in ms (what will be presented to the network).")
    po.register_int("label-window-length", 25,
                    "Window length of alignments / labels in ms.")
    po.register_int("label-window-shift", 10,
                    "Window shift of alignments / labels in ms.")
    po.register_bool(
        "subtract-mean", False, "Subtract mean of each feature"
        "file [CMS]; not recommended to do it this way.")
    po.register_int(
        "channel", -1, "Channel to extract (-1 -> expect mono, "
        "0 -> left, 1 -> right)")
    po.register_float(
        "min-duration", 0.0, "Minimum duration of segments "
        "to process (in seconds).")

    opts = po.parse_args()

    if (po.num_args() != 2):
        po.print_usage()
        sys.exit()

    wav_rspecifier = po.get_arg(1)
    feats_wspecifier = po.get_arg(2)

    compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts)