示例#1
0
def compute_mfcc_feats(wav: WaveData, mfcc_opts: MfccOptions) -> Matrix:
    """Compute MFCC features given a Kaldi WaveData.

    Args:
        wav: A WaveData object.
        mfcc_opts: An MfccOptions object containing feature extraction options.
        A few notable options are,
        - use_energy: Generally I will use False, since the energy does not
        contain much linguistic information
        - frame_opts.allow_downsample: Generally I will set this to True, since
        the AM I use can only handle the default sampling frequency (16KHz)
        - frame_opts.frame_shift_ms: For speech synthesis purposes, might be
        good to have a smaller shift (e.g., 5ms)
        - frame_opts.snip_edges: Generally I will set this to False, just to
        have a deterministic way to compute the number of frames

    Returns:
        feats: A T*D MFCC feature matrix.
    """
    mfcc = Mfcc(mfcc_opts)
    vtln_warp = 1.0  # This is the default value
    channel = 0  # Only use the first channel

    feats = mfcc.compute_features(wav.data()[channel], wav.samp_freq,
                                  vtln_warp)
    return feats
示例#2
0
def extract_mfcc(filename,
                 samp_freq,
                 frame_length_ms=25,
                 frame_shift_ms=10,
                 num_ceps=23,
                 round_to_power_of_two=True,
                 snip_edges=True):
    '''
    extract mfcc using kaldi
    args:
        filename: wav file path
        samp_freq: sample frequence
    return:
        mfcc: (frame, fre)
    '''
    # get rspec and wspec
    with open('wav.scp', 'w') as f:
        f.write('test1 ' + filename + '\n')
    rspec = 'scp,p:' + 'wav.scp'
    wspec = 'ark,t:' + 'spec.ark'
    # set po
    usage = """Extract MFCC features.Usage: example.py [opts...] <rspec> <wspec>"""
    po = ParseOptions(usage)
    po.register_float("min-duration", 0.0, "minimum segment duration")
    opts = po.parse_args()
    # set options
    mfcc_opts = MfccOptions()
    mfcc_opts.frame_opts.samp_freq = samp_freq
    mfcc_opts.num_ceps = num_ceps
    mfcc_opts.register(po)
    mfcc = Mfcc(mfcc_opts)
    sf = mfcc_opts.frame_opts.samp_freq
    with SequentialWaveReader(rspec) as reader, MatrixWriter(wspec) as writer:
        for key, wav in reader:
            if wav.duration < opts.min_duration:
                continue
            assert (wav.samp_freq >= sf)
            assert (wav.samp_freq % sf == 0)
            s = wav.data()
            s = s[:, ::int(wav.samp_freq / sf)]
            m = SubVector(mean(s, axis=0))
            f = mfcc.compute_features(m, sf, 1.0)
            f_array = np.array(f)
            print(f_array.shape)
            writer[key] = f
    return f_array
示例#3
0
def lid_module(key, audio_file, start, end):
    # ==================================
    #       Get data and process it.
    # ==================================
    wav_spc = "scp:echo " + key + " 'sox -V0 -t wav " + audio_file + " -c 1 -r 8000 -t wav - trim " + str(start) + " " + str(
        float(end) - float(start)) + "|' |"
    hires_mfcc = Mfcc(hires_mfcc_opts)
    wav = SequentialWaveReader(wav_spc).value()
    hi_feat = hires_mfcc.compute_features(wav.data()[0], wav.samp_freq, 1.0)
    hi_feat = hi_feat.numpy() - CMVN
    X = hi_feat.T
    X = np.expand_dims(np.expand_dims(X, 0), -1)
    #print(X.shape)
    v = network_eval.predict(X)
    #print(v)
    #print(key, "::", i2l[v.argmax()])
    return i2l[v.argmax()]
示例#4
0
def lid_module(key, audio_file, start, end):
    # ==================================
    #       Get data and process it.
    # ==================================
    wav_spc = "scp:echo " + key + " 'sox -V0 -t wav " + audio_file + " -c 1 -r 16000 -t wav - trim " + str(
        start) + " " + str(
        float(end) - float(start)) + "|' |"
    hires_mfcc = Mfcc(hires_mfcc_opts)
    wav = SequentialWaveReader(wav_spc).value()
    hi_feat = hires_mfcc.compute_features(wav.data()[0], wav.samp_freq, 1.0)
    hi_feat = hi_feat.numpy() - CMVN
    X = hi_feat.T
    print(X.shape)
    if X.shape[1] >= 384:
        X = np.expand_dims(X[:,:384], 0)
    else:
        padded_x = torch.zeros(40, 384)
        padded_x[:,:X.shape[1]]	 = torch.from_numpy(X)
        X = np.expand_dims(padded_x, 0)
    print(X.shape)
    emb = nn_LID_model_DA.emb(torch.from_numpy(X))[0]
    print(emb.shape)
    def compute_feat_KALDI(self, wav):
        try:
            po = ParseOptions("")
            mfcc_opts = MfccOptions()
            mfcc_opts.use_energy = False
            mfcc_opts.frame_opts.samp_freq = self.sr
            mfcc_opts.frame_opts.frame_length_ms = self.frame_length_s*1000
            mfcc_opts.frame_opts.frame_shift_ms = self.frame_shift_s*1000
            mfcc_opts.frame_opts.allow_downsample = False
            mfcc_opts.mel_opts.num_bins = self.num_bins
            mfcc_opts.mel_opts.low_freq = self.low_freq
            mfcc_opts.mel_opts.high_freq = self.high_freq
            mfcc_opts.num_ceps = self.num_ceps
            mfcc_opts.register(po)

            # Create MFCC object and obtain sample frequency
            mfccObj = Mfcc(mfcc_opts)
            mfccKaldi = mfccObj.compute_features(wav, self.sr, 1.0)
        except Exception as e:
            self.log.error(e)
            raise ValueError(
                "Speaker diarization failed while extracting features!!!")
        else:
            return mfccKaldi
示例#6
0
def compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts):
    mfcc = Mfcc(mfcc_opts)

    if opts.vtln_map:
        vtln_map_reader = RandomAccessFloatReaderMapped(
            opts.vtln_map, opts.utt2spk)
    elif opts.utt2spk:
        print("utt2spk option is needed only if vtln-map option is specified.",
              file=sys.stderr)

    num_utts, num_success = 0, 0
    with SequentialWaveReader(wav_rspecifier) as reader, \
         MatrixWriter(feats_wspecifier) as writer:
        for num_utts, (key, wave) in enumerate(reader, 1):
            if wave.duration < opts.min_duration:
                print("File: {} is too short ({} sec): producing no output.".
                      format(key, wave.duration),
                      file=sys.stderr)
                continue

            num_chan = wave.data().num_rows
            if opts.channel >= num_chan:
                print(
                    "File with id {} has {} channels but you specified "
                    "channel {}, producing no output.",
                    file=sys.stderr)
                continue
            channel = 0 if opts.channel == -1 else opts.channel

            if opts.vtln_map:
                if key not in vtln_map_reader:
                    print("No vtln-map entry for utterance-id (or speaker-id)",
                          key,
                          file=sys.stderr)
                    continue
                vtln_warp = vtln_map_reader[key]
            else:
                vtln_warp = opts.vtln_warp

            try:
                feats = mfcc.compute_features(wave.data()[channel],
                                              wave.samp_freq, vtln_warp)
            except:
                print("Failed to compute features for utterance",
                      key,
                      file=sys.stderr)
                continue

            if opts.subtract_mean:
                mean = Vector(feats.num_cols)
                mean.add_row_sum_mat_(1.0, feats)
                mean.scale_(1.0 / feats.num_rows)
                for i in range(feats.num_rows):
                    feats[i].add_vec_(-1.0, mean)

            writer[key] = feats
            num_success += 1

            if num_utts % 10 == 0:
                print("Processed {} utterances".format(num_utts),
                      file=sys.stderr)

    print("Done {} out of {} utterances".format(num_success, num_utts),
          file=sys.stderr)

    if opts.vtln_map:
        vtln_map_reader.close()

    return num_success != 0