def compute_mfcc_feats(wav: WaveData, mfcc_opts: MfccOptions) -> Matrix: """Compute MFCC features given a Kaldi WaveData. Args: wav: A WaveData object. mfcc_opts: An MfccOptions object containing feature extraction options. A few notable options are, - use_energy: Generally I will use False, since the energy does not contain much linguistic information - frame_opts.allow_downsample: Generally I will set this to True, since the AM I use can only handle the default sampling frequency (16KHz) - frame_opts.frame_shift_ms: For speech synthesis purposes, might be good to have a smaller shift (e.g., 5ms) - frame_opts.snip_edges: Generally I will set this to False, just to have a deterministic way to compute the number of frames Returns: feats: A T*D MFCC feature matrix. """ mfcc = Mfcc(mfcc_opts) vtln_warp = 1.0 # This is the default value channel = 0 # Only use the first channel feats = mfcc.compute_features(wav.data()[channel], wav.samp_freq, vtln_warp) return feats
def extract_mfcc(filename, samp_freq, frame_length_ms=25, frame_shift_ms=10, num_ceps=23, round_to_power_of_two=True, snip_edges=True): ''' extract mfcc using kaldi args: filename: wav file path samp_freq: sample frequence return: mfcc: (frame, fre) ''' # get rspec and wspec with open('wav.scp', 'w') as f: f.write('test1 ' + filename + '\n') rspec = 'scp,p:' + 'wav.scp' wspec = 'ark,t:' + 'spec.ark' # set po usage = """Extract MFCC features.Usage: example.py [opts...] <rspec> <wspec>""" po = ParseOptions(usage) po.register_float("min-duration", 0.0, "minimum segment duration") opts = po.parse_args() # set options mfcc_opts = MfccOptions() mfcc_opts.frame_opts.samp_freq = samp_freq mfcc_opts.num_ceps = num_ceps mfcc_opts.register(po) mfcc = Mfcc(mfcc_opts) sf = mfcc_opts.frame_opts.samp_freq with SequentialWaveReader(rspec) as reader, MatrixWriter(wspec) as writer: for key, wav in reader: if wav.duration < opts.min_duration: continue assert (wav.samp_freq >= sf) assert (wav.samp_freq % sf == 0) s = wav.data() s = s[:, ::int(wav.samp_freq / sf)] m = SubVector(mean(s, axis=0)) f = mfcc.compute_features(m, sf, 1.0) f_array = np.array(f) print(f_array.shape) writer[key] = f return f_array
def lid_module(key, audio_file, start, end): # ================================== # Get data and process it. # ================================== wav_spc = "scp:echo " + key + " 'sox -V0 -t wav " + audio_file + " -c 1 -r 8000 -t wav - trim " + str(start) + " " + str( float(end) - float(start)) + "|' |" hires_mfcc = Mfcc(hires_mfcc_opts) wav = SequentialWaveReader(wav_spc).value() hi_feat = hires_mfcc.compute_features(wav.data()[0], wav.samp_freq, 1.0) hi_feat = hi_feat.numpy() - CMVN X = hi_feat.T X = np.expand_dims(np.expand_dims(X, 0), -1) #print(X.shape) v = network_eval.predict(X) #print(v) #print(key, "::", i2l[v.argmax()]) return i2l[v.argmax()]
def lid_module(key, audio_file, start, end): # ================================== # Get data and process it. # ================================== wav_spc = "scp:echo " + key + " 'sox -V0 -t wav " + audio_file + " -c 1 -r 16000 -t wav - trim " + str( start) + " " + str( float(end) - float(start)) + "|' |" hires_mfcc = Mfcc(hires_mfcc_opts) wav = SequentialWaveReader(wav_spc).value() hi_feat = hires_mfcc.compute_features(wav.data()[0], wav.samp_freq, 1.0) hi_feat = hi_feat.numpy() - CMVN X = hi_feat.T print(X.shape) if X.shape[1] >= 384: X = np.expand_dims(X[:,:384], 0) else: padded_x = torch.zeros(40, 384) padded_x[:,:X.shape[1]] = torch.from_numpy(X) X = np.expand_dims(padded_x, 0) print(X.shape) emb = nn_LID_model_DA.emb(torch.from_numpy(X))[0] print(emb.shape)
def compute_feat_KALDI(self, wav): try: po = ParseOptions("") mfcc_opts = MfccOptions() mfcc_opts.use_energy = False mfcc_opts.frame_opts.samp_freq = self.sr mfcc_opts.frame_opts.frame_length_ms = self.frame_length_s*1000 mfcc_opts.frame_opts.frame_shift_ms = self.frame_shift_s*1000 mfcc_opts.frame_opts.allow_downsample = False mfcc_opts.mel_opts.num_bins = self.num_bins mfcc_opts.mel_opts.low_freq = self.low_freq mfcc_opts.mel_opts.high_freq = self.high_freq mfcc_opts.num_ceps = self.num_ceps mfcc_opts.register(po) # Create MFCC object and obtain sample frequency mfccObj = Mfcc(mfcc_opts) mfccKaldi = mfccObj.compute_features(wav, self.sr, 1.0) except Exception as e: self.log.error(e) raise ValueError( "Speaker diarization failed while extracting features!!!") else: return mfccKaldi
def compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts): mfcc = Mfcc(mfcc_opts) if opts.vtln_map: vtln_map_reader = RandomAccessFloatReaderMapped( opts.vtln_map, opts.utt2spk) elif opts.utt2spk: print("utt2spk option is needed only if vtln-map option is specified.", file=sys.stderr) num_utts, num_success = 0, 0 with SequentialWaveReader(wav_rspecifier) as reader, \ MatrixWriter(feats_wspecifier) as writer: for num_utts, (key, wave) in enumerate(reader, 1): if wave.duration < opts.min_duration: print("File: {} is too short ({} sec): producing no output.". format(key, wave.duration), file=sys.stderr) continue num_chan = wave.data().num_rows if opts.channel >= num_chan: print( "File with id {} has {} channels but you specified " "channel {}, producing no output.", file=sys.stderr) continue channel = 0 if opts.channel == -1 else opts.channel if opts.vtln_map: if key not in vtln_map_reader: print("No vtln-map entry for utterance-id (or speaker-id)", key, file=sys.stderr) continue vtln_warp = vtln_map_reader[key] else: vtln_warp = opts.vtln_warp try: feats = mfcc.compute_features(wave.data()[channel], wave.samp_freq, vtln_warp) except: print("Failed to compute features for utterance", key, file=sys.stderr) continue if opts.subtract_mean: mean = Vector(feats.num_cols) mean.add_row_sum_mat_(1.0, feats) mean.scale_(1.0 / feats.num_rows) for i in range(feats.num_rows): feats[i].add_vec_(-1.0, mean) writer[key] = feats num_success += 1 if num_utts % 10 == 0: print("Processed {} utterances".format(num_utts), file=sys.stderr) print("Done {} out of {} utterances".format(num_success, num_utts), file=sys.stderr) if opts.vtln_map: vtln_map_reader.close() return num_success != 0