def get_modspec(args, srate=16000, window=np.hanning): wavs = args.scp add_reverb = args.add_reverb set_unity_gain = args.set_unity_gain nmodulations = args.nmodulations order = args.order fduration = args.fduration_modspec frate = args.frate nfilters = args.nfilters fbank = createFbank(nfilters, int(2 * fduration * srate), srate) if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') with open(wavs, 'r') as fid: all_feats = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' # I want to work with numbers from 0 to 1 so.... signal = signal / np.power(2, 15) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) cos_trans = freqAnalysis.dct(time_frames) / np.sqrt( 2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) if set_unity_gain: feats = np.zeros((frame_num, nfilters * (nmodulations - 1))) else: feats = np.zeros((frame_num, nfilters * nmodulations)) print('%s: Computing Features for file: %s' % (sys.argv[0], uttid)) sys.stdout.flush() for i in range(frame_num): if set_unity_gain: each_feat = np.zeros([nfilters, nmodulations - 1]) else: each_feat = np.zeros([nfilters, nmodulations]) for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] xlpc, gg = computeLpcFast( band_dct, order) # Compute LPC coefficients if set_unity_gain: gg = 1 mod_spec = computeModSpecFromLpc(gg, xlpc, nmodulations) if set_unity_gain: mod_spec = mod_spec[1:] each_feat[j, :] = mod_spec if set_unity_gain: each_feat = np.reshape(each_feat, (1, nfilters * (nmodulations - 1))) else: each_feat = np.reshape(each_feat, (1, nfilters * nmodulations)) feats[i, :] = each_feat all_feats[uttid] = feats return all_feats
def getFeats(args, srate=16000, window=np.hanning): wavs = args.scp scp_type = args.scp_type outfile = args.outfile add_reverb = args.add_reverb coeff_0 = args.coeff_0 coeff_n = args.coeff_n order = args.order fduration = args.fduration frate = args.frate nfilters = args.nfilters kaldi_cmd = args.kaldi_cmd # Set up mel-filterbank fbank_type = args.fbank_type.strip().split(',') if args.complex_modulation: dur = int(fduration * srate) else: dur = int(2 * fduration * srate) if fbank_type[0] == "mel": if len(fbank_type) < 2: raise ValueError('Mel filter bank not configured properly....') fbank = createFbank(nfilters, dur, srate, warp_fact=float(fbank_type[1])) elif fbank_type[0] == "cochlear": if len(fbank_type) < 6: raise ValueError( 'Cochlear filter bank not configured properly....') if int(fbank_type[3]) == 1: print( '%s: Alpha is fixed and will not change as a function of the center frequency...' % sys.argv[0]) fbank = createFbankCochlear(nfilters, dur, srate, om_w=float(fbank_type[1]), alp=float(fbank_type[2]), fixed=int(fbank_type[3]), bet=float(fbank_type[4]), warp_fact=float(fbank_type[5])) else: raise ValueError( 'Invalid type of filter bank, use mel or cochlear with proper configuration' ) coeff_num = coeff_n - coeff_0 + 1 if args.keep_even: temp = np.arange(0, coeff_num) if coeff_0 % 2 == 0: # It starts from odd coefficients feat_len = temp[1::2].shape[0] else: feat_len = temp[0::2].shape[0] elif args.complex_modulation: if args.absolute_value: feat_len = coeff_num else: feat_len = 2 * coeff_num else: feat_len = coeff_num if args.compensate_noise: if args.complex_modulation: fmax = coeff_num / (fduration) faxis = np.linspace(0, fmax, coeff_n) else: fmax = coeff_num / (2 * fduration) faxis = np.linspace(0, fmax, coeff_n) if args.no_window: print('%s: Using square windows' % sys.argv[0]) window = sq_wind if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') with open(wavs, 'r') as fid: all_feats = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) if scp_type == 'wav': if inwav[-1] == '|': try: proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) skip_rest = False except Exception: skip_rest = True else: try: sr, signal = read(inwav) skip_rest = False except Exception: skip_rest = True assert sr == srate, 'Input file has different sampling rate.' elif scp_type == 'segment': try: cmd = 'wav-copy ' + inwav + ' - ' proc = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) skip_rest = False except Exception: skip_rest = True else: raise ValueError( 'Invalid type of scp type, it should be either wav or segment' ) if not skip_rest: # I want to work with numbers from 0 to 1 so.... # signal = signal / np.power(2, 15) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) if args.complex_modulation: cos_trans = freqAnalysis.ifft(time_frames) cos_trans = cos_trans[:, :int(fduration * srate / 2)] else: cos_trans = freqAnalysis.dct(time_frames) / np.sqrt( 2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) feats = np.zeros((frame_num, nfilters * feat_len)) print('%s: Computing Features for file: %s, also %d' % (sys.argv[0], uttid, time_frames.shape[0])) sys.stdout.flush() for i in range(frame_num): each_feat = np.zeros([nfilters, feat_len]) for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] if args.complex_modulation: xlpc, gg = computeLpcFast( band_dct, order, keepreal=False) # Compute LPC coefficients mod_spec = computeModSpecFromLpc(gg, xlpc, coeff_n) if args.compensate_noise: mod_spec = mod_spec * faxis if args.absolute_value: temp2 = np.abs(mod_spec[coeff_0 - 1:coeff_n]) else: temp2 = np.append( np.real(mod_spec[coeff_0 - 1:coeff_n]), np.imag(mod_spec[coeff_0 - 1:coeff_n])) else: xlpc, gg = computeLpcFast(band_dct, order) mod_spec = np.real( computeModSpecFromLpc(gg, xlpc, coeff_n)) if args.compensate_noise: mod_spec = mod_spec * faxis if args.absolute_value: temp2 = np.abs(mod_spec[coeff_0 - 1:coeff_n]) else: temp2 = mod_spec[coeff_0 - 1:coeff_n] if args.keep_even: if coeff_0 % 2 == 0: each_feat[j, :] = temp2[1::2] else: each_feat[j, :] = temp2[0::2] else: each_feat[j, :] = temp2 each_feat = np.reshape(each_feat, (1, nfilters * feat_len)) feats[i, :] = each_feat all_feats[uttid] = feats dict2Ark(all_feats, outfile, kaldi_cmd)
def getFeats(args, srate=16000, window=np.hanning): wavs=args.scp segment=args.segment outfile=args.outfile add_reverb=args.add_reverb set_unity_gain=args.set_unity_gain nmodulations=args.nmodulations order=args.order fduration=args.fduration frate=args.frate nfilters=args.nfilters kaldi_cmd=args.kaldi_cmd fbank = createFbank(nfilters, int(2*fduration*srate), srate) if add_reverb: if add_reverb=='small_room': sr_r, rir=read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir=rir[:,1] rir=rir/np.power(2,15) elif add_reverb=='large_room': sr_r, rir=read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir=rir[:,1] rir=rir/np.power(2,15) elif add_reverb=='clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') wav_in_buffer='' # Wav file that is currently in RAM # Load Location and Ids of all wav files wav_ids=[]; wav_locs=[] with open(wavs, 'r') as fid: for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) wav_ids.append(uttid) wav_locs.append(inwav) # Compute features for all the segments with open(segment, 'r') as fid_s: all_feats={} for line_s in fid_s: token_s = line_s.strip().split() seg_id=token_s[0]; wav_id=token_s[1] # Load wav file it is already not in RAM unload if wav_in_buffer!=wav_id: wav_in_buffer=wav_id inwav=wav_locs[wav_ids.index(wav_id)] if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal_big = read(io.BytesIO(proc.stdout)) else: sr, signal_big = read(inwav) assert sr == srate, 'Input file has different sampling rate.' t_beg=int(float(token_s[2])*sr); t_end=int(float(token_s[3])*sr) signal=signal_big[t_beg:t_end] signal=signal/np.power(2,15) if add_reverb: if not add_reverb=='clean': signal=addReverb(signal,rir) time_frames = np.array([frame for frame in getFrames(signal, srate, frate, fduration, window)]) cos_trans=freqAnalysis.dct(time_frames)/np.sqrt(2*int(srate * fduration)) [frame_num, ndct]=np.shape(cos_trans) feats=np.zeros((frame_num,nfilters*nmodulations)) print('%s: Computing Features for file: %s and segment: %s' % (sys.argv[0],wav_id,seg_id)) sys.stdout.flush() for i in range(frame_num): each_feat=np.zeros([nfilters,nmodulations]) for j in range(nfilters): filt=fbank[j,0:-1] band_dct=filt*cos_trans[i,:] xlpc, gg=computeLpcFast(band_dct,order) # Compute LPC coefficients if set_unity_gain: gg=1 mod_spec=computeModSpecFromLpc(gg,xlpc,nmodulations) each_feat[j,:]=mod_spec each_feat=np.reshape(each_feat,(1,nfilters*nmodulations)) feats[i,:]=each_feat all_feats[seg_id]=feats dict2Ark(all_feats,outfile,kaldi_cmd)
def get_mfcc(args, srate=16000, window=np.hamming): wavs = args.scp add_reverb = args.add_reverb nfft = args.nfft context = args.context fduration = args.fduration_mfcc frate = args.frate nfilters = args.nfilters fbank = createFbank(nfilters, nfft, srate) if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') sys.stdout.flush() with open(wavs, 'r') as fid: all_feats = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) print('%s: Computing Features for file: %s' % (sys.argv[0], uttid)) sys.stdout.flush() if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' signal = signal / np.power(2, 15) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) melEnergy_frames = np.log10( np.matmul(np.abs(fft(time_frames, int(nfft / 2 + 1), axis=1)), np.transpose(fbank))) mfcc_feats = dct(melEnergy_frames, axis=1) mfcc_feats = mfcc_feats[:, 0:13] if args.context: mfcc_feats = spliceFeats(mfcc_feats, context) all_feats[uttid] = mfcc_feats return all_feats
def extractModSpecFeatures(wavs, outdir, phone_map, phn_file_dir, get_phone_labels=True, only_center=True, around_center=1, ignore_edge=False, nmodulations=12, order=50, fduration=0.5, frate=100, nfft=512, nfilters=15, srate=16000, window=np.hanning): '''Extract the Modulation Spectral Features. Args: wavs (list): List of (uttid, 'filename or pipe-command'). outdir (string): Output of an existing directory. phone_map(string): Map of the phonemes from Kaldi get_phone_labels(bool): Set True if you want to get the phoneme labels fduration (float): Frame duration in seconds. frate (int): Frame rate in Hertz. hz2scale (function): Hz -> 'scale' conversion. nfft (int): Number of points to compute the FFT. nfilters (int): Number of filters. postproc (function): User defined post-processing function. srate (int): Expected sampling rate of the audio. scale2hz (function): 'scale' -> Hz conversion. srate (int): Expected sampling rate. window (function): Windowing function. Note: It is possible to use a Kaldi like style to read the audio using a "pipe-command" e.g.: "sph2pipe -f wav /path/file.wav |" ''' if not only_center: fbank = createFbank(nfilters, int(2 * fduration * srate), srate) # Get list of phonemes phn_list = [] with open(phone_map, 'r') as fid2: for line2 in fid2: line2 = line2.strip().split() if len(line2) == 2: if 'sil' not in line2 and 'SIL' not in line2: phn_list.append(line2[1]) phn_list = list(set(phn_list)) phn_list.sort() with open(wavs, 'r') as fid: # Initialize matrix for all features if get_phone_labels: all_feats = np.empty(nmodulations * nfilters + 1) else: all_feats = np.empty(nmodulations * nfilters) for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) fname_phn = uttid + '.PHN' if get_phone_labels: # Get first line of phone file in the beginning phn_file = open(fname_phn) phn_line = phn_file.readline() phn_locs = phn_line.strip().split() # Get phoneme information phone_now = phn_locs[2] phone_end = int(int(phn_locs[1]) / 160) beg_frame = int(int(phn_locs[0]) / 160) if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' # I want to work with numbers from 0 to 1 so.... signal = signal / np.power(2, 15) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) cos_trans = freqAnalysis.dct(time_frames) / np.sqrt( 2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) if get_phone_labels: feats = np.zeros([frame_num, nmodulations * nfilters + 1]) else: feats = np.zeros([frame_num, nmodulations * nfilters]) print('Computing Features for file: %s' % uttid) for i in range(beg_frame, frame_num): each_feat = np.zeros([nfilters, nmodulations]) for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] xlpc, gg = computeLpcFast( band_dct, order) # Compute LPC coefficients mod_spec = computeModSpecFromLpc( gg, xlpc, nmodulations) each_feat[j, :] = mod_spec each_feat = np.reshape(each_feat, (1, nfilters * nmodulations)) if get_phone_labels: # Udates to current phoneme if i > phone_end: # Get new phone label phn_line = phn_file.readline() if phn_line: phn_locs = phn_line.strip().split() phone_now = phn_locs[2] phone_end = int(int(phn_locs[1]) / 160) else: break # Break if no more phones are remaining ind = phn_list.index(phone_now) each_feat = np.append(each_feat, ind) feats[i, :] = each_feat all_feats = np.vstack([all_feats, feats]) all_feats = all_feats[1:, :] # Save the final BIG feature file np.save(os.path.join(outdir), all_feats) np.save(os.path.join(os.path.dirname(outdir), 'phone_list'), phn_list) else: fbank = createFbank(nfilters, int(2 * fduration * srate), srate) # Get list of phonemes phn_list = [] with open(phone_map, 'r') as fid2: for line2 in fid2: line2 = line2.strip().split() if len(line2) == 2: if 'sil' not in line2 and 'SIL' not in line2: phn_list.append(line2[1]) phn_list = list(set(phn_list)) phn_list.sort() with open(wavs, 'r') as fid: # Initialize matrix for all features if get_phone_labels: all_feats = np.empty(nmodulations * nfilters + 1) else: all_feats = np.empty(nmodulations * nfilters) for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' # I want to work with numbers from 0 to 1 so.... signal = signal / np.power(2, 15) fname_phn = uttid + '.PHN' # Get all phones and their center if os.path.isfile(os.path.join(phn_file_dir, fname_phn)): phn_file = open(os.path.join(phn_file_dir, fname_phn)) phone_mid = np.empty(0) phone_now = np.empty(0) for line2 in phn_file: phn_locs = line2.strip().split() if phn_locs[2] in phn_list: ind = phn_list.index(phn_locs[2]) phone_now = np.append(phone_now, ind) phone_mid = np.append( phone_mid, int(int(phn_locs[0]) + int(phn_locs[1])) / 2) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) cos_trans = freqAnalysis.dct(time_frames) / np.sqrt( 2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) if ignore_edge: phone_mid = phone_mid[1:-1] phone_now = phone_now[1:-1] only_compute = len(phone_mid) if get_phone_labels: feats = np.zeros([ only_compute * around_center, nmodulations * nfilters + 1 ]) else: feats = np.zeros([ only_compute * around_center, nmodulations * nfilters ]) print('Computing Features for file: %s' % uttid) for kk in range(only_compute): i_mid = int(np.floor((phone_mid[kk]))) for cont in range(around_center): i = i_mid + cont - int((around_center - 1) / 2) each_feat = np.zeros([nfilters, nmodulations]) for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] xlpc, gg = computeLpcFast( band_dct, order) # Compute LPC coefficients mod_spec = computeModSpecFromLpc( gg, xlpc, nmodulations) each_feat[j, :] = mod_spec each_feat = np.reshape( each_feat, (1, nfilters * nmodulations)) if get_phone_labels: feats[around_center * kk + cont, :] = np.append( each_feat, phone_now[kk]) else: feats[around_center * kk + cont, :] = each_feat #if get_phone_labels: # feats=np.append(feats,np.reshape(phone_now,(len(phone_now),1)),axis=1) all_feats = np.vstack([all_feats, feats]) all_feats = all_feats[1:, :] # Save the final BIG feature file np.save(os.path.join(outdir), all_feats) np.save(os.path.join(os.path.dirname(outdir), 'phone_list'), phn_list)
def extractFDLPTfPattern(wavs, outdir, phone_map, phn_file_dir, get_phone_labels=False, only_center=False, ignore_edge=False, order=50, fduration=0.5, frate=100, nfft=20, nfilters=15, srate=16000, window=np.hanning): if not only_center: fbank = createFbank(nfilters, int(2 * fduration * srate), srate) # Get list of phonemes phn_list = [] with open(phone_map, 'r') as fid2: for line2 in fid2: line2 = line2.strip().split() if len(line2) == 2: if 'sil' not in line2 and 'SIL' not in line2: phn_list.append(line2[1]) phn_list = list(set(phn_list)) phn_list.sort() with open(wavs, 'r') as fid: # Initialize matrix for all features if get_phone_labels: all_feats = np.empty(nfilters + 1) else: all_feats = np.empty(nfilters) for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) fname_phn = uttid + '.PHN' if get_phone_labels: if os.path.isfile(os.path.join(phn_file_dir, fname_phn)): # Get first line of phone file in the beginning phn_file = open(os.path.join(phn_file_dir, fname_phn)) phn_line = phn_file.readline() phn_locs = phn_line.strip().split() # Get phoneme information phone_now = phn_locs[2] phone_end = int(int(phn_locs[1])) beg_frame = int(int(phn_locs[0])) else: break else: beg_frame = 0 if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' # I want to work with numbers from 0 to 1 so.... signal = signal / np.power(2, 15) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) cos_trans = freqAnalysis.dct(time_frames) / np.sqrt( 2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) if get_phone_labels: feats = np.zeros([frame_num, nfilters + 1]) else: feats = np.zeros([frame_num, nfilters]) print('Computing Features for file: %s' % uttid) for i in range(beg_frame, frame_num): each_feat = np.zeros(nfilters) for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] xlpc, gg = computeLpcFast( band_dct, order) # Compute LPC coefficients w, h = freqz(np.sqrt(gg), xlpc, ndct) h_mid = np.log10( np.mean( np.abs(h[int(ndct / 2 - 160):int(ndct / 2 + 160)]))) each_feat[j] = h_mid if get_phone_labels: # Updates to current phoneme if i > phone_end: # Get new phone label phn_line = phn_file.readline() if phn_line: phn_locs = phn_line.strip().split() phone_now = phn_locs[2] phone_end = int(phn_locs[1]) phone_end = int(int(phn_locs[1]) / 160) else: break # Break if no more phones are remaining ind = phn_list.index(phone_now) each_feat = np.append(each_feat, ind) feats[i, :] = each_feat all_feats = np.vstack([all_feats, feats]) all_feats = all_feats[1:, :] # Save the final BIG feature file np.save(os.path.join(outdir), all_feats) np.save(os.path.join(os.path.dirname(outdir), 'phone_list'), phn_list) else: fbank = createFbank(nfilters, int(2 * fduration * srate), srate) # Get list of phonemes phn_list = [] with open(phone_map, 'r') as fid2: for line2 in fid2: line2 = line2.strip().split() if len(line2) == 2: if 'sil' not in line2 and 'SIL' not in line2: phn_list.append(line2[1]) phn_list = list(set(phn_list)) phn_list.sort() with open(wavs, 'r') as fid: # Initialize matrix for all features if get_phone_labels: all_feats = np.empty(nfilters * nfft + 1) else: all_feats = np.empty(nfilters * nfft) for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' # I want to work with numbers from 0 to 1 so.... signal = signal / np.power(2, 15) fname_phn = uttid + '.PHN' # Get all phones and their center if os.path.isfile(os.path.join(phn_file_dir, fname_phn)): phn_file = open(os.path.join(phn_file_dir, fname_phn)) phone_mid = np.empty(0) phone_now = np.empty(0) for line2 in phn_file: phn_locs = line2.strip().split() if phn_locs[2] in phn_list: ind = phn_list.index(phn_locs[2]) phone_now = np.append(phone_now, ind) phone_mid = np.append( phone_mid, int(int(phn_locs[0]) + int(phn_locs[1])) / 2) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) cos_trans = freqAnalysis.dct(time_frames) / np.sqrt( 2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) if ignore_edge: phone_mid = phone_mid[1:-1] phone_now = phone_now[1:-1] only_compute = len(phone_mid) feats = np.zeros([only_compute, nfilters * nfft]) print('Computing Features for file: %s' % uttid) for kk in range(only_compute): i = int(np.floor((phone_mid[kk]))) each_feat = each_feat = np.zeros([nfilters, nfft]) for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] xlpc, gg = computeLpcFast( band_dct, order) # Compute LPC coefficients w, h = freqz(np.sqrt(gg), xlpc, nfft) each_feat[j, :] = np.log10(np.abs(h)) each_feat = np.reshape(each_feat, (1, nfilters * nfft)) feats[kk, :] = each_feat if get_phone_labels: feats = np.append(feats, np.reshape(phone_now, (len(phone_now), 1)), axis=1) all_feats = np.vstack([all_feats, feats]) all_feats = all_feats[1:, :] # Save the final BIG feature file np.save(os.path.join(outdir), all_feats) np.save(os.path.join(os.path.dirname(outdir), 'phone_list'), phn_list)
def extractMelEnergyFeats(args, srate=16000, window=np.hamming): """ Extract the mel scale filter-bank energy features """ wavs = args.scp outfile = args.outfile add_reverb = args.add_reverb nfft = args.nfft context = args.context fduration = args.fduration frate = args.frate nfilters = args.nfilters kaldi_cmd = args.kaldi_cmd fbank = createFbank(nfilters, nfft, srate) if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') sys.stdout.flush() with open(wavs, 'r') as fid: all_feats = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) print('%s: Computing Features for file: %s' % (sys.argv[0], uttid)) sys.stdout.flush() if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) if inwav[0:6] == 'ffmpeg': riff_chunk_size = len(proc.stdout) - 8 q = riff_chunk_size b = [] for i in range(4): q, r = divmod(q, 256) b.append(r) riff = proc.stdout[:4] + bytes(b) + proc.stdout[8:] sr, signal = read(io.BytesIO(riff)) else: sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' signal = signal / np.power(2, 15) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) melEnergy_frames = np.log10( np.matmul(np.abs(fft(time_frames, int(nfft / 2 + 1), axis=1)), np.transpose(fbank))) mfcc_feats = dct(melEnergy_frames, axis=1) mfcc_feats = mfcc_feats[:, 0:13] if args.context: mfcc_feats = spliceFeats(mfcc_feats, context) all_feats[uttid] = mfcc_feats getKaldiArk(all_feats, outfile, kaldi_cmd)
def extractModSpecFeatures(args, srate=16000, window=np.hanning): wavs = args.scp outfile = args.outfile phone_map = args.phn_file phn_file_dir = args.phn_file_dir get_phone_labels = args.get_phone_labels add_reverb = args.add_reverb set_unity_gain = args.set_unity_gain nmodulations = args.nmodulations order = args.order fduration = args.fduration frate = args.frate nfilters = args.nfilters '''Extract the Modulation Spectral Features. Args: wavs (list): List of (uttid, 'filename or pipe-command'). outdir (string): Output of an existing directory. phone_map(string): Map of the phonemes from Kaldi get_phone_labels(bool): Set True if you want to get the phoneme labels fduration (float): Frame duration in seconds. frate (int): Frame rate in Hertz. hz2scale (function): Hz -> 'scale' conversion. nfft (int): Number of points to compute the FFT. nfilters (int): Number of filters. postproc (function): User defined post-processing function. srate (int): Expected sampling rate of the audio. scale2hz (function): 'scale' -> Hz conversion. srate (int): Expected sampling rate. window (function): Windowing function. Note: It is possible to use a Kaldi like style to read the audio using a "pipe-command" e.g.: "sph2pipe -f wav /path/file.wav |" ''' fbank = createFbank(nfilters, int(2 * fduration * srate), srate) if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') # Get list of phonemes phn_list = [] with open(phone_map, 'r') as fid2: for line2 in fid2: line2 = line2.strip().split() if len(line2) == 2: if 'sil' not in line2 and 'SIL' not in line2: phn_list.append(line2[1]) phn_list = list(set(phn_list)) phn_list.sort() with open(wavs, 'r') as fid: all_feats = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' # I want to work with numbers from 0 to 1 so.... signal = signal / np.power(2, 15) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) fname_phn = uttid + '.PHN' # Get all phones and their center if os.path.isfile(os.path.join(phn_file_dir, fname_phn)): phn_file = open(os.path.join(phn_file_dir, fname_phn)) phone_mid = np.empty(0) phone_now = np.empty(0) for line2 in phn_file: phn_locs = line2.strip().split() if phn_locs[2] in phn_list: ind = phn_list.index(phn_locs[2]) phone_now = np.append(phone_now, ind) phone_mid = np.append( phone_mid, int(int(phn_locs[0]) + int(phn_locs[1])) / 2) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) cos_trans = freqAnalysis.dct(time_frames) / np.sqrt( 2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) only_compute = len(phone_mid) if get_phone_labels: feats = np.zeros( [only_compute, nmodulations * nfilters + 1]) else: feats = np.zeros([only_compute, nmodulations * nfilters]) print('Computing Features for file: %s' % uttid) sys.stdout.flush() for kk in range(only_compute): i = int(np.floor((phone_mid[kk]))) each_feat = np.zeros([nfilters, nmodulations]) for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] #band_dct=band_dct[band_dct>0] xlpc, gg = computeLpcFast( band_dct, order) # Compute LPC coefficients if set_unity_gain: gg = 1 mod_spec = computeModSpecFromLpc( gg, xlpc, nmodulations) each_feat[j, :] = mod_spec each_feat = np.reshape(each_feat, (1, nfilters * nmodulations)) if get_phone_labels: feats[kk, :] = np.append(each_feat, phone_now[kk]) else: feats[kk, :] = each_feat all_feats[uttid] = feats # Save the final BIG feature file pickle.dump(all_feats, open(outfile, 'wb')) np.save(os.path.join(os.path.dirname(outfile), 'phone_list'), phn_list)
def getFeats(args, srate=16000, window=np.hamming): wavs = args.scp scp_type = args.scp_type outfile = args.outfile coeff_num = args.coeff_num coeff_range = args.coeff_range order = args.order fduration = args.fduration frate = args.frate nfilters = args.nfilters kaldi_cmd = args.kaldi_cmd add_noise = args.add_noise add_reverb = args.add_reverb if args.lifter_config: fid = open(args.lifter_config, 'r') lifter_config = fid.readline().strip().split(',') lifter_config = np.asarray([float(x) for x in lifter_config]) # Set up mel-filterbank fbank_type = args.fbank_type.strip().split(',') if fbank_type[0] == "mel": if len(fbank_type) < 2: raise ValueError('Mel filter bank not configured properly....') fbank = createFbank(nfilters, int(2 * fduration * srate), srate, warp_fact=float(fbank_type[1])) elif fbank_type[0] == "cochlear": if len(fbank_type) < 6: raise ValueError('Cochlear filter bank not configured properly....') if int(fbank_type[3]) == 1: print('%s: Alpha is fixed and will not change as a function of the center frequency...' % sys.argv[0]) fbank = createFbankCochlear(nfilters, int(2 * fduration * srate), srate, om_w=float(fbank_type[1]), alp=float(fbank_type[2]), fixed=int(fbank_type[3]), bet=float(fbank_type[4]), warp_fact=float(fbank_type[5])) else: raise ValueError('Invalid type of filter bank, use mel or cochlear with proper configuration') # Ignore odd modulations if args.odd_mod_zero: print('%s: Ignoring odd modulations... ' % sys.argv[0]) if add_noise: if add_noise == "clean" or add_noise == "diff": print('%s: No noise added!' % sys.argv[0]) else: noise_info = add_noise.strip().split(',') noise = load_noise(noise_info[0]) if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'medium_room': sr_r, rir = read('./RIR/RIR_MediumRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') # Set up mask coeff_range = coeff_range.split(',') lowpass = int(coeff_range[0]) highpass = int(coeff_range[1]) mask = [] for i in range(coeff_num): if i >= lowpass and i <= highpass: mask.append(1) else: mask.append(0) mask = np.asarray(mask) args.overlap_fraction = 1 - args.overlap_fraction # Setup modulation weights args.gamma_weight = args.gamma_weight.strip().split(',') if not args.gamma_weight[0] == "None": print('%s: Adding gamma filter on modulation frequencies...' % sys.argv[0]) x = np.linspace(0, order - 1, order) scale = float(args.gamma_weight[0]) shape = float(args.gamma_weight[1]) pk_required = float(args.gamma_weight[2]) res = 2 * fduration pk_required = pk_required * res pk = (shape - 1) * scale loc = -pk + pk_required mod_wts = stats.gamma.pdf(x, a=shape, loc=loc, scale=scale) * 3 * scale with open(wavs, 'r') as fid: all_feats = {} if args.write_utt2num_frames: all_lens = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) if scp_type == 'wav': if inwav[-1] == '|': try: proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) skip_rest=False except Exception: skip_rest=True else: try: sr, signal = read(inwav) skip_rest = False except Exception: skip_rest = True assert sr == srate, 'Input file has different sampling rate.' elif scp_type == 'segment': try: cmd = 'wav-copy ' + inwav + ' - ' proc = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) skip_rest = False except Exception: skip_rest = True else: raise ValueError('Invalid type of scp type, it should be either wav or segment') # I want to work with numbers from 0 to 1 so.... # signal = signal / np.power(2, 15) if not skip_rest: if add_noise: if not add_noise == "clean": if add_noise == "diff": a = [1, 2, 3, 2, 0, -2, -5, -2, 0, 2, 3, 2, 1] signal = convolve(signal, a, mode='same') else: signal = add_noise_to_wav(signal, noise, float(noise_info[1])) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) tframes = signal.shape[0] # Number of samples in the signal lfr = 1 / (args.overlap_fraction * fduration) time_frames = np.array([frame for frame in getFrames(signal, srate, lfr, fduration, window)]) cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) feats = np.zeros((nfilters, int(np.ceil(tframes * frate / srate)))) ptr = int(0) print('%s: Computing Features for file: %s' % (sys.argv[0], uttid)) sys.stdout.flush() for i in range(0, frame_num): for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] xlpc, gg = computeLpcFast(band_dct, order) # Compute LPC coefficients ms = computeModSpecFromLpc(gg, xlpc, coeff_num) ms = ms * mask if args.lifter_config: ms = ms * lifter_config if not args.gamma_weight[0] == "None": ms = ms * mod_wts if args.odd_mod_zero: ms[1::2] = 0 ms = fft(ms, 2 * int(fduration * frate)) ms = np.abs(np.exp(ms)) kk = int(np.round(fduration * frate)) kkb2 = int(np.round(fduration * frate / 2)) ms = ms[0:kk] * np.hanning(kk) / window(kk) if i == 0: if feats.shape[1] < kkb2: feats[j, :] += ms[kkb2:kkb2 + feats.shape[1]] else: feats[j, ptr:ptr + kkb2] += ms[kkb2:] elif i == frame_num - 1 or i == frame_num - 2: if ms.shape[0] >= feats.shape[1] - ptr: feats[j, ptr:] += ms[:feats.shape[1] - ptr] else: feats[j, ptr:ptr + kk] += ms else: feats[j, ptr:ptr + kk] += ms kk = int(np.round(fduration * frate * args.overlap_fraction)) kkb2 = int(np.round(fduration * frate / 2)) if i == 0: ptr = int(ptr + kk - kkb2) else: ptr = int(ptr + kk + randrange(2)) all_feats[uttid] = np.log(np.clip(feats.T, a_max=None, a_min=0.00000000000001)) if args.write_utt2num_frames: all_lens[uttid] = feats.shape[1] dict2Ark(all_feats, outfile, kaldi_cmd) if args.write_utt2num_frames: with open(outfile + '.len', 'w+') as file: for key, lens in all_lens.items(): p = "{:s} {:d}".format(key, lens) file.write(p) file.write("\n")
def compute_mel_spectrum(args, srate=16000, window=np.hamming): wavs = args.scp scp_type = args.scp_type outfile = args.outfile add_noise = args.add_noise nfft = args.nfft fduration = args.fduration frate = args.frate nfilters = args.nfilters add_reverb = args.add_reverb # Set up mel-filterbank fbank_type = args.fbank_type.strip().split(',') if fbank_type[0] == "mel": if len(fbank_type) < 2: raise ValueError('Mel filter bank not configured properly....') fbank = createFbank(nfilters, nfft, srate, warp_fact=float(fbank_type[1])) elif fbank_type[0] == "cochlear": if len(fbank_type) < 6: raise ValueError( 'Cochlear filter bank not configured properly....') if int(fbank_type[3]) == 1: print( '%s: Alpha is fixed and will not change as a function of the center frequency...' % sys.argv[0]) fbank = createFbankCochlear(nfilters, nfft, srate, om_w=float(fbank_type[1]), alp=float(fbank_type[2]), fixed=int(fbank_type[3]), bet=float(fbank_type[4]), warp_fact=float(fbank_type[5])) else: raise ValueError( 'Invalid type of filter bank, use mel or cochlear with proper configuration' ) if add_noise: if not add_noise == "clean" and not add_noise == "diff": noise_info = add_noise.strip().split(',') noise = load_noise(noise_info[0]) if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'medium_room': sr_r, rir = read('./RIR/RIR_MediumRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') with open(wavs, 'r') as fid: all_feats = {} if args.write_utt2num_frames: all_lens = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) print('%s: Computing Features for file: %s' % (sys.argv[0], uttid)) sys.stdout.flush() if scp_type == 'wav': if inwav[-1] == '|': try: proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) skip_rest = False except Exception: skip_rest = True else: try: sr, signal = read(inwav) skip_rest = False except Exception: skip_rest = True assert sr == srate, 'Input file has different sampling rate.' elif scp_type == 'segment': try: cmd = 'wav-copy ' + inwav + ' - ' proc = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) skip_rest = False except Exception: skip_rest = True else: raise ValueError( 'Invalid type of scp type, it should be either wav or segment' ) # signal = signal / np.power(2, 15) if not skip_rest: if add_noise: if not add_noise == "clean": if add_noise == "diff": # signal = np.diff(signal) a = [1, 2, 3, 2, 0, -2, -5, -2, 0, 2, 3, 2, 1] signal = convolve(signal, a, mode='same') else: signal = add_noise_to_wav(signal, noise, float(noise_info[1])) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) if args.spectrum_type == "log": melEnergy_frames = np.log10( np.matmul( np.abs( fft(time_frames, nfft, axis=1)[:, :int(nfft / 2 + 1)]), np.transpose(fbank))) elif args.spectrum_type == "power": melEnergy_frames = np.power( np.matmul( np.abs( fft(time_frames, nfft, axis=1)[:, :int(nfft / 2 + 1)]), np.transpose(fbank)), 2) else: print("Spectrum type not supported! ") sys.exit(1) all_feats[uttid] = melEnergy_frames if args.write_utt2num_frames: all_lens[uttid] = melEnergy_frames.shape[0] get_kaldi_ark(all_feats, outfile) if args.write_utt2num_frames: with open(outfile + '.len', 'w+') as file: for key, lens in all_lens.items(): p = "{:s} {:d}".format(key, lens) file.write(p) file.write("\n")
def extractMelEnergyFeats(args, srate=16000, window=np.hamming): """ Extract the mel scale filter-bank energy features """ wavs = args.scp outfile = args.outfile phone_map = args.phn_file phn_file_dir = args.phn_file_dir get_phone_labels = args.get_phone_labels add_reverb = args.add_reverb nfft = args.nfft context = args.context fduration = args.fduration frate = args.frate nfilters = args.nfilters fbank = createFbank(nfilters, nfft, srate) if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') sys.stdout.flush() # Get list of phonemes phn_list = [] with open(phone_map, 'r') as fid2: for line2 in fid2: line2 = line2.strip().split() if len(line2) == 2: if 'sil' not in line2 and 'SIL' not in line2: phn_list.append(line2[1]) phn_list = list(set(phn_list)) phn_list.sort() np.save('phone_list', phn_list) with open(wavs, 'r') as fid: all_feats = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) fname_phn = uttid + '.PHN' uttid_base = uttid.split('_')[0] fname_phn_base = uttid_base + '.PHN' if isfile(join(phn_file_dir, fname_phn_base)): fname_phn = fname_phn_base # Get all the locations of phonemes phone_now = np.empty(0) phone_end = np.empty(0) phone_beg = np.empty(0) if isfile(join(phn_file_dir, fname_phn)): print('%s: Computing Features for file: %s' % (sys.argv[0], uttid)) sys.stdout.flush() with open(join(phn_file_dir, fname_phn)) as phn_file: for phn_line in phn_file: phn_locs = phn_line.strip().split() # Get phoneme information phone_now = np.append(phone_now, phn_locs[2]) phone_end = np.append(phone_end, phn_locs[1]) phone_beg = np.append(phone_beg, phn_locs[0]) phn_file.close() if np.size(phone_end) == 0: print('%s: Corrupted Phone file.. hence skipped...' % sys.argv[0]) continue if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' signal = signal / np.power(2, 15) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) melEnergy_frames = np.log10( np.matmul( np.abs(fft(time_frames, int(nfft / 2 + 1), axis=1)), np.transpose(fbank))) if args.context: melEnergy_frames = spliceFeats(melEnergy_frames, context) if get_phone_labels: if args.context: feats = np.empty(nfilters * (2 * context + 1) + 1) else: feats = np.empty(nfilters + 1) else: if args.context: feats = np.empty(nfilters) else: feats = np.empty(nfilters * (2 * context + 1)) for num, phn in enumerate(phone_now): now_frames = melEnergy_frames[ int(phone_beg[num]):int(phone_end[num]), :] if get_phone_labels: ind = phn_list.index(phn) fr_num = now_frames.shape[0] now_frames = np.concatenate( (now_frames, np.tile(ind, (fr_num, 1))), axis=1) feats = np.vstack([feats, now_frames]) feats = feats[1:, :] all_feats[uttid] = feats outfile = abspath(outfile) pkl_file = outfile + '.pkl' pickle.dump(all_feats, open(pkl_file, 'wb')) with open(outfile + '.scp', 'w+') as file: for item in list(all_feats.keys()): file.write("%s %s\n" % (item, pkl_file))
def extractMelEnergyFeats(wavs, outfile, phone_map, phn_file_dir, fduration, context, frate, nfft, nfilters, get_phone_labels=False, add_reverb=True, srate=16000, window=np.hamming): """ Extract the mel scale filter-bank energy features """ fbank = createFbank(nfilters, nfft, srate) if add_reverb: sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) # Get list of phonemes phn_list = [] with open(phone_map, 'r') as fid2: for line2 in fid2: line2 = line2.strip().split() if len(line2) == 2: if 'sil' not in line2 and 'SIL' not in line2: phn_list.append(line2[1]) phn_list = list(set(phn_list)) phn_list.sort() with open(wavs, 'r') as fid: all_feats = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) fname_phn = uttid + '.PHN' # Get all the locations of phonemes phone_now = np.empty(0, dtype=int) phone_mid = np.empty(0, dtype=int) if isfile(join(phn_file_dir, fname_phn)): print('%s: Computing Features for file: %s' % (sys.argv[0], uttid)) sys.stdout.flush() with open(join(phn_file_dir, fname_phn)) as phn_file: for phn_line in phn_file: phn_locs = phn_line.strip().split() # Get phoneme information if phn_locs[2] in phn_list: ind = phn_list.index(phn_locs[2]) phone_now = np.append(phone_now, ind) phone_mid = np.append( phone_mid, int((int(phn_locs[0]) + int(phn_locs[1])) / 2)) phn_file.close() if np.size(phone_mid) == 0: print('%s: Corrupted Phone file.. hence skipped...' % sys.argv[0]) sys.stdout.flush() continue if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' signal = signal / np.power(2, 15) if add_reverb: signal = addReverb(signal, rir) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) melEnergy_frames = np.log10( np.matmul( np.abs(fft(time_frames, int(nfft / 2 + 1), axis=1)), np.transpose(fbank))) if args.context: melEnergy_frames = spliceFeats(melEnergy_frames, context) feats = melEnergy_frames[phone_mid, :] if get_phone_labels: feats = np.append(feats, phone_now.reshape(len(phone_now), 1), axis=1) all_feats[uttid] = feats pickle.dump(all_feats, open(outfile, 'wb'))
def extractModSpecFeatures(args, srate=16000, window=np.hamming): """ Extract the mel scale filter-bank energy features """ wavs = args.scp outfile = args.outfile add_reverb = args.add_reverb set_unity_gain = args.set_unity_gain nmodulations = args.nmodulations order = args.order fduration = args.fduration frate = args.frate nfilters = args.nfilters phone_map = args.phn_file phn_file_dir = args.phn_file_dir get_phone_labels = args.get_phone_labels fbank = createFbank(nfilters, int(2 * fduration * srate), srate) if add_reverb: if add_reverb == 'small_room': sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'large_room': sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav') rir = rir[:, 1] rir = rir / np.power(2, 15) elif add_reverb == 'clean': print('%s: No reverberation added!' % sys.argv[0]) else: raise ValueError('Invalid type of reverberation!') sys.stdout.flush() # Get list of phonemes phn_list = [] with open(phone_map, 'r') as fid2: for line2 in fid2: line2 = line2.strip().split() if len(line2) == 2: if 'sil' not in line2 and 'SIL' not in line2: phn_list.append(line2[1]) phn_list = list(set(phn_list)) phn_list.sort() np.save('phone_list', phn_list) with open(wavs, 'r') as fid: all_feats = {} for line in fid: tokens = line.strip().split() uttid, inwav = tokens[0], ' '.join(tokens[1:]) fname_phn = uttid + '.PHN' fname_phn_base = uttid[0:-2] + '.PHN' if isfile(join(phn_file_dir, fname_phn_base)): fname_phn = fname_phn_base # Get all the locations of phonemes phone_now = np.empty(0) phone_end = np.empty(0) phone_beg = np.empty(0) if isfile(join(phn_file_dir, fname_phn)): print('%s: Computing Features for file: %s' % (sys.argv[0], uttid)) sys.stdout.flush() with open(join(phn_file_dir, fname_phn)) as phn_file: for phn_line in phn_file: phn_locs = phn_line.strip().split() # Get phoneme information phone_now = np.append(phone_now, phn_locs[2]) phone_end = np.append(phone_end, phn_locs[1]) phone_beg = np.append(phone_beg, phn_locs[0]) phn_file.close() if np.size(phone_end) == 0: print('%s: Corrupted Phone file.. hence skipped...' % sys.argv[0]) continue if inwav[-1] == '|': proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE) sr, signal = read(io.BytesIO(proc.stdout)) else: sr, signal = read(inwav) assert sr == srate, 'Input file has different sampling rate.' signal = signal / np.power(2, 15) if add_reverb: if not add_reverb == 'clean': signal = addReverb(signal, rir) time_frames = np.array([ frame for frame in getFrames(signal, srate, frate, fduration, window) ]) cos_trans = dct(time_frames) / np.sqrt( 2 * int(srate * fduration)) [frame_num, ndct] = np.shape(cos_trans) # Main feature computation loop feats = np.zeros((frame_num, nfilters * nmodulations)) sys.stdout.flush() for i in range(frame_num): each_feat = np.zeros([nfilters, nmodulations]) for j in range(nfilters): filt = fbank[j, 0:-1] band_dct = filt * cos_trans[i, :] xlpc, gg = computeLpcFast( band_dct, order) # Compute LPC coefficients if set_unity_gain: gg = 1 mod_spec = computeModSpecFromLpc( gg, xlpc, nmodulations) each_feat[j, :] = mod_spec each_feat = np.reshape(each_feat, (1, nfilters * nmodulations)) feats[i, :] = each_feat if not get_phone_labels: now_feats = np.empty(nfilters * nmodulations) else: now_feats = np.empty(nfilters * nmodulations + 1) for num, phn in enumerate(phone_now): now_frames = feats[ int(phone_beg[num]):int(phone_end[num]), :] if get_phone_labels: ind = phn_list.index(phn) fr_num = now_frames.shape[0] now_frames = np.concatenate( (now_frames, np.tile(ind, (fr_num, 1))), axis=1) now_feats = np.vstack([now_feats, now_frames]) now_feats = now_feats[1:, :] all_feats[uttid] = now_feats pickle.dump(all_feats, open(outfile, 'wb'))