def converter(self, batch, device=-1): # alternative to chainer.dataset.concat_examples DATA_SHAPE = 40 * 3 # 40 log filterbank xs = [np.load(path).astype(np.float32) for path, _ in batch] delta_x = [delta(x, 3) for x in xs] delta_delta_x = [delta(x, 3) for x in delta_x] Xs = [ to_device(self.device, np.concatenate((a, b, c), axis=1).astype(np.float32)) for a, (b, c) in zip(xs, zip(delta_x, delta_delta_x)) ] # Xs = [F.concat((X, self.xp.zeros(((self.stacked_frames + self.skip_size) - len(X), DATA_SHAPE), dtype=self.xp.float32)), axis=0) if len(X) < (self.stacked_frames + self.skip_size) else X for X in Xs] # Xs = [F.pad_sequence([X[i:i + self.stacked_frames] for i in range(0, len(X), self.skip_size)]).reshape(-1, DATA_SHAPE * self.stacked_frames) for X in Xs] word_label = [ self.xp.asarray(lab[0]).astype(self.xp.int32) for _, lab in batch ] char_lable = [ self.xp.asarray(lab[1]).astype(self.xp.int32) for _, lab in batch ] lable_batch = char_lable return Xs, lable_batch
def test(): (fs, x) = wavfile.read('a.wav') mfcc_feat = mfcc_wav('a.wav', 0.02, 0.01) mfcc_delta = base.delta(mfcc_feat, 2) mfcc_delta_delta = base.delta(mfcc_delta, 2) mfcc_feat = np.hstack([mfcc_feat, mfcc_delta, mfcc_delta_delta]) features = mfcc_feat data_dim = 39 timesteps = 8 num_classes = 9 print('Reshaping data for LSTM') features = reshape_data(features,timesteps) json_file = open('model.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights("model.h5") print("Loaded model from disk") loaded_model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy']) prediction = loaded_model.predict(features) print(np.shape(prediction)) print(take_max(prediction))
def dscc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=lambda x: numpy.ones((x, ))): feats, energies = base.fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc) feats = base.delta(feats, 2) # OBTAIN DELTA feats = boxcox(feats) feats = numpy.log(feats) feats = dct(feats, type=2, axis=1, norm='ortho')[:, :numcep] feats = base.lifter(feats, ceplifter) if appendEnergy: feats[:, 0] = numpy.log( energies ) # replace first cepstral coefficient with log of frame energy feats = base.delta(feats, 2) #verify if 2 is right return feats
def get_features(audio_file): '''Get features from a file''' signal, sample_rate = sf.read(tf.gfile.FastGFile(audio_file, 'rb')) feat, energy = fbank(signal, sample_rate, nfilt=FLAGS.nfilt) feat = np.log(feat) dfeat = delta(feat, 2) ddfeat = delta(dfeat, 2) return np.concatenate([feat, dfeat, ddfeat, np.expand_dims(energy, 1)], axis=1)
def get_features(audio_file): """Get features from a file""" signal, sample_rate = sf.read(audio_file) feat, energy = fbank(signal, sample_rate, nfilt=FLAGS.nfilt) feat = np.log(feat) dfeat = delta(feat, 2) ddfeat = delta(dfeat, 2) return np.concatenate( [feat, dfeat, ddfeat, np.expand_dims(energy, 1)], axis=1)
def extract_feature(wav_path): """Extract 39-dim mfcc feature.""" fs, audio = wav.read(wav_path) mfcc = base.mfcc(audio, fs, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, preemph=0.97, appendEnergy=True) mfcc_d = base.delta(mfcc, N=2) mfcc_dd = base.delta(mfcc_d, N=2) feat = np.concatenate([mfcc, mfcc_d, mfcc_dd], axis=1) return feat
def graves_2012(self, wav_path): """ Alex. Graves: Sequence Transduction with Recurrent Neural Networks. CoRR abs/1211.3711 (2012) MFCC features Standard speech preprocessing was applied to transform the audio files into feature sequences. 26 channel mel-frequency filter bank and a pre-emphasis coefficient of 0.97 were used to compute 12 mel-frequency cepstral coeffici- ents plus an energy coefficient on 25ms Hamming windows at 10ms intervals. Delta coefficients were added to create input sequences of length 26 vectors For CMVN and all coefficient were normalised to have mean zero and standard deviat- ion one over the train- ing set. ==> please set --prep-cmvn-samples to -1. I left as default the other options which were not mentioned in the paper such as nfft, lowfreq, highfreq, ceplifter, etc. :param wav_path: wav file path :return: a feature sequence """ (rate, sig) = wav.read(Util.get_file_path(self.basepath, wav_path)) # computing features mfcc_feat = \ mfcc(signal=sig, samplerate=rate, numcep=12, winlen=0.025, nfilt=26, winstep=0.01, preemph=0.97, appendEnergy=False, winfunc=np.hamming) # adding energy energy = np.expand_dims(np.sum(np.power(mfcc_feat, 2), axis=-1), 1) mfcc_e_feat = np.concatenate((energy, mfcc_feat), axis=-1) # concatenating a delta vector delta_feat = delta(mfcc_e_feat, 1) return np.concatenate((mfcc_e_feat, delta_feat), axis=1)
def computeDeltaDelta(nfilt=41): ''' function that compute the second derivative of the mel-fbanks representation. The function read the input data from the delta_buffer queue and put the results in delta_delta_buffer queue. he results for each frame is a tuple (fbanks+delta+deltaDelta, wave_form) Note: not used in this version of the pipeline ''' num_frame = (fbanks_and_delta_window_size * 2 + 1) num_frame_VAD_window = (VAD_window_size * 2 + 1) while True: if delta_buffer.qsize() >= num_frame: last_N_frames = delta_buffer.get_last_n_frame(num_frame) last_N_frames = [frame[0] for frame in last_N_frames] last_N_frames = [d[:-nfilt] for d in last_N_frames] frame_delta = np.squeeze(np.asarray((delta_buffer[fbanks_and_delta_window_size])[0])) wave_form = (delta_buffer[fbanks_and_delta_window_size])[1] delta_buffer.pull_last_n_frame(1) frame_delta_delta = delta(last_N_frames, fbanks_and_delta_window_size)[fbanks_and_delta_window_size] delta_delta_frame = np.concatenate((frame_delta, frame_delta_delta)) delta_delta_buffer.put((delta_delta_frame, wave_form)) if delta_delta_buffer.isFirst: delta_delta_buffer.isFirst = False # add n times the first frame in order to allow th VAD to compute teh result even for the first frame for _ in range(num_frame_VAD_window - 1): delta_delta_buffer.put((delta_delta_frame, wave_form))
def mfcc_features( wavarr, win_len=5, # window length for feature extraction in secs - run_orig.m win_overlap=0, # specify the overlap between adjacent windows for feature extraction in percentage - run_orig.m nfft=0, lowfreq=5, highfreq=1000, kDelta=False, logging=False ): # rate, aud_data = scipy.io.wavfile.read(file) rate = wavarr[0] signal = wavarr[1] d_mfcc_feat = None if nfft == 0: nfft = fft.calculate_nfft(signal.size) #FFT size as the padded next power-of-two mfcc_feat = base.mfcc(signal, rate, winlen=win_len, #window_length*1000 in extractFeatures.m winstep=win_len-win_overlap, #10ms shift; Ts = 10 in extractFeatures.m numcep=13, #C=12; in extractFeatures.m nfilt=20, #M=20; in extractFeatures.m nfft=nfft, #pad to next power-of-2 lowfreq=5, highfreq=1000, #LF=5; HF=1000; in extractFeatures.m preemph=0.97, ceplifter=22, #alpha=0.97; L=22; in extractFeatures.m winfunc=np.hamming, #@hamming appendEnergy=False # replace first cepstral coefficient with log of frame energy ) if kDelta: d_mfcc_feat = base.delta(mfcc_feat, 2) #compute delta features from a feature vector #fbank_feat = sigproc.logfbank(signal, rate) #compute log Mel-filterbank energy features from an audio signal return mfcc_feat, d_mfcc_feat
def compute_fbanks_dataset(path="", nfilt=40): for filename in glob.glob(os.path.join(path, '*.wav')): sample_rate, audio_data = read(filename) fbanks, energy = logfbank(signal=audio_data, samplerate=sample_rate, nfilt=nfilt) fbanks = np.concatenate( (fbanks, np.reshape(energy, (energy.shape[0], 1))), axis=1) fbanks_delta = delta(feat=fbanks, N=win_size) fbanks_delta_delta = delta(feat=fbanks_delta, N=win_size) audio_features = np.concatenate( (fbanks, fbanks_delta, fbanks_delta_delta), axis=1) filename, _ = os.path.splitext(filename) print filename np.save(filename, audio_features)
def computeDeltaDelta(nfilt=41): ''' function that compute the second derivative of the mel-fbanks representation. The function read the input data from the delta_buffer queue and put the results in delta_delta_buffer queue. he results for each frame is a tuple (fbanks+delta+deltaDelta, wave_form) Note: not used in this version of the pipeline ''' num_frame = (fbanks_and_delta_window_size * 2 + 1) num_frame_VAD_window = (VAD_window_size * 2 + 1) while True: if delta_buffer.qsize() >= num_frame: last_N_frames = delta_buffer.get_last_n_frame(num_frame) last_N_frames = [frame[0] for frame in last_N_frames] last_N_frames = [d[:-nfilt] for d in last_N_frames] frame_delta = np.squeeze( np.asarray((delta_buffer[fbanks_and_delta_window_size])[0])) wave_form = (delta_buffer[fbanks_and_delta_window_size])[1] delta_buffer.pull_last_n_frame(1) frame_delta_delta = delta( last_N_frames, fbanks_and_delta_window_size)[fbanks_and_delta_window_size] delta_delta_frame = np.concatenate( (frame_delta, frame_delta_delta)) delta_delta_buffer.put((delta_delta_frame, wave_form)) if delta_delta_buffer.isFirst: delta_delta_buffer.isFirst = False # add n times the first frame in order to allow th VAD to compute teh result even for the first frame for _ in range(num_frame_VAD_window - 1): delta_delta_buffer.put((delta_delta_frame, wave_form))
def computeDelta(): ''' function that compute the first derivative of the mel-fbanks representation. The function read the input data from the audio_buffer queue and put the results in delta_buffer queue. the results for each frame is a tuple (fbanks+delta, wave_form) Note: not used in this version of the pipeline ''' num_frame = (fbanks_and_delta_window_size * 2 + 1) while True: if audio_buffer.qsize() >= num_frame: last_N_frames = np.asarray( audio_buffer.get_last_n_frame(num_frame)) last_N_frames = [frame[0] for frame in last_N_frames] fbanks = np.squeeze( np.asarray((audio_buffer[fbanks_and_delta_window_size])[0])) wave_form = (audio_buffer[fbanks_and_delta_window_size])[1] audio_buffer.pull_last_n_frame(1) frame_delta = delta( last_N_frames, fbanks_and_delta_window_size)[fbanks_and_delta_window_size] delta_frame = np.concatenate((fbanks, frame_delta)) delta_buffer.put((delta_frame, wave_form)) if delta_buffer.isFirst: for _ in range(fbanks_and_delta_window_size): delta_buffer.put((delta_frame, wave_form)) delta_buffer.isFirst = False
def get_fbank_feature(wavsignal, fs): """ 输入为wav文件数学表示和采样频率,输出为语音的FBANK特征+一阶差分+二阶差分 :param wavsignal: :param fs: :return: """ feat_fbank = logfbank(wavsignal, fs, nfilt=40, nfft=2048, winstep=0.025, winlen=0.05) feat_fbank_d = delta(feat_fbank, 2) feat_fbank_dd = delta(feat_fbank_d, 2) wav_feature = np.column_stack((feat_fbank, feat_fbank_d, feat_fbank_dd)) return wav_feature
def if_Miriam(ramkowanie, delta_memory): treshold = -50 path="waves/answer.wav" with open('model.bin', 'rb') as f: model=pickle.load(f) f.close() fs, data = wav.read(path) MFCC = mfcc(samples, fs,winlen=ramkowanie, nfft=round(ramkowanie * fs) + 1, numcep=13) delta=base.delta(MFCC, delta_memory) delta_delta=base.delta(delta, delta_memory) MFCC_and_deltas = numpy.c_[MFCC, delta, delta_delta] score=model.score(MFCC_and_deltas) print(score) if score>treshold: return True else: return False
def get_mfcc_feature(wavsignal, fs): """ 输入为wav文件数学表示和采样频率,输出为语音的MFCC特征+一阶差分+二阶差分; :param wavsignal: :param fs: :return: """ feat_mfcc = mfcc(wavsignal, fs, nfft=2048, nfilt=40, numcep=40, winlen=0.05, winstep=0.025) feat_mfcc_d = delta(feat_mfcc, 2) feat_mfcc_dd = delta(feat_mfcc_d, 2) wav_feature = np.column_stack((feat_mfcc, feat_mfcc_d, feat_mfcc_dd)) return wav_feature
def compute_mfcc(wav_path, winstep=0.01): (rate, sig) = wav.read(wav_path) mfcc_feat = mfcc(signal=sig, samplerate=rate, appendEnergy=True, winstep=winstep) # Deltas d_mfcc_feat = delta(mfcc_feat, 2) # Deltas-Deltas dd_mfcc_feat = delta(d_mfcc_feat, 2) # transpose mfcc_feat = np.transpose(mfcc_feat) d_mfcc_feat = np.transpose(d_mfcc_feat) dd_mfcc_feat = np.transpose(dd_mfcc_feat) # concat above three features concat_mfcc_feat = np.concatenate((mfcc_feat, d_mfcc_feat, dd_mfcc_feat)) return concat_mfcc_feat
def extract_features(audio_data, samplerate): mfcc_features = [] hamming_window = numpy.hamming(400) if (len(audio_data.shape) > 1): audio_data = audio_data[:, 0] for i in range(0, audio_data.shape[0] - 400, 240): trimmed = audio_data[i:i + 400] hammed = numpy.multiply(hamming_window, trimmed) mfcced = mfcc(hammed, samplerate, nfft=2048) mfcc_features.append(mfcced[0]) delta_features = delta(mfcc_features, 1) return mfcc_features, delta_features
def graves_2013(self, wav_path): """ Alex Graves, Abdel-rahman Mohamed, Geoffrey E. Hinton: Speech recognition with deep recurrent neural networks. ICASSP 2013: 6645-6649 FBANK features : (40 fbank, 1 energy * 3) The audio data was encoded using a Fourier-transform-based filter-bank with 40 coefficients (plus energy) distributed on a mel-scale, together with their first and second temporal derivatives. Each input vector was therefore size 123. For CMVN The data were normalised so that every element of the input vec- tors had zero mean and unit variance over the training set. there is not description about window I chose to use a hanning window. I left as default the other options which were not mentioned in the paper such as nfft, lowfreq, highfreq, ceplifter, etc. :param wav_path: wav file path :return: a feature sequence """ (rate, sig) = wav.read(Util.get_file_path(self.basepath, wav_path)) # computing features fbank_feat, _ = \ fbank(signal=sig, samplerate=rate, nfilt=40, winfunc=np.hanning) # adding energy energy = np.expand_dims(np.sum(np.power(fbank_feat, 2), axis=-1), 1) fbank_e_feat = np.concatenate((energy, fbank_feat), axis=-1) # concatenating delta vectors delta_feat = delta(fbank_e_feat, 1) delta_delta_feat = delta(fbank_e_feat, 2) return np.concatenate((fbank_e_feat, delta_feat, delta_delta_feat), axis=1)
def get_features(signal, rate, normalize): # extract features features = mfcc(signal, rate, winlen=0.020, winstep=0.010) # print('mfcc: ' , np.shape(features)) delta = base.delta(features, 5) features = np.concatenate((features, delta), axis=-1) # print('delta: ', np.shape(features)) # perform normalization if asked to if normalize is True: mean_features = np.mean(features, axis=0) var_features = np.var(features, axis=0) features = (features - mean_features) / var_features return features
def extract_mfcc(wave_files, encoded_labels, files_destination, labels_destination, mfcc_type): labels_df = pd.DataFrame(columns=['file', 'label']) files_num = len(wave_files) for i, (wave_file, label) in enumerate(zip(wave_files, encoded_labels)): wave_file_name = wave_file.split('/')[-1] mfcc_file_path = files_destination + wave_file_name.split('.')[0] + '.npy' print('{}/{}\t{}'.format(i + 1, files_num, wave_file_name)) wave_data, sample_rate = sf.read(wave_file) # save mfcc if mfcc_type == 'cnn': mfcc = librosa.feature.mfcc(wave_data, sr=sample_rate) elif mfcc_type == 'rnn': mfcc = base.mfcc(wave_data, samplerate=sample_rate, numcep=13, winstep=0.01, winfunc=np.hamming) deltas = base.delta(mfcc, 2) # normalize mfcc over all frames mfcc_mean = np.mean(mfcc, axis=0) mfcc_std = np.std(mfcc, axis=0) mfcc = (mfcc - mfcc_mean)/mfcc_std # normalize deltas over all frames delta_mean = np.mean(deltas, axis=0) delta_std = np.std(deltas, axis=0) deltas = (deltas - delta_mean)/delta_std np.save(mfcc_file_path, np.concatenate((mfcc, deltas), axis=1), allow_pickle=False) labels_df.loc[i] = [wave_file_name, label] labels_df.to_csv(labels_destination, sep='\t', index=False)
def computeDelta(): ''' function that compute the first derivative of the mel-fbanks representation. The function read the input data from the audio_buffer queue and put the results in delta_buffer queue. the results for each frame is a tuple (fbanks+delta, wave_form) Note: not used in this version of the pipeline ''' num_frame = (fbanks_and_delta_window_size * 2 + 1) while True: if audio_buffer.qsize() >= num_frame: last_N_frames = np.asarray(audio_buffer.get_last_n_frame(num_frame)) last_N_frames = [frame[0] for frame in last_N_frames] fbanks = np.squeeze(np.asarray((audio_buffer[fbanks_and_delta_window_size])[0])) wave_form = (audio_buffer[fbanks_and_delta_window_size])[1] audio_buffer.pull_last_n_frame(1) frame_delta = delta(last_N_frames, fbanks_and_delta_window_size)[fbanks_and_delta_window_size] delta_frame = np.concatenate((fbanks, frame_delta)) delta_buffer.put((delta_frame, wave_form)) if delta_buffer.isFirst: for _ in range(fbanks_and_delta_window_size): delta_buffer.put((delta_frame, wave_form)) delta_buffer.isFirst = False
from python_speech_features import mfcc from python_speech_features import base from sklearn.mixture import GaussianMixture as GMM recordings=15 ramkowanie=0.025 components=4 delta_memory=2 path = os.getcwd() allMFCC = numpy.empty([1, recordings], dtype=object) i=0 for file in os.listdir(os.path.join(path, 'miriam')): print("1") fs, samples = wav.read(os.path.join(path, 'miriam', file)) MFCC = mfcc(samples, fs,winlen=ramkowanie, nfft=round(ramkowanie * fs) + 1, numcep=13) delta=base.delta(MFCC, delta_memory) delta_delta=base.delta(delta, delta_memory) MFCC_and_deltas=numpy.c_[MFCC,delta,delta_delta] allMFCC[0, i] = MFCC_and_deltas i = i + 1 # w nazwie pliku jest liczba parametrow mfcc z ktorej zostala funkcja wywołana #with open('mfcc_miriam.bin', 'wb') as f: # pickle.dump(allMFCC, f) # f.close() model= GMM(components, covariance_type="diag") tmp_data = numpy.zeros((0, 39)) for j in range(0, recordings): tmp_data = numpy.r_[tmp_data, allMFCC[0, j]] model.fit(tmp_data) with open('model.bin', 'wb') as f:
def extractMFCC( kNumFeatures, shape, #only supports 1D features for now, will add 2D support next wavarr, win_len, win_overlap, nfft, cutoff=256, #for dominant_frequency_features only kDelta=False #frame-deltas (need more than 1 frame) ): rate = wavarr[0] signal = wavarr[1] kNumScalars = 2 #see 'extractFeaturesCodegen.m' mfcc_feat, _ = mymfcc.mfcc_features(wavarr, win_len=win_len, win_overlap=win_overlap, nfft=nfft, lowfreq=5, highfreq=1000) #print(" # mfcc frames %d, mfcc_feat[0].shape: %s" % (mfcc_feat.shape[0], str(mfcc_feat[0].shape))) nframes = mfcc_feat.shape[0] if shape=='1D': features = [None] * kNumScalars #np.zeros(shape=(kNumFeatures,), dtype=np.float) if shape=='2D': ndfeatures = np.zeros((nframes, kNumFeatures), dtype=np.float32) step_length = win_len - win_overlap offset = kNumScalars for f in range(nframes): #this can be replaced by one np flattening mfcc = mfcc_feat[f] #a frame current_start_sample = f * rate * step_length current_end_sample = current_start_sample + win_len * rate current_signal = signal[current_start_sample:current_end_sample] #current window #[kurtosis, dominant_frequency_features] kurt = kurtosis(current_signal) #dom_nfft = 4096 #Matlab code is using nfft=4096 #maxfreq, domfreq = dominant_frequency_features(current_signal[0:dom_nfft], rate, cutoff=cutoff, nfft=dom_nfft) if shape=='1D': #only supports 1D features for now, will add 2D support next features[0] = kurt features[1] = 0 #domfreq #features[2] = skew(current_signal) #this led to a drop of > 4% accuracy features.extend(mfcc) #TODO: check for overflow here if we use ndarray for features offset += len(mfcc) if shape=='2D': fvec = [kurt, 0] fvec.extend(mfcc) ndfeatures[f] = fvec if kDelta and (nframes > 1): assert(False) #TODO: fix this code d_mfcc_feat = base.delta(mfcc_feat, 2) #compute delta features from a feature vector ndfeatures = np.append(features, d_mfcc_feat) if shape=='1D': #only supports 1D features for now, will add 2D support next ndfeatures = np.zeros(kNumFeatures, dtype=np.float32) n = min(kNumFeatures, len(features)) ndfeatures[0:n] = features[:n] return ndfeatures
[frames, energy] = raw_frames(prime_data, frame_shift, window_size) freq_spectrum = get_freqspectrum(frames, params['alpha'], fs, window_size) fbanks = get_fbanks(freq_spectrum, params['n_filters'], fs) prime_features = get_mfcc(fbanks) prime_features = np.concatenate([energy[:, None], prime_features], 1) [frames, energy] = raw_frames(target_data, frame_shift, window_size) freq_spectrum = get_freqspectrum(frames, params['alpha'], fs, window_size) fbanks = get_fbanks(freq_spectrum, params['n_filters'], fs) target_features = get_mfcc(fbanks) target_features = np.concatenate([energy[:, None], target_features], 1) single_delta = base.delta(prime_features, params['delta_n']) double_delta = base.delta(single_delta, params['delta_n']) prime_features = np.concatenate( [prime_features, single_delta, double_delta], 1) single_delta = base.delta(target_features, params['delta_n']) double_delta = base.delta(single_delta, params['delta_n']) target_features = np.concatenate( [target_features, single_delta, double_delta], 1) feature_shape = np.shape(prime_features)[1] prime_node = output_file.create_earray(feature_node, 'prime', f_atom, (0, feature_shape), expectedrows=5000) prime_node.append(prime_features)
import numpy as np from pathlib import Path from argparse import ArgumentParser from python_speech_features.base import delta parser = ArgumentParser() parser.add_argument("--source", type=Path, required=True) parser.add_argument("--output", type=Path) parser.add_argument("--size", type=int, default=2) args = parser.parse_args() source = args.source output = args.output or args.source.with_name(f"delta_{args.source.name}") output.parent.mkdir(parents=True, exist_ok=True) source_npz = np.load(source) output_dict = {} for key, value in source_npz.items(): output_dict[key] = delta(value, args.size) np.savez(output, **output_dict)
def audio_features(params, img_audio, audio_path, append_name, node_list): output_file = params['output_file'] # create pytable atom for the features f_atom = tables.Float32Atom() count = 1 # keep track of the nodes for which no features could be made, places # database contains some empty audio files invalid = [] for node in node_list: print(f'processing file: {count}') count += 1 # create a group for the desired feature type audio_node = output_file.create_group(node, params['feat']) # get the base name of the node this feature will be appended to base_name = node._v_name.split(append_name)[1] # get the caption file names corresponding to the image of this node caption_files = img_audio[base_name][1] for cap in caption_files: # remove extension from the caption filename base_capt = cap.split('.')[0] # remove folder path from file names (Places/coco database) if '/' in base_capt: base_capt = base_capt.split('/')[-1] if '-' in base_capt: base_capt = base_capt.replace('-', '_') # read audio samples try: input_data, fs = librosa.load(os.path.join(audio_path, cap), sr=None) # in the places database some of the audiofiles are empty if len(input_data) == 0: break except: # try to repair broken files, some files had a wrong header. # In Places I found some that could not be fixed however try: fix_wav(os.path.join(audio_path, cap)) #input_data = read(os.path.join(audio_path, cap)) except: # the loop will break, if no valid audio features could # be made for this image, the entire node is deleted. break # set the fft size to the power of two equal to or greater than # the window size. window_size = int(fs * params['t_window']) exp = 1 while True: if np.power(2, exp) - window_size >= 0: fft_size = np.power(2, exp) break else: exp += 1 ############################################################################### # create audio features if params['feat'] == 'raw': # calculate the needed frame shift, premphasize and frame # the signal frame_shift = int(fs * params['t_shift']) input = sigproc.preemphasis(input_data, coeff=params['alpha']) features = sigproc.framesig(input_data, frame_len=window_size, frame_step=frame_shift, winfunc=params['windowing']) elif params['feat'] == 'freq_spectrum': # calculate the needed frame shift, premphasize and frame # the signal frame_shift = int(fs * params['t_shift']) input = sigproc.preemphasis(input_data, coeff=params['alpha']) frames = sigproc.framesig(input, frame_len=window_size, frame_step=frame_shift, winfunc=params['windowing']) # create the power spectrum features = sigproc.powspec(frames, fft_size) elif params['feat'] == 'fbanks': # create mel filterbank features [features, energy] = base.fbank(input_data, samplerate=fs, winlen=params['t_window'], winstep=params['t_shift'], nfilt=params['nfilters'], nfft=fft_size, lowfreq=0, highfreq=None, preemph=params['alpha'], winfunc=params['windowing']) elif params['feat'] == 'mfcc': # create mfcc features features = base.mfcc(input_data, samplerate=fs, winlen=params['t_window'], winstep=params['t_shift'], numcep=params['ncep'], nfilt=params['nfilters'], nfft=fft_size, lowfreq=0, highfreq=None, preemph=params['alpha'], ceplifter=0, appendEnergy=params['use_energy'], winfunc=params['windowing']) # apply cepstral mean variance normalisation if params['normalise']: features = (features - features.mean(0)) / features.std(0) # optionally add the deltas and double deltas if params['use_deltas']: single_delta = base.delta(features, params['delta_n']) double_delta = base.delta(single_delta, params['delta_n']) features = np.concatenate( [features, single_delta, double_delta], 1) ############################################################################### # create new leaf node in the feature node for the current audio # file feature_shape = np.shape(features)[1] f_table = output_file.create_earray(audio_node, append_name + base_capt, f_atom, (0, feature_shape), expectedrows=5000) # append new data to the tables f_table.append(features) if audio_node._f_list_nodes() == []: # keep track of all the invalid nodes for which no features could # be made invalid.append(node._v_name) # remove the top node including all other features if no captions # features could be created output_file.remove_node(node, recursive=True) print(invalid) print(f'There were {len(invalid)} files that could not be processed')