def calculate_acoustic_features(args, waveform): n_fft = int(args.window*SAMPLE_RATE/1000.0) hop_length = int(args.step * SAMPLE_RATE / 1000.0) if 'mfe' == args.feature_type: if args.backend=='speechpy': log_cut = 1e-8 spec, energy = mfe(waveform, SAMPLE_RATE, frame_length=args.window*1e-3, frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft) if args.energy: acoustic_features = np.hstack((spec, energy[:, np.newaxis])) acoustic_features = np.log(acoustic_features + log_cut) else: spec = librosa.feature.melspectrogram(y=waveform, sr=SAMPLE_RATE, n_fft=n_fft, hop_length=hop_length, n_mels=args.n_mels) acoustic_features = librosa.core.amplitude_to_db(spec).transpose() if args.energy: energy = librosa.feature.rmse(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose() acoustic_features = np.hstack((acoustic_features, energy)) elif 'mfcc' == args.feature_type: if args.backend=='speechpy': acoustic_features = mfcc(waveform, SAMPLE_RATE, frame_length=args.window*1e-3, frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft, num_cepstral = args.n_mfcc) else: acoustic_features = librosa.feature.mfcc(y=waveform, sr=SAMPLE_RATE, n_mfcc=args.n_mfcc, n_fft=n_fft, hop_length=hop_length, n_mels=args.n_mels).transpose() if args.energy: energy = librosa.feature.rmse(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose() acoustic_features = np.hstack((acoustic_features, energy)) elif 'lyon' == args.feature_type: waveform /= np.abs(waveform).max() acoustic_features = lyon_calc.lyon_passive_ear(waveform[:, np.newaxis].astype(np.double), SAMPLE_RATE, hop_length) max_val = acoustic_features.max() if max_val > 0: acoustic_features /= max_val acoustic_features = acoustic_features.astype(np.float32) if args.energy: energy = librosa.feature.rmse(y=waveform, frame_length=hop_length, hop_length=hop_length).transpose() energy /= energy.max() len_delta = acoustic_features.shape[0] - energy.shape[0] if len_delta > 0: energy = np.pad(energy, [(0, len_delta), (0, 0)], 'edge') else: energy = energy[:acoustic_features.shape[0], :] acoustic_features = np.hstack((acoustic_features, energy)) else: raise ValueError('Unexpected features type.') if args.deltas: orig_shape = acoustic_features.shape if args.backend=='speechpy': acoustic_features = extract_derivative_feature(acoustic_features) else: delta = librosa.feature.delta(acoustic_features, axis=0) ddelta = librosa.feature.delta(acoustic_features, order=2, axis=0) acoustic_features = np.stack((acoustic_features[:, :, np.newaxis], delta[:, :, np.newaxis], ddelta[:, :, np.newaxis]), axis=-1) acoustic_features = np.reshape(acoustic_features, (-1, orig_shape[-1] * 3)) return acoustic_features
def compute_fbank(file, debug=True): sr, signal = wav.read(file) if debug: print('signal shape: ', signal.shape) # Pre-emphasizing. signal_preemphasized = processing.preemphasis(signal, cof=data_config.preemphasis) # Stacking frames frames = processing.stack_frames(signal_preemphasized, sampling_frequency=sr, frame_length=data_config.window_size, frame_stride=data_config.hop_size, zero_padding=True) # Extracting power spectrum power_spectrum = processing.power_spectrum( frames, fft_points=512) # num_frames x fft_length if debug: print('power spectrum shape=', power_spectrum.shape) ############# Extract fbanks features ############# log_fbank = feature.lmfe(signal_preemphasized, sampling_frequency=sr, frame_length=data_config.window_size, frame_stride=data_config.hop_size, num_filters=data_config.num_mels, fft_length=512, low_frequency=0, high_frequency=None) # num_frames x num_filters if data_config.apply_cmvn: # Cepstral mean variance normalization. log_fbank_cmvn = processing.cmvn(log_fbank, variance_normalization=True) if debug: print('fbank(mean + variance normalized) feature shape=', log_fbank_cmvn.shape) log_fbank = log_fbank_cmvn # num_frames x num_filters # Extracting derivative features log_fbank = feature.extract_derivative_feature(log_fbank) # print('log fbank feature cube shape=', log_fbank_feature_cube.shape) # num_frames x num_filters x 3 # frameSlice and dowmSampling # concat_mat = concat_frame(log_fbank) # log_fbank = subsampling(concat_mat) # log_fbank = build_LFR_features(log_fbank, data_config.LFR_m, data_config.LFR_n) if debug: print('concat & subsample shape=', log_fbank.shape) return log_fbank
def generate_mfec_features(audio_file_name): final = [] fs, signal = wav.read(audio_file_name) mfe = speechpy.feature.mfe(signal, sampling_frequency=fs, frame_length=0.02, frame_stride=0.02, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) mfe_1 = mfe[0][:15] mfe_final = extract_derivative_feature(mfe_1) for d1 in mfe_final: temp_array = [] for d2 in d1: temp = np.array(d2).reshape(1, 3) temp_array.append(temp) final.append(temp_array) return final
############# Extract MFCC features ############# mfcc = feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) # Cepstral mean variance normalization. mfcc_cmvn = processing.cmvn(mfcc, variance_normalization=True) print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape) # Extracting derivative features mfcc_feature_cube = feature.extract_derivative_feature(mfcc) print('mfcc feature cube shape=', mfcc_feature_cube.shape) ############# Extract logenergy features ############# logenergy = feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) logenergy_feature_cube = feature.extract_derivative_feature(logenergy) print('logenergy features=', logenergy.shape)
if count != 0: elapsed_time = time.time() - start_time curr_id = subdir[29:38] # print(curr_id) count = count + 1 start_time = time.time() for file in files: sound = AudioSegment.from_wav(subdir + "/" + file) sound = sound.set_channels(1) sound.export("modified.wav", format="wav") sample_rate, samples = wavfile.read("modified.wav") features = lmfe(samples, sample_rate, 0.025, 0.01, 40) features = extract_derivative_feature(features) timevar = 100 if features.shape[0] >= timevar: no_cuts = int(features.shape[0] / timevar) for i in range(no_cuts): cut = features[i * timevar:(i * timevar) + timevar:, :, :] # print("cut: ", cut.shape) with open(filename2, "a") as myfile: myfile.write(curr_id + "\n") with open(filename, "a") as myfile: for data_slice in cut: np.savetxt(myfile, data_slice, delimiter=',', newline="\n")