def Prediction(self): """ Makes a prediction based on the input audio frames transformed into MFCCs Args: None Returns: None """ # converts first 19 chunks of audio bytes into 16 bit int values in_data = np.fromstring(np.array(self.frames[:19]), 'Int16') # extract MFCCs from the 19 chunks of audio audio_sig = np.array([ mfcc(in_data, self.rate, self.window, self.stride, self.mfcc, self.filter_banks, self.fft_num, 0, None, True) ]) # makes predictions prediction = self.ww_model.model.predict(audio_sig) if (self.print_pred): print(prediction) return prediction
def get_feature(file_path: str, mfcc_len: int = 39, flatten: bool = False): import librosa from speechpy.feature import mfcc # 某些音频用scipy.io.wavfile读会报 "Incomplete wav chunk" error # 似乎是因为scipy只能读pcm和float格式,而有的wav不是这两种格式... # fs, signal = wav.read(file_path) signal, fs = librosa.load(file_path) s_len = len(signal) # 如果音频信号小于mean_signal_length,则扩充它 if s_len < mean_signal_length: pad_len = mean_signal_length - s_len pad_rem = pad_len % 2 pad_len //= 2 signal = np.pad(signal, (pad_len, pad_len + pad_rem), 'constant', constant_values=0) # 否则把它切开 else: pad_len = s_len - mean_signal_length pad_len //= 2 signal = signal[pad_len:pad_len + mean_signal_length] mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len) # 用 SVM & MLP 模型时要降维数据 if flatten: mel_coefficients = np.ravel(mel_coefficients) return mel_coefficients
def vectorize_raw(audio: np.ndarray) -> np.ndarray: """Turns audio into feature vectors, without clipping for length""" from speechpy.feature import mfcc if len(audio) == 0: raise ValueError('Cannot vectorize empty audio!') return mfcc(audio, pr.sample_rate, pr.window_t, pr.hop_t, pr.n_mfcc, pr.n_filt, pr.n_fft)
def get_feature_vector_from_mfcc(file_path: str, mfcc_len: int = 45): # sf, signal = wav.read(file_path) signal, fs = sf.read(file_path) s_len = len(signal) fs = 16000 # print(s_len) if s_len < mean_signal_length: # print("I am in the if loop, for the features padding") pad_len = mean_signal_length - s_len pad_len //= 2 pad_rem = pad_len % 2 signal = np.pad(signal, (pad_len, pad_len + pad_rem), 'constant', constant_values=0) else: # print("I am in the else loop") pad_len = s_len - mean_signal_length pad_len //= 2 signal = signal[pad_len:pad_len + mean_signal_length] mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len) mel_coefficients = np.ravel(mel_coefficients) normalize_feature_vector = min_max_scalling(mel_coefficients) return normalize_feature_vector
def get_feature_vector_from_mfcc(file_path: str, flatten: bool, mfcc_len: int = 39) -> np.ndarray: """ Make feature vector from MFCC for the given wav file. Args: file_path (str): path to the .wav file that needs to be read. flatten (bool) : Boolean indicating whether to flatten mfcc obtained. mfcc_len (int): Number of cepestral co efficients to be consider. Returns: numpy.ndarray: feature vector of the wav file made from mfcc. """ fs, signal = wav.read(file_path) s_len = len(signal) # pad the signals to have same size if lesser than required # else slice them if s_len < mean_signal_length: pad_len = mean_signal_length - s_len pad_rem = pad_len % 2 pad_len //= 2 signal = np.pad(signal, (pad_len, pad_len + pad_rem), 'constant', constant_values=0) else: pad_len = s_len - mean_signal_length pad_len //= 2 signal = signal[pad_len:pad_len + mean_signal_length] mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len) if flatten: # Flatten the data mel_coefficients = np.ravel(mel_coefficients) return mel_coefficients
def calculate_acoustic_features(args, waveform): n_fft = int(args.window*SAMPLE_RATE/1000.0) hop_length = int(args.step * SAMPLE_RATE / 1000.0) if 'mfe' == args.feature_type: if args.backend=='speechpy': log_cut = 1e-8 spec, energy = mfe(waveform, SAMPLE_RATE, frame_length=args.window*1e-3, frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft) if args.energy: acoustic_features = np.hstack((spec, energy[:, np.newaxis])) acoustic_features = np.log(acoustic_features + log_cut) else: spec = librosa.feature.melspectrogram(y=waveform, sr=SAMPLE_RATE, n_fft=n_fft, hop_length=hop_length, n_mels=args.n_mels) acoustic_features = librosa.core.amplitude_to_db(spec).transpose() if args.energy: energy = librosa.feature.rmse(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose() acoustic_features = np.hstack((acoustic_features, energy)) elif 'mfcc' == args.feature_type: if args.backend=='speechpy': acoustic_features = mfcc(waveform, SAMPLE_RATE, frame_length=args.window*1e-3, frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft, num_cepstral = args.n_mfcc) else: acoustic_features = librosa.feature.mfcc(y=waveform, sr=SAMPLE_RATE, n_mfcc=args.n_mfcc, n_fft=n_fft, hop_length=hop_length, n_mels=args.n_mels).transpose() if args.energy: energy = librosa.feature.rmse(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose() acoustic_features = np.hstack((acoustic_features, energy)) elif 'lyon' == args.feature_type: waveform /= np.abs(waveform).max() acoustic_features = lyon_calc.lyon_passive_ear(waveform[:, np.newaxis].astype(np.double), SAMPLE_RATE, hop_length) max_val = acoustic_features.max() if max_val > 0: acoustic_features /= max_val acoustic_features = acoustic_features.astype(np.float32) if args.energy: energy = librosa.feature.rmse(y=waveform, frame_length=hop_length, hop_length=hop_length).transpose() energy /= energy.max() len_delta = acoustic_features.shape[0] - energy.shape[0] if len_delta > 0: energy = np.pad(energy, [(0, len_delta), (0, 0)], 'edge') else: energy = energy[:acoustic_features.shape[0], :] acoustic_features = np.hstack((acoustic_features, energy)) else: raise ValueError('Unexpected features type.') if args.deltas: orig_shape = acoustic_features.shape if args.backend=='speechpy': acoustic_features = extract_derivative_feature(acoustic_features) else: delta = librosa.feature.delta(acoustic_features, axis=0) ddelta = librosa.feature.delta(acoustic_features, order=2, axis=0) acoustic_features = np.stack((acoustic_features[:, :, np.newaxis], delta[:, :, np.newaxis], ddelta[:, :, np.newaxis]), axis=-1) acoustic_features = np.reshape(acoustic_features, (-1, orig_shape[-1] * 3)) return acoustic_features
def get_data(data_path, flatten=True, mfcc_len=39, class_labels=("Neutral", "Angry", "Happy", "Sad")): """ Process the data for training and testing. Perform the following steps. 1. Read the files and get the audio frame. 2. Perform the test-train split :param class_labels: class labels that we care about. :param data_path: path to the dataset folder :param mfcc_len: Number of mfcc features to take for each frame :param flatten: Boolean specifying whether to flatten the data or not :return: 4 arrays, x_train x_test y_train y_test """ data = [] labels = [] max_fs = 0 cur_dir = os.getcwd() sys.stderr.write('curdir: %s\n' % cur_dir) os.chdir(data_path) for i, directory in enumerate(class_labels): sys.stderr.write("started reading folder %s\n" % directory) os.chdir(directory) for filename in os.listdir('.'): fs, signal = read_wav(filename) max_fs = max(max_fs, fs) s_len = len(signal) # pad the signals to have same size if lesser than required # else slice them if s_len < mean_signal_length: pad_len = mean_signal_length - s_len pad_rem = pad_len % 2 pad_len //= 2 signal = np.pad(signal, (pad_len, pad_len + pad_rem), 'constant', constant_values=0) else: pad_len = s_len - mean_signal_length pad_len //= 2 signal = signal[pad_len:pad_len + mean_signal_length] mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len) if flatten: # Flatten the data mel_coefficients = np.ravel(mel_coefficients) data.append(mel_coefficients) labels.append(i) sys.stderr.write("ended reading folder %s\n" % directory) os.chdir('..') os.chdir(cur_dir) x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42) return np.array(x_train), np.array(x_test), np.array(y_train), np.array( y_test)
def array_to_features(data: list, config) -> list: """Takes a list of normalized audio signal amplitude and returns a list of MFCC parameters""" features = mfcc(data, int(config['sampling_rate']), frame_length=float(config['mfcc_frame_duration']), frame_stride=float(config['mfcc_frame_stride']), num_cepstral=int(config['mfcc_num_cepstral']), num_filters=int(config['mfcc_num_filters']), fft_length=int(config['mfcc_fft_length'])) return features
def run_interval(self, start, end): """Do fpd on interval of appropriate size.""" # Extract MFCC values. sample = np.array( mfcc(np.frombuffer(self.raw_bytes[start:end], FORMAT), SAMPLE_RATE, WINDOW, STRIDE, MFCC, FILTER_BANKS, FFT_NUM, 0, None, True)) # If positively predicted, write interval to wav file. if self.model.predict(np.reshape(sample, (1, 46, 13))) >= \ PREDICTION_THRESHOLD: self.write(start, end) self.n_false_pos += 1
def get_mfcc(data, sfs): """ load the wav data Args: data(np.array): audio files sfs(np.array(int)): frequencies of the audio data Returns: (np.array): mel-frequency cepstrum of the audio data """ if isinstance(sfs, (int, np.int64)): sfs = [sfs for i in range(len(data))] ret = np.array([mfcc(x, sf, num_cepstral=39) for x, sf in zip(data, sfs)]) return np.expand_dims(ret, axis=1)
def Convert_To_MFCC(self, wf): ''' Converts audio byte streams to MFCCs Args: wf - wave object Return: MFCCs - list of float lists ''' return mfcc(self.Read_Audio_Data(wf), RATE, WINDOW, STRIDE, MFCC, FILTER_BANKS, FFT_NUM, 0, None, True).tolist()
def get_mfccs(self, signal): s_len = len(signal) if s_len < self.mean_signal_length: pad_len = self.mean_signal_length - s_len pad_rem = pad_len % 2 pad_len //= 2 signal = np.pad(signal, (pad_len, pad_len + pad_rem), 'constant', constant_values=0) else: pad_len = s_len - self.mean_signal_length pad_len //= 2 signal = signal[pad_len:pad_len + self.mean_signal_length] mel_coefficients = mfcc(signal, 16000, num_cepstral=39) return mel_coefficients
def test_mfcc(self): num_cepstral = 13 mfcc = feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, num_cepstral=num_cepstral, frame_stride=0.01, num_filters=num_filters, fft_length=512, low_frequency=0, high_frequency=None) # Shape matcher assert mfcc.shape[1] == num_cepstral
def get_feature_vector_from_mfcc(file_path: str, flatten: bool, mfcc_len: int = 39) -> np.ndarray: fs, signal = wav.read(file_path) s_len = len(signal) if s_len < mean_signal_length: pad_len = mean_signal_length - s_len pad_rem = pad_len % 2 pad_len //= 2 signal = np.pad(signal, (pad_len, pad_len + pad_rem), 'constant', constant_values=0) else: pad_len = s_len - mean_signal_length pad_len //= 2 signal = signal[pad_len:pad_len + mean_signal_length] mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len) if flatten: mel_coefficients = np.ravel(mel_coefficients) return mel_coefficients
def get_featurefrom_mfcc(file_path: str, mfcc_len: int): signal, speed = sf.read(file_path) signal_len = len(signal) speed = 48000 meansignal_length = 176578 if signal_len < meansignal_length: pad_len = meansignal_length - signal_len pad_rem = pad_len % 2 pad_len //= 2 new_signal = np.pad(signal, (pad_len, pad_len + pad_rem), "constant", constant_values=0) else: pad_len = signal_len - meansignal_length pad_len //= 2 new_signal = signal[pad_len:pad_len + meansignal_length] print(new_signal) # print(len(new_signal)) mel_coefficients = mfcc(new_signal, speed, num_cepstral=mfcc_len) mel_coefficients = np.ravel(mel_coefficients) normalized_feature = min_max_scalling(mel_coefficients) return normalized_feature
def get_feature_vector_from_mfcc(file_path: str, mfcc_len: int): signal, fs = sf.read(file_path) s_len = len(signal) fs = 48000 # the sample rate of most fast signal in the data. . . # pad the signals to have same size if lesser than required if s_len < mean_signal_length: pad_len = mean_signal_length - s_len pad_len //= 2 pad_rem = pad_len % 2 signal = np.pad(signal, (pad_len, pad_len + pad_rem), 'constant', constant_values=0) else: pad_len = s_len - mean_signal_length pad_len //= 2 signal = signal[pad_len:pad_len + mean_signal_length] mel_coefficients = mfcc(signal, fs, num_cepstral=mfcc_len) mel_coefficients = np.ravel(mel_coefficients) normalize_feature_vector = min_max_scalling(mel_coefficients) return normalize_feature_vector
noise = 0 location = input("Location: ") description = input("Type of False Activation: ") while not(noise == 'q' or noise == 'm' or noise == 'l'): noise = input("Noise Level - Quiet (Q) Moderate (M) Loud (L): ").lower() print() while True: data = stream.read(CHUNK) frames.append(data) if (len(frames) > 19): in_data = np.fromstring(np.array(frames[:19]), 'Int16') audio_sig = np.array([mfcc(in_data, RATE, WINDOW, STRIDE, MFCC, FILTER_BANKS, FFT_NUM, 0, None, True)]) prediction = model.predict(audio_sig) print(prediction) if (prediction > CONFIDENCE): act_count += 1 if (act_count >= ACTIVATIONS): act_count = 0 print("NIMBUS", end = " ", flush = True) if(false_act == "y"): file_name = "notww_" + description + "-false_"+ location + "_" + noise +"_" + datetime.now().strftime("%m%d%Y%H%M%S%f") + "_ewenike.wav"
def Wake_Word(self): ''' Runs the wake word and calls the corresponding required functions for the end-to-end response Args: None Returns: None ''' # reads chunk of audio data = self.istream.read(self.CHUNK, exception_on_overflow=False) # appends chunk to frame list self.frames.append(data) # begins making predictions after the first 2.5 seconds of audio is read if (len(self.frames) > 19): # converts first 19 chunks of audio bytes into 16 bit int values in_data = np.fromstring(np.array(self.frames[:19]), 'Int16') # extract MFCCs from the 19 chunks of audio audio_sig = np.array([ mfcc(in_data, self.RATE, self.WINDOW, self.STRIDE, self.MFCC, self.FILTER_BANKS, self.FFT_NUM, 0, None, True) ]) # makes predictions prediction = self.ww_model.model.predict(audio_sig) # if the predictions is larger than the defined confidence if (prediction > self.CONFIDENCE): # increment the activation counter self.act_count += 1 # if the number of consecutive activations exceeds the activation value if (self.act_count >= self.ACTIVATIONS): # resets the activation count self.act_count = 0 # stops the input stream self.istream.stop_stream() # wake word audio prompt # subprocess.Popen(['mpg123', '-q', os.getcwd() + '%sUtils%sData%sNimbus_Awakened.mp3' % (self.delim, self.delim, self.delim)]) # stalls the program as the audio is played time.sleep(4) print("\nNIMBUS Activated\n\n") # obtains the string from the audio input stt_result = self.Speech_To_Text() print(stt_result) answer = "" best_stt_answer = "" # determines the appropriate input for the NLP engine if list(stt_result) != []: best_stt_answer = stt_result[0].alternatives[ 0].transcript else: answer = "Sorry, I could not hear you. Please ask again." # calls the NLP engine if speech was converted if best_stt_answer != "": answer = "Please ask again later." #NLP(best_stt_answer) # converts the NLP answer to audio self.Text_To_Speech(answer) # resets the wake word audio frames self.frames = [] # opens the input stream for wake word detection self.istream.start_stream() else: # print a period for not wake word predictions print('.', flush=True, end="") # reset the activation count self.act_count = 0 # window the data stream self.frames = self.frames[1:]
frames = processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, filter=lambda x: np.ones((x, )), zero_padding=True) # Extracting power spectrum power_spectrum = processing.power_spectrum(frames, fft_points=512) print('power spectrum shape=', power_spectrum.shape) ############# Extract MFCC features ############# mfcc = feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) # Cepstral mean variance normalization. mfcc_cmvn = processing.cmvn(mfcc, variance_normalization=True) print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape) # Extracting derivative features mfcc_feature_cube = feature.extract_derivative_feature(mfcc) print('mfcc feature cube shape=', mfcc_feature_cube.shape) ############# Extract logenergy features ############# logenergy = feature.lmfe(signal, sampling_frequency=fs,
def Convert_To_MFCC(wf): return mfcc(Read_Audio_Data(wf), RATE, WINDOW, STRIDE, MFCC, FILTER_BANKS, FFT_NUM, 0, None, True).tolist()
# reads chunk of audio data = stream.read(CHUNK) # appends chunk to frame list frames.append(data) # begins making predictions after the first 2.5 seconds of audio is read if (len(frames) > 19): # converts first 19 chunks of audio bytes into 16 bit int values in_data = np.fromstring(np.array(frames[:19]), 'Int16') # extract MFCCs from the 19 chunks of audio audio_sig = np.array([ mfcc(in_data, RATE, WINDOW, STRIDE, MFCC, FILTER_BANKS, FFT_NUM, 0, None, True) ]) # makes predictions prediction = model.predict(audio_sig) # print(prediction) # if the predictions is larger than the defined confidence if (prediction > CONFIDENCE): # increment the activation counter act_count += 1 # if the number of consecutive activations exceeds the activation value if (act_count >= ACTIVATIONS):
def ConvertToMFCC(fileName, path): return mfcc(readAudioData(path + fileName), RATE, WINDOW, STRIDE, MFCC, FILTER_BANKS, FFT_NUM, 0, None, True).tolist()
def _calc_mfcc(wav): mfccs = mfcc(wav, 16000, num_cepstral=39) return mfccs