def _get_mfcc_log_spec_and_log_mel_spec(wav, preemphasis_coeff, n_fft, win_length, hop_length): # Pre-emphasis y_preem = preemphasis(wav, coeff=preemphasis_coeff) # Get spectrogram D = librosa.stft(y=y_preem, n_fft=n_fft, hop_length=hop_length, win_length=win_length) mag = np.abs(D) # Get mel-spectrogram mel_basis = librosa.filters.mel(hp_default.sr, hp_default.n_fft, hp_default.n_mels) # (n_mels, 1+n_fft//2) mel = np.dot(mel_basis, mag ** 1) # (n_mels, t) # mel spectrogram # Get mfccs db = librosa.power_to_db(mel) mfccs = np.dot(librosa.filters.dct(hp_default.n_mfcc, db.shape[0]), mel) # Log mag = np.log(mag + sys.float_info.epsilon) mel = np.log(mel + sys.float_info.epsilon) # Normalization # self.y_log_spec = (y_log_spec - hp.mean_log_spec) / hp.std_log_spec # self.y_log_spec = (y_log_spec - hp.min_log_spec) / (hp.max_log_spec - hp.min_log_spec) return mfccs.T, mag.T, mel.T # (t, n_mfccs), (t, 1+n_fft/2), (t, n_mels)
def _get_mfcc_log_spec_and_log_mel_spec(wav, preemphasis_coeff, n_fft, win_length, hop_length): # Pre-emphasis y_preem = preemphasis(wav, coeff=preemphasis_coeff) # Get spectrogram D = librosa.stft(y=y_preem, n_fft=n_fft, hop_length=hop_length, win_length=win_length) mag = np.abs(D) # Get mel-spectrogram mel_basis = librosa.filters.mel(hp_default.sr, hp_default.n_fft, hp_default.n_mels) # (n_mels, 1+n_fft//2) mel = np.dot(mel_basis, mag) # (n_mels, t) # mel spectrogram # Get mfccs db = librosa.amplitude_to_db(mel) mfccs = np.dot(librosa.filters.dct(hp_default.n_mfcc, db.shape[0]), db) # Log mag = np.log(mag + sys.float_info.epsilon) mel = np.log(mel + sys.float_info.epsilon) # Normalization # self.y_log_spec = (y_log_spec - hp.mean_log_spec) / hp.std_log_spec # self.y_log_spec = (y_log_spec - hp.min_log_spec) / (hp.max_log_spec - hp.min_log_spec) return mfccs.T, mag.T, mel.T # (t, n_mfccs), (t, 1+n_fft/2), (t, n_mels)
def generator(): for f, emo in data: f = wav_files[f] wav, _ = librosa.load(f, sr=hp.sr) wav = utils.preemphasis(wav) S = utils.spectrogram(wav) label = utils.emo2int[abbr2emo[emo]] yield (S.T, label)
def listen_for_speech(): """ Listens to Microphone, extracts phrases from it and sends it to Google's TTS service and returns response. a "phrase" is sound surrounded by silence (according to threshold). num_phrases controls how many phrases to process before finishing the listening process (-1 for infinite). """ infer_model = Model('infer', logdir=None) infer_model.load_model('logdir/18-07-02T18-30-25/model.ckpt-444') #Open stream p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) print("* Listening mic. ") audio2send = [] rel = RATE//CHUNK slid_win = deque(maxlen=int(SILENCE_LIMIT*rel)) #Prepend audio from 0.5 seconds before noise was detected prev_audio = deque(maxlen=int(PREV_AUDIO*rel)) started = False n = 0 response = [] while True: data = stream.read(CHUNK, exception_on_overflow=False) slid_win.append(math.sqrt(abs(audioop.avg(data, 4)))) #print slid_win[-1] if(sum([x > THRESHOLD for x in slid_win]) > 0): if(not started): print() print("Starting record of phrase") started = True audio2send.append(data) elif (started is True): # The limit was reached, finish capture and deliver. wf = save_speech(list(prev_audio) + audio2send, n, p) wav, _ = librosa.load(wf, sr=RATE) wav = utils.preemphasis(wav) S = utils.spectrogram(wav) pred = infer_model.infer(np.array([S.T])) print(utils.int2emo[pred[0]]) started = False slid_win.clear() prev_audio.clear() audio2send = [] n += 1 print( "Listening ...") print() else: prev_audio.append(data) print( "* Done recording") stream.close() p.terminate() return response