def process_wake_up(self, audio): if self.wakeup_recognizer.is_recognized(audio.frame_data, self.metrics): SessionManager.touch() self.state.sleeping = False self.__speak("I'm awake.") # TODO: Localization self.metrics.increment("mycroft.wakeup")
def wake_up(self, audio): if self.wakeup_recognizer.is_recognized(audio.frame_data, self.metrics): SessionManager.touch() self.state.sleeping = False self.__speak(mycroft.dialog.get("i am awake", self.stt.lang)) self.metrics.increment("mycroft.wakeup")
def process_wake_word(self, audio, timer): hyp = self.mycroft_recognizer.transcribe(audio.frame_data, self.metrics) if self.mycroft_recognizer.contains(hyp): extractor = WordExtractor(audio, self.mycroft_recognizer, self.metrics) timer.lap() extractor.calculate_range() self.metrics.timer("mycroft.recognizer.extractor.time_s", timer.lap()) audio_before = extractor.get_audio_data_before() self.metrics.timer("mycroft.recognizer.audio_extracted.length_s", self._audio_length(audio_before)) audio_after = extractor.get_audio_data_after() self.metrics.timer("mycroft.recognizer.audio_extracted.length_s", self._audio_length(audio_after)) SessionManager.touch() payload = { 'utterance': hyp.hypstr, 'session': SessionManager.get().session_id, 'pos_begin': extractor.begin, 'pos_end': extractor.end } self.emitter.emit("recognizer_loop:wakeword", payload) try: self.transcribe([audio_before, audio_after]) except sr.UnknownValueError: self.__speak("Go ahead") self.state.skip_wakeword = True self.metrics.increment("mycroft.wakeword")
def process(self, audio): SessionManager.touch() payload = { 'utterance': self.wakeword_recognizer.key_phrase, 'session': SessionManager.get().session_id, } self.emitter.emit("recognizer_loop:wakeword", payload) if self._audio_length(audio) < self.MIN_AUDIO_SIZE: LOG.warning("Audio too short to be processed") else: stopwatch = Stopwatch() with stopwatch: transcription = self.transcribe(audio) if transcription: ident = str(stopwatch.timestamp) + str(hash(transcription)) # STT succeeded, send the transcribed speech on for processing payload = { 'utterances': [transcription], 'lang': self.stt.lang, 'session': SessionManager.get().session_id, 'ident': ident } self.emitter.emit("recognizer_loop:utterance", payload) self.metrics.attr('utterances', [transcription]) else: ident = str(stopwatch.timestamp) # Report timing metrics report_timing(ident, 'stt', stopwatch, {'transcription': transcription, 'stt': self.stt.__class__.__name__})
def process(self, audio): SessionManager.touch() payload = { 'utterance': self.wakeword_recognizer.key_phrase, 'session': SessionManager.get().session_id, } self.emitter.emit("recognizer_loop:wakeword", payload) if self._audio_length(audio) < self.MIN_AUDIO_SIZE: LOG.warning("Audio too short to be processed") else: stopwatch = Stopwatch() with stopwatch: transcription = self.transcribe(audio) if transcription: ident = str(stopwatch.timestamp) + str(hash(transcription)) # STT succeeded, send the transcribed speech on for processing payload = { 'utterances': [transcription], 'lang': self.stt.lang, 'session': SessionManager.get().session_id, 'ident': ident } self.emitter.emit("recognizer_loop:utterance", payload) self.metrics.attr('utterances', [transcription]) else: ident = str(stopwatch.timestamp) # Report timing metrics report_timing(ident, 'stt', stopwatch, { 'transcription': transcription, 'stt': self.stt.__class__.__name__ })
def process_skip_wake_word(self, audio): SessionManager.touch() try: self.transcribe([audio]) except sr.UnknownValueError: logger.warn("Speech Recognition could not understand audio") self.__speak("Sorry, I didn't catch that.") self.metrics.increment("mycroft.recognizer.error") self.state.skip_wakeword = False
def _send_wakeword_info(self, emitter): """Send messagebus message indicating that a wakeword was received. Arguments: emitter: bus emitter to send information on. """ SessionManager.touch() payload = {'utterance': self.wake_word_name, 'session': SessionManager.get().session_id} emitter.emit("recognizer_loop:wakeword", payload)
def wake_up(self, audio): if self.wakeup_recognizer.is_recognized(audio.frame_data, self.metrics): SessionManager.touch() self.state.sleeping = False lines = ["I'm awake.", "System rebooted.", "All systems check. I am now online.", "Waking up."] self.__speak(choice(lines)) self.metrics.increment("mycroft.wakeup")
def process_audio(self, audio): SessionManager.touch() payload = { 'utterance': self.mycroft_recognizer.key_phrase, 'session': SessionManager.get().session_id, } self.emitter.emit("recognizer_loop:wakeword", payload) try: self.transcribe([audio]) except sr.UnknownValueError: # TODO: Localization logger.warn("Speech Recognition could not understand audio")
def process_audio(self, audio): SessionManager.touch() payload = { 'utterance': self.mycroft_recognizer.key_phrase, 'session': SessionManager.get().session_id, } self.emitter.emit("recognizer_loop:wakeword", payload) try: self.transcribe([audio]) except sr.UnknownValueError: # TODO: Localization logger.warn("Speech Recognition could not understand audio") self.__speak("Sorry, I didn't catch that.")
def process(self, audio): SessionManager.touch() payload = { 'utterance': self.wakeword_recognizer.key_phrase, 'session': SessionManager.get().session_id, } self.emitter.emit("recognizer_loop:wakeword", payload) if self._audio_length(audio) < self.MIN_AUDIO_SIZE: LOG.warning("Audio too short to be processed") else: self.transcribe(audio)
def process(self, audio): SessionManager.touch() payload = { 'utterance': self.mycroft_recognizer.key_phrase, 'session': SessionManager.get().session_id, } self.emitter.emit("recognizer_loop:wakeword", payload) if self._audio_length(audio) < self.MIN_AUDIO_SIZE: LOG.warn("Audio too short to be processed") elif connected(): self.transcribe(audio) else: self.__speak("Mycroft seems not to be connected to the Internet")
def process(self, audio): SessionManager.touch() payload = { 'utterance': self.mycroft_recognizer.key_phrase, 'session': SessionManager.get().session_id, } self.emitter.emit("recognizer_loop:wakeword", payload) if self._audio_length(audio) < self.MIN_AUDIO_SIZE: LOG.warn("Audio too short to be processed") self.emitter.emit("recognizer_loop:tooshort", {}) elif connected(): self.transcribe(audio) else: self.__speak("Mycroft seems not to be connected to the Internet")
def __speak(self, utterance): print "going to speak " + utterance payload = { 'utterance': utterance, 'session': SessionManager.get().session_id } self.emitter.emit("speak", Message("speak", payload))
def transcribe(self, audio): text = None try: print("aud: " + str((audio))) initial = self.stt.execute(audio) print("initial: " + initial) text = initial.lower().strip() LOG.debug("STT: " + text) except sr.RequestError as e: LOG.error("Could not request Speech Recognition {0}".format(e)) except HTTPError as e: if e.response.status_code == 401: text = "pair my device" LOG.warn("Access Denied at mycroft.ai") except Exception as e: LOG.error(e) LOG.error("Speech Recognition could not understand audio") self.__speak("Sorry, I didn't catch that") if text: payload = { 'utterances': [text], 'session': SessionManager.get().session_id } self.emitter.emit("recognizer_loop:utterance", payload) self.metrics.attr('utterances', [text])
def transcribe(self, audio_segments): utterances = [] threads = [] if connected(): for audio in audio_segments: if self._audio_length(audio) < self.MIN_AUDIO_SIZE: logger.debug("Audio too short to send to STT") continue target = self._create_remote_stt_runnable(audio, utterances) t = threading.Thread(target=target) t.start() threads.append(t) for thread in threads: thread.join() if len(utterances) > 0: payload = { 'utterances': utterances, 'session': SessionManager.get().session_id } self.emitter.emit("recognizer_loop:utterance", payload) self.metrics.attr('utterances', utterances) else: raise sr.UnknownValueError else: # TODO: Localization self.__speak("This device is not connected to the Internet")
def transcribe(self, audio): text = None try: # Invoke the STT engine on the audio clip text = self.stt.execute(audio).lower().strip() LOG.debug("STT: " + text) except sr.RequestError as e: LOG.error("Could not request Speech Recognition {0}".format(e)) except ConnectionError as e: LOG.error("Connection Error: {0}".format(e)) self.__speak("Intelora seems not to be connected to the Internet.") except HTTPError as e: if e.response.status_code == 401: text = "pair my device" LOG.warn("Access Denied at Mycroft API") except Exception as e: LOG.error(e) LOG.error("Speech Recognition could not understand audio") lines = ["Sorry, I didn't catch that.", "Sorry, I didn't hear you clearly.", "Can you repeat what you said, please?", "Can you please say that again?"] self.__speak(choice(lines)) if text: # STT succeeded, send the transcribed speech on for processing payload = { 'utterances': [text], 'lang': self.stt.lang, 'session': SessionManager.get().session_id } self.emitter.emit("recognizer_loop:utterance", payload) self.metrics.attr('utterances', [text])
def transcribe(self, audio): text = None try: # Invoke the STT engine on the audio clip text = self.stt.execute(audio).lower().strip() LOG.debug("STT: " + text) except sr.RequestError as e: LOG.error("Could not request Speech Recognition {0}".format(e)) except ConnectionError as e: LOG.error("Connection Error: {0}".format(e)) self.emitter.emit("recognizer_loop:no_internet") except HTTPError as e: if e.response.status_code == 401: text = "pair my device" # phrase to start the pairing process LOG.warning("Access Denied at mycroft.ai") except Exception as e: LOG.error(e) LOG.error("Speech Recognition could not understand audio") if text: # STT succeeded, send the transcribed speech on for processing payload = { 'utterances': [text], 'lang': self.stt.lang, 'session': SessionManager.get().session_id } self.emitter.emit("recognizer_loop:utterance", payload) self.metrics.attr('utterances', [text])
def transcribe(self, audio): text = None try: # Invoke the STT engine on the audio clip text = self.stt.execute(audio).lower().strip() LOG.debug("STT: " + text) except sr.RequestError as e: LOG.error("Could not request Speech Recognition {0}".format(e)) except ConnectionError as e: LOG.error("Connection Error: {0}".format(e)) self.emitter.emit("recognizer_loop:no_internet") except HTTPError as e: if e.response.status_code == 401: text = "pair my device" # phrase to start the pairing process LOG.warning("Access Denied at mycroft.ai") except Exception as e: self.emitter.emit('recognizer_loop:speech.recognition.unknown') LOG.error(e) LOG.error("Speech Recognition could not understand audio") if text: # STT succeeded, send the transcribed speech on for processing payload = { 'utterances': [text], 'lang': self.stt.lang, 'session': SessionManager.get().session_id } self.emitter.emit("recognizer_loop:utterance", payload) self.metrics.attr('utterances', [text])
def target(): self.emitter.emit( "speak", Message("speak", metadata={ 'utterance': utterance, 'session': SessionManager.get().session_id }))
def publish(self, events): if 'session_id' not in events: session_id = SessionManager.get().session_id events['session_id'] = session_id if self.enabled: requests.post(self.url, headers={'Content-Type': 'application/json'}, data=json.dumps(events), verify=False)
def publish(self, events): if 'session_id' not in events: session_id = SessionManager.get().session_id events['session_id'] = session_id if self.enabled: requests.post( self.url, headers={'Content-Type': 'application/json'}, data=json.dumps(events), verify=False)
def try_consume_audio(self): timer = Stopwatch() hyp = None audio = self.queue.get() self.metrics.timer("mycroft.recognizer.audio.length_s", self._audio_length(audio)) self.queue.task_done() timer.start() if self.state.sleeping: hyp = self.wakeup_recognizer.transcribe(audio.get_wav_data(), metrics=self.metrics) if hyp and hyp.hypstr: logger.debug("sleeping recognition: " + hyp.hypstr) if hyp and hyp.hypstr.lower().find("wake up") >= 0: SessionManager.touch() self.state.sleeping = False self.__speak("I'm awake.") # TODO: Localization self.metrics.increment("mycroft.wakeup") else: if not self.state.skip_wakeword: hyp = self.ww_recognizer.transcribe(audio.get_wav_data(), metrics=self.metrics) if hyp and hyp.hypstr.lower().find("mycroft") >= 0: extractor = WakewordExtractor(audio, self.ww_recognizer, self.metrics) timer.lap() extractor.calculate_range() self.metrics.timer("mycroft.recognizer.extractor.time_s", timer.lap()) audio_before = extractor.get_audio_data_before() self.metrics.timer("mycroft.recognizer.audio_extracted.length_s", self._audio_length(audio_before)) audio_after = extractor.get_audio_data_after() self.metrics.timer("mycroft.recognizer.audio_extracted.length_s", self._audio_length(audio_after)) SessionManager.touch() payload = { 'utterance': hyp.hypstr, 'session': SessionManager.get().session_id, 'pos_begin': int(extractor.range.begin), 'pos_end': int(extractor.range.end) } self.emitter.emit("recognizer_loop:wakeword", payload) try: self.transcribe([audio_before, audio_after]) except sr.UnknownValueError: self.__speak("Go ahead") self.state.skip_wakeword = True self.metrics.increment("mycroft.wakeword") elif self.state.skip_wakeword: SessionManager.touch() try: self.transcribe([audio]) except sr.UnknownValueError: logger.warn("Speech Recognition could not understand audio") self.__speak("Sorry, I didn't catch that.") self.metrics.increment("mycroft.recognizer.error") self.state.skip_wakeword = False else: self.metrics.clear() self.metrics.flush()
def process_audio(self, audio): SessionManager.touch() payload = { 'utterance': self.mycroft_recognizer.key_phrase, 'session': SessionManager.get().session_id, } self.emitter.emit("recognizer_loop:wakeword", payload) try: self.transcribe([audio]) except sr.UnknownValueError: # TODO: Localization logger.warn("Speech Recognition could not understand audio") self.__speak("Sorry, I didn't catch that.") bus = dbus.SessionBus() remote_object = bus.get_object( "com.mycroftkde.KDEPlasmoid", "/ComMycroftkdeKDEPlasmoidInterface") setText = remote_object.setText( "Sorry, I didn't catch that.", dbus_interface="com.mycroftkde.KDEPlasmoid")
def _compile_metadata(self): ww_module = self.wake_word_recognizer.__class__.__name__ if ww_module == 'PreciseHotword': model_path = self.wake_word_recognizer.precise_model with open(model_path, 'rb') as f: model_hash = md5(f.read()).hexdigest() else: model_hash = '0' return { 'name': self.wake_word_name.replace(' ', '-'), 'engine': md5(ww_module.encode('utf-8')).hexdigest(), 'time': str(int(1000 * get_time())), 'sessionId': SessionManager.get().session_id, 'accountId': self.account_id, 'model': str(model_hash) }
def process(self, audio): path = pathlib.Path().absolute() settings_file = open(str(path)+'/mycroft/client/speech/set_config.txt', 'w') if self._audio_length(audio) >= self.MIN_AUDIO_SIZE: stopwatch = Stopwatch() with stopwatch: transcription = self.transcribe(audio) settings = {"rate": " '1.0' ", "volume": " '80%' "} #speed if "quickly" in transcription: settings["rate"] = " '1.6' " if "slowly" in transcription: settings["rate"] = " '.6' " #volume if "loudly" in transcription: settings["volume"] = " '100%' " if "softly" in transcription: settings["volume"] = " '50%' " settings_file.write(str(settings)) settings_file.close() if transcription: ident = str(stopwatch.timestamp) + str(hash(transcription)) # STT succeeded, send the transcribed speech on for processing payload = { 'utterances': [transcription], 'lang': self.stt.lang, 'session': SessionManager.get().session_id, 'ident': ident } self.emitter.emit("recognizer_loop:utterance", payload) self.metrics.attr('utterances', [transcription]) # Report timing metrics report_timing(ident, 'stt', stopwatch, {'transcription': transcription, 'stt': self.stt.__class__.__name__}) else: ident = str(stopwatch.timestamp) else: LOG.warning("Audio too short to be processed")
def transcribe(self, audio): LOG.debug("Transcribing audio") text = None try: # Invoke the STT engine on the audio clip text = self.stt.execute(audio).lower().strip() LOG.debug("STT: --------->" + text) except sr.RequestError as e: LOG.error("Could not request Speech Recognition {0}".format(e)) except ConnectionError as e: LOG.error("Connection Error: {0}".format(e)) self.emitter.emit("recognizer_loop:no_internet") except HTTPError as e: if e.response.status_code == 401: text = "pair my device" # phrase to start the pairing process LOG.warning("Access Denied at mycroft.ai") except Exception as e: LOG.error(e) LOG.error("Speech Recognition could not understand audio") if text: # STT succeeded, send the transcribed speech on for processing LOG.error("maine samjha tune bola " + text) tellMeMore = "tell me more" if (text == tellMeMore): #hotWordListener = self.finalHotWord LOG.info("found tell me more in listener****") #text = text + " about " + hotWordListener with open("hotWordFile.txt", "rw+") as hotWordTemp: prevHotWord = hotWordTemp.read() hotWordTemp.truncate(0) text = "tell me about " + prevHotWord LOG.error(" naya wala maine samjha tune bola " + text) payload = { 'utterances': [text], 'lang': self.stt.lang, 'session': SessionManager.get().session_id } self.emitter.emit("recognizer_loop:utterance", payload) self.metrics.attr('utterances', [text])
def transcribe(self, audio): text = None try: text = self.stt.execute(audio).lower().strip() LOG.debug("STT: " + text) except sr.RequestError as e: LOG.error("Could not request Speech Recognition {0}".format(e)) except HTTPError as e: if e.response.status_code == 401: text = "pair my device" LOG.warn("Access Denied at mycroft.ai") except Exception as e: LOG.error(e) LOG.error("Speech Recognition could not understand audio") self.__speak("Sorry, I didn't catch that") if text: payload = { 'utterances': [text], 'session': SessionManager.get().session_id } self.emitter.emit("recognizer_loop:utterance", payload) self.metrics.attr('utterances', [text])
def _upload_wake_word(self, audio): ww_module = self.wake_word_recognizer.__class__.__name__ if ww_module == 'PreciseHotword': model_path = self.wake_word_recognizer.precise_model with open(model_path, 'rb') as f: model_hash = md5(f.read()).hexdigest() else: model_hash = '0' metadata = { 'name': self.wake_word_name.replace(' ', '-'), 'engine': md5(ww_module.encode('utf-8')).hexdigest(), 'time': str(int(1000 * get_time())), 'sessionId': SessionManager.get().session_id, 'accountId': self.account_id, 'model': str(model_hash) } requests.post(self.upload_url, files={ 'audio': BytesIO(audio.get_wav_data()), 'metadata': StringIO(json.dumps(metadata)) })
def _upload_wake_word(self, audio): ww_module = self.wake_word_recognizer.__class__.__name__ if ww_module == 'PreciseHotword': model_path = self.wake_word_recognizer.precise_model with open(model_path, 'rb') as f: model_hash = md5(f.read()).hexdigest() else: model_hash = '0' metadata = { 'name': self.wake_word_name.replace(' ', '-'), 'engine': md5(ww_module.encode('utf-8')).hexdigest(), 'time': str(int(1000 * get_time())), 'sessionId': SessionManager.get().session_id, 'accountId': self.account_id, 'model': str(model_hash) } requests.post( self.upload_url, files={ 'audio': BytesIO(audio.get_wav_data()), 'metadata': StringIO(json.dumps(metadata)) } )
def __speak(self, utterance): payload = { 'utterance': utterance, 'session': SessionManager.get().session_id } self.emitter.emit("speak", Message("speak", metadata=payload))
def _wait_until_wake_word(self, source, sec_per_buffer): """Listen continuously on source until a wake word is spoken Args: source (AudioSource): Source producing the audio chunks sec_per_buffer (float): Fractional number of seconds in each chunk """ num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE * source.SAMPLE_WIDTH) silence = '\0' * num_silent_bytes # bytearray to store audio in byte_data = silence buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer buffers_since_check = 0.0 # Max bytes for byte_data before audio is removed from the front max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source) test_size = self.sec_to_bytes(self.TEST_WW_SEC, source) said_wake_word = False # Rolling buffer to track the audio energy (loudness) heard on # the source recently. An average audio energy is maintained # based on these levels. energies = [] idx_energy = 0 avg_energy = 0.0 energy_avg_samples = int(5 / sec_per_buffer) # avg over last 5 secs counter = 0 while not said_wake_word and not self._stop_signaled: if self._skip_wake_word(): break chunk = self.record_sound_chunk(source) energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) if energy < self.energy_threshold * self.multiplier: self._adjust_threshold(energy, sec_per_buffer) if len(energies) < energy_avg_samples: # build the average energies.append(energy) avg_energy += float(energy) / energy_avg_samples else: # maintain the running average and rolling buffer avg_energy -= float(energies[idx_energy]) / energy_avg_samples avg_energy += float(energy) / energy_avg_samples energies[idx_energy] = energy idx_energy = (idx_energy + 1) % energy_avg_samples # maintain the threshold using average if energy < avg_energy * 1.5: if energy > self.energy_threshold: # bump the threshold to just above this value self.energy_threshold = energy * 1.2 # Periodically output energy level stats. This can be used to # visualize the microphone input, e.g. a needle on a meter. if counter % 3: with open(self.mic_level_file, 'w') as f: f.write("Energy: cur=" + str(energy) + " thresh=" + str(self.energy_threshold)) f.close() counter += 1 # At first, the buffer is empty and must fill up. After that # just drop the first chunk bytes to keep it the same size. needs_to_grow = len(byte_data) < max_size if needs_to_grow: byte_data += chunk else: # Remove beginning of audio and add new chunk to end byte_data = byte_data[len(chunk):] + chunk buffers_since_check += 1.0 if buffers_since_check > buffers_per_check: buffers_since_check -= buffers_per_check chopped = byte_data[-test_size:] \ if test_size < len(byte_data) else byte_data audio_data = chopped + silence said_wake_word = \ self.wake_word_recognizer.found_wake_word(audio_data) # if a wake word is success full then record audio in temp # file. if self.save_wake_words and said_wake_word: audio = self._create_audio_data(byte_data, source) stamp = str(int(1000 * get_time())) uid = SessionManager.get().session_id if not isdir(self.save_wake_words_dir): mkdir(self.save_wake_words_dir) dr = self.save_wake_words_dir ww = self.wake_word_name.replace(' ', '-') filename = join(dr, ww + '.' + stamp + '.' + uid + '.wav') with open(filename, 'wb') as f: f.write(audio.get_wav_data()) if self.upload_config['enable'] or self.config['opt_in']: t = Thread(target=self._upload_file, args=(filename,)) t.daemon = True t.start()
def _wait_until_wake_word(self, source, sec_per_buffer, emitter): """Listen continuously on source until a wake word is spoken Args: source (AudioSource): Source producing the audio chunks sec_per_buffer (float): Fractional number of seconds in each chunk """ num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE * source.SAMPLE_WIDTH) silence = get_silence(num_silent_bytes) # bytearray to store audio in byte_data = silence buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer buffers_since_check = 0.0 # Max bytes for byte_data before audio is removed from the front max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source) test_size = self.sec_to_bytes(self.TEST_WW_SEC, source) said_wake_word = False # Rolling buffer to track the audio energy (loudness) heard on # the source recently. An average audio energy is maintained # based on these levels. energies = [] idx_energy = 0 avg_energy = 0.0 energy_avg_samples = int(5 / sec_per_buffer) # avg over last 5 secs counter = 0 # These are frames immediately after wake word is detected # that we want to keep to send to STT ww_frames = deque(maxlen=7) while not said_wake_word and not self._stop_signaled: if self._skip_wake_word(): break chunk = self.record_sound_chunk(source) ww_frames.append(chunk) energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) if energy < self.energy_threshold * self.multiplier: self._adjust_threshold(energy, sec_per_buffer) if len(energies) < energy_avg_samples: # build the average energies.append(energy) avg_energy += float(energy) / energy_avg_samples else: # maintain the running average and rolling buffer avg_energy -= float(energies[idx_energy]) / energy_avg_samples avg_energy += float(energy) / energy_avg_samples energies[idx_energy] = energy idx_energy = (idx_energy + 1) % energy_avg_samples # maintain the threshold using average if energy < avg_energy * 1.5: if energy > self.energy_threshold: # bump the threshold to just above this value self.energy_threshold = energy * 1.2 # Periodically output energy level stats. This can be used to # visualize the microphone input, e.g. a needle on a meter. if counter % 3: self.write_mic_level(energy, source) counter += 1 # At first, the buffer is empty and must fill up. After that # just drop the first chunk bytes to keep it the same size. needs_to_grow = len(byte_data) < max_size if needs_to_grow: byte_data += chunk else: # Remove beginning of audio and add new chunk to end byte_data = byte_data[len(chunk):] + chunk buffers_since_check += 1.0 self.wake_word_recognizer.update(chunk) if buffers_since_check > buffers_per_check: buffers_since_check -= buffers_per_check chopped = byte_data[-test_size:] \ if test_size < len(byte_data) else byte_data audio_data = chopped + silence said_wake_word = \ self.wake_word_recognizer.found_wake_word(audio_data) # Save positive wake words as appropriate if said_wake_word: SessionManager.touch() payload = { 'utterance': self.wake_word_name, 'session': SessionManager.get().session_id, } emitter.emit("recognizer_loop:wakeword", payload) audio = None mtd = None if self.save_wake_words: # Save wake word locally audio = self._create_audio_data(byte_data, source) mtd = self._compile_metadata() if not isdir(self.saved_wake_words_dir): os.mkdir(self.saved_wake_words_dir) module = self.wake_word_recognizer.__class__.__name__ fn = join( self.saved_wake_words_dir, '_'.join(str(mtd[k]) for k in sorted(mtd)) + '.wav') with open(fn, 'wb') as f: f.write(audio.get_wav_data()) if self.config['opt_in'] and not self.upload_disabled: # Upload wake word for opt_in people Thread( target=self._upload_wake_word, daemon=True, args=[ audio or self._create_audio_data(byte_data, source), mtd or self._compile_metadata() ]).start() return ww_frames
def wake_up(self, audio): if self.wakeup_recognizer.found_wake_word(audio.frame_data): SessionManager.touch() self.state.sleeping = False self.emitter.emit('recognizer_loop:awoken') self.metrics.increment("mycroft.wakeup")
def target(): self.emitter.emit( "speak", Message("speak", metadata={'utterance': utterance, 'session': SessionManager.get().session_id}))
def try_consume_audio(self): timer = Stopwatch() hyp = None audio = self.queue.get() self.metrics.timer("mycroft.recognizer.audio.length_s", self._audio_length(audio)) self.queue.task_done() timer.start() if self.state.sleeping: hyp = self.wakeup_recognizer.transcribe(audio.get_wav_data(), metrics=self.metrics) if hyp and hyp.hypstr: logger.debug("sleeping recognition: " + hyp.hypstr) if hyp and hyp.hypstr.lower().find("wake up") >= 0: SessionManager.touch() self.state.sleeping = False self.__speak("I'm awake.") # TODO: Localization self.metrics.increment("mycroft.wakeup") else: if not self.state.skip_wakeword: hyp = self.ww_recognizer.transcribe(audio.get_wav_data(), metrics=self.metrics) if hyp and hyp.hypstr.lower().find("mycroft") >= 0: extractor = WakewordExtractor(audio, self.ww_recognizer, self.metrics) timer.lap() extractor.calculate_range() self.metrics.timer("mycroft.recognizer.extractor.time_s", timer.lap()) audio_before = extractor.get_audio_data_before() self.metrics.timer( "mycroft.recognizer.audio_extracted.length_s", self._audio_length(audio_before)) audio_after = extractor.get_audio_data_after() self.metrics.timer( "mycroft.recognizer.audio_extracted.length_s", self._audio_length(audio_after)) SessionManager.touch() payload = { 'utterance': hyp.hypstr, 'session': SessionManager.get().session_id, 'pos_begin': int(extractor.range.begin), 'pos_end': int(extractor.range.end) } self.emitter.emit("recognizer_loop:wakeword", payload) try: self.transcribe([audio_before, audio_after]) except sr.UnknownValueError: self.__speak("Go ahead") self.state.skip_wakeword = True self.metrics.increment("mycroft.wakeword") elif self.state.skip_wakeword: SessionManager.touch() try: self.transcribe([audio]) except sr.UnknownValueError: logger.warn( "Speech Recognition could not understand audio") self.__speak("Sorry, I didn't catch that.") self.metrics.increment("mycroft.recognizer.error") self.state.skip_wakeword = False else: self.metrics.clear() self.metrics.flush()
def _wait_until_wake_word(self, source, sec_per_buffer): """Listen continuously on source until a wake word is spoken Args: source (AudioSource): Source producing the audio chunks sec_per_buffer (float): Fractional number of seconds in each chunk """ num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE * source.SAMPLE_WIDTH) silence = '\0' * num_silent_bytes # bytearray to store audio in byte_data = silence buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer buffers_since_check = 0.0 # Max bytes for byte_data before audio is removed from the front max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source) test_size = self.sec_to_bytes(self.TEST_WW_SEC, source) said_wake_word = False # Rolling buffer to track the audio energy (loudness) heard on # the source recently. An average audio energy is maintained # based on these levels. energies = [] idx_energy = 0 avg_energy = 0.0 energy_avg_samples = int(5 / sec_per_buffer) # avg over last 5 secs counter = 0 while not said_wake_word and not self._stop_signaled: if self._skip_wake_word(): break chunk = self.record_sound_chunk(source) energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) if energy < self.energy_threshold * self.multiplier: self._adjust_threshold(energy, sec_per_buffer) if len(energies) < energy_avg_samples: # build the average energies.append(energy) avg_energy += float(energy) / energy_avg_samples else: # maintain the running average and rolling buffer avg_energy -= float(energies[idx_energy]) / energy_avg_samples avg_energy += float(energy) / energy_avg_samples energies[idx_energy] = energy idx_energy = (idx_energy + 1) % energy_avg_samples # maintain the threshold using average if energy < avg_energy * 1.5: if energy > self.energy_threshold: # bump the threshold to just above this value self.energy_threshold = energy * 1.2 # Periodically output energy level stats. This can be used to # visualize the microphone input, e.g. a needle on a meter. if counter % 3: with open(self.mic_level_file, 'w') as f: f.write("Energy: cur=" + str(energy) + " thresh=" + str(self.energy_threshold)) f.close() counter += 1 # At first, the buffer is empty and must fill up. After that # just drop the first chunk bytes to keep it the same size. needs_to_grow = len(byte_data) < max_size if needs_to_grow: byte_data += chunk else: # Remove beginning of audio and add new chunk to end byte_data = byte_data[len(chunk):] + chunk buffers_since_check += 1.0 if buffers_since_check > buffers_per_check: buffers_since_check -= buffers_per_check chopped = byte_data[-test_size:] \ if test_size < len(byte_data) else byte_data audio_data = chopped + silence said_wake_word = \ self.wake_word_recognizer.found_wake_word(audio_data) # if a wake word is success full then record audio in temp # file. if self.save_wake_words and said_wake_word: audio = self._create_audio_data(byte_data, source) stamp = str(int(1000 * get_time())) uid = SessionManager.get().session_id if not isdir(self.save_wake_words_dir): mkdir(self.save_wake_words_dir) dr = self.save_wake_words_dir ww = self.wake_word_name.replace(' ', '-') filename = join(dr, ww + '.' + stamp + '.' + uid + '.wav') with open(filename, 'wb') as f: f.write(audio.get_wav_data()) if self.upload_config['enable'] or self.config['opt_in']: t = Thread(target=self._upload_file, args=(filename, )) t.daemon = True t.start()