def _load_config(self): """Load configuration parameters from configuration.""" config = CONFIGURATION self.config_core = config self.lang = config.get('lang') self.config = config.get('listener') rate = self.config.get('sample_rate') device_index = self.config.get('device_index') device_name = self.config.get('device_name') if not device_index and device_name: device_index = find_input_device(device_name) LOG.debug('Using microphone (None = default): ' + str(device_index)) self.microphone = MutableMicrophone(device_index, rate, mute=self.mute_calls > 0) # TODO - localization self.wakeup_recognizer = self.create_wakeup_recognizer() self.hotword_engines = {} self.create_hotword_engines() self.responsive_recognizer = ResponsiveRecognizer(self.hotword_engines) self.state = RecognizerLoopState()
def on_preferences_changed(self, event): preferences = json.loads(event["data"]["preferences"]) for pref in preferences: user_id = pref["user_id"] category = pref["category"] value = pref["value"] LOG.debug(category + ":" + value)
def execute(self, sentence, ident=None, listen=False): """Convert sentence to speech, preprocessing out unsupported ssml The method caches results if possible using the hash of the sentence. Arguments: sentence: Sentence to be spoken ident: Id reference to current interaction listen: True if listen should be triggered at the end of the utterance. """ sentence = self.validate_ssml(sentence) chunks = self._preprocess_sentence(sentence) # Apply the listen flag to the last chunk, set the rest to False chunks = [(chunks[i], listen if i == len(chunks) - 1 else False) for i in range(len(chunks))] for sentence, l in chunks: key = str(hashlib.md5( sentence.encode('utf-8', 'ignore')).hexdigest()) wav_file = os.path.join(self.cache_dir, key + '.' + self.audio_ext) if os.path.exists(wav_file): LOG.debug("TTS cache hit") phonemes = self.load_phonemes(key) else: wav_file, phonemes = self.get_tts(sentence, wav_file) if phonemes: self.save_phonemes(key, phonemes) vis = self.viseme(phonemes) if phonemes else None self.queue.put((self.audio_ext, wav_file, vis, ident, l))
def speak(self, utterance): if self.debug: LOG.debug("[SPEAK] " + utterance) if self.color: print('\x1b[6;34;40m MYCROFT: ' + utterance + ' \x1b[0m') else: print('MYCROFT: ' + utterance)
def say(self, utterance): if self.debug: LOG.debug("[UTTERANCE] " + utterance) if self.color: print('\x1b[6;33;40m YOU: ' + utterance + ' \x1b[0m') else: print('YOU: ' + utterance)
def on_gui_message(self, payload): try: msg = json.loads(payload) if self.debug: LOG.debug("Msg: " + str(payload)) msg_type = msg.get("type") if msg_type == "mycroft.session.set": skill = msg.get("namespace") self.skill = self.skill or skill data = msg.get("data") if skill not in self.vars: self.vars[skill] = {} for d in data: self.vars[skill][d] = data[d] self.on_new_gui_data(data) elif msg_type == "mycroft.session.list.insert": # Insert new namespace self.skill = msg['data'][0]['skill_id'] self.loaded.insert(0, [self.skill, []]) elif msg_type == "mycroft.gui.list.insert": # Insert a page in an existing namespace self.page = msg['data'][0]['url'] pos = msg.get('position') # TODO sometimes throws IndexError: list index out of range # not invalid json, seems like either pos is out of range or # "mycroft.session.list.insert" message was missed # NOTE: only happened once with wiki skill, cant replicate self.loaded[0][1].insert(pos, self.page) #self.skill = self.loaded[0][0] elif msg_type == "mycroft.session.list.move": # Move the namespace at "pos" to the top of the stack pos = msg.get('from') self.loaded.insert(0, self.loaded.pop(pos)) elif msg_type == "mycroft.session.list.remove": pos = msg.get('position') skill = msg.get("namespace") if self.skill == skill: self.skill = None self.loaded.pop(pos) elif msg_type == "mycroft.events.triggered": # Switch selected page of namespace skill = msg['namespace'] self.skill = self.skill or skill pos = msg['data']['number'] for n in self.loaded: if n[0] == skill: # TODO sometimes pos throws # IndexError: list index out of range # ocasionally happens with weather skill # LOGS: # 05:38:29.363 - __main__:on_gui_message:56 - DEBUG - Msg: {"type": "mycroft.events.triggered", "namespace": "mycroft-weather.mycroftai", "event_name": "page_gained_focus", "data": {"number": 1}} # 05:38:29.364 - __main__:on_gui_message:90 - ERROR - list index out of range self.page = n[1][pos] self._draw_buffer() self.on_message(msg) except Exception as e: if self.debug: LOG.exception(e) LOG.error("Invalid JSON: " + str(payload))
def _register_object(self, message, object_name, register_func): name = message.data['name'] samples = message.data['samples'] LOG.debug('Registering ' + self.engine.name + ' ' + object_name + ': ' + name) register_func(name, samples) self.train_time = get_time() + self.train_delay self.wait_and_train()
def end_audio(self): """Helper function for child classes to call in execute(). Sends the recognizer_loop:audio_output_end message (indicating that speaking is done for the moment) as well as trigger listening if it has been requested. It also checks if cache directory needs cleaning to free up disk space. Arguments: listen (bool): indication if listening trigger should be sent. """ LOG.debug("recognizer_loop:audio_output_end")
def load_phonemes(self, key): """Load phonemes from cache file. Arguments: Key: Key identifying phoneme cache """ pho_file = os.path.join(self.cache_dir, key + ".pho") if os.path.exists(pho_file): try: with open(pho_file, "r") as cachefile: phonemes = cachefile.read().strip() return phonemes except Exception: LOG.debug("Failed to read .PHO from cache") return None
def play_ogg(uri, play_cmd="ogg123 -q %1"): """ Play a ogg-file. Returns: subprocess.Popen object """ play_ogg_cmd = str(play_cmd).split(" ") for index, cmd in enumerate(play_ogg_cmd): if cmd == "%1": play_ogg_cmd[index] = uri try: return subprocess.Popen(play_ogg_cmd) except Exception as e: LOG.error("Failed to launch OGG: {}".format(play_ogg_cmd)) LOG.debug("Error: {}".format(repr(e)), exc_info=True) return None
def handle_fallback(self, message): utt = message.data.get('utterance') LOG.debug(self.engine.name + " fallback attempt: " + utt) if not self.finished_training_event.is_set(): LOG.debug('Waiting for training to finish...') self.finished_training_event.wait() data = self.engine.calc_intent(utt) if data["conf"] < 0.5: return False self.make_active() self.emitter.emit(message.reply(data["name"], data=data)) return True
def transcribe(self, audio): def send_unknown_intent(): """ Send message that nothing was transcribed. """ self.emitter.emit('recognizer_loop:speech.recognition.unknown') try: # Invoke the STT engine on the audio clip text = self.stt.execute(audio) if text is not None: text = text.lower().strip() LOG.debug("STT: " + text) else: send_unknown_intent() LOG.info('no words were transcribed') if self.save_utterances: mtd = self._compile_metadata(text) filename = os.path.join(self.saved_utterances_dir, mtd["name"]) with open(filename, 'wb') as f: f.write(audio.get_wav_data()) filename = os.path.join(self.saved_utterances_dir, mtd["name"].replace(".wav", ".json")) with open(filename, 'w') as f: json.dump(mtd, f, indent=4) return text except sr.RequestError as e: LOG.error("Could not request Speech Recognition {0}".format(e)) except ConnectionError as e: LOG.error("Connection Error: {0}".format(e)) self.emitter.emit("recognizer_loop:no_internet") except RequestException as e: LOG.error(e.__class__.__name__ + ': ' + str(e)) except Exception as e: send_unknown_intent() LOG.error(e) LOG.error("Speech Recognition could not understand audio") return None dialog_name = 'not connected to the internet' self.emitter.emit('speak', {'utterance': dialog_name})
def load_local(self, path): """ Load local json file into self. Args: path (str): file to load """ path = expanduser(path) if exists(path) and isfile(path): try: config = load_commented_json(path) for key in config: self.__setitem__(key, config[key]) LOG.debug("Configuration {} loaded".format(path)) except Exception as e: LOG.error("Error loading configuration '{}'".format(path)) LOG.error(repr(e)) else: LOG.debug("Configuration '{}' not defined, skipping".format(path))
def get_cli_input(self): while True: if self.waiting: sleep(0.3) continue if self.debug: LOG.debug("waiting for input") if self.color: line = input("\x1b[6;33;40m INPUT: \x1b[0m") else: line = input("INPUT:") self.say(line) msg = {"data": {"utterances": [line], "lang": "en-us"}, "type": "recognizer_loop:utterance", "context": {"source": self.client.peer, "destination": "hive_mind", "platform": platform}} self.send_to_hivemind_bus(msg) self.waiting = True
def _skip_wake_word(self): """Check if told programatically to skip the wake word For example when we are in a dialog with the user. """ # TODO: remove startListening signal check in 20.02 if check_for_signal('startListening') or self._listen_triggered: return True # Pressing the Mark 1 button can start recording (unless # it is being used to mean 'stop' instead) if check_for_signal('buttonPress', 1): # give other processes time to consume this signal if # it was meant to be a 'stop' sleep(0.25) if check_for_signal('buttonPress'): # Signal is still here, assume it was intended to # begin recording LOG.debug("Button Pressed, wakeword not needed") return True return False
async def event_handler(self, event): event = json.loads(event) event_type = event.get("event", "") if event_type == "hello": self.on_connect(event) elif event_type == "status_change": self.on_status_change(event) elif event_type == "typing": self.on_typing(event) elif event_type == "posted": self.on_message(event) elif event_type == "channel_viewed": self.on_viewed(event) elif event_type == "preferences_changed": self.on_preferences_changed(event) elif event_type == "post_deleted": self.on_post_deleted(event) elif event_type == "user_added": self.on_user_added(event) elif event_type == "user_removed": self.on_user_removed(event) else: LOG.debug(event)
def listen(self, source, bus, stream=None): """Listens for chunks of audio that Mycroft should perform STT on. This will listen continuously for a wake-up-word, then return the audio chunk containing the spoken phrase that comes immediately afterwards. Args: source (AudioSource): Source producing the audio chunks bus (EventEmitter): Emitter for notifications of when recording begins and ends. stream (AudioStreamHandler): Stream target that will receive chunks of the utterance audio while it is being recorded Returns: AudioData: audio with the user's utterance, minus the wake-up-word """ assert isinstance(source, AudioSource), "Source must be an AudioSource" # bytes_per_sec = source.SAMPLE_RATE * source.SAMPLE_WIDTH sec_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE # Every time a new 'listen()' request begins, reset the threshold # used for silence detection. This is as good of a reset point as # any, as we expect the user and Mycroft to not be talking. # NOTE: adjust_for_ambient_noise() doc claims it will stop early if # speech is detected, but there is no code to actually do that. self.adjust_for_ambient_noise(source, 1.0) LOG.debug("Waiting for wake word...") self._wait_until_wake_word(source, sec_per_buffer, bus) self._listen_triggered = False if self._stop_signaled: return LOG.debug("Recording...") bus.emit("recognizer_loop:record_begin") frame_data = self._record_phrase(source, sec_per_buffer, stream) audio_data = self._create_audio_data(frame_data, source) bus.emit("recognizer_loop:record_end") LOG.debug("Thinking...") return audio_data
def find_input_device(device_name): """ Find audio input device by name. Arguments: device_name: device name or regex pattern to match Returns: device_index (int) or None if device wasn't found """ LOG.info('Searching for input device: {}'.format(device_name)) LOG.debug('Devices: ') pa = pyaudio.PyAudio() pattern = re.compile(device_name) for device_index in range(pa.get_device_count()): dev = pa.get_device_info_by_index(device_index) LOG.debug(' {}'.format(dev['name'])) if dev['maxInputChannels'] > 0 and pattern.match(dev['name']): LOG.debug(' ^-- matched') return device_index return None
def begin_audio(self): """Helper function for child classes to call in execute()""" # Create signals informing start of speech LOG.debug("recognizer_loop:audio_output_start")
def speak(self, utterance, channel_id, user_data): user = user_data["mattermost_username"] utterance = "@{} , ".format(user) + utterance LOG.debug("Sending message to channel " + channel_id) LOG.debug("Message: " + utterance) self.bot.send_message(channel_id, utterance)
def on_handled(self): if self.debug: LOG.debug("Request handled") self.waiting = False
def _wait_until_wake_word(self, source, sec_per_buffer, bus): """Listen continuously on source until a wake word is spoken Args: source (AudioSource): Source producing the audio chunks sec_per_buffer (float): Fractional number of seconds in each chunk """ num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE * source.SAMPLE_WIDTH) silence = get_silence(num_silent_bytes) # bytearray to store audio in byte_data = silence buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer buffers_since_check = 0.0 # Max bytes for byte_data before audio is removed from the front max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source) test_size = self.sec_to_bytes(self.TEST_WW_SEC, source) said_wake_word = False # Rolling buffer to track the audio energy (loudness) heard on # the source recently. An average audio energy is maintained # based on these levels. energies = [] idx_energy = 0 avg_energy = 0.0 energy_avg_samples = int(5 / sec_per_buffer) # avg over last 5 secs counter = 0 # These are frames immediately after wake word is detected # that we want to keep to send to STT ww_frames = deque(maxlen=7) while not said_wake_word and not self._stop_signaled: if self._skip_wake_word(): break chunk = self.record_sound_chunk(source) ww_frames.append(chunk) energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) if energy < self.energy_threshold * self.multiplier: self._adjust_threshold(energy, sec_per_buffer) if len(energies) < energy_avg_samples: # build the average energies.append(energy) avg_energy += float(energy) / energy_avg_samples else: # maintain the running average and rolling buffer avg_energy -= float(energies[idx_energy]) / energy_avg_samples avg_energy += float(energy) / energy_avg_samples energies[idx_energy] = energy idx_energy = (idx_energy + 1) % energy_avg_samples # maintain the threshold using average if energy < avg_energy * 1.5: if energy > self.energy_threshold: # bump the threshold to just above this value self.energy_threshold = energy * 1.2 counter += 1 # At first, the buffer is empty and must fill up. After that # just drop the first chunk bytes to keep it the same size. needs_to_grow = len(byte_data) < max_size if needs_to_grow: byte_data += chunk else: # Remove beginning of audio and add new chunk to end byte_data = byte_data[len(chunk):] + chunk buffers_since_check += 1.0 self.feed_hotwords(chunk) if buffers_since_check > buffers_per_check: buffers_since_check -= buffers_per_check chopped = byte_data[-test_size:] \ if test_size < len(byte_data) else byte_data audio_data = chopped + silence said_hot_word = False for hotword in self.check_for_hotwords(audio_data, bus): said_hot_word = True engine = self.hotword_engines[hotword]["engine"] sound = self.hotword_engines[hotword]["sound"] utterance = self.hotword_engines[hotword]["utterance"] listen = self.hotword_engines[hotword]["listen"] LOG.debug("Hot Word: " + hotword) # If enabled, play a wave file with a short sound to audibly # indicate hotword was detected. if sound: try: audio_file = resolve_resource_file(sound) source.mute() if audio_file.endswith(".wav"): play_wav(audio_file).wait() elif audio_file.endswith(".mp3"): play_mp3(audio_file).wait() elif audio_file.endswith(".ogg"): play_ogg(audio_file).wait() else: play_audio(audio_file).wait() source.unmute() except Exception as e: LOG.warning(e) # Hot Word succeeded payload = { 'hotword': hotword, 'start_listening': listen, 'sound': sound, "engine": engine.__class__.__name__ } bus.emit("recognizer_loop:hotword", payload) if utterance: # send the transcribed word on for processing payload = {'utterances': [utterance]} bus.emit("recognizer_loop:utterance", payload) audio = None mtd = self._compile_metadata(hotword) if self.save_wake_words: # Save wake word locally audio = self._create_audio_data(byte_data, source) if not isdir(self.saved_wake_words_dir): os.mkdir(self.saved_wake_words_dir) fn = join( self.saved_wake_words_dir, '_'.join(str(mtd[k]) for k in sorted(mtd)) + '.wav') with open(fn, 'wb') as f: f.write(audio.get_wav_data()) fn = join( self.saved_wake_words_dir, '_'.join(str(mtd[k]) for k in sorted(mtd)) + '.json') with open(fn, 'w') as f: json.dump(mtd, f, indent=4) if listen: said_wake_word = True if said_hot_word: # reset bytearray to store wake word audio in, else many # serial detections byte_data = silence
def trigger_listen(self): """Externally trigger listening.""" LOG.debug('Listen triggered from external source.') self._listen_triggered = True
def connect(self): LOG.debug("Announcing GUI") self.bus.on('mycroft.gui.port', self._connect_to_gui) self.bus.emit(Message("mycroft.gui.connected", {"gui_id": self.gui_id})) self.connected = True
def on_open(self, message): LOG.debug("Gui connection open")