def audio2phoneme(audio_file): wave_read = wave.open(audio_file, 'rb') length = wave_read.getnframes()/wave_read.getframerate() wave_read.close() # Decode streaming data. decoder = Decoder(config) buf = bytearray(1024) with open(audio_file, 'rb') as f: decoder.start_utt() while f.readinto(buf): decoder.process_raw(buf, False, False) decoder.end_utt() nframes = decoder.n_frames() phonemes = [] offset = None for seg in decoder.seg(): if offset is None: offset = seg.start_frame start_frame = seg.start_frame - offset end_frame = seg.end_frame - offset phonemes.append(( seg.word, start_frame/nframes*length, end_frame/nframes*length)) return phonemes
class LocalRecognizer(object): def __init__(self, sample_rate=16000, lang="en-us", key_phrase="mycroft"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.configure() def configure(self): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', os.path.join(BASEDIR, 'model', self.lang, 'mycroft-en-us.dict')) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float('1e-45')) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') self.decoder = Decoder(config) def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def is_recognized(self, byte_data, metrics): hyp = self.transcribe(byte_data, metrics) return hyp and self.key_phrase in hyp.hypstr.lower() def found_wake_word(self, hypothesis): return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
def speech_recog(self, model): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', '/usr/local/share/pocketsphinx/model/en-us/en-us') config.set_int('-ds', 2) config.set_int('-topn', 3) config.set_int('-maxwpf', 5) #config.set_string('-kws', MODELDIR + model + '.txt') config.set_string('-lm', MODELDIR + model + '.lm') config.set_string('-dict', MODELDIR + model + '.dict') decoder = Decoder(config) decoder.start_utt() recog_text = '' with self.stream_in as stream: audio_generator = stream.generator() for content in audio_generator: decoder.process_raw(content, False, False) if decoder.hyp() and decoder.hyp().hypstr != '': recog_text += decoder.hyp().hypstr if len(recog_text) > 1: decoder.end_utt() logging.info("recog text: %s", recog_text) return recog_text return recog_text
class LocalRecognizer(object): def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000, lang="en-us"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.threshold = threshold self.phonemes = phonemes dict_name = self.create_dict(key_phrase, phonemes) self.decoder = Decoder(self.create_config(dict_name)) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def is_recognized(self, byte_data, metrics): hyp = self.transcribe(byte_data, metrics) return hyp and self.key_phrase in hyp.hypstr.lower() def found_wake_word(self, hypothesis): return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
def main(): abspath = os.path.dirname(os.path.abspath(__file__)) abspath = os.path.join(abspath, '..') model_dir = os.path.join(abspath, 'model') hmm = os.path.join(model_dir, HMM) lm = os.path.join(model_dir, LM) dic = os.path.join(model_dir, DIC) config = Decoder.default_config() config.set_string('-hmm', hmm) config.set_string('-lm', lm) config.set_string('-dict', dic) config.set_string('-logfn', '/dev/null') decoder = Decoder(config) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=BUFFER) stream.start_stream() in_speech_bf = True decoder.start_utt() while True: buf = stream.read(BUFFER) if buf: decoder.process_raw(buf, False, False) if decoder.get_in_speech(): sys.stdout.write('.') sys.stdout.flush() if decoder.get_in_speech() == in_speech_bf: continue in_speech_bf = decoder.get_in_speech() if in_speech_bf: continue decoder.end_utt() try: if decoder.hyp().hypstr != '': print('You said:', decoder.hyp().hypstr) except AttributeError: pass decoder.start_utt() else: break decoder.end_utt() print('An Error occured:', decoder.hyp().hypstr)
def speech_recog(self, model): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', '/usr/local/share/pocketsphinx/model/en-us/en-us') config.set_int('-ds', 2) config.set_int('-topn', 3) config.set_int('-maxwpf', 5) #config.set_string('-kws', MODELDIR + model + '.txt') config.set_string('-lm', MODELDIR + model + '.lm') config.set_string('-dict', MODELDIR + model + '.dict') decoder = Decoder(config) decoder.start_utt() tstamp = time.time() recog_text = '' while len(recog_text) < 1: try: buf = self.stream_in.read(CHUNK_SIZE) logging.info("actual voice") decoder.process_raw(buf, False, False) if decoder.hyp().hypstr != '': recog_text += decoder.hyp().hypstr print "text: " + decoder.hyp().hypstr tstamp = time.time() except IOError as ex: if ex[1] != pyaudio.paInputOverflowed: raise buf = '\x00' * CHUNK_SIZE #white noise logging.info("white noise") except AttributeError: pass decoder.end_utt() logging.info("recog text: " + recog_text) return recog_text
stream = open(sys.argv[1], "rb") else: p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) stream.start_stream() print('start...') while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) else: break hypothesis = decoder.hyp() if hypothesis: print('\nhypothesis: %s, score: %d' % (hypothesis.hypstr, hypothesis.best_score)) print ([(seg.word, seg.prob, seg.start_frame, seg.end_frame) for seg in decoder.seg()]) print ("Detected keyword, restarting search") os.system('mpg123 ' + os.path.join(script_dir, 'hi.mp3')) print('restart...') decoder.end_utt() decoder.start_utt() print('ok') # break stream.close()
def detect(self): # create decoders on the fly if not self.decoders: self.decoders = [] for id, phrase in self.config['triggers'].iteritems(): config = Decoder.default_config() # set recognition model to US config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string( '-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) # specify recognition key phrase config.set_string('-keyphrase', phrase) config.set_float('-kws_threshold', 1e-5) # hide the VERY verbose logging information # if not self.config['debug']: config.set_string('-logfn', '/dev/null') decoder = Decoder(config) decoder.id = id self.decoders.append(decoder) events.fire('detection_started') # start decoding for decoder in self.decoders: decoder.start_utt() pcm = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL, self.config['device']) pcm.setchannels(1) pcm.setrate(16000) pcm.setformat(alsaaudio.PCM_FORMAT_S16_LE) pcm.setperiodsize(1024) phrase = None triggered = False while not triggered: _, buffer = pcm.read() for decoder in self.decoders: decoder.process_raw(buffer, False, False) triggered = decoder.hyp() is not None if triggered: phrase = decoder.id break pcm.close() pcm = None for decoder in self.decoders: decoder.end_utt() events.fire('detection_fullfilled', id=phrase)
class PocketsphinxTrigger(BaseTrigger): type = triggers.TYPES.VOICE AUDIO_CHUNK_SIZE = 1024 AUDIO_RATE = 16000 _capture = None def __init__(self, config, trigger_callback, capture): super(PocketsphinxTrigger, self).__init__(config, trigger_callback, 'pocketsphinx') self._capture = capture self._enabled_lock = threading.Event() # self._disabled_sync_lock = threading.Event() self._decoder = None def setup(self): # PocketSphinx configuration ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) ps_config.set_string( '-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) # Specify recognition key phrase ps_config.set_string('-keyphrase', self._tconfig['phrase']) ps_config.set_float('-kws_threshold', float(self._tconfig['threshold'])) # Hide the VERY verbose logging information when not in debug if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG: null_path = '/dev/null' if platform.system() == 'Windows': null_path = 'nul' ps_config.set_string('-logfn', null_path) # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) def run(self): thread = threading.Thread(target=self.thread, args=()) thread.setDaemon(True) thread.start() def thread(self): while True: self._enabled_lock.wait() self._capture.handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE) self._decoder.start_utt() triggered = False while not triggered: if not self._enabled_lock.isSet(): break # Read from microphone data = self._capture.handle_read() # Detect if keyword/trigger word was said self._decoder.process_raw(data, False, False) triggered = self._decoder.hyp() is not None self._capture.handle_release() self._decoder.end_utt() if triggered: self._trigger_callback(self) def enable(self): self._enabled_lock.set() def disable(self): self._enabled_lock.clear()
class PocketsphinxTrigger(BaseTrigger): type = triggers.TYPES.VOICE def __init__(self, config, trigger_callback): super(PocketsphinxTrigger, self).__init__(config, trigger_callback, 'pocketsphinx') self._enabled_lock = threading.Event() self._disabled_sync_lock = threading.Event() self._decoder = None def setup(self): # PocketSphinx configuration ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string( '-hmm', os.path.join(get_model_path(), self._tconfig['language'])) ps_config.set_string( '-dict', os.path.join(get_model_path(), self._tconfig['dictionary'])) # Specify recognition key phrase #ps_config.set_string('-keyphrase', self._tconfig['phrase']) #ps_config.set_float('-kws_threshold', float(self._tconfig['threshold'])) ### Multiple Hotwords #ps_config.set_string('-inmic', 'yes') ps_config.set_string('-kws', '/opt/AlexaPi/src/keyphrase.list') # Hide the VERY verbose logging information when not in debug if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG: ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) def run(self): thread = threading.Thread(target=self.thread, args=()) thread.setDaemon(True) thread.start() def thread(self): while True: self._enabled_lock.wait() # Enable reading microphone raw data inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL, self._config['sound']['input_device']) inp.setchannels(1) inp.setrate(16000) inp.setformat(alsaaudio.PCM_FORMAT_S16_LE) inp.setperiodsize(1024) self._decoder.start_utt() triggered = False #assistantTriggered = False voice_command = "" while not triggered: if not self._enabled_lock.isSet(): break # Read from microphone _, buf = inp.read() # Detect if keyword/trigger word was said self._decoder.process_raw(buf, False, False) triggered = self._decoder.hyp() is not None # To avoid overflows close the microphone connection inp.close() self._decoder.end_utt() self._disabled_sync_lock.set() if triggered: ### Assistant Starts Here voice_command = self._decoder.hyp().hypstr self._trigger_callback(self, voice_command) ### def enable(self): self._enabled_lock.set() self._disabled_sync_lock.clear() def disable(self): self._enabled_lock.clear() self._disabled_sync_lock.wait()
class PocketGrammar(object): AUDIO_CHUNK_SIZE = 1024 AUDIO_RATE = 16000 HMM = 'cmusphinx-5prealpha-en-us-ptm-2.0/' DIC = 'dictionary.dic' GRAMMAR = 'grammar.jsgf' def __init__(self, device_index=0, model_path=None): self._decoder = None self._pa = None self._device_no = device_index self._model_path = model_path # PocketSphinx configuration logging.info('Grammar file:' + os.path.join(model_path, self.GRAMMAR)) ps_config = Decoder.default_config() # Set recognition model to ... ps_config.set_string('-hmm', os.path.join(model_path, self.HMM)) ps_config.set_string('-dict', os.path.join(model_path, self.DIC)) ps_config.set_string('-jsgf', os.path.join(model_path, self.GRAMMAR)) ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) self._pa = pyaudio.PyAudio() def _handle_init(self, rate, chunk_size): self._handle = self._pa.open(input=True, input_device_index=self._device_no, format=pyaudio.paInt16, channels=1, rate=rate, frames_per_buffer=chunk_size) def _handle_release(self): self._handle.stop_stream() self._handle.close() def _handle_read(self, chunk_size): return self._handle.read(chunk_size, exception_on_overflow=False) def getHypothesys(self): # init microphone self._handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE) self._decoder.start_utt() # from speech to silence or from silence to speech? utteranceStarted = False triggered = False while not triggered: # Read from microphone and process data = self._handle_read(self.AUDIO_CHUNK_SIZE) self._decoder.process_raw(data, False, False) # checks for transition from silence to speech. inSpeech = self._decoder.get_in_speech() if inSpeech and not utteranceStarted: utteranceStarted = True logging.debug("Silence") # checks for the transition from speech to silence if not inSpeech and utteranceStarted: hypothesis = self._decoder.hyp() triggered = hypothesis is not None # close microphone self._handle_release() self._decoder.end_utt() if triggered: return hypothesis.hypstr
class PocketKeyword(object): AUDIO_CHUNK_SIZE = 1024 AUDIO_RATE = 16000 def __init__(self, phrase, threshold, device_index=0): self._decoder = None self._pa = None self._device_no = device_index self._phrase = phrase self._threshold = float(threshold) # PocketSphinx configuration logging.info('Phrase: ' + phrase + ' Threshold: ' + str(threshold)) ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string('-hmm', os.path.join(get_model_path_keyword(), 'en-us')) ps_config.set_string( '-dict', os.path.join(get_model_path_keyword(), 'cmudict-en-us.dict')) # Specify recognition key phrase ps_config.set_string('-keyphrase', self._phrase) ps_config.set_float('-kws_threshold', self._threshold) ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) self._pa = pyaudio.PyAudio() def _handle_init(self, rate, chunk_size): self._handle = self._pa.open(input=True, input_device_index=self._device_no, format=pyaudio.paInt16, channels=1, rate=rate, frames_per_buffer=chunk_size) def _handle_release(self): self._handle.stop_stream() self._handle.close() def _handle_read(self, chunk_size): return self._handle.read(chunk_size, exception_on_overflow=False) def getHypothesys(self): # init microphone self._handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE) self._decoder.start_utt() triggered = False while not triggered: # Read from microphone and process data = self._handle_read(self.AUDIO_CHUNK_SIZE) self._decoder.process_raw(data, False, False) # best guess from CMU Sphinx STT hypothesis = self._decoder.hyp() triggered = hypothesis is not None # close microphone self._handle_release() self._decoder.end_utt() if triggered: return hypothesis.hypstr
class SpeechRecognizer(Interpreter): def __init__(self, name: str, sr: str = "pocketsphinx"): super().__init__(name, True) self.logger = self.get_logger() self.sr = sr self.current_data = [] self.setup() def setup(self) -> None: self.RATE = int(os.getenv("RATE")) self.CHUNK = int(os.getenv("CHUNK")) self.setup_pocketsphinx() if (self.sr == "googlespeech"): self.setup_googlespeech() def setup_pocketsphinx(self) -> None: self.logger.info("Setting up PocketSphinx.") self.MODELDIR = "resources/model" config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'es-es')) config.set_string('-lm', os.path.join(self.MODELDIR, 'es-es.lm')) config.set_string('-dict', os.path.join(self.MODELDIR, 'es.dict')) config.set_string('-logfn', '/dev/null') self.decoder = Decoder(config) self.prev_buf_is_speech = False self.decoder.start_utt() self.logger.info("Done setting up PocketSphinx.") def setup_googlespeech(self) -> None: self.logger.info("Setting up Google Speech.") credentials = service_account.Credentials.from_service_account_file( 'resources/keys/credentials.json') config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, language_code='es-PE', sample_rate_hertz=self.RATE, ) self.client = speech.SpeechClient(credentials=credentials) self.streaming_config = speech.types.StreamingRecognitionConfig( config=config) self.logger.info("Done setting up Google Speech.") def get_destinations_ID(self, raw_data) -> List[Identifier]: return [self.destinations_ID[0]] def preprocess(self, raw_data): """Filtering""" return raw_data def query_gs(self): requests = (speech.types.StreamingRecognizeRequest(audio_content=chunk) for chunk in self.current_data) responses = self.client.streaming_recognize( config=self.streaming_config, requests=requests) try: response = next(responses) data = response.results[0].alternatives[0].transcript conf = response.results[0].alternatives[0].confidence except Exception as e: self.logger.info(f"{self.name}>> {e}") conf = None data = None self.current_data.clear() return data, conf def query_ps(self): try: data = self.decoder.hyp().hypstr conf = self.decoder.hyp().best_score if data == "": data = None except Exception as e: self.logger.info(f"{self.name}>> {e}") conf = None data = None return data, conf def process(self, raw_data) -> Generator: self.decoder.process_raw(raw_data, False, False) cur_buf_is_speech = self.decoder.get_in_speech() data = None self.logger.info( f"prev: {self.prev_buf_is_speech}, current: {cur_buf_is_speech}") force_speech = False if raw_data == bytes([0] * self.CHUNK * 16): force_speech = True self.logger.info("RECEIVED FORCE STOP") if force_speech or (self.prev_buf_is_speech and not cur_buf_is_speech): # No longer in speech -> stop listening and process self.logger.info("No longer in speech, yielding True.") yield True self.decoder.end_utt() if (self.sr == "googlespeech"): data, conf = self.query_gs() elif (self.sr == "pocketsphinx"): data, conf = self.query_ps() self.logger.info( f"{self.name}>> Heard DATA: '{data}' with confidence: {conf}.") self.decoder.start_utt() self.prev_buf_is_speech = cur_buf_is_speech elif not self.prev_buf_is_speech and cur_buf_is_speech: # Now in speech -> Start listening self.current_data.append(raw_data) self.prev_buf_is_speech = cur_buf_is_speech yield False elif self.prev_buf_is_speech and cur_buf_is_speech: # Still in speech -> Keep on listening self.current_data.append(raw_data) self.prev_buf_is_speech = cur_buf_is_speech yield False else: self.prev_buf_is_speech = cur_buf_is_speech yield False yield data return def pass_msg(self, msg: str) -> None: if msg == "RESUME": self.e.set() def dump_history(self, filename: str, data: List[Any]) -> None: pass
class PocketsphinxTrigger(BaseTrigger): type = triggers.TYPES.VOICE def __init__(self, config, trigger_callback): super(PocketsphinxTrigger, self).__init__(config, trigger_callback, 'pocketsphinx') self._enabled_lock = threading.Event() self._disabled_sync_lock = threading.Event() self._decoder = None def setup(self): # PocketSphinx configuration ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string('-hmm', os.path.join(get_model_path(), self._tconfig['language'])) ps_config.set_string('-dict', os.path.join(get_model_path(), self._tconfig['dictionary'])) # Specify recognition key phrase #ps_config.set_string('-keyphrase', self._tconfig['phrase']) #ps_config.set_float('-kws_threshold', float(self._tconfig['threshold'])) ### Multiple Hotwords #ps_config.set_string('-inmic', 'yes') ps_config.set_string('-kws', '/opt/AlexaPi/src/keyphrase.list') # Hide the VERY verbose logging information when not in debug if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG: ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) def run(self): thread = threading.Thread(target=self.thread, args=()) thread.setDaemon(True) thread.start() def thread(self): while True: self._enabled_lock.wait() # Enable reading microphone raw data inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL, self._config['sound']['input_device']) inp.setchannels(1) inp.setrate(16000) inp.setformat(alsaaudio.PCM_FORMAT_S16_LE) inp.setperiodsize(1024) self._decoder.start_utt() triggered = False #assistantTriggered = False voice_command = "" while not triggered: if not self._enabled_lock.isSet(): break # Read from microphone _, buf = inp.read() # Detect if keyword/trigger word was said self._decoder.process_raw(buf, False, False) triggered = self._decoder.hyp() is not None # To avoid overflows close the microphone connection inp.close() self._decoder.end_utt() self._disabled_sync_lock.set() if triggered: ### Assistant Starts Here try: voice_command = self._decoder.hyp().hypstr except: voice_command = "" self._trigger_callback(self, voice_command) ### def enable(self): self._enabled_lock.set() self._disabled_sync_lock.clear() def disable(self): self._enabled_lock.clear() self._disabled_sync_lock.wait()
def main(): """ A main method to that does a simple matching of sentences and executes scripts """ notifier = sdnotify.SystemdNotifier() # Load config first config_file = open(os.path.join(os.getcwd(), 'config.yaml'), 'r') config = yaml.load(config_file) interaction_timeout = int(config['interaction_timeout']) # Create Decoder config pocketsphinx_config = Decoder.default_config() pocketsphinx_config.set_string('-hmm', os.path.join(os.getcwd(), config['hmm_path'])) pocketsphinx_config.set_string('-dict', os.path.join(os.getcwd(), config['dict_path'])) pocketsphinx_config.set_string('-featparams', os.path.join(os.getcwd(), config['feat_params_path'])) pocketsphinx_config.set_boolean("-allphone_ci", True) # Using decoder.set_kws & decoder.set_lm_file # pocketsphinx_config.set_string('-lm', os.path.join(os.getcwd(), config['lm_path'])) # pocketsphinx_config.set_string('-kws', os.path.join(os.getcwd(), config['keyphrase_path'])) # Initialize audio p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) stream.start_stream() # Load invocations and commands invocations = config['invocations'] # Process audio chunk by chunk. On keyword detected perform action and restart search decoder = Decoder(pocketsphinx_config) logmath = decoder.get_logmath() decoder.set_kws('keyword', os.path.join(os.getcwd(), config['invocation_path'])) decoder.set_lm_file('lm', os.path.join(os.getcwd(), config['lm_path'])) invocation_ctx = None in_speech_bf = False # Run some initialization scripts for terminal displays subprocess.Popen([os.path.join(os.getcwd(), config['init_exec'])]).communicate() decoder.set_search('keyword') decoder.start_utt() notifier.notify("READY=1") interaction_time = None while True: notifier.notify("WATCHDOG=1") buf = stream.read(1024, exception_on_overflow = False) if buf: decoder.process_raw(buf, False, False) else: logging.error("Unable to get audio, exiting") break hyp = decoder.hyp() # seg = decoder.seg() hyp_str = hyp.hypstr.lower().strip() if hyp else None now_in_speech = decoder.get_in_speech() if now_in_speech != in_speech_bf: in_speech_bf = now_in_speech if not in_speech_bf: decoder.end_utt() if hyp_str: logging.info("Heard: '%s' while being in '%s' context (score: %d, confidence: %d -> in log scale %d)" % (hyp_str, invocation_ctx, hyp.best_score, logmath.exp(hyp.prob), hyp.prob)) if not invocation_ctx: if hyp_str in invocations: logging.info("Matched invocation: '%s'" % hyp_str) invocation_ctx = hyp_str subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['enter']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate() interaction_time = time.time() decoder.set_search('lm') else: logging.debug('Unknown invocation or wrongly heard, silently ignoring') else: matched = False score_dict = defaultdict(list) commands = invocations[invocation_ctx]['commands'] for command in commands: logging.info("- command: '%s':" % command['name']) for sentence in command['sentence']: score = calc_similarity(command, sentence.lower(), hyp_str) score_dict[score].append(command) logging.debug(" - similarity: %d for sentence: %s" % (score, sentence)) if score == 1000: logging.debug("... seems like found perfect match, ignoring the rest") break for best in sorted(score_dict.items(), reverse=True): if best[0] > 90: command = best[1][0] # here might be some randomness logging.info("The best matching command is '%s', executing: %s" % (command['name'], command['exec'])) subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['ack']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate() subprocess.Popen([os.path.join(os.getcwd(), command['exec']), invocations[invocation_ctx]['voice_params'], invocation_ctx, command['name']]).communicate() subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]) invocation_ctx = None decoder.set_search('keyword') matched = True break # take only the first which should be the best if not matched: logging.info("... not matched, ignoring") subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['noop']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate() decoder.start_utt() if invocation_ctx and interaction_time and time.time() > interaction_time + interaction_timeout: logging.info("The invocation context has just timed out, returning to listen for invocation word.") subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']), invocations[invocation_ctx]['voice_params'], invocation_ctx]) invocation_ctx = None interaction_time = None decoder.end_utt() decoder.set_search('keyword') decoder.start_utt()
class NLUAudio(NLUBase): """Define NLUAudio component For now hotword uses pocketsphinx with speech_recognition and Nuance services has NLU """ def __init__(self, settings, action_queue, tts_queue, logger): NLUBase.__init__(self, settings, action_queue, None, tts_queue, logger) # Init private attributes self._rerun = True self._answer_sound_path = "sounds/answer.wav" self._config = Decoder.default_config() if not self._prepare_decoder(): self._must_run = False def _prepare_decoder(self): """Set decoder config""" # prepare config self._hotword = self._settings['speech']['hotword'] # self._answer = self._settings['hotword']['answer'] if not os.path.isdir("pocketsphinx-data"): raise HotWordError("Missing pocketsphinx-data folder. Please run `make hotword`") acoustic_model = os.path.join("pocketsphinx-data", self._settings['speech']['language'], 'acoustic-model', ) language_model = os.path.join("pocketsphinx-data", self._settings['speech']['language'], 'language-model.lm.bin', ) pocket_dict = os.path.join("pocketsphinx-data", self._settings['speech']['language'], 'pronounciation-dictionary.dict', ) self._config.set_string('-logfn', "/dev/null") self._config.set_string('-hmm', acoustic_model) self._config.set_string('-lm', language_model) self._config.set_string('-dict', pocket_dict) try: self._decoder = Decoder(self._config) except RuntimeError: self.logger.critical("Error get audio decoder. Hotword not started") return False self._decoder.set_keyphrase('wakeup', self._hotword) self._decoder.set_search('wakeup') def stop(self): """Stop process""" self._rerun = False NLUBase.stop(self) def _answering(self): """Play the hotwoard confirmation sound""" f_ans = wave.open(self._answer_sound_path, "rb") stream = self._paudio.open(format=self._paudio.get_format_from_width(f_ans.getsampwidth()), channels=f_ans.getnchannels(), rate=f_ans.getframerate(), output=True) data = f_ans.readframes(1024) while len(data) > 0: stream.write(data) data = f_ans.readframes(1024) f_ans.close() def run(self): """Listen for NLU""" self._rerun = True self._must_run = True self.logger.debug("starting listening hotword %s", self._hotword) while self._rerun: self._rerun = False try: self._paudio = pyaudio.PyAudio() stream = self._paudio.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) except OSError: self.logger.warning("No audio device found can not listen for NLU") self.logger.warning("Disabling NLU audio") self._must_run = False self._rerun = False return stream.start_stream() self._paudio.get_default_input_device_info() self._decoder.start_utt() while self._must_run: buf = stream.read(1024) self._decoder.process_raw(buf, False, False) if not self.tts_queue.empty(): # If tts_queue is not empty, this means the Droid # is currently speaking. So we don't want to it listen itself # TODO replace this stuff by speaker annulation continue if self._decoder.hyp() and self._decoder.hyp().hypstr == self._hotword: self.logger.debug("Hotword detected") # self.tts_queue.put(gtt(self._answer)) # self.tts_queue.put(gtt("mmm")) self._answering() ret = nlu_audio(self._settings, self.logger) # GOT ACTIONS interpretations = ret.get("nlu_interpretation_results", {}).\ get("payload", {}).get("interpretations", {}) # TODO: what about if len(interpretations) > 1 ?? for interpretation in interpretations: intent = interpretation.get("action", {}).get("intent", {}) self.logger.info("Intent: {}".format(intent.get("value"))) self.logger.info("Confidence: {}".format(intent.get("confidence"))) # TODO log arguments if intent.get("value") == "NO_MATCH": # I don't understand :/ self._misunderstand(0, True, True) elif intent.get("confidence") < 0.8: # I'm not sure to undestand :/ self._misunderstand(intent.get("confidence"), True, True) else: # Check intent name if len(intent.get("value").split("__")) != 2: self.logger.critical("BAD Intent name: " "{}".format(intent.get("value"))) self._misunderstand(0, True, True) # Run function with parameters action, method = intent.get("value").split("__") # Run action # TODO add parameters from NLU response self._run_action(action, method, {}, False, True, True) # TODO run nlu audio detection self._rerun = True break self._decoder.end_utt()
class InstructionRecogniser(QThread): ''' You should only use keyIn/keyOut, and shutdown after use. The thread starts itself when appropriate. Signals are emitted with any recognised instructions. ''' def __init__(self, gui): QThread.__init__(self, gui) if settings.sphinx_acoustic_model_dir == '': # use default acoustic model acoustic_model_directory = path.join(get_model_path(), 'en-us') else: # use custom acoustic model acoustic_model_directory = settings.sphinx_acoustic_model_dir config = Decoder.default_config() config.set_string('-hmm', acoustic_model_directory) # acoustic model config.set_string( '-dict', settings.prepared_lexicon_file) # lexicon pronunciation config.set_string( '-jsgf', settings.prepared_grammar_file) # language model from grammar config.set_string( '-logfn', settings.outputFileName(sphinx_decoder_log_file_base_name, ext='log')) self.listen = False self.decoder = Decoder(config) self.audio = None self.device = None def startup(self): self.audio = PyAudio() if 0 <= settings.audio_input_device_index < self.audio.get_device_count( ): # out of range or -1 for default self.device = settings.audio_input_device_index else: self.device = None def shutdown(self): self.listen = False self.wait() self.audio.terminate() self.audio = None def keyIn(self): if not self.isRunning(): self.listen = True self.start() def keyOut(self): self.listen = False def run(self): audio_stream = self.audio.open(input_device_index=self.device, channels=1, format=paInt16, rate=audio_sample_rate, frames_per_buffer=audio_chunk_size, input=True) chunks = [] msg_duration = 0 buff = audio_stream.read(audio_chunk_size) while self.listen and len( buff) > 0 and msg_duration < message_duration_limit: chunks.append(buff) buff = audio_stream.read(audio_chunk_size) msg_duration += audio_chunk_size / audio_sample_rate audio_stream.close() audio_message = b''.join(chunks) self.decoder.start_utt( ) # STYLE catch failures here (e.g. grammar/lex files not found) self.decoder.process_raw(audio_message, False, True) self.decoder.end_utt() hyp = self.decoder.hyp() if hyp: SR_log('VOICE: "%s"' % hyp.hypstr) if settings.show_recognised_voice_strings: signals.statusBarMsg.emit('VOICE: "%s"' % hyp.hypstr) callsign_tokens, instr_lst = interpret_string(hyp.hypstr) signals.voiceMsgRecognised.emit(callsign_tokens, instr_lst) else: SR_log('VOICE: no hypothesis, message duration was %g s' % msg_duration) signals.voiceMsgNotRecognised.emit()