def setup(self): # PocketSphinx configuration ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) ps_config.set_string( '-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) # Specify recognition key phrase ps_config.set_string('-keyphrase', self._tconfig['phrase']) ps_config.set_float('-kws_threshold', float(self._tconfig['threshold'])) # Hide the VERY verbose logging information when not in debug if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG: null_path = '/dev/null' if platform.system() == 'Windows': null_path = 'nul' ps_config.set_string('-logfn', null_path) # Process audio chunk by chunk. On keyword detected perform action and restart search self._detector = Decoder(ps_config)
def build_decoder(self): config = Decoder.default_config() config.set_string( "-dict", os.path.join(self.MODEL_DIR, "cmudict-en-us.dict") ) config.set_string( "-fdict", os.path.join(self.MODEL_DIR, "en-us/noisedict") ) config.set_string( "-featparams", os.path.join(self.MODEL_DIR, "en-us/feat.params") ) config.set_string( "-tmat", os.path.join(self.MODEL_DIR, "en-us/transition_matrices") ) config.set_string("-hmm", os.path.join(self.MODEL_DIR, "en-us")) config.set_string("-lm", os.path.join(self.MODEL_DIR, "en-us.lm.bin")) config.set_string("-mdef", os.path.join(self.MODEL_DIR, "en-us/mdef")) config.set_string("-mean", os.path.join(self.MODEL_DIR, "en-us/means")) config.set_string( "-sendump", os.path.join(self.MODEL_DIR, "en-us/sendump") ) config.set_string( "-var", os.path.join(self.MODEL_DIR, "en-us/variances") ) null_path = "/dev/null" if sys.platform == "win32": null_path = "NUL" config.set_string("-logfn", null_path) return Decoder(config)
def setup(self): # PocketSphinx configuration ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string( '-hmm', os.path.join(get_model_path(), self._tconfig['language'])) ps_config.set_string( '-dict', os.path.join(get_model_path(), self._tconfig['dictionary'])) # Specify recognition key phrase #ps_config.set_string('-keyphrase', self._tconfig['phrase']) #ps_config.set_float('-kws_threshold', float(self._tconfig['threshold'])) ### Multiple Hotwords #ps_config.set_string('-inmic', 'yes') ps_config.set_string('-kws', '/opt/AlexaPi/src/keyphrase.list') # Hide the VERY verbose logging information when not in debug if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG: ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config)
def __init__(self, phrase, threshold, device_index=0): self._decoder = None self._pa = None self._device_no = device_index self._phrase = phrase self._threshold = float(threshold) # PocketSphinx configuration logging.info('Phrase: ' + phrase + ' Threshold: ' + str(threshold)) ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string('-hmm', os.path.join(get_model_path_keyword(), 'en-us')) ps_config.set_string( '-dict', os.path.join(get_model_path_keyword(), 'cmudict-en-us.dict')) # Specify recognition key phrase ps_config.set_string('-keyphrase', self._phrase) ps_config.set_float('-kws_threshold', self._threshold) ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) self._pa = pyaudio.PyAudio()
def create_decoder(): base = os.path.join(root(), 'pocketsphinx', 'zero_ru_cont_8k_v3') hmm = os.path.join(base, 'zero_ru.cd_semi_4000') # - mobile? # hmm = os.path.join(base, 'zero_ru.cd_cont_4000') # hmm = os.path.join(base, 'zero_ru.cd_ptm_4000') - mobile? dict = os.path.join(base, 'ru.dic.orig') # dict = os.path.join(base, 'ru.dic') lm = os.path.join(base, 'ru.lm.orig') # kws = os.path.join(base, 'ru.dic.orig.keywords') kws = os.path.join(base, 'keywords.mini') decoder_config = Decoder.default_config() decoder_config.set_string('-hmm', hmm) decoder_config.set_string("-lm", lm) # decoder_config.set_string('-keyphrase', 'алекса') # decoder_config.set_float('-kws_threshold', 1e-20) # decoder_config.set_string('-kws', kws) decoder_config.set_string('-dict', dict) decoder_config.set_boolean('-remove_noise', False) decoder_config.set_float('-samprate', 8000) decoder_config.set_string('-logfn', os.devnull) decoder = Decoder(decoder_config) return decoder
def create_decoder(): path = os.path.dirname(os.path.realpath(__file__)) pocketsphinx_data = os.getenv('POCKETSPHINX_DATA', os.path.join(path, 'pocketsphinx')) hmm = os.getenv('POCKETSPHINX_HMM', os.path.join(pocketsphinx_data, 'tdt_sc_8k')) dict = os.getenv('POCKETSPHINX_DIC', os.path.join(pocketsphinx_data, 'keywords.dic')) kws = os.getenv('POCKETSPHINX_KWS', os.path.join(pocketsphinx_data, 'keywords.kws')) lm = os.getenv('POCKETSPHINX_LM', os.path.join(pocketsphinx_data, 'keywords.lm')) log = os.getenv('POCKETSPHINX_LOG', os.path.join(pocketsphinx_data, 'log')) config = Decoder.default_config() config.set_string('-hmm', hmm) config.set_string('-lm', lm) config.set_string('-dict', dict) # config.set_string('-kws', kws) # config.set_int('-samprate', SAMPLE_RATE) # uncomment if rate is not 16000. use config.set_float() on ubuntu config.set_int('-nfft', 512) #config.set_float('-vad_threshold', 2.7) config.set_string('-logfn', log) return Decoder(config)
def audio2phoneme(audio_file): wave_read = wave.open(audio_file, 'rb') length = wave_read.getnframes() / wave_read.getframerate() wave_read.close() # Decode streaming data. decoder = Decoder(config) buf = bytearray(1024) with open(audio_file, 'rb') as f: decoder.start_utt() while f.readinto(buf): decoder.process_raw(buf, False, False) decoder.end_utt() nframes = decoder.n_frames() phonemes = [] offset = None for seg in decoder.seg(): if offset is None: offset = seg.start_frame start_frame = seg.start_frame - offset end_frame = seg.end_frame - offset phonemes.append((seg.word, start_frame / nframes * length, end_frame / nframes * length)) return phonemes
def process_file(self, audiofile): """ processes audio file and returns the text """ with open(audiofile, 'rb') as audiofile: decoder = Decoder(self.config) decoder.start_utt() while True: buf = audiofile.read(1024) if buf: decoder.process_raw(buf, False, False) else: break decoder.end_utt() hyp = decoder.hyp() print "Hyp:", hyp if hyp != None: print "Hyp Score", (hyp.prob, hyp.best_score) average_score = 0 seg_count = 0 for seg in decoder.seg(): if seg.word != "<sil>": seg_count += 1 average_score += seg.ascore print(seg.word, seg.ascore, seg.lscore) print "hyp:", hyp.hypstr print average_score / seg_count return hyp.hypstr return None
def __init__(self, keyword, sensitivity): config = Decoder.default_config() config.set_string('-logfn', '/dev/null') config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) config.set_string('-keyphrase', keyword if keyword != 'snowboy' else 'snow boy') config.set_float('-kws_threshold', 10 ** -sensitivity) self._decoder = Decoder(config) self._decoder.start_utt()
def audio2phoneme(audio_file): wave_read = wave.open(audio_file, 'rb') length = wave_read.getnframes()/wave_read.getframerate() wave_read.close() # Decode streaming data. decoder = Decoder(config) buf = bytearray(1024) with open(audio_file, 'rb') as f: decoder.start_utt() while f.readinto(buf): decoder.process_raw(buf, False, False) decoder.end_utt() nframes = decoder.n_frames() phonemes = [] offset = None for seg in decoder.seg(): if offset is None: offset = seg.start_frame start_frame = seg.start_frame - offset end_frame = seg.end_frame - offset phonemes.append(( seg.word, start_frame/nframes*length, end_frame/nframes*length)) return phonemes
def __init__(self): # https://github.com/cmusphinx/pocketsphinx-python/blob/master/example.py config = Decoder.default_config() config.set_string('-logfn', '/dev/null') config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-lm', os.path.join(get_model_path(), 'en-us.lm.bin')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) self._decoder = Decoder(config)
def configure(self): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', os.path.join(BASEDIR, 'model', self.lang, 'mycroft-en-us.dict')) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float('1e-45')) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') self.decoder = Decoder(config)
def setup_pocketsphinx(self) -> None: self.logger.info("Setting up PocketSphinx.") self.MODELDIR = "resources/model" config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'es-es')) config.set_string('-lm', os.path.join(self.MODELDIR, 'es-es.lm')) config.set_string('-dict', os.path.join(self.MODELDIR, 'es.dict')) config.set_string('-logfn', '/dev/null') self.decoder = Decoder(config) self.prev_buf_is_speech = False self.decoder.start_utt() self.logger.info("Done setting up PocketSphinx.")
def _prepare_decoder(self): """Set decoder config""" # prepare config self._hotword = self._settings['speech']['hotword'] # self._answer = self._settings['hotword']['answer'] if not os.path.isdir("pocketsphinx-data"): raise HotWordError("Missing pocketsphinx-data folder. Please run `make hotword`") acoustic_model = os.path.join("pocketsphinx-data", self._settings['speech']['language'], 'acoustic-model', ) language_model = os.path.join("pocketsphinx-data", self._settings['speech']['language'], 'language-model.lm.bin', ) pocket_dict = os.path.join("pocketsphinx-data", self._settings['speech']['language'], 'pronounciation-dictionary.dict', ) self._config.set_string('-logfn', "/dev/null") self._config.set_string('-hmm', acoustic_model) self._config.set_string('-lm', language_model) self._config.set_string('-dict', pocket_dict) try: self._decoder = Decoder(self._config) except RuntimeError: self.logger.critical("Error get audio decoder. Hotword not started") return False self._decoder.set_keyphrase('wakeup', self._hotword) self._decoder.set_search('wakeup')
class LocalRecognizer(object): def __init__(self, sample_rate=16000, lang="en-us", key_phrase="mycroft"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.configure() def configure(self): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', os.path.join(BASEDIR, 'model', self.lang, 'mycroft-en-us.dict')) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float('1e-45')) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') self.decoder = Decoder(config) def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def is_recognized(self, byte_data, metrics): hyp = self.transcribe(byte_data, metrics) return hyp and self.key_phrase in hyp.hypstr.lower() def contains(self, hypothesis): return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
class LocalRecognizer(object): def __init__(self, sample_rate=16000, lang="en-us", key_phrase="mycroft"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.configure() def configure(self): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', os.path.join(BASEDIR, 'model', self.lang, 'mycroft-en-us.dict')) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float('1e-45')) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') self.decoder = Decoder(config) def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def is_recognized(self, byte_data, metrics): hyp = self.transcribe(byte_data, metrics) return hyp and self.key_phrase in hyp.hypstr.lower() def found_wake_word(self, hypothesis): return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
def setup(self): # PocketSphinx configuration ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) ps_config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) # Specify recognition key phrase ps_config.set_string('-keyphrase', self._tconfig['phrase']) ps_config.set_float('-kws_threshold', float(self._tconfig['threshold'])) # Hide the VERY verbose logging information when not in debug if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG: ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config)
def __init__(self, settings, action_queue, tts_queue, logger): NLUBase.__init__(self, settings, action_queue, None, tts_queue, logger) # Init private attributes self._rerun = True self._answer_sound_path = "sounds/answer.wav" self._config = Decoder.default_config() if not self._prepare_decoder(): self._must_run = False
def start_recognizer(self): """Function to handle lm or grammar processing of audio.""" config = Decoder.default_config() rospy.loginfo("Done initializing pocketsphinx") # Setting configuration of decoder using provided params config.set_string('-dict', self.dict) config.set_string('-lm', self.class_lm) config.set_string('-hmm', self.hmm) self.decoder = Decoder(config) # Start processing input audio self.decoder.start_utt() rospy.loginfo("Decoder started successfully") # Subscribe to audio topic rospy.Subscriber("recognizer/audio_ready", Bool, self.process_audio) rospy.spin()
def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000, lang="en-us"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.threshold = threshold self.phonemes = phonemes dict_name = self.create_dict(key_phrase, phonemes) self.decoder = Decoder(self.create_config(dict_name))
def create_decoder(): from pocketsphinx.pocketsphinx import Decoder path = os.path.dirname(os.path.realpath(__file__)) pocketsphinx_data = os.getenv('POCKETSPHINX_DATA', os.path.join(path, 'pocketsphinx-data')) hmm = os.getenv('POCKETSPHINX_HMM', os.path.join(pocketsphinx_data, 'hmm')) dict = os.getenv('POCKETSPHINX_DIC', os.path.join(pocketsphinx_data, 'dictionary.txt')) kws = os.getenv('POCKETSPHINX_KWS', os.path.join(pocketsphinx_data, 'keywords.txt')) config = Decoder.default_config() config.set_string('-hmm', hmm) config.set_string('-dict', dict) config.set_string('-kws', kws) # config.set_int('-samprate', SAMPLE_RATE) # uncomment if rate is not 16000. use config.set_float() on ubuntu config.set_int('-nfft', 512) config.set_float('-vad_threshold', 2.7) config.set_string('-logfn', os.devnull) return Decoder(config)
def __init__(self, keyword, sensitivity): """ Constructor. :param keyword: keyword to be detected. :param sensitivity: detection sensitivity. """ # Set the configuration. config = Decoder.default_config() config.set_string('-logfn', '/dev/null') # Set recognition model to US config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) config.set_string('-keyphrase', keyword) config.set_float('-kws_threshold', sensitivity) self._decoder = Decoder(config) self._decoder.start_utt()
def __init__(self, kws_threshold = 1e-40): # configuration. base_dir = os.path.dirname(__file__) modeldir = "../../../pocketsphinx/model/en-us" config = _Decoder.default_config() config.set_string('-hmm', os.path.join(base_dir, modeldir, 'en-us')) config.set_string('-dict', os.path.join(base_dir, modeldir, 'cmudict-en-us.dict')) config.set_float('-kws_threshold', kws_threshold) self.config = config self.decoder = None
def __init__(self, device_index=0, model_path=None): self._decoder = None self._pa = None self._device_no = device_index self._model_path = model_path # PocketSphinx configuration logging.info('Grammar file:' + os.path.join(model_path, self.GRAMMAR)) ps_config = Decoder.default_config() # Set recognition model to ... ps_config.set_string('-hmm', os.path.join(model_path, self.HMM)) ps_config.set_string('-dict', os.path.join(model_path, self.DIC)) ps_config.set_string('-jsgf', os.path.join(model_path, self.GRAMMAR)) ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) self._pa = pyaudio.PyAudio()
def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config
def __init__(self, engine_type, keyword, sensitivity): """Initializer. :param engine_type: type of the engine. :param keyword: keyword being used for detection. :param sensitivity: sensitivity passed to the engine. """ super().__init__(engine_type, keyword, sensitivity) # Set the configuration. config = Decoder.default_config() config.set_string('-logfn', '/dev/null') # Set recognition model to US config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) config.set_string('-keyphrase', keyword) config.set_float('-kws_threshold', sensitivity) self._decoder = Decoder(config) self._decoder.start_utt()
def __init__(self): self.MODELDIR = 'speech/' self.wav_name = 'media/temp.wav' self.raw_name = 'media/temp.raw' config = Decoder.default_config() config.set_string('-hmm', self.MODELDIR + 'ru_ru/') config.set_string('-dict', self.MODELDIR + 'ru.dic') self.decoder = Decoder(config) jsgf = Jsgf(self.MODELDIR + 'gr.gram') rule = jsgf.get_rule('gr.rule') fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5) fsg.writefile('gr.fsg') self.decoder.set_fsg('gr', fsg) self.decoder.set_search('gr') self.rec = Recognizer() self.mic = Microphone()
def recognize_phonemes(segments_path, phonemes_result_path): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', join(model_dir, decoder_hmm)) config.set_string('-allphone', join(model_dir, decoder_allphone)) config.set_string('-dict', join(model_dir, decoder_dict)) config.set_float('-lw', decoder_lw) config.set_float('-pip', decoder_pip) config.set_float('-beam', decoder_beam) config.set_float('-pbeam', decoder_pbeam) config.set_boolean('-mmap', decoder_mmap) hyps = [] segs = [] self.decoder = Decoder(config) with open(segments_path, 'rb') as stream: in_speech_buffer = False self.decoder.start_utt() while True: buf = stream.read(decoder_stream_buf_size) if buf: self.decoder.process_raw(buf, False, False) if self.decoder.get_in_speech() != in_speech_buffer: in_speech_buffer = self.decoder.get_in_speech() if not in_speech_buffer: hyp_result, segment = _get_decoder_results() segs += segment hyps.append(hyp_result) self.decoder.start_utt() else: if in_speech_buffer: hyp_result, segment = _get_decoder_results() segs += segment hyps.append(hyp_result) break phonemes_dict = dict(hypotheses=hyps, segment_info=segs) phonemes_result = DecoderOutputSchema().dumps(phonemes_dict) with open(phonemes_result_path, 'w') as f: f.write(phonemes_result)
def __init__(self, gui): QThread.__init__(self, gui) if settings.sphinx_acoustic_model_dir == '': # use default acoustic model acoustic_model_directory = path.join(get_model_path(), 'en-us') else: # use custom acoustic model acoustic_model_directory = settings.sphinx_acoustic_model_dir config = Decoder.default_config() config.set_string('-hmm', acoustic_model_directory) # acoustic model config.set_string( '-dict', settings.prepared_lexicon_file) # lexicon pronunciation config.set_string( '-jsgf', settings.prepared_grammar_file) # language model from grammar config.set_string( '-logfn', settings.outputFileName(sphinx_decoder_log_file_base_name, ext='log')) self.listen = False self.decoder = Decoder(config) self.audio = None self.device = None
def _create_decoder(config) -> Decoder: decoder_config = Decoder.default_config() decoder_config.set_string('-hmm', config.hmm) decoder_config.set_string('-dict', config.dict) decoder_config.set_boolean('-remove_noise', config.remove_noise) decoder_config.set_float('-samprate', config.sample_rate) decoder_config.set_string('-logfn', devnull) if config.lm is not None: decoder_config.set_string("-lm", config.lm) elif len(config.hotwords) == 1: decoder_config.set_string('-keyphrase', config.hotwords[0]) decoder_config.set_float('-kws_threshold', config.threshold) else: import os from tempfile import gettempdir path = os.path.join(gettempdir(), 'keywords.mini') f = open(path, 'w') f.writelines(['{} /{}/\n'.format(w, config.threshold) for w in config.hotwords]) f.flush() decoder_config.set_string('-kws', path) return Decoder(decoder_config)
def get_decoder(): from pocketsphinx.pocketsphinx import Decoder script_dir = os.path.dirname(os.path.realpath(__file__)) config = Decoder.default_config() config.set_string('-hmm', os.path.join(script_dir, 'model/hmm/en')) config.set_string('-dict', os.path.join(script_dir, 'model/respeaker.dic')) config.set_string('-kws', os.path.join(script_dir, 'model/keywords.txt')) # config.set_string('-keyphrase', 'respeaker') # config.set_float('-kws_threshold', 1e-43) config.set_int('-samprate', SAMPLE_RATE) config.set_int('-nfft', 2048) config.set_string('-logfn', os.devnull) try: decoder = Decoder(config) except Exception as e: print( "Maybe replace config.set_int('-samprate', SAMPLE_RATE) with config.set_float('-samprate', SAMPLE_RATE)" ) raise e return decoder
class LocalRecognizer(object): def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000, lang="en-us"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.threshold = threshold self.phonemes = phonemes dict_name = self.create_dict(key_phrase, phonemes) self.decoder = Decoder(self.create_config(dict_name)) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def is_recognized(self, byte_data, metrics): hyp = self.transcribe(byte_data, metrics) return hyp and self.key_phrase in hyp.hypstr.lower() def found_wake_word(self, hypothesis): return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
def init(): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-logfn', settings.POCKETSPHINX_LOG) config.set_string('-hmm', settings.ACOUSTIC_MODEL) config.set_string('-lm', settings.LANGUAGE_MODEL) config.set_string('-dict', settings.POCKET_DICT) # Decode streaming data global decoder, p decoder = Decoder(config) decoder.set_keyphrase('wakeup', settings.WAKE_UP_WORD) decoder.set_search('wakeup') p = pyaudio.PyAudio() global r r = speech_recognition.Recognizer()
def load_models(pipe, config, models): """Internal worker method to load the language model Note: Some lanaguages take a long time to load. English is by far the fastest language to be loaded as a model. Arguments: pipe (:obj: socket): The response pipe to send to the parent process models (dict): The language and nltk models developed by the parent process Returns: (Decoder) The STT decoder object and the nltk model """ language_model = models["language_model"] nltk_model = models["nltk_model"] if False in [ language_model.is_valid_model(), nltk_model.is_valid_model() ]: l_log.error("The language model %s is invalid!" % str(language_model.name)) send_error(pipe, "Failed loading language model!") return # Load the model configurations into pocketsphinx config.set_string('-hmm', str(language_model.hmm)) config.set_string('-lm', str(language_model.lm)) config.set_string('-dict', str(language_model.dict)) decoder = Decoder(config) send_json( pipe, {"success": True}) # Send a success message to the client l_log.debug("Set the language model to %s" % str(language_model.name)) return decoder, nltk_model # Return the new decoder and nltk model
def init(): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-logfn', os.path.join(settings.LOGS_DIR, 'passive-listen.log')) config.set_string('-hmm', os.path.join(settings.MODEL_DIR, 'en-US/acoustic-model')) config.set_string('-lm', os.path.join(settings.MODEL_DIR, 'en-US/language-model.lm.bin')) config.set_string('-dict', os.path.join(settings.MODEL_DIR, 'en-US/pronounciation-dictionary.dict')) # Decode streaming data global decoder, p decoder = Decoder(config) decoder.set_keyphrase('wakeup', settings.WAKE_UP_WORD) decoder.set_search('wakeup') p = pyaudio.PyAudio() global r r = speech_recognition.Recognizer()
class PocketSphinxASREngine(ASREngine): """https://pypi.org/project/pocketsphinx/""" def __init__(self): # https://github.com/cmusphinx/pocketsphinx-python/blob/master/example.py config = Decoder.default_config() config.set_string('-logfn', '/dev/null') config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-lm', os.path.join(get_model_path(), 'en-us.lm.bin')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) self._decoder = Decoder(config) def transcribe(self, path): pcm, sample_rate = soundfile.read(path) assert sample_rate == 16000 pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16).tobytes() self._decoder.start_utt() self._decoder.process_raw(pcm, no_search=False, full_utt=True) self._decoder.end_utt() words = [] for seg in self._decoder.seg(): word = seg.word # Remove special tokens. if word == '<sil>' or word == '<s>' or word == '</s>': continue word = ''.join([x for x in word if x.isalpha()]) words.append(word) return ' '.join(words) def __str__(self): return 'PocketSphinx'
def init(): # Be wary of an OSError due to a race condition if not os.path.exists(LOGS_DIR): os.makedirs(LOGS_DIR) # Create a decoder with certain model config = Decoder.default_config() config.set_string('-logfn', path.join(LOGS_DIR, 'passive-listen.log')) config.set_string('-hmm', path.join(MODEL_DIR, 'en-us\en-us')) config.set_string('-lm', path.join(MODEL_DIR, 'en-us\en-us.lm.dmp')) config.set_string('-dict', path.join(MODEL_DIR, 'en-us\cmudict-en-us.dict')) # Decode streaming data global decoder, p decoder = Decoder(config) decoder.set_keyphrase("wakeup", WAKE_UP_WORD) decoder.set_search("wakeup") p = pyaudio.PyAudio()
def init(): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-logfn', os.path.join(settings.LOGS_DIR, 'passive-listen.log')) config.set_string('-hmm', os.path.join(settings.MODEL_DIR, 'en-US/acoustic-model')) config.set_string( '-lm', os.path.join(settings.MODEL_DIR, 'en-US/language-model.lm.bin')) config.set_string( '-dict', os.path.join(settings.MODEL_DIR, 'en-US/pronounciation-dictionary.dict')) # Decode streaming data global decoder, p decoder = Decoder(config) decoder.set_keyphrase('wakeup', settings.WAKE_UP_WORD) decoder.set_search('wakeup') p = pyaudio.PyAudio() global r r = speech_recognition.Recognizer()
class PocketSphinxEngine(Engine): def __init__(self, keyword, sensitivity): config = Decoder.default_config() config.set_string('-logfn', '/dev/null') config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) config.set_string('-keyphrase', keyword if keyword != 'snowboy' else 'snow boy') config.set_float('-kws_threshold', 10**-sensitivity) self._decoder = Decoder(config) self._decoder.start_utt() def process(self, pcm): assert pcm.dtype == np.int16 self._decoder.process_raw(pcm.tobytes(), False, False) detected = self._decoder.hyp() if detected: self._decoder.end_utt() self._decoder.start_utt() return detected def release(self): self._decoder.end_utt() def __str__(self): return 'PocketSphinx'
#!/usr/bin/python import sys, os from pocketsphinx.pocketsphinx import Decoder import pyaudio script_dir = os.path.dirname(os.path.realpath(__file__)) # Create a decoder with certain model config = Decoder.default_config() config.set_string("-logfn", os.devnull) config.set_string('-hmm', os.path.join(script_dir, 'model/hmm/en')) config.set_string('-dict', os.path.join(script_dir, 'model/keywords_en.dic')) if True: config.set_string('-kws', os.path.join(script_dir, 'model/keywords_en.txt')) else: config.set_string('-keyphrase', 'miss j') config.set_float('-kws_threshold', 1e-15) # Process audio chunk by chunk. On keyword detected perform action and restart search decoder = Decoder(config) decoder.start_utt() stream = None if len(sys.argv) > 1: stream = open(sys.argv[1], "rb") else: p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1,
from __future__ import division import os import sys import wave sys.path.insert(0, '/opt/hansonrobotics/lib/python2.7/site-packages/') from pocketsphinx.pocketsphinx import Decoder MODELDIR = '/opt/hansonrobotics/share/pocketsphinx/model' config = Decoder.default_config() config.set_string('-hmm', os.path.join(MODELDIR, 'en-us/en-us')) config.set_string('-allphone', os.path.join(MODELDIR, 'en-us/en-us-phone.lm.dmp')) config.set_float('-lw', 2.0) config.set_float('-beam', 1e-10) config.set_float('-pbeam', 1e-10) def audio2phoneme(audio_file): wave_read = wave.open(audio_file, 'rb') length = wave_read.getnframes()/wave_read.getframerate() wave_read.close() # Decode streaming data. decoder = Decoder(config) buf = bytearray(1024) with open(audio_file, 'rb') as f: decoder.start_utt() while f.readinto(buf): decoder.process_raw(buf, False, False) decoder.end_utt()
def __init__(self, key_phrase, threshold, sample_rate=16000, lang="en-us"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.threshold = threshold self.decoder = Decoder(self.create_config())
def speech_recog(self, model): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', '/usr/local/share/pocketsphinx/model/en-us/en-us') config.set_int('-ds', 2) config.set_int('-topn', 3) config.set_int('-maxwpf', 5) #config.set_string('-kws', MODELDIR + model + '.txt') config.set_string('-lm', MODELDIR + model + '.lm') config.set_string('-dict', MODELDIR + model + '.dict') decoder = Decoder(config) decoder.start_utt() tstamp = time.time() recog_text = '' while len(recog_text) < 1: try: buf = self.stream_in.read(CHUNK_SIZE) logging.info("actual voice") decoder.process_raw(buf, False, False) if decoder.hyp().hypstr != '': recog_text += decoder.hyp().hypstr print "text: " + decoder.hyp().hypstr tstamp = time.time() except IOError as ex: if ex[1] != pyaudio.paInputOverflowed: raise buf = '\x00' * CHUNK_SIZE #white noise logging.info("white noise") except AttributeError: pass decoder.end_utt() logging.info("recog text: " + recog_text) return recog_text
class NLUAudio(NLUBase): """Define NLUAudio component For now hotword uses pocketsphinx with speech_recognition and Nuance services has NLU """ def __init__(self, settings, action_queue, tts_queue, logger): NLUBase.__init__(self, settings, action_queue, None, tts_queue, logger) # Init private attributes self._rerun = True self._answer_sound_path = "sounds/answer.wav" self._config = Decoder.default_config() if not self._prepare_decoder(): self._must_run = False def _prepare_decoder(self): """Set decoder config""" # prepare config self._hotword = self._settings['speech']['hotword'] # self._answer = self._settings['hotword']['answer'] if not os.path.isdir("pocketsphinx-data"): raise HotWordError("Missing pocketsphinx-data folder. Please run `make hotword`") acoustic_model = os.path.join("pocketsphinx-data", self._settings['speech']['language'], 'acoustic-model', ) language_model = os.path.join("pocketsphinx-data", self._settings['speech']['language'], 'language-model.lm.bin', ) pocket_dict = os.path.join("pocketsphinx-data", self._settings['speech']['language'], 'pronounciation-dictionary.dict', ) self._config.set_string('-logfn', "/dev/null") self._config.set_string('-hmm', acoustic_model) self._config.set_string('-lm', language_model) self._config.set_string('-dict', pocket_dict) try: self._decoder = Decoder(self._config) except RuntimeError: self.logger.critical("Error get audio decoder. Hotword not started") return False self._decoder.set_keyphrase('wakeup', self._hotword) self._decoder.set_search('wakeup') def stop(self): """Stop process""" self._rerun = False NLUBase.stop(self) def _answering(self): """Play the hotwoard confirmation sound""" f_ans = wave.open(self._answer_sound_path, "rb") stream = self._paudio.open(format=self._paudio.get_format_from_width(f_ans.getsampwidth()), channels=f_ans.getnchannels(), rate=f_ans.getframerate(), output=True) data = f_ans.readframes(1024) while len(data) > 0: stream.write(data) data = f_ans.readframes(1024) f_ans.close() def run(self): """Listen for NLU""" self._rerun = True self._must_run = True self.logger.debug("starting listening hotword %s", self._hotword) while self._rerun: self._rerun = False try: self._paudio = pyaudio.PyAudio() stream = self._paudio.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) except OSError: self.logger.warning("No audio device found can not listen for NLU") self.logger.warning("Disabling NLU audio") self._must_run = False self._rerun = False return stream.start_stream() self._paudio.get_default_input_device_info() self._decoder.start_utt() while self._must_run: buf = stream.read(1024) self._decoder.process_raw(buf, False, False) if not self.tts_queue.empty(): # If tts_queue is not empty, this means the Droid # is currently speaking. So we don't want to it listen itself # TODO replace this stuff by speaker annulation continue if self._decoder.hyp() and self._decoder.hyp().hypstr == self._hotword: self.logger.debug("Hotword detected") # self.tts_queue.put(gtt(self._answer)) # self.tts_queue.put(gtt("mmm")) self._answering() ret = nlu_audio(self._settings, self.logger) # GOT ACTIONS interpretations = ret.get("nlu_interpretation_results", {}).\ get("payload", {}).get("interpretations", {}) # TODO: what about if len(interpretations) > 1 ?? for interpretation in interpretations: intent = interpretation.get("action", {}).get("intent", {}) self.logger.info("Intent: {}".format(intent.get("value"))) self.logger.info("Confidence: {}".format(intent.get("confidence"))) # TODO log arguments if intent.get("value") == "NO_MATCH": # I don't understand :/ self._misunderstand(0, True, True) elif intent.get("confidence") < 0.8: # I'm not sure to undestand :/ self._misunderstand(intent.get("confidence"), True, True) else: # Check intent name if len(intent.get("value").split("__")) != 2: self.logger.critical("BAD Intent name: " "{}".format(intent.get("value"))) self._misunderstand(0, True, True) # Run function with parameters action, method = intent.get("value").split("__") # Run action # TODO add parameters from NLU response self._run_action(action, method, {}, False, True, True) # TODO run nlu audio detection self._rerun = True break self._decoder.end_utt()
cl = getattr(im, config['platform']['device'].capitalize() + 'Platform') platform = cl(config) except ImportError: from alexapi.device_platforms.desktop import DesktopPlatform platform = DesktopPlatform(config) # Setup recorded = False servers = ["127.0.0.1:11211"] mc = Client(servers, debug=1) path = os.path.realpath(__file__).rstrip(os.path.basename(__file__)) resources_path = os.path.join(path, 'resources', '') tmp_path = os.path.join(tempfile.mkdtemp(prefix='AlexaPi-runtime-'), '') # PocketSphinx configuration ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) ps_config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) # Specify recognition key phrase ps_config.set_string('-keyphrase', config['sphinx']['trigger_phrase']) ps_config.set_float('-kws_threshold', 1e-5) # Hide the VERY verbose logging information if not debug: ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search decoder = Decoder(ps_config)
def main(): abspath = os.path.dirname(os.path.abspath(__file__)) abspath = os.path.join(abspath, '..') model_dir = os.path.join(abspath, 'model') hmm = os.path.join(model_dir, HMM) lm = os.path.join(model_dir, LM) dic = os.path.join(model_dir, DIC) config = Decoder.default_config() config.set_string('-hmm', hmm) config.set_string('-lm', lm) config.set_string('-dict', dic) config.set_string('-logfn', '/dev/null') decoder = Decoder(config) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=BUFFER) stream.start_stream() in_speech_bf = True decoder.start_utt() while True: buf = stream.read(BUFFER) if buf: decoder.process_raw(buf, False, False) if decoder.get_in_speech(): sys.stdout.write('.') sys.stdout.flush() if decoder.get_in_speech() == in_speech_bf: continue in_speech_bf = decoder.get_in_speech() if in_speech_bf: continue decoder.end_utt() try: if decoder.hyp().hypstr != '': print('You said:', decoder.hyp().hypstr) except AttributeError: pass decoder.start_utt() else: break decoder.end_utt() print('An Error occured:', decoder.hyp().hypstr)
def speech_recog(self, model): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', '/usr/local/share/pocketsphinx/model/en-us/en-us') config.set_int('-ds', 2) config.set_int('-topn', 3) config.set_int('-maxwpf', 5) #config.set_string('-kws', MODELDIR + model + '.txt') config.set_string('-lm', MODELDIR + model + '.lm') config.set_string('-dict', MODELDIR + model + '.dict') decoder = Decoder(config) decoder.start_utt() recog_text = '' with self.stream_in as stream: audio_generator = stream.generator() for content in audio_generator: decoder.process_raw(content, False, False) if decoder.hyp() and decoder.hyp().hypstr != '': recog_text += decoder.hyp().hypstr if len(recog_text) > 1: decoder.end_utt() logging.info("recog text: %s", recog_text) return recog_text return recog_text
#!/usr/bin/python import sys, os from pocketsphinx.pocketsphinx import Decoder import pyaudio script_dir = os.path.dirname(os.path.realpath(__file__)) # Create a decoder with certain model config = Decoder.default_config() config.set_string("-logfn", os.devnull) config.set_string('-hmm', os.path.join(script_dir, 'model/hmm/en')) config.set_string('-dict', os.path.join(script_dir, 'model/keywords_en.dic')) if True: config.set_string('-kws', os.path.join(script_dir, 'model/keywords_en.txt')) else: config.set_string('-keyphrase', 'miss j') config.set_float('-kws_threshold', 1e-15) # Process audio chunk by chunk. On keyword detected perform action and restart search decoder = Decoder(config) decoder.start_utt() stream = None if len(sys.argv) > 1: stream = open(sys.argv[1], "rb") else: p = pyaudio.PyAudio()
def main(): environment: str = os.getenv("ENVIRONMENT", "dev") config: Dict = load_config(environment) initialize_logger(level=config["logging"]["level"], filename=config["logging"]["filename"]) redis_host = config["redis"]["host"] redis_port = config["redis"]["port"] logger.debug(f"Connecting to redis at {redis_host}:{redis_port}") redis_client: Redis = Redis(host=redis_host, port=redis_port, db=0) logger.debug("Initializing PyAudio interface") audio = pyaudio.PyAudio() microphone_index = get_microphone_index(audio, config["microphone"]["name"]) logger.debug( f"Using microphone device '{config['microphone']['name']}' (card index {microphone_index})" ) logger.debug( f"Intializing pocketsphinx Decoder using model dir {MODELDIR}") decoder_config: DecoderConfig = Decoder.default_config() decoder_config.set_string("-hmm", os.path.join(MODELDIR, "en-us/en-us")) decoder_config.set_string("-lm", os.path.join(MODELDIR, "en-us/en-us.lm.bin")) decoder_config.set_string( "-dict", os.path.join(MODELDIR, "en-us/cmudict-en-us.dict")) decoder = Decoder(decoder_config) logger.debug("Opening audio stream") stream = audio.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=2048, input_device_index=microphone_index) stream.start_stream() in_speech_bf = False decoder.start_utt() try: logger.debug("Starting decoder loop") while cycle([True]): buf = stream.read(2048) if buf: logger.debug("Decoding raw audio") decoder.process_raw(buf, False, False) if decoder.get_in_speech() != in_speech_bf: logger.debug("GOT HERE") in_speech_bf = decoder.get_in_speech() if not in_speech_bf: decoder.end_utt() transcription = decoder.hyp().hypstr logger.debug(f"Result: {transcription}") redis_client.publish("subsystem.listener.recording", transcription) decoder.start_utt() else: logger.debug("Buffer closed. Ending") break decoder.end_utt() except Exception: logger.exception("Something bad happened") finally: redis_client.close()
class PocketsphinxTrigger(BaseTrigger): type = triggers.TYPES.VOICE def __init__(self, config, trigger_callback): super(PocketsphinxTrigger, self).__init__(config, trigger_callback, 'pocketsphinx') self._enabled_lock = threading.Event() self._disabled_sync_lock = threading.Event() self._decoder = None def setup(self): # PocketSphinx configuration ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string('-hmm', os.path.join(get_model_path(), self._tconfig['language'])) ps_config.set_string('-dict', os.path.join(get_model_path(), self._tconfig['dictionary'])) # Specify recognition key phrase #ps_config.set_string('-keyphrase', self._tconfig['phrase']) #ps_config.set_float('-kws_threshold', float(self._tconfig['threshold'])) ### Multiple Hotwords #ps_config.set_string('-inmic', 'yes') ps_config.set_string('-kws', '/opt/AlexaPi/src/keyphrase.list') # Hide the VERY verbose logging information when not in debug if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG: ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) def run(self): thread = threading.Thread(target=self.thread, args=()) thread.setDaemon(True) thread.start() def thread(self): while True: self._enabled_lock.wait() # Enable reading microphone raw data inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL, self._config['sound']['input_device']) inp.setchannels(1) inp.setrate(16000) inp.setformat(alsaaudio.PCM_FORMAT_S16_LE) inp.setperiodsize(1024) self._decoder.start_utt() triggered = False #assistantTriggered = False voice_command = "" while not triggered: if not self._enabled_lock.isSet(): break # Read from microphone _, buf = inp.read() # Detect if keyword/trigger word was said self._decoder.process_raw(buf, False, False) triggered = self._decoder.hyp() is not None # To avoid overflows close the microphone connection inp.close() self._decoder.end_utt() self._disabled_sync_lock.set() if triggered: ### Assistant Starts Here try: voice_command = self._decoder.hyp().hypstr except: voice_command = "" self._trigger_callback(self, voice_command) ### def enable(self): self._enabled_lock.set() self._disabled_sync_lock.clear() def disable(self): self._enabled_lock.clear() self._disabled_sync_lock.wait()