def recog_wav(MODELDIR, wavfile): #print(MODELDIR) config = Decoder.default_config() config.set_string('-hmm', os.path.join(MODELDIR, 'en-us')) config.set_string('-lm', os.path.join(MODELDIR, 'en-us.lm.bin')) config.set_string('-dict', os.path.join(MODELDIR, 'cmudict-en-us.dict')) # Decode streaming data. decoder = Decoder(config) start = time.time() decoder.start_utt() wav_stream = open(wavfile, "rb") while True: buffer = wav_stream.read(1024) if buffer: decoder.process_raw(buffer, False, False) else: break decoder.end_utt() duration = time.time() - start print("Duration: " + str(duration)) #Benchmarking for seg in decoder.seg(): print(seg.word)
def pocket(): ps = Pocketsphinx() language_directory = os.path.dirname(os.path.realpath(__file__)) print language_directory acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") language_model_file = os.path.join(language_directory, "language-model.lm.bin") phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict") config = Decoder.default_config() config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files config.set_string("-lm", language_model_file) config.set_string("-dict", phoneme_dictionary_file) decoder = Decoder(config) with sr.AudioFile(s_dir + "/a bad situation could become dramatically worse. /a bad situation could become dramatically worse. .wav") as source: audio_data = r.record(source) decoder.start_utt() decoder.process_raw(audio_data, False, True) decoder.end_utt() print decoder.hyp() ps.decode( audio_file=os.path.join(s_dir, 'a bad situation could become dramatically worse. /a bad situation could become dramatically worse. .wav'), buffer_size=2048, no_search=False, full_utt=False) print(ps.hypothesis()) # => ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>'] #pocket()
def get_text_from_audio(audio_input_name: str, working_directory: str = WORKING_DIRECTORY): """ Gets text from audio file (using pocketsphinx-python library) Args: Return: list: text from audio file """ # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', os.path.join(SPEECH_MODEL_PATH, 'en-us')) config.set_string('-lm', os.path.join(SPEECH_MODEL_PATH, 'en-us.lm.bin')) config.set_string('-dict', os.path.join(SPEECH_MODEL_PATH, 'cmudict-en-us.dict')) decoder = Decoder(config) # Decode streaming data. decoder.start_utt() with open(os.path.join(working_directory, audio_input_name), 'rb') as stream: while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) else: break decoder.end_utt() text_from_audio = [seg.word for seg in decoder.seg()] return text_from_audio if text_from_audio else 'Audio file doesn\'t contain words'
class PocketsphinxHotWord(HotWordEngine): """Wake word engine using PocketSphinx. PocketSphinx is very general purpose but has a somewhat high error rate. The key advantage is to be able to specify the wake word with phonemes. """ def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"): super().__init__(key_phrase, config, lang) # Hotword module imports from pocketsphinx import Decoder # Hotword module params self.phonemes = self.config.get("phonemes", "HH EY . M AY K R AO F T") self.num_phonemes = len(self.phonemes.split()) self.threshold = self.config.get("threshold", 1e-90) self.sample_rate = self.listener_config.get("sample_rate", 1600) dict_name = self.create_dict(self.key_phrase, self.phonemes) config = self.create_config(dict_name, Decoder.default_config()) self.decoder = Decoder(config) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name, config): """If language config doesn't exist then we use default language (english) config as a fallback. """ model_file = join(RECOGNIZER_DIR, 'model', self.lang, 'hmm') if not exists(model_file): LOG.error( 'PocketSphinx model not found at "{}". '.format(model_file) + 'Falling back to en-us model' ) model_file = join(RECOGNIZER_DIR, 'model', 'en-us', 'hmm') config.set_string('-hmm', model_file) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config def transcribe(self, byte_data, metrics=None): start = time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time() - start) return self.decoder.hyp() def found_wake_word(self, frame_data): hyp = self.transcribe(frame_data) return hyp and self.key_phrase in hyp.hypstr.lower()
class LocalRecognizer(object): def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000, lang="en-us"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.threshold = threshold self.phonemes = phonemes dict_name = self.create_dict(key_phrase, phonemes) self.decoder = Decoder(self.create_config(dict_name)) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/home/sg/mycroft-core/scripts/logs/pocket.log') return config def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def is_recognized(self, byte_data, metrics): hyp = self.transcribe(byte_data, metrics) return hyp and self.key_phrase in hyp.hypstr.lower() def found_wake_word(self, frame_data): hyp = self.transcribe(frame_data) hyp = self.transcribe(frame_data) #if hyp is not None: #print("hyp is not null") return hyp and self.key_phrase in hyp.hypstr.lower()
class stt: def __init__(self, profile, hmm=None, dict=None, lm=None, kws_threshold=None, keyphrase=None): self.profile = profile if keyphrase: if not dict: dict = fullpath('config/keyphrase.dic') if not lm: lm = fullpath('config/keyphrase.lm') else: if not dict: dict = fullpath('config/corpus.dic') if not lm: lm = fullpath('config/corpus.lm') if not hmm: hmm = 'share/pocketsphinx/model/en-us/en-us' config = Decoder.default_config() config.set_string('-hmm', os.path.join(SPHINX_ROOT, hmm)) config.set_string('-dict', dict) config.set_string('-lm', lm) config.set_string('-logfn', fullpath('config/sphinx.log')) if keyphrase: config.set_string('-keyphrase', keyphrase) if kws_threshold: config.set_float('-kws_threshold', kws_threshold) self.decoder = Decoder(config) self.transcribe = self.transcribe_darwin self.hyp = None def transcribe_darwin(self, wav): self.decoder.start_utt() self.decoder.process_raw(wav, False, False) self.decoder.end_utt() self.hyp = self.decoder.hyp() if self.hyp: return self.hyp.hypstr def get_prob(self): if self.hyp: print self.hyp.best_score return self.hyp.prob def transcribe_linux(self, wav): self.decoder.start_utt() self.decoder.process_raw(wav, False, False) self.decoder.end_utt() result = self.decoder.get_hyp() if result: return result[0]
def listen(MODE): CORPUS = 6278 model_path = get_model_path() home_path = "/home/the0s/Desktop/HCR_Python" print(model_path) print(home_path) DATADIR = "/usr/local/lib/python2.7/dist-packages/pocketsphinx/data" config = Decoder.default_config() config.set_string('-hmm', os.path.join(model_path, 'hub4wsj_sc_8k')) config.set_string('-lm', os.path.join(home_path, str(CORPUS) + '.lm.bin')) config.set_string('-dict', os.path.join(home_path, str(CORPUS) + '.dic')) config.set_string('-logfn', '/dev/null') decoder = Decoder(config) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) stream.start_stream() in_speech_bf = False decoder.start_utt() while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) if decoder.get_in_speech() != in_speech_bf: in_speech_bf = decoder.get_in_speech() if not in_speech_bf: decoder.end_utt() if decoder.hyp() is not None: buf = [s for s in decoder.hyp().hypstr.split()] print(buf) if len(buf) > 0: if MODE == 0: #DrinkRequest for item in buf: if checkRequest(item) != "NONE": output = checkRequest(item) stream.stop_stream() stream.close() return output if MODE == 1: #DrinkConfirm for item in buf: if checkConfirm(item) != "NONE": output = checkConfirm(item) stream.stop_stream() stream.close() return output decoder.start_utt() else: break decoder.end_utt()
def __init__(self): MODELDIR = get_model_path() CURR_DIR = os.path.dirname(os.path.realpath(__file__)) KEYPHRASE_THRESH_DIR = CURR_DIR + '/keyphrases.thresh' # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', os.path.join(MODELDIR, 'en-us')) config.set_string('-dict', \ os.path.join(MODELDIR, 'cmudict-en-us.dict')) config.set_string('-kws', KEYPHRASE_THRESH_DIR) #config.set_string('-logfn', '/dev/null') decoder = Decoder(config) p = pyaudio.PyAudio() host_info = p.get_host_api_info_by_index(0) device_index = 3 for i in range(host_info.get('deviceCount')): device_info = p.get_device_info_by_host_api_device_index(0, i) #print('\n\n\n\n'+str(i)+device_info.get('name') + " : " + str(device_info.get('maxInputChannels'))) if 'USB' in device_info.get('name'): device_index = i break ''' fire /1e18/ ''' stream = p.open( format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=1024, input_device_index=device_index) stream.start_stream() in_speech_bf = True decoder.start_utt() print("Starting to listen") while True: buf = stream.read(1024, exception_on_overflow = False) decoder.process_raw(buf, False, False) if decoder.hyp() != None: print("\nDetected: " + decoder.hyp().hypstr + "\n") decoder.end_utt() #print "Detected Move Forward, restarting search" decoder.start_utt() print("Am not listening any more") stream.stop_stream() stream.close() p.terminate()
class PocketsphinxRecognizer(LocalRecognizer): def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000, lang="en-us"): self.lang = str(lang) self.key_phrase = str(key_phrase) print("####key_phrase-->", key_phrase) self.sample_rate = sample_rate self.threshold = threshold self.phonemes = phonemes print("####phonemes -->", phonemes) dict_name = self.create_dict(key_phrase, phonemes) print("####dict_name --->", dict_name) self.decoder = Decoder(self.create_config(dict_name)) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/home/sg/mycroft-core/scripts/logs/pocket.log') return config def transcribe(self, byte_data, metrics=None): start = time.time() #sr = r.recognize_sphinx() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) #LOG.error("transcribed ---> +"+str(self.decoder.hyp())) return self.decoder.hyp() def found_wake_word(self, frame_data): hyp = self.transcribe(frame_data) #LOG.info("hyp is ---->"+hyp)) return hyp and self.key_phrase in hyp.hypstr.lower()
class PocketsphinxHotWord(HotWordEngine): def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"): super(PocketsphinxHotWord, self).__init__(key_phrase, config, lang) # Hotword module imports from pocketsphinx import Decoder # Hotword module config module = self.config.get("module") if module != "pocketsphinx": LOG.warning( str(module) + " module does not match with " "Hotword class pocketsphinx") # Hotword module params self.phonemes = self.config.get("phonemes", "HH EY . M AY K R AO F T") self.num_phonemes = len(self.phonemes.split()) self.threshold = self.config.get("threshold", 1e-90) self.sample_rate = self.listener_config.get("sample_rate", 1600) dict_name = self.create_dict(key_phrase, self.phonemes) config = self.create_config(dict_name, Decoder.default_config()) self.decoder = Decoder(config) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name, config): model_file = join(RECOGNIZER_DIR, 'model', self.lang, 'hmm') if not exists(model_file): LOG.error('PocketSphinx model not found at ' + str(model_file)) config.set_string('-hmm', model_file) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def found_wake_word(self, frame_data): hyp = self.transcribe(frame_data) return hyp and self.key_phrase in hyp.hypstr.lower()
class PocketsphinxHotWord(HotWordEngine): def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"): super(PocketsphinxHotWord, self).__init__(key_phrase, config, lang) # Hotword module imports from pocketsphinx import Decoder # Hotword module config module = self.config.get("module") if module != "pocketsphinx": LOG.warning( str(module) + " module does not match with " "Hotword class pocketsphinx") # Hotword module params self.phonemes = self.config.get("phonemes", "HH EY . M AY K R AO F T") self.num_phonemes = len(self.phonemes.split()) self.threshold = self.config.get("threshold", 1e-90) self.sample_rate = self.listener_config.get("sample_rate", 1600) dict_name = self.create_dict(self.key_phrase, self.phonemes) config = self.create_config(dict_name, Decoder.default_config()) self.decoder = Decoder(config) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name, config): model_file = join(RECOGNIZER_DIR, 'model', self.lang, 'hmm') if not exists(model_file): LOG.error('PocketSphinx model not found at ' + str(model_file)) config.set_string('-hmm', model_file) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def found_wake_word(self, frame_data): hyp = self.transcribe(frame_data) return hyp and self.key_phrase in hyp.hypstr.lower()
class PocketsphinxListener: """Pocketsphinx listener implementation used for comparison with Precise""" def __init__(self, key_phrase, dict_file, hmm_folder, threshold=1e-90, chunk_size=-1): from pocketsphinx import Decoder config = Decoder.default_config() config.set_string('-hmm', hmm_folder) config.set_string('-dict', dict_file) config.set_string('-keyphrase', key_phrase) config.set_float('-kws_threshold', float(threshold)) config.set_float('-samprate', 16000) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') self.key_phrase = key_phrase self.buffer = b'\0' * pr.sample_depth * pr.buffer_samples self.pr = pr self.read_size = -1 if chunk_size == -1 else pr.sample_depth * chunk_size try: self.decoder = Decoder(config) except RuntimeError: options = dict(key_phrase=key_phrase, dict_file=dict_file, hmm_folder=hmm_folder, threshold=threshold) raise RuntimeError('Invalid Pocketsphinx options: ' + str(options)) def _transcribe(self, byte_data): self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() return self.decoder.hyp() def found_wake_word(self, frame_data): hyp = self._transcribe(frame_data + b'\0' * int(2 * 16000 * 0.01)) return bool(hyp and self.key_phrase in hyp.hypstr.lower()) def update(self, stream: Union[BinaryIO, np.ndarray, bytes]) -> float: if isinstance(stream, np.ndarray): chunk = audio_to_buffer(stream) else: if isinstance(stream, (bytes, bytearray)): chunk = stream else: chunk = stream.read(self.read_size) if len(chunk) == 0: raise EOFError self.buffer = self.buffer[len(chunk):] + chunk return float(self.found_wake_word(self.buffer))
def retrieve_scores(word): filename = word + '.wav' grammarname = word + '-align.jsgf' model_path = get_model_path() # Initialize the config values config = DefaultConfig() config.set_boolean('-verbose', False) config.set_string('-hmm', os.path.join(model_path, 'en-us')) config.set_boolean('-lm', False) config.set_string('-dict', 'phonemes.dict.txt') config.set_boolean('-backtrace', True) config.set_boolean('-bestpath', False) config.set_boolean('-fsgusefiller', False) decoder = Decoder(config) # Set the search to JSGF Grammar jsgf = Jsgf(grammarname) rule = jsgf.get_rule('forcing.' + word) decoder.set_jsgf_file('grammar', grammarname) decoder.set_search('grammar') stream = open(filename, 'rb') utt_started = False scores = [] decoder.start_utt() while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) in_speech = decoder.get_in_speech() if (in_speech and not utt_started): utt_started = True if (not in_speech and utt_started): decoder.end_utt() hyp = decoder.hyp() if hyp is not None: print('hyp: %s' % (hyp.best_score)) print_segments(decoder) scores = retrieve_segments(decoder) decoder.start_utt() utt_started = False else: break decoder.end_utt() print('scores:', scores) return scores
def get_phonemes(file): # Decode streaming data decoder = Decoder(config) decoder.start_utt() stream = open(file, 'rb') i=0 while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) else: break decoder.end_utt() Hypothesis = decoder.hyp() return [seg.word for seg in decoder.seg()]
class LocalRecognizer(object): def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000, lang="en-us"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.threshold = threshold self.phonemes = phonemes dict_name = self.create_dict(key_phrase, phonemes) self.decoder = Decoder(self.create_config(dict_name)) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def is_recognized(self, byte_data, metrics): hyp = self.transcribe(byte_data, metrics) return hyp and self.key_phrase in hyp.hypstr.lower() def found_wake_word(self, hypothesis): return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
def run(self): conf = Decoder.default_config() conf.set_string('-hmm', self.config.hmmPS) conf.set_string('-lm', self.config.lmPS) conf.set_string('-dict', self.config.dictPS) if os.path.isfile(self.config.mllrPS): conf.set_string('-mllr', self.config.mllrPS) decoder = Decoder(conf) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) stream.start_stream() self.samplewith = p.get_sample_size(pyaudio.paInt16) in_speech_bf = True decoder.start_utt('') while not self._terminate: buf = stream.read(1024) if buf: if self.save: self.liSave.append(buf) self.numSave += 1 if self.numSave > self.maxSave: # nos protegemos de dejar el microfono encendido self.activeSave(self.fichWAV) decoder.process_raw(buf, False, False) if decoder.get_in_speech() != in_speech_bf: in_speech_bf = decoder.get_in_speech() if not in_speech_bf: decoder.end_utt() try: if decoder.hyp().hypstr != '': self.decode(decoder.hyp().hypstr) except AttributeError: pass decoder.start_utt('') else: break decoder.end_utt()
def run( self ): conf = Decoder.default_config() conf.set_string('-hmm', self.config.hmmPS) conf.set_string('-lm', self.config.lmPS) conf.set_string('-dict', self.config.dictPS) if os.path.isfile(self.config.mllrPS): conf.set_string('-mllr', self.config.mllrPS) decoder = Decoder(conf) p = pyaudio.PyAudio() stream = p.open( format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024 ) stream.start_stream() self.samplewith = p.get_sample_size(pyaudio.paInt16) in_speech_bf = True decoder.start_utt('') while not self._terminate: buf = stream.read(1024) if buf: if self.save: self.liSave.append(buf) self.numSave += 1 if self.numSave > self.maxSave: # nos protegemos de dejar el microfono encendido self.activeSave(self.fichWAV) decoder.process_raw(buf, False, False) if decoder.get_in_speech() != in_speech_bf: in_speech_bf = decoder.get_in_speech() if not in_speech_bf: decoder.end_utt() try: if decoder.hyp().hypstr != '': self.decode(decoder.hyp().hypstr) except AttributeError: pass decoder.start_utt('') else: break decoder.end_utt()
def begin_passive_listening(self): """Uses PocketSphinx to listen for the wakeword and call the active listening function """ config = Decoder.default_config() config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) config.set_string('-keyphrase', self.config.get("general", "wake_word")) config.set_string('-logfn', 'nul') config.set_float('-kws_threshold', 1e-10) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) stream.start_stream() decoder = Decoder(config) decoder.start_utt() while True: buf = stream.read(1024) decoder.process_raw(buf, False, False) if decoder.hyp() is not None: logging.debug("Wake word recognized") speech_input = self.active_listen() if (speech_input != -1 and speech_input != -2 and speech_input != -3): for name, command in self.commands.items(): if speech_input in name: command() elif speech_input == -1: self.speak("Sorry, I didn't catch that.") decoder.end_utt() decoder.start_utt() logging.debug("Listening for wakeword again")
def transcribe(decoder: pocketsphinx.Decoder, audio_data: bytes, nbest: int = 0) -> Dict[str, Any]: """Transcribes audio data to text.""" # Process data as an entire utterance start_time = time.time() decoder.start_utt() decoder.process_raw(audio_data, False, True) decoder.end_utt() end_time = time.time() logger.debug(f"Decoded audio in {end_time - start_time} second(s)") transcription = "" decode_seconds = end_time - start_time likelihood = 0.0 score = 0 hyp = decoder.hyp() if hyp is not None: likelihood = decoder.get_logmath().exp(hyp.prob) transcription = hyp.hypstr result = { "text": transcription, "transcribe_seconds": decode_seconds, "likelihood": likelihood, } if nbest > 0: # Include alternative transcriptions result["nbest"] = { nb.hypstr: nb.score for nb in decoder.nbest()[:nbest] } return result
class VoiceIOHandler(JarvisIOHandler): def __init__(self): JarvisIOHandler.__init__(self) hmm = '/usr/local/share/pocketsphinx/model/en-us/en-us' dic ='/usr/local/share/pocketsphinx/model/en-us/cmudict-en-us.dict' lm ='/usr/local/share/pocketsphinx/model/en-us/en-us.lm.bin' config = Decoder.default_config() config.set_string('-hmm',hmm) config.set_string('-lm',lm) config.set_string('-dict',dic) config.set_string('-logfn','/dev/null') self.decoder = Decoder(config) self.microphone = pyaudio.PyAudio() pyvona_config = open('configs/pyvona.txt') pvcfg = pyvona_config.readlines() pyvona_config.close() self.voice = pyvona.create_voice(pvcfg[0].strip(),pvcfg[1].strip()) self.voice.region = 'us-west' self.voice.voice_name='Brian' self.voice.sentence_break = 200 googleSTT_config = open('configs/GoogleSTT.txt') self.key = googleSTT_config.readlines()[0].strip() googleSTT_config.close() self.recognizer = sr.Recognizer() with sr.Microphone() as source: self.recognizer.adjust_for_ambient_noise(source) def waitForInput(self): if self._isLowPower: utt = '' stream = self.microphone.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) stream.start_stream() in_speech_bf = True self.decoder.start_utt() while True: buf = stream.read(1024) if buf: self.decoder.process_raw(buf, False, False) if self.decoder.get_in_speech() != in_speech_bf: in_speech_bf = self.decoder.get_in_speech() if not in_speech_bf: self.decoder.end_utt() try: if self.decoder.hyp().hypstr != '': utt = self.decoder.hyp().hypstr break except AttributeError: pass self.decoder.start_utt() stream.stop_stream() stream.close() print utt return utt.lower().strip() else: with sr.Microphone() as source: print 'Listening' audio = self.recognizer.listen(source) print 'Recognizing...' try: rec = self.recognizer.recognize_google(audio,key=self.key).lower().strip() print rec return rec except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") return 'CNU' except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e)) return 'CNC' def output(self,text_to_output): self.voice.speak(text_to_output)
class ContinuousPocketsphinx(object): ''' classdocs ''' CHUNK = 4096 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 16000 #MODELDIR = "../models" MODELDIR = "/home/mgreibus/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" decoder = None stream = None config = None ai = None def __init__(self): ''' Constructor ''' print ("[__init__]+++") # Create a decoder with certain model self.ai = Artificialintelligence() self.config = self.createConfig("code"); self.decoder = Decoder(self.config); print ("[__init__] created decoder") #self.updateGrammar(self.decoder, "confirmation"); print ("[__init__]---") p = pyaudio.PyAudio() self.stream = p.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK) #Indicate listening for next utterance print ("READY....") def updateGrammar(self,pDecoder, pGramma): ''' Update decoder language model from fsg file ''' print ("[updateGrammar]+++" + pGramma) logmath = pDecoder.get_logmath(); fsg = sphinxbase.FsgModel(os.path.join("../resource/", pGramma+'.fsg'), logmath, 7.5) #pDecoder.readfile(os.path.join("../resource/", pGramma+'.fsg'), logmath) pDecoder.set_fsg("default",fsg); pDecoder.set_search("default"); print ("[updateGrammar]---") def createConfig(self,pGramma): print ("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/liepa.cd_semi_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print ("[createConfig]---") return config; def speak(self,text): print("Speak: ", text) if text is not None: aProcess = subprocess.Popen(['/home/mgreibus/bin/tark-win-lt', text], stderr=subprocess.STDOUT) out = aProcess.communicate()[0]; time.sleep (0.100) print("ended Speak: ", out) def said(self, aiContext, text): print ("[said]+++", text) aiContext = self.ai.said(text, aiContext) print ('AI response: ', aiContext.state, aiContext.response) self.speak(aiContext.response) if aiContext.interactiveStep is False : self.said(aiContext, text); print ("[said]---") return aiContext def recognized(self, pStream, pDecoder, aiContext): print ("[recognized]+++") pStream.stop_stream() pDecoder.end_utt() # Retrieve hypothesis. hypothesis = pDecoder.hyp() if hypothesis is not None: print ('Best hypothesis: ', hypothesis.uttid, hypothesis.best_score, hypothesis.hypstr) self.said(aiContext, hypothesis.hypstr.decode('utf-8')) if aiContext.state in aiContext.GRAM: self.updateGrammar(pDecoder, aiContext.GRAM[aiContext.state]); elif (time.time() - aiContext.stateStarted) > 10: self.speak(aiContext.response) aiContext.stateStarted = time.time() print ("Time: ", (time.time() - aiContext.stateStarted)) print("AI response ", aiContext.response) time.sleep (0.100) #Indicate listening for next utterance pStream.start_stream() pDecoder.start_utt(None) print ("READY....") print ("[recognized]---") return aiContext def run(self): ''' Executor ''' print("* start recording") self.decoder.start_utt(None) cur_vad_state = 0 aiContext = self.ai.createContext(); self.said(aiContext, None); while True: data = self.stream.read(self.CHUNK) time.sleep (0.100) #frames.append(data) self.decoder.process_raw(data, False, False) vad_state = self.decoder.get_vad_state() if vad_state and not cur_vad_state: #silence -> speech transition, #let user know that we heard print("Listening...\n") if not vad_state and cur_vad_state: #speech -> silence transition, #time to start new utterance aiContext = self.recognized(self.stream,self.decoder, aiContext); if aiContext.state == aiContext.STATE_THANKS: break cur_vad_state = vad_state
class KeywordSpotting(threading.Thread): def __init__(self, in_fs, out_fs, mute_period_length, kws_frame_length): threading.Thread.__init__(self) # 初始化配置 self.daemon = True self.exit_flag = False self.in_fs = in_fs self.out_fs = out_fs self.mute_period_frames_count = int(in_fs * mute_period_length) self.kws_frames_count = int(in_fs * kws_frame_length) model_path = get_model_path() config = Decoder.default_config() config.set_string('-hmm', os.path.join(model_path, 'en-us')) # 声学模型路径 # config.set_string('-lm',"./tests/7567.lm") config.set_string('-dict', os.path.join(model_path, 'cmudict-en-us.dict')) # 字典路径 config.set_string('-keyphrase', 'alexa') config.set_float('-kws_threshold', 1e-20) config.set_string('-logfn', './logs/tmp') # INFO输出到其他位置 self.decoder = Decoder(config) self.decoder.start_utt() self.start() def run(self): while not self.exit_flag: # 1.从input池中读取一定长度的数据。该过程可能被阻塞,直到池中存在足够多数据。 processed_input_frames = global_var.processed_input_pool.get( self.kws_frames_count) # 2.如果keyword spotting检测出该数据段中存在关键字,则对该数据进行重采样,填充后,存入keyword池 if self._kws(processed_input_frames): global_var.keyword_pool.put( self._padding( Resampler.resampling(processed_input_frames, self.in_fs, self.out_fs), 0, self.mute_period_frames_count)) def stop(self): self.exit_flag = True self.join() def _kws(self, frames): buf = frames.tobytes() if buf: self.decoder.process_raw(buf, False, False) if self.decoder.hyp() != None: print([(seg.word, seg.prob, seg.start_frame, seg.end_frame) for seg in self.decoder.seg()]) print("Detected keyphrase, restarting search") self.decoder.end_utt() self.decoder.start_utt() return True return False def _padding(self, frames, padding_value, padding_num): res = np.pad(frames, (0, padding_num), 'constant', constant_values=(padding_value, padding_value)) return res
decoder = Decoder(config) p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) #Indicate listening for next utterance print("READY....") frames = [] utt_started = False decoder.start_utt(None) while True: data = stream.read(CHUNK) time.sleep(0.100) #frames.append(data) decoder.process_raw(data, False, False) in_speech = decoder.get_in_speech() if in_speech and not utt_started: #silence -> speech transition, #let user know that he is heard print("Started...\n") utt_started = True if not in_speech and utt_started: #speech -> silence transition,
class TestVoice(Voice): # playback FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 CHUNK = 1024 FILE_NAME = 'aux.wav' # recognition MODELDIR = "es-ES" GRAMMARDIR = "gram" # text to speech RATE = 150 VOLUME = 0.9 def __init__(self, file_name='aux.wav', raspi=False): self.FILE_NAME = file_name self.audio = pyaudio.PyAudio() self.raspi = raspi self.config = Decoder.default_config() self.config.set_string('-hmm', os.path.join(self.MODELDIR, 'acoustic-model')) self.config.set_string( '-dict', os.path.join(self.MODELDIR, 'pronounciation-dictionary.dict')) self.config.set_string('-logfn', os.devnull) self.decoder = Decoder(self.config) self.r = sr.Recognizer() print("adjunting...") with sr.Microphone() as source: self.r.adjust_for_ambient_noise(source) # tts self.tts = pyttsx3.init() self.tts.setProperty('rate', self.RATE) self.tts.setProperty('volume', self.VOLUME) self.tts.setProperty('voice', 'spanish-latin-am') def speak(self, phrase): self.tts.say(phrase) self.tts.runAndWait() def play(self, filename): extension = filename.split('.')[1] if extension == 'wav': wf = wave.open(filename, 'rb') stream = self.audio.open(format=self.audio.get_format_from_width( wf.getsampwidth()), channels=wf.getnchannels(), rate=wf.getframerate(), output=True) data = wf.readframes(self.CHUNK) # play while len(data) > 0: stream.write(data) data = wf.readframes(self.CHUNK) stream.stop_stream() stream.close() elif extension == 'mp3': playsound(filename) def listen(self, duration=3): # start recording if self.raspi: stream = self.audio.open(format=self.FORMAT, channels=1, rate=self.RATE, input_device_index=2, input=True, frames_per_buffer=self.CHUNK) else: stream = self.audio.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input_device_index=7, input=True, frames_per_buffer=self.CHUNK) frames = [] for i in range(0, int(self.RATE / self.CHUNK * duration)): data = stream.read(self.CHUNK, exception_on_overflow=False) frames.append(data) stream.stop_stream() stream.close() wave_file = wave.open(self.FILE_NAME, 'wb') if self.raspi: wave_file.setnchannels(1) else: wave_file.setnchannels(self.CHANNELS) wave_file.setsampwidth(self.audio.get_sample_size(self.FORMAT)) wave_file.setframerate(self.RATE) wave_file.writeframes(b''.join(frames)) wave_file.close() with sr.AudioFile(self.FILE_NAME) as source: audio = self.r.record(source) raw_data = audio.get_raw_data(convert_rate=16000, convert_width=2) return raw_data def echo(self): self.play(self.FILE_NAME) def recognize(self): with sr.Microphone() as source: audio = self.r.listen(source) # raw_out = self.listen() try: self.decoder.start_utt() self.decoder.process_raw(audio.frame_data, False, True) self.decoder.end_utt() hyp = self.decoder.hyp() return hyp.hypstr except Exception: return None def loadGrammar(self, grammar): # delete(self.decoder) grammar_file = grammar + '.gram' c_string = os.path.join(self.GRAMMARDIR, grammar_file) #.encode('ascii') print(c_string) self.config.set_string('-jsgf', c_string) self.decoder.reinit(self.config) def close(self): self.audio.terminate()
class SphinxWrapper(object): ''' For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal. Before signal is fed to decoder, it should be isntructed that new utterance is expected. When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)` ''' #MODELDIR = "../models" #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models" decoder = None config = None previousVadState = 0 currentVadState = 0 def __init__(self): ''' Constructor ''' def prepareDecoder(self, pGramma): ''' Entry point where sphinx decoder is initialized or grammar updated ''' if self.decoder is None: self.config = self.createConfig(pGramma); self.decoder = Decoder(self.config); else: self.updateGrammar(self.decoder, pGramma); def createConfig(self,pGramma): ''' Create configuration with acoustic model path, grammar and dictionary ''' print ("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print ("[createConfig]---") return config; def updateGrammar(self,pGramma): ''' Update decoder language model from fsg file ''' print ("[updateGrammar]+++" + pGramma) logmath = self.decoder.get_logmath(); fsg = sphinxbase.FsgModel(os.path.join("../resource/", pGramma+'.fsg'), logmath, 7.5) self.decoder.set_fsg("default",fsg); self.decoder.set_search("default"); print ("[updateGrammar]---") def startListening(self): """ Instruct decoder that new utterace should be expected """ self.decoder.start_utt(None) def stopListening(self): """ Instruct decoder that new utterace should is not expected any more """ self.decoder.end_utt() def process_raw(self, data): """ Feed decoder with raw audio data. After data is updating refresh VAD state """ #print("process_raw...\n") self.decoder.process_raw(data, False, False) self.previousVadState = self.currentVadState self.currentVadState = self.decoder.get_vad_state(); #print("process_raw", self.currentVadState and True, self.previousVadState and True) def calculateHypothesis(self): return self.decoder.hyp(); def calculateVadState(self): return self.decoder.get_vad_state; def isVoiceStarted(self): ''' silence -> speech transition, ''' return self.currentVadState and not self.previousVadState def isVoiceEnded(self): ''' speech -> silence transition, ''' return not self.currentVadState and self.previousVadState
class SphinxWrapper(object): ''' For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal. Before signal is fed to decoder, it should be isntructed that new utterance is expected. When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)` ''' #MODELDIR = "../models" #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models" decoder = None config = None previousVadState = 0 currentVadState = 0 def __init__(self): ''' Constructor ''' def prepareDecoder(self, pGramma): ''' Entry point where sphinx decoder is initialized or grammar updated ''' if self.decoder is None: self.config = self.createConfig(pGramma) self.decoder = Decoder(self.config) else: self.updateGrammar(self.decoder, pGramma) def createConfig(self, pGramma): ''' Create configuration with acoustic model path, grammar and dictionary ''' print("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma + '.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print("[createConfig]---") return config def updateGrammar(self, pGramma): ''' Update decoder language model from fsg file ''' print("[updateGrammar]+++" + pGramma) logmath = self.decoder.get_logmath() fsg = sphinxbase.FsgModel( os.path.join("../resource/", pGramma + '.fsg'), logmath, 7.5) self.decoder.set_fsg("default", fsg) self.decoder.set_search("default") print("[updateGrammar]---") def startListening(self): """ Instruct decoder that new utterace should be expected """ self.decoder.start_utt(None) def stopListening(self): """ Instruct decoder that new utterace should is not expected any more """ self.decoder.end_utt() def process_raw(self, data): """ Feed decoder with raw audio data. After data is updating refresh VAD state """ #print("process_raw...\n") self.decoder.process_raw(data, False, False) self.previousVadState = self.currentVadState self.currentVadState = self.decoder.get_vad_state() #print("process_raw", self.currentVadState and True, self.previousVadState and True) def calculateHypothesis(self): return self.decoder.hyp() def calculateVadState(self): return self.decoder.get_vad_state def isVoiceStarted(self): ''' silence -> speech transition, ''' return self.currentVadState and not self.previousVadState def isVoiceEnded(self): ''' speech -> silence transition, ''' return not self.currentVadState and self.previousVadState
class PocketSphinxASR(ASR): NAME = 'Pocketsphinx ASR' DEPENDENCIES = { 'system': [ 'swig', 'libpulse-dev' ], 'pip' : [ 'pocketsphinx==0.1.15' ] } LANGUAGE_PACKS = { 'en': [ f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/en-us/en-us.tar', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/en-us/en-us.lm.bin', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/en-us/cmudict-en-us.dict' ], 'fr': [ f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/fr-fr/fr-fr.tar', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/fr-fr/fr-fr.lm.bin', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/fr-fr/cmudict-fr-fr.dict' ], 'de': [ f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/de-de/de-de.tar', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/de-de/de-de.lm.bin', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/de-de/cmudict-de-de.dict' ] } def __init__(self): super().__init__() self._capableOfArbitraryCapture = True self._isOnlineASR = False self._decoder: Optional[Decoder] = None self._config = None def onStart(self): super().onStart() if not self.checkLanguage(): self.downloadLanguage() self._config = Decoder.default_config() self._config.set_string('-hmm', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}') self._config.set_string('-lm', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}.lm.bin') self._config.set_string('-dict', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/cmudict-{self.LanguageManager.activeLanguageAndCountryCode.lower()}.dict') self._decoder = Decoder(self._config) def checkLanguage(self) -> bool: if not Path(self.Commons.rootDir(), f'venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}').exists(): self.logInfo('Missing language model') return False return True def timeout(self): super().timeout() try: self._decoder.end_utt() except: # If this fails we don't care, at least we tried to close the utterance pass def downloadLanguage(self) -> bool: self.logInfo(f'Downloading language model for "{self.LanguageManager.activeLanguage}"') venv = Path(self.Commons.rootDir(), 'venv/lib/python3.7/site-packages/pocketsphinx/') for url in self.LANGUAGE_PACKS[self.LanguageManager.activeLanguage]: filename = Path(url).name download = Path(venv, 'model', filename) self.Commons.downloadFile(url=f'{url}?raw=true', dest=str(download)) if download.suffix == '.tar': dest = Path(venv, 'model', self.LanguageManager.activeLanguageAndCountryCode.lower()) if dest.exists(): shutil.rmtree(dest) tar = tarfile.open(str(download)) tar.extractall(str(dest)) download.unlink() self.logInfo('Downloaded and installed') return True def decodeStream(self, session: DialogSession) -> Optional[ASRResult]: super().decodeStream(session) result = None with Stopwatch() as processingTime: with Recorder(self._timeout) as recorder: self.ASRManager.addRecorder(session.siteId, recorder) self._decoder.start_utt() inSpeech = False for chunk in recorder: if self._timeout.isSet(): break self._decoder.process_raw(chunk, False, False) if self._decoder.get_in_speech() != inSpeech: inSpeech = self._decoder.get_in_speech() if not inSpeech: self._decoder.end_utt() result = self._decoder.hyp() if self._decoder.hyp() else None break self.end(recorder, session) return ASRResult( text=result.hypstr.strip(), session=session, likelihood=self._decoder.hyp().prob, processingTime=processingTime.time ) if result else None
class Wrapper(): def __init__(self, **kwargs): signal.signal(signal.SIGINT, self.stop) model_path = get_model_path() kwargs = { x: os.path.expandvars(kwargs[x]) if type(kwargs[x]) is str else kwargs[x] for x in kwargs } nodename = kwargs.pop('nodename') grammar_file = kwargs.pop('grammar_file', None) grammar_rule = kwargs.pop('grammar_rule', None) grammar_name = kwargs.pop('grammar_name', None) kwargs.pop('esiaf_input_topic') if kwargs.get('dic') is not None and kwargs.get('dict') is None: kwargs['dict'] = kwargs.pop('dic') if kwargs.get('hmm') is None: kwargs['hmm'] = os.path.join(model_path, 'en-us') if kwargs.get('lm') is None: kwargs['lm'] = os.path.join(model_path, 'en-us.lm.bin') if kwargs.get('dict') is None and kwargs.get('dic') is None: kwargs['dict'] = os.path.join(model_path, 'cmudict-en-us.dict') if kwargs.pop('verbose', False) is False: if sys.platform.startswith('win'): kwargs['logfn'] = 'nul' else: kwargs['logfn'] = '/dev/null' config = Decoder.default_config() print(kwargs) for key, value in kwargs.items(): if isinstance(value, bool): config.set_boolean('-{}'.format(key), value) elif isinstance(value, int): config.set_int('-{}'.format(key), value) elif isinstance(value, float): config.set_float('-{}'.format(key), value) elif isinstance(value, str): config.set_string('-{}'.format(key), value) self.decoder = Decoder(config) if grammar_file and grammar_rule and grammar_name: jsgf = Jsgf(grammar_file) rule = jsgf.get_rule(grammar_name + '.' + grammar_rule) fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5) self.decoder.set_fsg(grammar_name, fsg) self.decoder.set_search(grammar_name) self.start = None self.finish = None self.speech_publisher = rospy.Publisher(nodename + '/' + 'SpeechRec', SpeechInfo, queue_size=10) def stop(self, *args, **kwargs): raise StopIteration def hypothesis(self): hyp = self.decoder.hyp() if hyp: return hyp.hypstr else: return '' def vad_finished_callback(self): self.decoder.end_utt() result = '' if self.decoder.hyp(): result = self.hypothesis() rospy.loginfo('understood: \'' + str(result) + '\'') hypo = SpeechHypothesis() hypo.recognizedSpeech = result hypo.probability = 1.0 time = RecordingTimeStamps() time.start = self.start time.finish = self.finish speechInfo = SpeechInfo() speechInfo.hypotheses = [hypo] speechInfo.duration = time self.speech_publisher.publish(speechInfo) self.start = None self.finish = None def add_audio_data(self, audio_data, recording_timestamps): _recording_timestamps = RecordingTimeStamps() msg_from_string(_recording_timestamps, recording_timestamps) rospy.loginfo('got audio!') if not self.start: self.start = _recording_timestamps.start self.decoder.start_utt() self.finish = _recording_timestamps.finish bytearray = audio_data.tobytes() self.decoder.process_raw(bytearray, False, False)
class VoiceService(object): audio_device = None buffer_size = 2048 sampling_rate = 16000 def __init__(self): config = get_decoder_config() self.decoder = Decoder(config) self.speech = pyttsx3.init() self.audio = sphinxbase.Ad(self.audio_device, self.sampling_rate) self.buffer = bytearray(self.buffer_size) self.default_search = self.decoder.get_search() self.in_speech = False self.max_history = 100 self.phrases = [] self.prompts = {} self.next_prompt_id = 1 self.current_prompt = None self.prompt_queue = queue.Queue() def create_prompt(self, message=None, message_url=None, search="enable", timeout=15): """ Create a new prompt and add it to the queue. Currently, only one type of prompt is supported. We play a message, then wait for someone to say a specific word (the search word) within the alloted amount of time. The status of the prompt can be retrieved by calling get_prompt with the appropriate id. timeout: prompt timeout in seconds, expected to be either None or numeric. """ if timeout is not None: # Be forgiving of caller who may have passed timeout as a string. timeout = float(timeout) prompt = { "created_time": time.time(), "detected": False, "detected_time": None, "id": self.get_next_prompt_id(), "message": message, "message_url": message_url, "search": search, "search_started": False, "search_started_time": None, "played": False, "played_time": None, "timeout": timeout, "timed_out": False } self.prompts[str(prompt['id'])] = prompt self.prompt_queue.put(prompt) return prompt def get_next_prompt_id(self): """ Get a unique ID for a prompt. """ tmp = self.next_prompt_id self.next_prompt_id += 1 return tmp def get_phrases(self): """ Get the history of detected phrases. """ return self.phrases def get_prompt(self, prompt_id): """ Get information about a prompt. """ return self.prompts[str(prompt_id)] def get_status(self): """ Get the system status. """ status = { "current_prompt": self.current_prompt, "in_speech": self.decoder.get_in_speech(), "queue_length": self.prompt_queue.qsize(), "search": self.decoder.get_search() } return status def play_prompt(self, prompt): prompt['played_time'] = time.time() if prompt.get("message_url", None) is not None: cmd = ["mplayer", "-ao", "pulse", prompt['message_url']] subprocess.call(cmd) elif prompt.get("message", None) is not None: self.speech.say(prompt['message']) self.speech.runAndWait() prompt['played'] = True def process_hypothesis(self, hypothesis): print("SPEECH {}".format(hypothesis.hypstr)) phrase = { "search": self.decoder.get_search(), "time": time.time(), "text": hypothesis.hypstr } self.phrases.append(phrase) del self.phrases[:-self.max_history] def run_next_prompt(self): if self.prompt_queue.empty(): self.create_prompt(None, search="paradrop", timeout=None) self.current_prompt = self.prompt_queue.get_nowait() self.decoder.set_search(self.current_prompt['search']) self.audio.stop_recording() self.play_prompt(self.current_prompt) self.audio.start_recording() self.current_prompt['search_started_time'] = time.time() self.current_prompt['search_started'] = True def detect_timeout(self): """ Check if the current prompt has timed out. """ if self.current_prompt is None: # No active prompt to timeout. return False if self.decoder.get_in_speech(): # Defer timeout if decoder reports that speech is in progress. A # person may be speaking the target phrase currently. return False if self.current_prompt['timeout'] is None: # If timeout is None, then only timeout when there is another item # in the queue. return not self.prompt_queue.empty() else: diff = time.time() - self.current_prompt['search_started_time'] return diff >= self.current_prompt['timeout'] def run(self): self.decoder.set_keyphrase("activate", "activate") self.decoder.set_keyphrase("allow", "allow") self.decoder.set_keyphrase("enable", "enable") self.decoder.set_keyphrase("paradrop", "para drop") self.audio.start_recording() while True: if self.current_prompt is None: self.run_next_prompt() self.decoder.start_utt() self.audio.readinto(self.buffer) self.decoder.process_raw(self.buffer, False, False) if self.in_speech and not self.decoder.get_in_speech(): self.decoder.end_utt() hypothesis = self.decoder.hyp() if hypothesis is not None: self.process_hypothesis(hypothesis) self.current_prompt['detected'] = True self.current_prompt['detected_time'] = time.time() self.current_prompt = None else: self.decoder.start_utt() if self.detect_timeout(): self.decoder.end_utt() self.current_prompt['timed_out'] = True self.current_prompt = None self.in_speech = self.decoder.get_in_speech()
from os import environ, path from sphinxbase import Config from pocketsphinx import Decoder MODELDIR = "pocketsphinx/model" DATADIR = "pocketsphinx/test/data" # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', path.join(MODELDIR, 'en-us/en-us')) config.set_string('-lm', path.join(MODELDIR, 'en-us/en-us.lm.bin')) config.set_string('-dict', path.join(MODELDIR, 'en-us/cmudict-en-us.dict')) decoder = Decoder(config) # Decode streaming data. decoder = Decoder(config) decoder.start_utt() stream = open(path.join(DATADIR, 'goforward.raw'), 'rb') while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) else: break decoder.end_utt() print ('Best hypothesis segments: ', [seg.word for seg in decoder.seg()])
class PocketSphinxAsr(Asr): NAME = 'Pocketsphinx Asr' DEPENDENCIES = { 'system': ['swig', 'libpulse-dev'], 'pip': ['pocketsphinx==0.1.15'] } LANGUAGE_PACK = { f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/%lang%/%lang%.tar', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/%lang%/%lang%.lm.bin', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/%lang%/cmudict-%lang%.dict' } def __init__(self): super().__init__() self._capableOfArbitraryCapture = True self._isOnlineASR = False self._decoder: Optional[Decoder] = None self._config = None def onStart(self): super().onStart() if not self.checkLanguage(): self.downloadLanguage() self._config = Decoder.default_config() self._config.set_string( '-hmm', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}' ) self._config.set_string( '-lm', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}.lm.bin' ) self._config.set_string( '-dict', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/cmudict-{self.LanguageManager.activeLanguageAndCountryCode.lower()}.dict' ) self._decoder = Decoder(self._config) def checkLanguage(self) -> bool: if not Path( self.Commons.rootDir(), f'venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}' ).exists(): self.logInfo('Missing language model') return False return True def timeout(self): super().timeout() try: self._decoder.end_utt() except: # If this fails we don't care, at least we tried to close the utterance pass def downloadLanguage(self, forceLang: str = '') -> bool: lang = forceLang or self.LanguageManager.activeLanguageAndCountryCode self.logInfo(f'Downloading language model for "{lang}"') venv = Path(self.Commons.rootDir(), 'venv/lib/python3.7/site-packages/pocketsphinx/') for url in self.LANGUAGE_PACK: url = url.replace('%lang%', lang.lower()) filename = Path(url).name download = Path(venv, 'model', filename) result = self.Commons.downloadFile(url=f'{url}?raw=true', dest=str(download)) if not result: if forceLang: return False else: # TODO be universal self.downloadLanguage(forceLang='en-US') else: if download.suffix == '.tar': dest = Path(venv, 'model', lang.lower()) if dest.exists(): shutil.rmtree(dest) tar = tarfile.open(str(download)) tar.extractall(str(dest)) download.unlink() self.logInfo('Downloaded and installed') return True def decodeStream(self, session: DialogSession) -> Optional[ASRResult]: super().decodeStream(session) result = None counter = 0 with Stopwatch() as processingTime: with Recorder(self._timeout, session.user, session.deviceUid) as recorder: self.ASRManager.addRecorder(session.deviceUid, recorder) self._recorder = recorder self._decoder.start_utt() inSpeech = False for chunk in recorder: if self._timeout.isSet(): break self._decoder.process_raw(chunk, False, False) hypothesis = self._decoder.hyp() if hypothesis: counter += 1 if counter == 10: self.partialTextCaptured(session, hypothesis.hypstr, hypothesis.prob, processingTime.time) counter = 0 if self._decoder.get_in_speech() != inSpeech: inSpeech = self._decoder.get_in_speech() if not inSpeech: self._decoder.end_utt() result = self._decoder.hyp() if self._decoder.hyp( ) else None break self.end() return ASRResult( text=result.hypstr.strip(), session=session, likelihood=self._decoder.hyp().prob, processingTime=processingTime.time) if result else None
def recognition_worker(audio_file, queue, event, max_no_speech=120, debug=False, hmm='/usr/local/share/pocketsphinx/model/en-us/en-us', lm='/usr/local/share/pocketsphinx/model/en-us/en-us.lm.bin', cmudict='/usr/local/share/pocketsphinx/model/en-us/cmudict-en-us.dict'): ''' Read audio from `audio_file and feed it to pocketsphinx. Put recognized text in `queue`. Shut down if `event` is set. If no speech is detected for `max_no_speech` seconds, set `event` and quit. ''' from pocketsphinx import Decoder config = Decoder.default_config() config.set_string('-hmm', hmm) config.set_string('-lm', lm) config.set_string('-dict', cmudict) if not debug: config.set_string('-logfn', '/dev/null') decoder = Decoder(config) in_speech_bf = True no_speech_timer = None now_in_speech = False decoder.start_utt() try: with open(audio_file, 'rb') as f: f.read(40) # read RIFF header # TODO: Probably should sanity check the audio format... while not event.is_set(): buf = f.read(1024) if buf: decoder.process_raw(buf, False, False) now_in_speech = decoder.get_in_speech() if debug and now_in_speech: print('Found speech', file=sys.stderr) if now_in_speech != in_speech_bf: in_speech_bf = now_in_speech if not in_speech_bf: if debug: print('Processing speech', file=sys.stderr) # No speech, but there was speech before, so, process. decoder.end_utt() try: speech = decoder.hyp().hypstr if speech != '': if debug: print('Speech: ' + speech, file=sys.stderr) queue.put_nowait(speech) except AttributeError: pass decoder.start_utt() else: # Got some speech, reset timer. no_speech_timer = None else: if debug: print('No audio', file=sys.stderr) # Wait a bit... event.wait(0.1) if not now_in_speech: if no_speech_timer is None: no_speech_timer = datetime.datetime.now() elif (datetime.datetime.now() - no_speech_timer).total_seconds() > max_no_speech: if debug: print('No speech, timing out', file=sys.stderr) event.set() except KeyboardInterrupt: pass
def run_decoder(self, stream): # Process audio chunk by chunk. On keyword detected process/restart decoder = Decoder(self.config) #decoder.set_search('keywords') decoder.start_utt() last_decode_str = None last_decode_time = perf_counter() # https://stackoverflow.com/a/47371315/8903959 while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) else: break _time_check = datetime.now().replace(second=0, microsecond=0) if _time_check in self.callback_time_dict: stream.stop_stream() decoder.end_utt() self.callback_time_dict[_time_check]() # Wait until the next minute time.sleep(60) stream.start_stream() print("Listening again\r") decoder.start_utt() just_restarted = True if decoder.hyp() is not None: if last_decode_str == decoder.hyp().hypstr: reset_max = 5 if perf_counter() - last_decode_time > reset_max: print( f"No kwrds in the last {reset_max}s, resetting\r") decoder.end_utt() decoder.start_utt() continue else: last_decode_str = decoder.hyp().hypstr last_decode_time = perf_counter() print(decoder.hyp().hypstr + "\r") just_restarted = False split_words = decoder.hyp().hypstr.lower().split() for i in range(len(split_words)): together = " ".join(split_words[i:]) if together in self.callbacks_dict: stream.stop_stream() decoder.end_utt() callback = self.callbacks_dict[together] #print([(seg.word, seg.prob) for seg in decoder.seg()]) print(f"\n{callback.__name__}") try: callback(decoder.hyp().hypstr) except Exception as e: print(e) stream.start_stream() print("Listening again\r") decoder.start_utt() just_restarted = True break if not just_restarted and len(decoder.hyp().hypstr) > 25: print("No keyword, restarting search\r") decoder.end_utt() decoder.start_utt()
r = sr.Recognizer() r.energy_threshold = 1000 # minimum audio energy to consider for recording r.pause_threshold = 0.25 # seconds of non-speaking audio before a phrase is cons$ r.phrase_threshold = 0.15 # minimum seconds of speaking audio before we conside$ r.non_speaking_duration = 0.25 # seconds of non-speaking audio to keep on both $ with sr.Microphone() as source: print("Please wait. Calibrating microphone...") # listen for 5 seconds and create the ambient noise energy level r.adjust_for_ambient_noise(source, duration=5) print "ABLE is listening..." audio = r.listen(source) try: print "ABLE is recognizing..." raw_data = audio.get_raw_data(convert_rate=16000, convert_width=2) decoder.start_utt() decoder.process_raw(raw_data, False, True) decoder.end_utt() hypothesis = decoder.hyp() HYPOTESIS = hypothesis.hypstr.split() RESULT = getCommand(hypothesis.hypstr) print HYPOTESIS, RESULT if (RESULT == "GO"): print("ABLE: MOTORS ARE STARTING") sleep(2) #time in seconds print("ABLE: MOTORS ARE READY") else: print("DOESNT EXIST ACTION REQUIERED") except: print "ABLE SAYS BYE..."
class TestVoice(Voice): # playback FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 CHUNK = 1024 FILE_NAME = 'aux.wav' # recognition MODELDIR = "es-ES" GRAMMARDIR = "gram" # text to speech RATE = 150 VOLUME = 0.9 def __init__(self, file_name='aux.wav', raspi=False, local=True): ## load environment self.FILE_NAME = file_name self.audio = pyaudio.PyAudio() self.raspi = raspi self.local = local self.config = Decoder.default_config() self.config.set_string('-hmm', os.path.join(self.MODELDIR, 'acoustic-model')) self.config.set_string( '-dict', os.path.join(self.MODELDIR, 'pronounciation-dictionary.dict')) self.config.set_string('-logfn', os.devnull) self.decoder = Decoder(self.config) self.r = sr.Recognizer() print("adjunting...") with sr.Microphone() as source: self.r.adjust_for_ambient_noise(source) # tts if self.local: self.tts = pyttsx3.init() self.tts.setProperty('rate', self.RATE) self.tts.setProperty('volume', self.VOLUME) self.tts.setProperty('voice', 'spanish-latin-am') else: # Instantiates a client self.tts_client = texttospeech.TextToSpeechClient() # Build the voice request, select the language code ("en-US") and the ssml # voice gender ("neutral") self.tts_voice = texttospeech.types.VoiceSelectionParams( language_code='es-ES', ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE) # Select the type of audio file you want returned self.tts_audio_config = texttospeech.types.AudioConfig( audio_encoding=texttospeech.enums.AudioEncoding.MP3) def speak(self, phrase): print('decir: ' + phrase) if self.local: self.tts.say(phrase) self.tts.runAndWait() else: # Set the text input to be synthesized synthesis_input = texttospeech.types.SynthesisInput(text=phrase) # Perform the text-to-speech request on the text input with the selected # voice parameters and audio file type response = self.tts_client.synthesize_speech( synthesis_input, self.tts_voice, self.tts_audio_config) audio_file = 'tts.mp3' # The response's audio_content is binary. with open(audio_file, 'wb') as out: out.write(response.audio_content) print('reproducir voz sintetica') command = '/usr/bin/mpg321 ' + audio_file print(command) os.system(command) def play(self, filename): print('reproduciendo archivo: ' + filename) extension = filename.split('.')[-1] if extension == 'wav': wf = wave.open(filename, 'rb') stream = self.audio.open(format=self.audio.get_format_from_width( wf.getsampwidth()), channels=wf.getnchannels(), rate=wf.getframerate(), output=True) data = wf.readframes(self.CHUNK) # play while len(data) > 0: stream.write(data) data = wf.readframes(self.CHUNK) stream.stop_stream() stream.close() elif extension == 'mp3': command = '/usr/bin/mpg321 ' + filename print(command) os.system(command) def listen(self, duration=3): # start recording if self.raspi: stream = self.audio.open(format=self.FORMAT, channels=1, rate=self.RATE, input_device_index=2, input=True, frames_per_buffer=self.CHUNK) else: stream = self.audio.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input_device_index=7, input=True, frames_per_buffer=self.CHUNK) frames = [] for i in range(0, int(self.RATE / self.CHUNK * duration)): data = stream.read(self.CHUNK, exception_on_overflow=False) frames.append(data) stream.stop_stream() stream.close() wave_file = wave.open(self.FILE_NAME, 'wb') if self.raspi: wave_file.setnchannels(1) else: wave_file.setnchannels(self.CHANNELS) wave_file.setsampwidth(self.audio.get_sample_size(self.FORMAT)) wave_file.setframerate(self.RATE) wave_file.writeframes(b''.join(frames)) wave_file.close() with sr.AudioFile(self.FILE_NAME) as source: audio = self.r.record(source) raw_data = audio.get_raw_data(convert_rate=16000, convert_width=2) return raw_data def echo(self): self.play(self.FILE_NAME) def recognize(self): with sr.Microphone() as source: audio = self.r.listen(source) # raw_out = self.listen() try: self.decoder.start_utt() self.decoder.process_raw(audio.frame_data, False, True) self.decoder.end_utt() hyp = self.decoder.hyp() return hyp.hypstr except Exception: return None def loadGrammar(self, grammar): # delete(self.decoder) grammar_file = grammar + '.gram' c_string = os.path.join(self.GRAMMARDIR, grammar_file) #.encode('ascii') print(c_string) self.config.set_string('-jsgf', c_string) self.decoder.reinit(self.config) def close(self): self.audio.terminate()
p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) #Indicate listening for next utterance print ("READY....") frames = [] utt_started = False decoder.start_utt(None) while True: data = stream.read(CHUNK) time.sleep (0.100) #frames.append(data) decoder.process_raw(data, False, False) in_speech = decoder.get_in_speech() if in_speech and not utt_started: #silence -> speech transition, #let user know that he is heard print("Started...\n") utt_started = True if not in_speech and utt_started: #speech -> silence transition,
class PocketsphinxEngine(WakeWordEnginePlugin): # Padding of silence when feeding to pocketsphinx _config = { 'phonemes': 'HH EY . M AY K R AO F T', 'threshold': '1e-90', 'wake_word_length': 1.2 } SILENCE_SEC = 0.01 url = 'https://github.com/MatthewScholefield/pocketsphinx-models/raw/master/{lang}.tar.gz' def __init__(self, rt, on_activation: Callable): super().__init__(rt, on_activation) lang = rt.config['lang'] self.hmm_folder = join(rt.paths.user_config, 'models', lang) self.rate, self.width = self.rec_config['sample_rate'], self.rec_config['sample_width'] self.padding = b'\0' * int(self.rate * self.width * self.SILENCE_SEC) self.buffer = b'' download_extract_tar(self.url.format(lang=lang), self.hmm_folder) config = Decoder.default_config() config.set_string('-hmm', self.hmm_folder) config.set_string('-dict', self._create_dict(self.wake_word, self.config['phonemes'])) config.set_string('-keyphrase', self.wake_word) config.set_float('-kws_threshold', float(self.config['threshold'])) config.set_float('-samprate', self.rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') self.ps = Decoder(config) @staticmethod def _create_dict(key_phrase, phonemes): fd, file_name = tempfile.mkstemp() with os.fdopen(fd, 'w') as f: f.write(key_phrase + ' ' + phonemes.replace(' . ', ' ')) return file_name def _transcribe(self, raw_audio): self.ps.start_utt() self.ps.process_raw(raw_audio, False, False) self.ps.end_utt() return self.ps.hyp() def startup(self): self.buffer = b'\0' * int(self.width * self.rate * self.config['wake_word_length']) def shutdown(self): self.buffer = b'' def pause_listening(self): pass def continue_listening(self): pass def update(self, raw_audio: bytes): self.buffer = self.buffer[len(raw_audio):] + raw_audio transcription = self._transcribe(self.buffer + self.padding) if transcription and self.wake_word in transcription.hypstr.lower(): self.on_activation()