def retrieve_scores(word): filename = word + '.wav' grammarname = word + '-align.jsgf' model_path = get_model_path() # Initialize the config values config = DefaultConfig() config.set_boolean('-verbose', False) config.set_string('-hmm', os.path.join(model_path, 'en-us')) config.set_boolean('-lm', False) config.set_string('-dict', 'phonemes.dict.txt') config.set_boolean('-backtrace', True) config.set_boolean('-bestpath', False) config.set_boolean('-fsgusefiller', False) decoder = Decoder(config) # Set the search to JSGF Grammar jsgf = Jsgf(grammarname) rule = jsgf.get_rule('forcing.' + word) decoder.set_jsgf_file('grammar', grammarname) decoder.set_search('grammar') stream = open(filename, 'rb') utt_started = False scores = [] decoder.start_utt() while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) in_speech = decoder.get_in_speech() if (in_speech and not utt_started): utt_started = True if (not in_speech and utt_started): decoder.end_utt() hyp = decoder.hyp() if hyp is not None: print('hyp: %s' % (hyp.best_score)) print_segments(decoder) scores = retrieve_segments(decoder) decoder.start_utt() utt_started = False else: break decoder.end_utt() print('scores:', scores) return scores
class VoiceService(object): audio_device = None buffer_size = 2048 sampling_rate = 16000 def __init__(self): config = get_decoder_config() self.decoder = Decoder(config) self.speech = pyttsx3.init() self.audio = sphinxbase.Ad(self.audio_device, self.sampling_rate) self.buffer = bytearray(self.buffer_size) self.default_search = self.decoder.get_search() self.in_speech = False self.max_history = 100 self.phrases = [] self.prompts = {} self.next_prompt_id = 1 self.current_prompt = None self.prompt_queue = queue.Queue() def create_prompt(self, message=None, message_url=None, search="enable", timeout=15): """ Create a new prompt and add it to the queue. Currently, only one type of prompt is supported. We play a message, then wait for someone to say a specific word (the search word) within the alloted amount of time. The status of the prompt can be retrieved by calling get_prompt with the appropriate id. timeout: prompt timeout in seconds, expected to be either None or numeric. """ if timeout is not None: # Be forgiving of caller who may have passed timeout as a string. timeout = float(timeout) prompt = { "created_time": time.time(), "detected": False, "detected_time": None, "id": self.get_next_prompt_id(), "message": message, "message_url": message_url, "search": search, "search_started": False, "search_started_time": None, "played": False, "played_time": None, "timeout": timeout, "timed_out": False } self.prompts[str(prompt['id'])] = prompt self.prompt_queue.put(prompt) return prompt def get_next_prompt_id(self): """ Get a unique ID for a prompt. """ tmp = self.next_prompt_id self.next_prompt_id += 1 return tmp def get_phrases(self): """ Get the history of detected phrases. """ return self.phrases def get_prompt(self, prompt_id): """ Get information about a prompt. """ return self.prompts[str(prompt_id)] def get_status(self): """ Get the system status. """ status = { "current_prompt": self.current_prompt, "in_speech": self.decoder.get_in_speech(), "queue_length": self.prompt_queue.qsize(), "search": self.decoder.get_search() } return status def play_prompt(self, prompt): prompt['played_time'] = time.time() if prompt.get("message_url", None) is not None: cmd = ["mplayer", "-ao", "pulse", prompt['message_url']] subprocess.call(cmd) elif prompt.get("message", None) is not None: self.speech.say(prompt['message']) self.speech.runAndWait() prompt['played'] = True def process_hypothesis(self, hypothesis): print("SPEECH {}".format(hypothesis.hypstr)) phrase = { "search": self.decoder.get_search(), "time": time.time(), "text": hypothesis.hypstr } self.phrases.append(phrase) del self.phrases[:-self.max_history] def run_next_prompt(self): if self.prompt_queue.empty(): self.create_prompt(None, search="paradrop", timeout=None) self.current_prompt = self.prompt_queue.get_nowait() self.decoder.set_search(self.current_prompt['search']) self.audio.stop_recording() self.play_prompt(self.current_prompt) self.audio.start_recording() self.current_prompt['search_started_time'] = time.time() self.current_prompt['search_started'] = True def detect_timeout(self): """ Check if the current prompt has timed out. """ if self.current_prompt is None: # No active prompt to timeout. return False if self.decoder.get_in_speech(): # Defer timeout if decoder reports that speech is in progress. A # person may be speaking the target phrase currently. return False if self.current_prompt['timeout'] is None: # If timeout is None, then only timeout when there is another item # in the queue. return not self.prompt_queue.empty() else: diff = time.time() - self.current_prompt['search_started_time'] return diff >= self.current_prompt['timeout'] def run(self): self.decoder.set_keyphrase("activate", "activate") self.decoder.set_keyphrase("allow", "allow") self.decoder.set_keyphrase("enable", "enable") self.decoder.set_keyphrase("paradrop", "para drop") self.audio.start_recording() while True: if self.current_prompt is None: self.run_next_prompt() self.decoder.start_utt() self.audio.readinto(self.buffer) self.decoder.process_raw(self.buffer, False, False) if self.in_speech and not self.decoder.get_in_speech(): self.decoder.end_utt() hypothesis = self.decoder.hyp() if hypothesis is not None: self.process_hypothesis(hypothesis) self.current_prompt['detected'] = True self.current_prompt['detected_time'] = time.time() self.current_prompt = None else: self.decoder.start_utt() if self.detect_timeout(): self.decoder.end_utt() self.current_prompt['timed_out'] = True self.current_prompt = None self.in_speech = self.decoder.get_in_speech()
class SphinxWrapper(object): ''' For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal. Before signal is fed to decoder, it should be isntructed that new utterance is expected. When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)` ''' #MODELDIR = "../models" #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models" decoder = None config = None previousVadState = 0 currentVadState = 0 def __init__(self): ''' Constructor ''' def prepareDecoder(self, pGramma): ''' Entry point where sphinx decoder is initialized or grammar updated ''' if self.decoder is None: self.config = self.createConfig(pGramma); self.decoder = Decoder(self.config); else: self.updateGrammar(self.decoder, pGramma); def createConfig(self,pGramma): ''' Create configuration with acoustic model path, grammar and dictionary ''' print ("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print ("[createConfig]---") return config; def updateGrammar(self,pGramma): ''' Update decoder language model from fsg file ''' print ("[updateGrammar]+++" + pGramma) logmath = self.decoder.get_logmath(); fsg = sphinxbase.FsgModel(os.path.join("../resource/", pGramma+'.fsg'), logmath, 7.5) self.decoder.set_fsg("default",fsg); self.decoder.set_search("default"); print ("[updateGrammar]---") def startListening(self): """ Instruct decoder that new utterace should be expected """ self.decoder.start_utt(None) def stopListening(self): """ Instruct decoder that new utterace should is not expected any more """ self.decoder.end_utt() def process_raw(self, data): """ Feed decoder with raw audio data. After data is updating refresh VAD state """ #print("process_raw...\n") self.decoder.process_raw(data, False, False) self.previousVadState = self.currentVadState self.currentVadState = self.decoder.get_vad_state(); #print("process_raw", self.currentVadState and True, self.previousVadState and True) def calculateHypothesis(self): return self.decoder.hyp(); def calculateVadState(self): return self.decoder.get_vad_state; def isVoiceStarted(self): ''' silence -> speech transition, ''' return self.currentVadState and not self.previousVadState def isVoiceEnded(self): ''' speech -> silence transition, ''' return not self.currentVadState and self.previousVadState
class Wrapper(): def __init__(self, **kwargs): signal.signal(signal.SIGINT, self.stop) model_path = get_model_path() kwargs = { x: os.path.expandvars(kwargs[x]) if type(kwargs[x]) is str else kwargs[x] for x in kwargs } nodename = kwargs.pop('nodename') grammar_file = kwargs.pop('grammar_file', None) grammar_rule = kwargs.pop('grammar_rule', None) grammar_name = kwargs.pop('grammar_name', None) kwargs.pop('esiaf_input_topic') if kwargs.get('dic') is not None and kwargs.get('dict') is None: kwargs['dict'] = kwargs.pop('dic') if kwargs.get('hmm') is None: kwargs['hmm'] = os.path.join(model_path, 'en-us') if kwargs.get('lm') is None: kwargs['lm'] = os.path.join(model_path, 'en-us.lm.bin') if kwargs.get('dict') is None and kwargs.get('dic') is None: kwargs['dict'] = os.path.join(model_path, 'cmudict-en-us.dict') if kwargs.pop('verbose', False) is False: if sys.platform.startswith('win'): kwargs['logfn'] = 'nul' else: kwargs['logfn'] = '/dev/null' config = Decoder.default_config() print(kwargs) for key, value in kwargs.items(): if isinstance(value, bool): config.set_boolean('-{}'.format(key), value) elif isinstance(value, int): config.set_int('-{}'.format(key), value) elif isinstance(value, float): config.set_float('-{}'.format(key), value) elif isinstance(value, str): config.set_string('-{}'.format(key), value) self.decoder = Decoder(config) if grammar_file and grammar_rule and grammar_name: jsgf = Jsgf(grammar_file) rule = jsgf.get_rule(grammar_name + '.' + grammar_rule) fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5) self.decoder.set_fsg(grammar_name, fsg) self.decoder.set_search(grammar_name) self.start = None self.finish = None self.speech_publisher = rospy.Publisher(nodename + '/' + 'SpeechRec', SpeechInfo, queue_size=10) def stop(self, *args, **kwargs): raise StopIteration def hypothesis(self): hyp = self.decoder.hyp() if hyp: return hyp.hypstr else: return '' def vad_finished_callback(self): self.decoder.end_utt() result = '' if self.decoder.hyp(): result = self.hypothesis() rospy.loginfo('understood: \'' + str(result) + '\'') hypo = SpeechHypothesis() hypo.recognizedSpeech = result hypo.probability = 1.0 time = RecordingTimeStamps() time.start = self.start time.finish = self.finish speechInfo = SpeechInfo() speechInfo.hypotheses = [hypo] speechInfo.duration = time self.speech_publisher.publish(speechInfo) self.start = None self.finish = None def add_audio_data(self, audio_data, recording_timestamps): _recording_timestamps = RecordingTimeStamps() msg_from_string(_recording_timestamps, recording_timestamps) rospy.loginfo('got audio!') if not self.start: self.start = _recording_timestamps.start self.decoder.start_utt() self.finish = _recording_timestamps.finish bytearray = audio_data.tobytes() self.decoder.process_raw(bytearray, False, False)
class SphinxWrapper(object): ''' For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal. Before signal is fed to decoder, it should be isntructed that new utterance is expected. When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)` ''' #MODELDIR = "../models" #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models" decoder = None config = None previousVadState = 0 currentVadState = 0 def __init__(self): ''' Constructor ''' def prepareDecoder(self, pGramma): ''' Entry point where sphinx decoder is initialized or grammar updated ''' if self.decoder is None: self.config = self.createConfig(pGramma) self.decoder = Decoder(self.config) else: self.updateGrammar(self.decoder, pGramma) def createConfig(self, pGramma): ''' Create configuration with acoustic model path, grammar and dictionary ''' print("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma + '.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print("[createConfig]---") return config def updateGrammar(self, pGramma): ''' Update decoder language model from fsg file ''' print("[updateGrammar]+++" + pGramma) logmath = self.decoder.get_logmath() fsg = sphinxbase.FsgModel( os.path.join("../resource/", pGramma + '.fsg'), logmath, 7.5) self.decoder.set_fsg("default", fsg) self.decoder.set_search("default") print("[updateGrammar]---") def startListening(self): """ Instruct decoder that new utterace should be expected """ self.decoder.start_utt(None) def stopListening(self): """ Instruct decoder that new utterace should is not expected any more """ self.decoder.end_utt() def process_raw(self, data): """ Feed decoder with raw audio data. After data is updating refresh VAD state """ #print("process_raw...\n") self.decoder.process_raw(data, False, False) self.previousVadState = self.currentVadState self.currentVadState = self.decoder.get_vad_state() #print("process_raw", self.currentVadState and True, self.previousVadState and True) def calculateHypothesis(self): return self.decoder.hyp() def calculateVadState(self): return self.decoder.get_vad_state def isVoiceStarted(self): ''' silence -> speech transition, ''' return self.currentVadState and not self.previousVadState def isVoiceEnded(self): ''' speech -> silence transition, ''' return not self.currentVadState and self.previousVadState