class SphinxWrapper(object): ''' For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal. Before signal is fed to decoder, it should be isntructed that new utterance is expected. When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)` ''' #MODELDIR = "../models" #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models" decoder = None config = None previousVadState = 0 currentVadState = 0 def __init__(self): ''' Constructor ''' def prepareDecoder(self, pGramma): ''' Entry point where sphinx decoder is initialized or grammar updated ''' if self.decoder is None: self.config = self.createConfig(pGramma); self.decoder = Decoder(self.config); else: self.updateGrammar(self.decoder, pGramma); def createConfig(self,pGramma): ''' Create configuration with acoustic model path, grammar and dictionary ''' print ("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print ("[createConfig]---") return config; def updateGrammar(self,pGramma): ''' Update decoder language model from fsg file ''' print ("[updateGrammar]+++" + pGramma) logmath = self.decoder.get_logmath(); fsg = sphinxbase.FsgModel(os.path.join("../resource/", pGramma+'.fsg'), logmath, 7.5) self.decoder.set_fsg("default",fsg); self.decoder.set_search("default"); print ("[updateGrammar]---") def startListening(self): """ Instruct decoder that new utterace should be expected """ self.decoder.start_utt(None) def stopListening(self): """ Instruct decoder that new utterace should is not expected any more """ self.decoder.end_utt() def process_raw(self, data): """ Feed decoder with raw audio data. After data is updating refresh VAD state """ #print("process_raw...\n") self.decoder.process_raw(data, False, False) self.previousVadState = self.currentVadState self.currentVadState = self.decoder.get_vad_state(); #print("process_raw", self.currentVadState and True, self.previousVadState and True) def calculateHypothesis(self): return self.decoder.hyp(); def calculateVadState(self): return self.decoder.get_vad_state; def isVoiceStarted(self): ''' silence -> speech transition, ''' return self.currentVadState and not self.previousVadState def isVoiceEnded(self): ''' speech -> silence transition, ''' return not self.currentVadState and self.previousVadState
class ContinuousPocketsphinx(object): ''' classdocs ''' CHUNK = 4096 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 16000 #MODELDIR = "../models" MODELDIR = "/home/mgreibus/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" decoder = None stream = None config = None ai = None def __init__(self): ''' Constructor ''' print ("[__init__]+++") # Create a decoder with certain model self.ai = Artificialintelligence() self.config = self.createConfig("code"); self.decoder = Decoder(self.config); print ("[__init__] created decoder") #self.updateGrammar(self.decoder, "confirmation"); print ("[__init__]---") p = pyaudio.PyAudio() self.stream = p.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK) #Indicate listening for next utterance print ("READY....") def updateGrammar(self,pDecoder, pGramma): ''' Update decoder language model from fsg file ''' print ("[updateGrammar]+++" + pGramma) logmath = pDecoder.get_logmath(); fsg = sphinxbase.FsgModel(os.path.join("../resource/", pGramma+'.fsg'), logmath, 7.5) #pDecoder.readfile(os.path.join("../resource/", pGramma+'.fsg'), logmath) pDecoder.set_fsg("default",fsg); pDecoder.set_search("default"); print ("[updateGrammar]---") def createConfig(self,pGramma): print ("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/liepa.cd_semi_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print ("[createConfig]---") return config; def speak(self,text): print("Speak: ", text) if text is not None: aProcess = subprocess.Popen(['/home/mgreibus/bin/tark-win-lt', text], stderr=subprocess.STDOUT) out = aProcess.communicate()[0]; time.sleep (0.100) print("ended Speak: ", out) def said(self, aiContext, text): print ("[said]+++", text) aiContext = self.ai.said(text, aiContext) print ('AI response: ', aiContext.state, aiContext.response) self.speak(aiContext.response) if aiContext.interactiveStep is False : self.said(aiContext, text); print ("[said]---") return aiContext def recognized(self, pStream, pDecoder, aiContext): print ("[recognized]+++") pStream.stop_stream() pDecoder.end_utt() # Retrieve hypothesis. hypothesis = pDecoder.hyp() if hypothesis is not None: print ('Best hypothesis: ', hypothesis.uttid, hypothesis.best_score, hypothesis.hypstr) self.said(aiContext, hypothesis.hypstr.decode('utf-8')) if aiContext.state in aiContext.GRAM: self.updateGrammar(pDecoder, aiContext.GRAM[aiContext.state]); elif (time.time() - aiContext.stateStarted) > 10: self.speak(aiContext.response) aiContext.stateStarted = time.time() print ("Time: ", (time.time() - aiContext.stateStarted)) print("AI response ", aiContext.response) time.sleep (0.100) #Indicate listening for next utterance pStream.start_stream() pDecoder.start_utt(None) print ("READY....") print ("[recognized]---") return aiContext def run(self): ''' Executor ''' print("* start recording") self.decoder.start_utt(None) cur_vad_state = 0 aiContext = self.ai.createContext(); self.said(aiContext, None); while True: data = self.stream.read(self.CHUNK) time.sleep (0.100) #frames.append(data) self.decoder.process_raw(data, False, False) vad_state = self.decoder.get_vad_state() if vad_state and not cur_vad_state: #silence -> speech transition, #let user know that we heard print("Listening...\n") if not vad_state and cur_vad_state: #speech -> silence transition, #time to start new utterance aiContext = self.recognized(self.stream,self.decoder, aiContext); if aiContext.state == aiContext.STATE_THANKS: break cur_vad_state = vad_state
class SphinxWrapper(object): ''' For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal. Before signal is fed to decoder, it should be isntructed that new utterance is expected. When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)` ''' #MODELDIR = "../models" #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models" decoder = None config = None previousVadState = 0 currentVadState = 0 def __init__(self): ''' Constructor ''' def prepareDecoder(self, pGramma): ''' Entry point where sphinx decoder is initialized or grammar updated ''' if self.decoder is None: self.config = self.createConfig(pGramma) self.decoder = Decoder(self.config) else: self.updateGrammar(self.decoder, pGramma) def createConfig(self, pGramma): ''' Create configuration with acoustic model path, grammar and dictionary ''' print("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma + '.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print("[createConfig]---") return config def updateGrammar(self, pGramma): ''' Update decoder language model from fsg file ''' print("[updateGrammar]+++" + pGramma) logmath = self.decoder.get_logmath() fsg = sphinxbase.FsgModel( os.path.join("../resource/", pGramma + '.fsg'), logmath, 7.5) self.decoder.set_fsg("default", fsg) self.decoder.set_search("default") print("[updateGrammar]---") def startListening(self): """ Instruct decoder that new utterace should be expected """ self.decoder.start_utt(None) def stopListening(self): """ Instruct decoder that new utterace should is not expected any more """ self.decoder.end_utt() def process_raw(self, data): """ Feed decoder with raw audio data. After data is updating refresh VAD state """ #print("process_raw...\n") self.decoder.process_raw(data, False, False) self.previousVadState = self.currentVadState self.currentVadState = self.decoder.get_vad_state() #print("process_raw", self.currentVadState and True, self.previousVadState and True) def calculateHypothesis(self): return self.decoder.hyp() def calculateVadState(self): return self.decoder.get_vad_state def isVoiceStarted(self): ''' silence -> speech transition, ''' return self.currentVadState and not self.previousVadState def isVoiceEnded(self): ''' speech -> silence transition, ''' return not self.currentVadState and self.previousVadState