class SphinxDecoder(): def __init__(self): self.MODELDIR = 'speech/' self.wav_name = 'media/temp.wav' self.raw_name = 'media/temp.raw' config = Decoder.default_config() config.set_string('-hmm', self.MODELDIR + 'ru_ru/') config.set_string('-dict', self.MODELDIR + 'ru.dic') self.decoder = Decoder(config) jsgf = Jsgf(self.MODELDIR + 'gr.gram') rule = jsgf.get_rule('gr.rule') fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5) fsg.writefile('gr.fsg') self.decoder.set_fsg('gr', fsg) self.decoder.set_search('gr') self.rec = Recognizer() self.mic = Microphone() def wav_to_raw(self): audio_file = AudioSegment.from_wav(self.wav_name) audio_file = audio_file.set_frame_rate(16000) audio_file.export(self.raw_name, format='raw') def record_audio(self): with self.mic as source: self.rec.adjust_for_ambient_noise(source) system('aplay media/beep.wav') audio = self.rec.listen(source) with open(self.wav_name, 'wb') as new_audio: new_audio.write(audio.get_wav_data()) self.wav_to_raw() def get_from_audio(self): self.record_audio() self.decoder.start_utt() stream = open(self.raw_name, 'rb') while True: buf = stream.read(1024) if buf: self.decoder.process_raw(buf, False, False) else: break self.decoder.end_utt() stream.close() try: return self.decoder.hyp().hypstr except: return None
def main(): """ A main method to that does a simple matching of sentences and executes scripts """ notifier = sdnotify.SystemdNotifier() # Load config first config_file = open(os.path.join(os.getcwd(), 'config.yaml'), 'r') config = yaml.load(config_file) interaction_timeout = int(config['interaction_timeout']) # Create Decoder config pocketsphinx_config = Decoder.default_config() pocketsphinx_config.set_string('-hmm', os.path.join(os.getcwd(), config['hmm_path'])) pocketsphinx_config.set_string('-dict', os.path.join(os.getcwd(), config['dict_path'])) pocketsphinx_config.set_string('-featparams', os.path.join(os.getcwd(), config['feat_params_path'])) pocketsphinx_config.set_boolean("-allphone_ci", True) # Using decoder.set_kws & decoder.set_lm_file # pocketsphinx_config.set_string('-lm', os.path.join(os.getcwd(), config['lm_path'])) # pocketsphinx_config.set_string('-kws', os.path.join(os.getcwd(), config['keyphrase_path'])) # Initialize audio p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) stream.start_stream() # Load invocations and commands invocations = config['invocations'] # Process audio chunk by chunk. On keyword detected perform action and restart search decoder = Decoder(pocketsphinx_config) logmath = decoder.get_logmath() decoder.set_kws('keyword', os.path.join(os.getcwd(), config['invocation_path'])) decoder.set_lm_file('lm', os.path.join(os.getcwd(), config['lm_path'])) invocation_ctx = None in_speech_bf = False # Run some initialization scripts for terminal displays subprocess.Popen([os.path.join(os.getcwd(), config['init_exec'])]).communicate() decoder.set_search('keyword') decoder.start_utt() notifier.notify("READY=1") interaction_time = None while True: notifier.notify("WATCHDOG=1") buf = stream.read(1024, exception_on_overflow = False) if buf: decoder.process_raw(buf, False, False) else: logging.error("Unable to get audio, exiting") break hyp = decoder.hyp() # seg = decoder.seg() hyp_str = hyp.hypstr.lower().strip() if hyp else None now_in_speech = decoder.get_in_speech() if now_in_speech != in_speech_bf: in_speech_bf = now_in_speech if not in_speech_bf: decoder.end_utt() if hyp_str: logging.info("Heard: '%s' while being in '%s' context (score: %d, confidence: %d -> in log scale %d)" % (hyp_str, invocation_ctx, hyp.best_score, logmath.exp(hyp.prob), hyp.prob)) if not invocation_ctx: if hyp_str in invocations: logging.info("Matched invocation: '%s'" % hyp_str) invocation_ctx = hyp_str subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['enter']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate() interaction_time = time.time() decoder.set_search('lm') else: logging.debug('Unknown invocation or wrongly heard, silently ignoring') else: matched = False score_dict = defaultdict(list) commands = invocations[invocation_ctx]['commands'] for command in commands: logging.info("- command: '%s':" % command['name']) for sentence in command['sentence']: score = calc_similarity(command, sentence.lower(), hyp_str) score_dict[score].append(command) logging.debug(" - similarity: %d for sentence: %s" % (score, sentence)) if score == 1000: logging.debug("... seems like found perfect match, ignoring the rest") break for best in sorted(score_dict.items(), reverse=True): if best[0] > 90: command = best[1][0] # here might be some randomness logging.info("The best matching command is '%s', executing: %s" % (command['name'], command['exec'])) subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['ack']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate() subprocess.Popen([os.path.join(os.getcwd(), command['exec']), invocations[invocation_ctx]['voice_params'], invocation_ctx, command['name']]).communicate() subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]) invocation_ctx = None decoder.set_search('keyword') matched = True break # take only the first which should be the best if not matched: logging.info("... not matched, ignoring") subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['noop']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate() decoder.start_utt() if invocation_ctx and interaction_time and time.time() > interaction_time + interaction_timeout: logging.info("The invocation context has just timed out, returning to listen for invocation word.") subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']), invocations[invocation_ctx]['voice_params'], invocation_ctx]) invocation_ctx = None interaction_time = None decoder.end_utt() decoder.set_search('keyword') decoder.start_utt()