def start_listening(self): ''' Starts streaming. Pauses until self.resume has been called ''' config = Decoder.default_config() config.set_string('-hmm', path.join(self.model_dir, self.hmm)) config.set_string('-lm', path.join(self.model_dir, self.lm)) config.set_string('-dict', path.join(self.model_dir, self.dictionary)) config.set_string('-logfn', self.logfn) # This takes a while decoder = Decoder(config) p = pyaudio.PyAudio() print(self.input_source_index) stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, input_device_index=self.input_source_index, frames_per_buffer=1024) stream.start_stream() in_speech_bf = False decoder.start_utt() self.wait_to_resume_lock.acquire() while self.is_running: while self.paused: pass buf = stream.read(1024, exception_on_overflow=False) if buf: decoder.process_raw(buf, False, False) if decoder.get_in_speech() != in_speech_bf: in_speech_bf = decoder.get_in_speech() if not in_speech_bf: decoder.end_utt() # if self.wait_to_resume: # stream.stop_stream() phrase = decoder.hyp().hypstr if phrase != "": self.all_speech_data.append(phrase) # if self.wait_to_resume: # # print("waiting") # self.wait_to_resume_lock.acquire() # # print("resuming") # if self.wait_to_resume: # stream.start_stream() decoder.start_utt() else: break decoder.end_utt()
def main(): abspath = os.path.dirname(os.path.abspath(__file__)) abspath = os.path.join(abspath, '..') model_dir = os.path.join(abspath, 'model') hmm = os.path.join(model_dir, HMM) lm = os.path.join(model_dir, LM) dic = os.path.join(model_dir, DIC) config = Decoder.default_config() config.set_string('-hmm', hmm) config.set_string('-lm', lm) config.set_string('-dict', dic) config.set_string('-logfn', '/dev/null') decoder = Decoder(config) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=BUFFER) stream.start_stream() in_speech_bf = True decoder.start_utt() while True: buf = stream.read(BUFFER) if buf: decoder.process_raw(buf, False, False) if decoder.get_in_speech(): sys.stdout.write('.') sys.stdout.flush() if decoder.get_in_speech() == in_speech_bf: continue in_speech_bf = decoder.get_in_speech() if in_speech_bf: continue decoder.end_utt() try: if decoder.hyp().hypstr != '': print('You said:', decoder.hyp().hypstr) except AttributeError: pass decoder.start_utt() else: break decoder.end_utt() print('An Error occured:', decoder.hyp().hypstr)
def main(): environment: str = os.getenv("ENVIRONMENT", "dev") config: Dict = load_config(environment) initialize_logger(level=config["logging"]["level"], filename=config["logging"]["filename"]) redis_host = config["redis"]["host"] redis_port = config["redis"]["port"] logger.debug(f"Connecting to redis at {redis_host}:{redis_port}") redis_client: Redis = Redis(host=redis_host, port=redis_port, db=0) logger.debug("Initializing PyAudio interface") audio = pyaudio.PyAudio() microphone_index = get_microphone_index(audio, config["microphone"]["name"]) logger.debug( f"Using microphone device '{config['microphone']['name']}' (card index {microphone_index})" ) logger.debug( f"Intializing pocketsphinx Decoder using model dir {MODELDIR}") decoder_config: DecoderConfig = Decoder.default_config() decoder_config.set_string("-hmm", os.path.join(MODELDIR, "en-us/en-us")) decoder_config.set_string("-lm", os.path.join(MODELDIR, "en-us/en-us.lm.bin")) decoder_config.set_string( "-dict", os.path.join(MODELDIR, "en-us/cmudict-en-us.dict")) decoder = Decoder(decoder_config) logger.debug("Opening audio stream") stream = audio.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=2048, input_device_index=microphone_index) stream.start_stream() in_speech_bf = False decoder.start_utt() try: logger.debug("Starting decoder loop") while cycle([True]): buf = stream.read(2048) if buf: logger.debug("Decoding raw audio") decoder.process_raw(buf, False, False) if decoder.get_in_speech() != in_speech_bf: logger.debug("GOT HERE") in_speech_bf = decoder.get_in_speech() if not in_speech_bf: decoder.end_utt() transcription = decoder.hyp().hypstr logger.debug(f"Result: {transcription}") redis_client.publish("subsystem.listener.recording", transcription) decoder.start_utt() else: logger.debug("Buffer closed. Ending") break decoder.end_utt() except Exception: logger.exception("Something bad happened") finally: redis_client.close()
class AvaRecognizer(object): """Class to add ASR recognition functionality using language model + dictionary Publishes recognition output to recognizer/asr_output.""" def __init__(self): # Initializing publisher with buffer size of 10 messages self.pub_ = rospy.Publisher("recognizer/asr_output", String, queue_size=10) # initialize node rospy.init_node("ava_recognizer") # Call custom function on node shutdown rospy.on_shutdown(self.shutdown) # Params # File containing language model _lm_param = "~lm" # Dictionary _dict_param = "~dict" # HMM Model _hmm_param = "~hmm" # used in process_audio for piecing full utterances self.in_speech_bf = False # Setting param values if rospy.has_param( _dict_param) and rospy.get_param(_dict_param) != ":default": self.dict = rospy.get_param(_dict_param) else: rospy.logerr( "No dictionary found. Please add an appropriate dictionary argument." ) return if rospy.has_param( _lm_param) and rospy.get_param(_lm_param) != ':default': self._use_lm = 1 self.class_lm = rospy.get_param(_lm_param) else: rospy.logerr("No lm found. Please add an appropriate lm argument.") return if rospy.has_param(_hmm_param): self.hmm = rospy.get_param(_hmm_param) if rospy.get_param(_hmm_param) == ":default": if os.path.isdir( "/home/team5/.local/lib/python2.7/site-packages/pocketsphinx/model" ): rospy.loginfo("Loading the default acoustic model") self.hmm = "/home/team5/.local/lib/python2.7/site-packages/pocketsphinx/model/en-us" rospy.loginfo("Done loading the default acoustic model") else: rospy.logerr("Failed to find default model.") return else: rospy.logerr( "No language model specified. Couldn't find default model.") return # All params satisfied. Starting recognizer and audio thread self._audio_queue = Queue.Queue() self._kill_audio = False threading.Thread(target=self.get_audio).start() self.start_recognizer() def start_recognizer(self): """Function to handle lm or grammar processing of audio.""" config = Decoder.default_config() rospy.loginfo("Done initializing pocketsphinx") # Setting configuration of decoder using provided params config.set_string('-dict', self.dict) config.set_string('-lm', self.class_lm) config.set_string('-hmm', self.hmm) self.decoder = Decoder(config) # Start processing input audio self.decoder.start_utt() rospy.loginfo("Decoder started successfully") # Subscribe to audio topic rospy.Subscriber("recognizer/audio_ready", Bool, self.process_audio) rospy.spin() def process_audio(self, isready): """Audio processing based on decoder config.""" # Check if input audio has ended assert (isready) data = self._audio_queue.get() self.decoder.process_raw(data, False, False) if self.decoder.get_in_speech() != self.in_speech_bf: self.in_speech_bf = self.decoder.get_in_speech() if not self.in_speech_bf: self.decoder.end_utt() if self.decoder.hyp() != None: rospy.loginfo('OUTPUT: \"' + self.decoder.hyp().hypstr + '\"') self.pub_.publish(self.decoder.hyp().hypstr) self.decoder.start_utt() @staticmethod def shutdown(): """This function is executed on node shutdown.""" # command executed after Ctrl+C is pressed rospy.loginfo("Stop AvaRecognizer") rospy.sleep(1) def get_audio(self): """ Used for audio parsing thread. """ # parameters for PCM. view PCMs with 'pactl list sources short'. # don't modify me plz. device = 'sysdefault:CARD=Audio' inp = alsaaudio.PCM(type=alsaaudio.PCM_CAPTURE, mode=alsaaudio.PCM_NORMAL, card=device) inp.setchannels(1) inp.setrate(16000) inp.setformat(alsaaudio.PCM_FORMAT_S16_LE) inp.setperiodsize(1024) pub = rospy.Publisher('recognizer/audio_ready', Bool, queue_size=10) while not (self._kill_audio): _, data = inp.read() self._audio_queue.put(data) pub.publish(True) return
class Words(Chain): """ Chain to compute words and summarizes words occurences at levels of individual subject and dataset """ allow_sample_layer_concurrency = True abstract_class = False requirements = [Preprocess] def __init__(self): super(Words, self).__init__() self._subject_words = {} self.decoder = None def dataset_preprocess(self, dataset): self._subject_words.clear() def subject_preprocess(self, subject, samples, common_subject_settings): self._subject_words[subject] = [] @staticmethod def sample_result_filename(out_sample_path): return f'{out_sample_path[:-5]}_words_result.json' def _compute_words(self, segments_path, words_result_path): """ :param segments_path: :param words_result_path: :return: """ model_dir = self.process_settings.get('model_dir', MODEL_DIR) decoder_hmm = self.process_settings.get('decoder_hmm', 'en-us/en-us') decoder_lm = self.process_settings.get('decoder_lm', 'en-us/en-us.lm.bin') decoder_dict = self.process_settings.get('decoder_dict', 'en-us/cmudict-en-us.dict') decoder_lw = self.process_settings.get('decoder_lw', 2.0) decoder_pip = self.process_settings.get('decoder_pip', 0.3) decoder_beam = self.process_settings.get('decoder_beam', 1e-200) decoder_pbeam = self.process_settings.get('decoder_pbeam', 1e-20) decoder_mmap = self.process_settings.get('decoder_mmap', False) decoder_stream_buf_size = self.process_settings.get('decoder_stream_buf_size', 8192) pprint_indent = self.process_settings.get('pprint_indent', 4) hypothesis = PocketsphinxHypothesisSchema() ph_info = PocketsphinxSegmentSchema() def _get_decoder_results(): self.decoder.end_utt() segment = [ph_info.dump(dict(word=seg.word, start=seg.start_frame / 100, end=seg.end_frame / 100, prob=seg.prob)) for seg in self.decoder.seg()] hyp = self.decoder.hyp() hyp_dict = dict(best_score=hyp.best_score, hypstr=hyp.hypstr, prob=hyp.prob) hyp_result = hypothesis.dump(hyp_dict) return hyp_result, segment @check_if_already_done(words_result_path) def recognize_words(segments_path, words_result_path): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', join(model_dir, decoder_hmm)) config.set_string('-lm', join(model_dir, decoder_lm)) config.set_string('-dict', join(model_dir, decoder_dict)) config.set_float('-lw', decoder_lw) config.set_float('-pip', decoder_pip) config.set_float('-beam', decoder_beam) config.set_float('-pbeam', decoder_pbeam) config.set_boolean('-mmap', decoder_mmap) hyps=[] segs=[] self.decoder = Decoder(config) with open(segments_path, 'rb') as stream: in_speech_buffer = False self.decoder.start_utt() while True: buf = stream.read(decoder_stream_buf_size) if buf: self.decoder.process_raw(buf, False, False) if self.decoder.get_in_speech() != in_speech_buffer: in_speech_buffer = self.decoder.get_in_speech() if not in_speech_buffer: hyp_result, segment = _get_decoder_results() segs += segment hyps.append(hyp_result) self.decoder.start_utt() else: if in_speech_buffer: hyp_result, segment = _get_decoder_results() segs += segment hyps.append(hyp_result) break words_dict = dict(hypotheses=hyps, segment_info=segs) words_result = DecoderOutputSchema().dumps(words_dict) with open(words_result_path, 'w') as f: f.write(words_result) recognize_words(segments_path, words_result_path) with open(words_result_path, 'r') as f: logger.debug(f'words_result_path: {words_result_path}') json_file = json.load(f) result = DecoderOutputSchema().load(json_file) logger.debug(json.dumps(result, indent=pprint_indent)) def sample_layer(self, subject, sample_json_filename, sample_settings): url = sample_settings.get('url') datatype = sample_settings.get('datatype') output_path_pattern = join(self.results_dir, subject, sample_json_filename) words_result_file = self.sample_result_filename(output_path_pattern) logger.info(f'words result file: {words_result_file}') audio_path = resolve_audio_path(url, datatype, output_path_pattern) _, segments_path = audio_and_segment_paths(audio_path, False) self._compute_words(segments_path, words_result_file)
def main(): """ A main method to that does a simple matching of sentences and executes scripts """ notifier = sdnotify.SystemdNotifier() # Load config first config_file = open(os.path.join(os.getcwd(), 'config.yaml'), 'r') config = yaml.load(config_file) interaction_timeout = int(config['interaction_timeout']) # Create Decoder config pocketsphinx_config = Decoder.default_config() pocketsphinx_config.set_string('-hmm', os.path.join(os.getcwd(), config['hmm_path'])) pocketsphinx_config.set_string('-dict', os.path.join(os.getcwd(), config['dict_path'])) pocketsphinx_config.set_string('-featparams', os.path.join(os.getcwd(), config['feat_params_path'])) pocketsphinx_config.set_boolean("-allphone_ci", True) # Using decoder.set_kws & decoder.set_lm_file # pocketsphinx_config.set_string('-lm', os.path.join(os.getcwd(), config['lm_path'])) # pocketsphinx_config.set_string('-kws', os.path.join(os.getcwd(), config['keyphrase_path'])) # Initialize audio p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) stream.start_stream() # Load invocations and commands invocations = config['invocations'] # Process audio chunk by chunk. On keyword detected perform action and restart search decoder = Decoder(pocketsphinx_config) logmath = decoder.get_logmath() decoder.set_kws('keyword', os.path.join(os.getcwd(), config['invocation_path'])) decoder.set_lm_file('lm', os.path.join(os.getcwd(), config['lm_path'])) invocation_ctx = None in_speech_bf = False # Run some initialization scripts for terminal displays subprocess.Popen([os.path.join(os.getcwd(), config['init_exec'])]).communicate() decoder.set_search('keyword') decoder.start_utt() notifier.notify("READY=1") interaction_time = None while True: notifier.notify("WATCHDOG=1") buf = stream.read(1024, exception_on_overflow = False) if buf: decoder.process_raw(buf, False, False) else: logging.error("Unable to get audio, exiting") break hyp = decoder.hyp() # seg = decoder.seg() hyp_str = hyp.hypstr.lower().strip() if hyp else None now_in_speech = decoder.get_in_speech() if now_in_speech != in_speech_bf: in_speech_bf = now_in_speech if not in_speech_bf: decoder.end_utt() if hyp_str: logging.info("Heard: '%s' while being in '%s' context (score: %d, confidence: %d -> in log scale %d)" % (hyp_str, invocation_ctx, hyp.best_score, logmath.exp(hyp.prob), hyp.prob)) if not invocation_ctx: if hyp_str in invocations: logging.info("Matched invocation: '%s'" % hyp_str) invocation_ctx = hyp_str subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['enter']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate() interaction_time = time.time() decoder.set_search('lm') else: logging.debug('Unknown invocation or wrongly heard, silently ignoring') else: matched = False score_dict = defaultdict(list) commands = invocations[invocation_ctx]['commands'] for command in commands: logging.info("- command: '%s':" % command['name']) for sentence in command['sentence']: score = calc_similarity(command, sentence.lower(), hyp_str) score_dict[score].append(command) logging.debug(" - similarity: %d for sentence: %s" % (score, sentence)) if score == 1000: logging.debug("... seems like found perfect match, ignoring the rest") break for best in sorted(score_dict.items(), reverse=True): if best[0] > 90: command = best[1][0] # here might be some randomness logging.info("The best matching command is '%s', executing: %s" % (command['name'], command['exec'])) subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['ack']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate() subprocess.Popen([os.path.join(os.getcwd(), command['exec']), invocations[invocation_ctx]['voice_params'], invocation_ctx, command['name']]).communicate() subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]) invocation_ctx = None decoder.set_search('keyword') matched = True break # take only the first which should be the best if not matched: logging.info("... not matched, ignoring") subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['noop']), invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate() decoder.start_utt() if invocation_ctx and interaction_time and time.time() > interaction_time + interaction_timeout: logging.info("The invocation context has just timed out, returning to listen for invocation word.") subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']), invocations[invocation_ctx]['voice_params'], invocation_ctx]) invocation_ctx = None interaction_time = None decoder.end_utt() decoder.set_search('keyword') decoder.start_utt()
class PocketGrammar(object): AUDIO_CHUNK_SIZE = 1024 AUDIO_RATE = 16000 HMM = 'cmusphinx-5prealpha-en-us-ptm-2.0/' DIC = 'dictionary.dic' GRAMMAR = 'grammar.jsgf' def __init__(self, device_index=0, model_path=None): self._decoder = None self._pa = None self._device_no = device_index self._model_path = model_path # PocketSphinx configuration logging.info('Grammar file:' + os.path.join(model_path, self.GRAMMAR)) ps_config = Decoder.default_config() # Set recognition model to ... ps_config.set_string('-hmm', os.path.join(model_path, self.HMM)) ps_config.set_string('-dict', os.path.join(model_path, self.DIC)) ps_config.set_string('-jsgf', os.path.join(model_path, self.GRAMMAR)) ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) self._pa = pyaudio.PyAudio() def _handle_init(self, rate, chunk_size): self._handle = self._pa.open(input=True, input_device_index=self._device_no, format=pyaudio.paInt16, channels=1, rate=rate, frames_per_buffer=chunk_size) def _handle_release(self): self._handle.stop_stream() self._handle.close() def _handle_read(self, chunk_size): return self._handle.read(chunk_size, exception_on_overflow=False) def getHypothesys(self): # init microphone self._handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE) self._decoder.start_utt() # from speech to silence or from silence to speech? utteranceStarted = False triggered = False while not triggered: # Read from microphone and process data = self._handle_read(self.AUDIO_CHUNK_SIZE) self._decoder.process_raw(data, False, False) # checks for transition from silence to speech. inSpeech = self._decoder.get_in_speech() if inSpeech and not utteranceStarted: utteranceStarted = True logging.debug("Silence") # checks for the transition from speech to silence if not inSpeech and utteranceStarted: hypothesis = self._decoder.hyp() triggered = hypothesis is not None # close microphone self._handle_release() self._decoder.end_utt() if triggered: return hypothesis.hypstr
class SpeechRecognizer(Interpreter): def __init__(self, name: str, sr: str = "pocketsphinx"): super().__init__(name, True) self.logger = self.get_logger() self.sr = sr self.current_data = [] self.setup() def setup(self) -> None: self.RATE = int(os.getenv("RATE")) self.CHUNK = int(os.getenv("CHUNK")) self.setup_pocketsphinx() if (self.sr == "googlespeech"): self.setup_googlespeech() def setup_pocketsphinx(self) -> None: self.logger.info("Setting up PocketSphinx.") self.MODELDIR = "resources/model" config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'es-es')) config.set_string('-lm', os.path.join(self.MODELDIR, 'es-es.lm')) config.set_string('-dict', os.path.join(self.MODELDIR, 'es.dict')) config.set_string('-logfn', '/dev/null') self.decoder = Decoder(config) self.prev_buf_is_speech = False self.decoder.start_utt() self.logger.info("Done setting up PocketSphinx.") def setup_googlespeech(self) -> None: self.logger.info("Setting up Google Speech.") credentials = service_account.Credentials.from_service_account_file( 'resources/keys/credentials.json') config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, language_code='es-PE', sample_rate_hertz=self.RATE, ) self.client = speech.SpeechClient(credentials=credentials) self.streaming_config = speech.types.StreamingRecognitionConfig( config=config) self.logger.info("Done setting up Google Speech.") def get_destinations_ID(self, raw_data) -> List[Identifier]: return [self.destinations_ID[0]] def preprocess(self, raw_data): """Filtering""" return raw_data def query_gs(self): requests = (speech.types.StreamingRecognizeRequest(audio_content=chunk) for chunk in self.current_data) responses = self.client.streaming_recognize( config=self.streaming_config, requests=requests) try: response = next(responses) data = response.results[0].alternatives[0].transcript conf = response.results[0].alternatives[0].confidence except Exception as e: self.logger.info(f"{self.name}>> {e}") conf = None data = None self.current_data.clear() return data, conf def query_ps(self): try: data = self.decoder.hyp().hypstr conf = self.decoder.hyp().best_score if data == "": data = None except Exception as e: self.logger.info(f"{self.name}>> {e}") conf = None data = None return data, conf def process(self, raw_data) -> Generator: self.decoder.process_raw(raw_data, False, False) cur_buf_is_speech = self.decoder.get_in_speech() data = None self.logger.info( f"prev: {self.prev_buf_is_speech}, current: {cur_buf_is_speech}") force_speech = False if raw_data == bytes([0] * self.CHUNK * 16): force_speech = True self.logger.info("RECEIVED FORCE STOP") if force_speech or (self.prev_buf_is_speech and not cur_buf_is_speech): # No longer in speech -> stop listening and process self.logger.info("No longer in speech, yielding True.") yield True self.decoder.end_utt() if (self.sr == "googlespeech"): data, conf = self.query_gs() elif (self.sr == "pocketsphinx"): data, conf = self.query_ps() self.logger.info( f"{self.name}>> Heard DATA: '{data}' with confidence: {conf}.") self.decoder.start_utt() self.prev_buf_is_speech = cur_buf_is_speech elif not self.prev_buf_is_speech and cur_buf_is_speech: # Now in speech -> Start listening self.current_data.append(raw_data) self.prev_buf_is_speech = cur_buf_is_speech yield False elif self.prev_buf_is_speech and cur_buf_is_speech: # Still in speech -> Keep on listening self.current_data.append(raw_data) self.prev_buf_is_speech = cur_buf_is_speech yield False else: self.prev_buf_is_speech = cur_buf_is_speech yield False yield data return def pass_msg(self, msg: str) -> None: if msg == "RESUME": self.e.set() def dump_history(self, filename: str, data: List[Any]) -> None: pass