예제 #1
0
class AudioConsumer(threading.Thread):
    """
    AudioConsumer
    Consumes AudioData chunks off the queue
    """

    # In seconds, the minimum audio size to be sent to remote STT
    MIN_AUDIO_SIZE = 0.5

    def __init__(self, state, queue, emitter, wakeup_recognizer,
                 mycroft_recognizer, remote_recognizer):
        threading.Thread.__init__(self)
        self.daemon = True
        self.queue = queue
        self.state = state
        self.emitter = emitter
        self.wakeup_recognizer = wakeup_recognizer
        self.mycroft_recognizer = mycroft_recognizer
        self.remote_recognizer = remote_recognizer
        self.metrics = MetricsAggregator()

    def run(self):
        while self.state.running:
            self.read_audio()

    @staticmethod
    def _audio_length(audio):
        return float(len(
            audio.frame_data)) / (audio.sample_rate * audio.sample_width)

    def read_audio(self):
        timer = Stopwatch()
        audio = self.queue.get()
        self.metrics.timer("mycroft.recognizer.audio.length_s",
                           self._audio_length(audio))
        self.queue.task_done()
        timer.start()

        if self.state.sleeping:
            self.process_wake_up(audio)
        elif self.state.skip_wakeword:
            self.process_skip_wake_word(audio)
        else:
            self.process_wake_word(audio, timer)

        self.metrics.flush()

    def process_wake_up(self, audio):
        if self.wakeup_recognizer.is_recognized(audio.frame_data,
                                                self.metrics):
            SessionManager.touch()
            self.state.sleeping = False
            self.__speak("I'm awake.")  # TODO: Localization
            self.metrics.increment("mycroft.wakeup")

    def process_wake_word(self, audio, timer):
        hyp = self.mycroft_recognizer.transcribe(audio.frame_data,
                                                 self.metrics)

        if self.mycroft_recognizer.contains(hyp):
            extractor = WordExtractor(audio, self.mycroft_recognizer,
                                      self.metrics)
            timer.lap()
            extractor.calculate_range()
            self.metrics.timer("mycroft.recognizer.extractor.time_s",
                               timer.lap())
            audio_before = extractor.get_audio_data_before()
            self.metrics.timer("mycroft.recognizer.audio_extracted.length_s",
                               self._audio_length(audio_before))
            audio_after = extractor.get_audio_data_after()
            self.metrics.timer("mycroft.recognizer.audio_extracted.length_s",
                               self._audio_length(audio_after))

            SessionManager.touch()
            payload = {
                'utterance': hyp.hypstr,
                'session': SessionManager.get().session_id,
                'pos_begin': extractor.begin,
                'pos_end': extractor.end
            }
            self.emitter.emit("recognizer_loop:wakeword", payload)

            try:
                self.transcribe([audio_before, audio_after])
            except sr.UnknownValueError:
                self.__speak("Go ahead")
                self.state.skip_wakeword = True
                self.metrics.increment("mycroft.wakeword")

    def process_skip_wake_word(self, audio):
        SessionManager.touch()
        try:
            self.transcribe([audio])
        except sr.UnknownValueError:
            logger.warn("Speech Recognition could not understand audio")
            self.__speak("Sorry, I didn't catch that.")
            self.metrics.increment("mycroft.recognizer.error")
        self.state.skip_wakeword = False

    def __speak(self, utterance):
        payload = {
            'utterance': utterance,
            'session': SessionManager.get().session_id
        }
        self.emitter.emit("speak", Message("speak", metadata=payload))

    def _create_remote_stt_runnable(self, audio, utterances):
        def runnable():
            try:
                text = self.remote_recognizer.transcribe(
                    audio, metrics=self.metrics).lower()
            except sr.UnknownValueError:
                pass
            except sr.RequestError as e:
                logger.error(
                    "Could not request results from Speech Recognition "
                    "service; {0}".format(e))
            except CerberusAccessDenied as e:
                logger.error("AccessDenied from Cerberus proxy.")
                self.__speak(
                    "Your device is not registered yet. To start pairing, "
                    "login at cerberus dot mycroft dot A.I")
                utterances.append("pair my device")
            except Exception as e:
                logger.error("Unexpected exception: {0}".format(e))
            else:
                logger.debug("STT: " + text)
                if text.strip() != '':
                    utterances.append(text)

        return runnable

    def transcribe(self, audio_segments):
        utterances = []
        threads = []
        for audio in audio_segments:
            if self._audio_length(audio) < self.MIN_AUDIO_SIZE:
                logger.debug("Audio too short to send to STT")
                continue

            target = self._create_remote_stt_runnable(audio, utterances)
            t = threading.Thread(target=target)
            t.start()
            threads.append(t)

        for thread in threads:
            thread.join()
        if len(utterances) > 0:
            payload = {
                'utterances': utterances,
                'session': SessionManager.get().session_id
            }
            self.emitter.emit("recognizer_loop:utterance", payload)
            self.metrics.attr('utterances', utterances)
        else:
            raise sr.UnknownValueError
예제 #2
0
class AudioConsumer(threading.Thread):
    """
    AudioConsumer
    Consumes AudioData chunks off the queue
    """

    # In seconds, the minimum audio size to be sent to remote STT
    MIN_AUDIO_SIZE = 1.0

    def __init__(self, state, queue, emitter, wakeup_recognizer,
                 wakeword_recognizer, wrapped_remote_recognizer,
                 wakeup_prefixes, wakeup_words):
        threading.Thread.__init__(self)
        self.daemon = True
        self.queue = queue
        self.state = state
        self.emitter = emitter
        self.wakeup_recognizer = wakeup_recognizer
        self.ww_recognizer = wakeword_recognizer
        self.wrapped_remote_recognizer = wrapped_remote_recognizer
        self.wakeup_prefixes = wakeup_prefixes
        self.wakeup_words = wakeup_words
        self.metrics = MetricsAggregator()

    def run(self):
        while self.state.running:
            self.try_consume_audio()

    @staticmethod
    def _audio_length(audio):
        return float(len(
            audio.frame_data)) / (audio.sample_rate * audio.sample_width)

    def try_consume_audio(self):
        timer = Stopwatch()
        hyp = None
        audio = self.queue.get()
        self.metrics.timer("mycroft.recognizer.audio.length_s",
                           self._audio_length(audio))
        self.queue.task_done()
        timer.start()
        if self.state.sleeping:
            hyp = self.wakeup_recognizer.transcribe(audio.get_wav_data(),
                                                    metrics=self.metrics)
            if hyp and hyp.hypstr:
                logger.debug("sleeping recognition: " + hyp.hypstr)
            if hyp and hyp.hypstr.lower().find("wake up") >= 0:
                SessionManager.touch()
                self.state.sleeping = False
                self.__speak("I'm awake.")  # TODO: Localization
                self.metrics.increment("mycroft.wakeup")
        else:
            if not self.state.skip_wakeword:
                hyp = self.ww_recognizer.transcribe(audio.get_wav_data(),
                                                    metrics=self.metrics)

            if hyp and hyp.hypstr.lower().find("mycroft") >= 0:
                extractor = WakewordExtractor(audio, self.ww_recognizer,
                                              self.metrics)
                timer.lap()
                extractor.calculate_range()
                self.metrics.timer("mycroft.recognizer.extractor.time_s",
                                   timer.lap())
                audio_before = extractor.get_audio_data_before()
                self.metrics.timer(
                    "mycroft.recognizer.audio_extracted.length_s",
                    self._audio_length(audio_before))
                audio_after = extractor.get_audio_data_after()
                self.metrics.timer(
                    "mycroft.recognizer.audio_extracted.length_s",
                    self._audio_length(audio_after))

                SessionManager.touch()
                payload = {
                    'utterance': hyp.hypstr,
                    'session': SessionManager.get().session_id,
                    'pos_begin': int(extractor.range.begin),
                    'pos_end': int(extractor.range.end)
                }
                self.emitter.emit("recognizer_loop:wakeword", payload)

                try:
                    self.transcribe([audio_before, audio_after])
                except sr.UnknownValueError:
                    self.__speak("Go ahead")
                    self.state.skip_wakeword = True
                    self.metrics.increment("mycroft.wakeword")

            elif self.state.skip_wakeword:
                SessionManager.touch()
                try:
                    self.transcribe([audio])
                except sr.UnknownValueError:
                    logger.warn(
                        "Speech Recognition could not understand audio")
                    self.__speak("Sorry, I didn't catch that.")
                    self.metrics.increment("mycroft.recognizer.error")
                self.state.skip_wakeword = False
            else:
                self.metrics.clear()
        self.metrics.flush()

    def __speak(self, utterance):
        """
        Speak commands should be asynchronous to avoid filling up the
        portaudio buffer.
        :param utterance:
        :return:
        """
        def target():
            self.emitter.emit(
                "speak",
                Message("speak",
                        metadata={
                            'utterance': utterance,
                            'session': SessionManager.get().session_id
                        }))

        threading.Thread(target=target).start()

    def _create_remote_stt_runnable(self, audio, utterances):
        def runnable():
            try:
                text = self.wrapped_remote_recognizer.transcribe(
                    audio, metrics=self.metrics).lower()
            except sr.UnknownValueError:
                pass
            except sr.RequestError as e:
                logger.error(
                    "Could not request results from Speech Recognition "
                    "service; {0}".format(e))
            except CerberusAccessDenied as e:
                logger.error("AccessDenied from Cerberus proxy.")
                self.__speak(
                    "Your device is not registered yet. To start pairing, "
                    "login at cerberus.mycroft.ai")
                utterances.append("pair my device")
            else:
                logger.debug("STT: " + text)
                if text.strip() != '':
                    utterances.append(text)

        return runnable

    def transcribe(self, audio_segments):
        utterances = []
        threads = []
        for audio in audio_segments:
            if self._audio_length(audio) < self.MIN_AUDIO_SIZE:
                logger.debug("Audio too short to send to STT")
                continue

            target = self._create_remote_stt_runnable(audio, utterances)
            t = threading.Thread(target=target)
            t.start()
            threads.append(t)

        for thread in threads:
            thread.join()
        if len(utterances) > 0:
            payload = {
                'utterances': utterances,
                'session': SessionManager.get().session_id
            }
            self.emitter.emit("recognizer_loop:utterance", payload)
            self.metrics.attr('utterances', utterances)
        else:
            raise sr.UnknownValueError
예제 #3
0
class AudioConsumer(threading.Thread):
    """
    AudioConsumer
    Consumes AudioData chunks off the queue
    """

    # In seconds, the minimum audio size to be sent to remote STT
    MIN_AUDIO_SIZE = 0.5

    def __init__(self, state, queue, emitter, wakeup_recognizer,
                 mycroft_recognizer, remote_recognizer):
        threading.Thread.__init__(self)
        self.daemon = True
        self.queue = queue
        self.state = state
        self.emitter = emitter
        self.wakeup_recognizer = wakeup_recognizer
        self.mycroft_recognizer = mycroft_recognizer
        self.remote_recognizer = remote_recognizer
        self.metrics = MetricsAggregator()

    def run(self):
        while self.state.running:
            self.read_audio()

    @staticmethod
    def _audio_length(audio):
        return float(len(audio.frame_data)) / (
            audio.sample_rate * audio.sample_width)

    def read_audio(self):
        timer = Stopwatch()
        audio = self.queue.get()
        self.metrics.timer("mycroft.recognizer.audio.length_s",
                           self._audio_length(audio))
        self.queue.task_done()
        timer.start()

        if self.state.sleeping:
            self.process_wake_up(audio)
        elif self.state.skip_wakeword:
            self.process_skip_wake_word(audio)
        else:
            self.process_wake_word(audio, timer)

        self.metrics.flush()

    def process_wake_up(self, audio):
        if self.wakeup_recognizer.is_recognized(audio.frame_data,
                                                self.metrics):
            SessionManager.touch()
            self.state.sleeping = False
            self.__speak("I'm awake.")  # TODO: Localization
            self.metrics.increment("mycroft.wakeup")

    def process_wake_word(self, audio, timer):
        hyp = self.mycroft_recognizer.transcribe(audio.frame_data,
                                                 self.metrics)

        if self.mycroft_recognizer.contains(hyp):
            extractor = WordExtractor(audio, self.mycroft_recognizer,
                                      self.metrics)
            timer.lap()
            extractor.calculate_range()
            self.metrics.timer("mycroft.recognizer.extractor.time_s",
                               timer.lap())
            audio_before = extractor.get_audio_data_before()
            self.metrics.timer("mycroft.recognizer.audio_extracted.length_s",
                               self._audio_length(audio_before))
            audio_after = extractor.get_audio_data_after()
            self.metrics.timer("mycroft.recognizer.audio_extracted.length_s",
                               self._audio_length(audio_after))

            SessionManager.touch()
            payload = {
                'utterance': hyp.hypstr,
                'session': SessionManager.get().session_id,
                'pos_begin': extractor.begin,
                'pos_end': extractor.end
            }
            self.emitter.emit("recognizer_loop:wakeword", payload)

            try:
                self.transcribe([audio_before, audio_after])
            except sr.UnknownValueError:
                self.__speak("Go ahead")
                self.state.skip_wakeword = True
                self.metrics.increment("mycroft.wakeword")

    def process_skip_wake_word(self, audio):
        SessionManager.touch()
        try:
            self.transcribe([audio])
        except sr.UnknownValueError:
            logger.warn("Speech Recognition could not understand audio")
            self.__speak("Sorry, I didn't catch that.")
            self.metrics.increment("mycroft.recognizer.error")
        self.state.skip_wakeword = False

    def __speak(self, utterance):
        payload = {
            'utterance': utterance,
            'session': SessionManager.get().session_id
        }
        self.emitter.emit("speak", Message("speak", metadata=payload))

    def _create_remote_stt_runnable(self, audio, utterances):
        def runnable():
            try:
                text = self.remote_recognizer.transcribe(
                        audio, metrics=self.metrics).lower()
            except sr.UnknownValueError:
                pass
            except sr.RequestError as e:
                logger.error(
                        "Could not request results from Speech Recognition "
                        "service; {0}".format(e))
            except CerberusAccessDenied as e:
                logger.error("AccessDenied from Cerberus proxy.")
                self.__speak(
                        "Your device is not registered yet. To start pairing, "
                        "login at cerberus dot mycroft dot A.I")
                utterances.append("pair my device")
            except Exception as e:
                logger.error("Unexpected exception: {0}".format(e))
            else:
                logger.debug("STT: " + text)
                if text.strip() != '':
                    utterances.append(text)

        return runnable

    def transcribe(self, audio_segments):
        utterances = []
        threads = []
        for audio in audio_segments:
            if self._audio_length(audio) < self.MIN_AUDIO_SIZE:
                logger.debug("Audio too short to send to STT")
                continue

            target = self._create_remote_stt_runnable(audio, utterances)
            t = threading.Thread(target=target)
            t.start()
            threads.append(t)

        for thread in threads:
            thread.join()
        if len(utterances) > 0:
            payload = {
                'utterances': utterances,
                'session': SessionManager.get().session_id
            }
            self.emitter.emit("recognizer_loop:utterance", payload)
            self.metrics.attr('utterances', utterances)
        else:
            raise sr.UnknownValueError
예제 #4
0
class AudioConsumer(threading.Thread):
    """
    AudioConsumer
    Consumes AudioData chunks off the queue
    """

    # In seconds, the minimum audio size to be sent to remote STT
    MIN_AUDIO_SIZE = 1.0

    def __init__(
            self, state, queue, emitter, wakeup_recognizer,
            wakeword_recognizer, wrapped_remote_recognizer, wakeup_prefixes,
            wakeup_words):
        threading.Thread.__init__(self)
        self.daemon = True
        self.queue = queue
        self.state = state
        self.emitter = emitter
        self.wakeup_recognizer = wakeup_recognizer
        self.ww_recognizer = wakeword_recognizer
        self.wrapped_remote_recognizer = wrapped_remote_recognizer
        self.wakeup_prefixes = wakeup_prefixes
        self.wakeup_words = wakeup_words
        self.metrics = MetricsAggregator()

    def run(self):
        while self.state.running:
            self.try_consume_audio()

    @staticmethod
    def _audio_length(audio):
        return float(
            len(audio.frame_data))/(audio.sample_rate*audio.sample_width)

    def try_consume_audio(self):
        timer = Stopwatch()
        hyp = None
        audio = self.queue.get()
        self.metrics.timer(
            "mycroft.recognizer.audio.length_s", self._audio_length(audio))
        self.queue.task_done()
        timer.start()
        if self.state.sleeping:
            hyp = self.wakeup_recognizer.transcribe(
                audio.get_wav_data(), metrics=self.metrics)
            if hyp and hyp.hypstr:
                logger.debug("sleeping recognition: " + hyp.hypstr)
            if hyp and hyp.hypstr.lower().find("wake up") >= 0:
                SessionManager.touch()
                self.state.sleeping = False
                self.__speak("I'm awake.")  # TODO: Localization
                self.metrics.increment("mycroft.wakeup")
        else:
            if not self.state.skip_wakeword:
                hyp = self.ww_recognizer.transcribe(
                    audio.get_wav_data(), metrics=self.metrics)

            if hyp and hyp.hypstr.lower().find("mycroft") >= 0:
                extractor = WakewordExtractor(
                    audio, self.ww_recognizer, self.metrics)
                timer.lap()
                extractor.calculate_range()
                self.metrics.timer(
                    "mycroft.recognizer.extractor.time_s", timer.lap())
                audio_before = extractor.get_audio_data_before()
                self.metrics.timer(
                    "mycroft.recognizer.audio_extracted.length_s",
                    self._audio_length(audio_before))
                audio_after = extractor.get_audio_data_after()
                self.metrics.timer(
                    "mycroft.recognizer.audio_extracted.length_s",
                    self._audio_length(audio_after))

                SessionManager.touch()
                payload = {
                    'utterance': hyp.hypstr,
                    'session': SessionManager.get().session_id,
                    'pos_begin': int(extractor.range.begin),
                    'pos_end': int(extractor.range.end)
                }
                self.emitter.emit("recognizer_loop:wakeword", payload)

                try:
                    self.transcribe([audio_before, audio_after])
                except sr.UnknownValueError:
                    self.__speak("Go ahead")
                    self.state.skip_wakeword = True
                    self.metrics.increment("mycroft.wakeword")

            elif self.state.skip_wakeword:
                SessionManager.touch()
                try:
                    self.transcribe([audio])
                except sr.UnknownValueError:
                    logger.warn(
                        "Speech Recognition could not understand audio")
                    self.__speak("Sorry, I didn't catch that.")
                    self.metrics.increment("mycroft.recognizer.error")
                self.state.skip_wakeword = False
            else:
                self.metrics.clear()
        self.metrics.flush()

    def __speak(self, utterance):
        """
        Speak commands should be asynchronous to avoid filling up the
        portaudio buffer.
        :param utterance:
        :return:
        """
        def target():
            self.emitter.emit(
                "speak",
                Message("speak",
                        metadata={'utterance': utterance,
                                  'session': SessionManager.get().session_id}))

        threading.Thread(target=target).start()

    def _create_remote_stt_runnable(self, audio, utterances):
        def runnable():
            try:
                text = self.wrapped_remote_recognizer.transcribe(
                    audio, metrics=self.metrics).lower()
            except sr.UnknownValueError:
                pass
            except sr.RequestError as e:
                logger.error(
                    "Could not request results from Speech Recognition "
                    "service; {0}".format(e))
            except CerberusAccessDenied as e:
                logger.error("AccessDenied from Cerberus proxy.")
                self.__speak(
                    "Your device is not registered yet. To start pairing, "
                    "login at cerberus.mycroft.ai")
                utterances.append("pair my device")
            else:
                logger.debug("STT: " + text)
                if text.strip() != '':
                    utterances.append(text)
        return runnable

    def transcribe(self, audio_segments):
        utterances = []
        threads = []
        for audio in audio_segments:
            if self._audio_length(audio) < self.MIN_AUDIO_SIZE:
                logger.debug("Audio too short to send to STT")
                continue

            target = self._create_remote_stt_runnable(audio, utterances)
            t = threading.Thread(target=target)
            t.start()
            threads.append(t)

        for thread in threads:
            thread.join()
        if len(utterances) > 0:
            payload = {
                'utterances': utterances,
                'session': SessionManager.get().session_id
            }
            self.emitter.emit("recognizer_loop:utterance", payload)
            self.metrics.attr('utterances', utterances)
        else:
            raise sr.UnknownValueError