예제 #1
0
    def speech_recog(self, model):
        # Create a decoder with certain model
        config = Decoder.default_config()
        config.set_string('-hmm', '/usr/local/share/pocketsphinx/model/en-us/en-us')
        config.set_int('-ds', 2)
        config.set_int('-topn', 3)
        config.set_int('-maxwpf', 5)
        #config.set_string('-kws', MODELDIR + model + '.txt')
        config.set_string('-lm', MODELDIR + model + '.lm')
        config.set_string('-dict', MODELDIR + model + '.dict')
        decoder = Decoder(config)

        decoder.start_utt()
        recog_text = ''

        with self.stream_in as stream:
            audio_generator = stream.generator()
            for content in audio_generator:
                decoder.process_raw(content, False, False)
                if decoder.hyp() and decoder.hyp().hypstr != '':
                    recog_text += decoder.hyp().hypstr
                    if len(recog_text) > 1:
                        decoder.end_utt()
                        logging.info("recog text: %s", recog_text)
                        return recog_text
        return recog_text
예제 #2
0
class PocketSphinxEngine(Engine):
    def __init__(self, keyword, sensitivity):
        config = Decoder.default_config()
        config.set_string('-logfn', '/dev/null')
        config.set_string('-hmm', os.path.join(get_model_path(), 'en-us'))
        config.set_string('-dict',
                          os.path.join(get_model_path(), 'cmudict-en-us.dict'))
        config.set_string('-keyphrase',
                          keyword if keyword != 'snowboy' else 'snow boy')
        config.set_float('-kws_threshold', 10**-sensitivity)

        self._decoder = Decoder(config)
        self._decoder.start_utt()

    def process(self, pcm):
        assert pcm.dtype == np.int16

        self._decoder.process_raw(pcm.tobytes(), False, False)

        detected = self._decoder.hyp()
        if detected:
            self._decoder.end_utt()
            self._decoder.start_utt()

        return detected

    def release(self):
        self._decoder.end_utt()

    def __str__(self):
        return 'PocketSphinx'
예제 #3
0
def audio2phoneme(audio_file):
    wave_read = wave.open(audio_file, 'rb')
    length = wave_read.getnframes() / wave_read.getframerate()
    wave_read.close()

    # Decode streaming data.
    decoder = Decoder(config)

    buf = bytearray(1024)
    with open(audio_file, 'rb') as f:
        decoder.start_utt()
        while f.readinto(buf):
            decoder.process_raw(buf, False, False)
        decoder.end_utt()

    nframes = decoder.n_frames()

    phonemes = []
    offset = None
    for seg in decoder.seg():
        if offset is None:
            offset = seg.start_frame
        start_frame = seg.start_frame - offset
        end_frame = seg.end_frame - offset
        phonemes.append((seg.word, start_frame / nframes * length,
                         end_frame / nframes * length))

    return phonemes
예제 #4
0
class LocalRecognizer(object):
    def __init__(self, sample_rate=16000, lang="en-us", key_phrase="mycroft"):
        self.lang = lang
        self.key_phrase = key_phrase
        self.sample_rate = sample_rate
        self.configure()

    def configure(self):
        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang,
                                               'hmm'))
        config.set_string('-dict', os.path.join(BASEDIR, 'model', self.lang,
                                                'mycroft-en-us.dict'))
        config.set_string('-keyphrase', self.key_phrase)
        config.set_float('-kws_threshold', float('1e-45'))
        config.set_float('-samprate', self.sample_rate)
        config.set_int('-nfft', 2048)
        config.set_string('-logfn', '/dev/null')
        self.decoder = Decoder(config)

    def transcribe(self, byte_data, metrics=None):
        start = time.time()
        self.decoder.start_utt()
        self.decoder.process_raw(byte_data, False, False)
        self.decoder.end_utt()
        if metrics:
            metrics.timer("mycroft.stt.local.time_s", time.time() - start)
        return self.decoder.hyp()

    def is_recognized(self, byte_data, metrics):
        hyp = self.transcribe(byte_data, metrics)
        return hyp and self.key_phrase in hyp.hypstr.lower()

    def contains(self, hypothesis):
        return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
예제 #5
0
class LocalRecognizer(object):
    def __init__(self, sample_rate=16000, lang="en-us", key_phrase="mycroft"):
        self.lang = lang
        self.key_phrase = key_phrase
        self.sample_rate = sample_rate
        self.configure()

    def configure(self):
        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang,
                                               'hmm'))
        config.set_string('-dict', os.path.join(BASEDIR, 'model', self.lang,
                                                'mycroft-en-us.dict'))
        config.set_string('-keyphrase', self.key_phrase)
        config.set_float('-kws_threshold', float('1e-45'))
        config.set_float('-samprate', self.sample_rate)
        config.set_int('-nfft', 2048)
        config.set_string('-logfn', '/dev/null')
        self.decoder = Decoder(config)

    def transcribe(self, byte_data, metrics=None):
        start = time.time()
        self.decoder.start_utt()
        self.decoder.process_raw(byte_data, False, False)
        self.decoder.end_utt()
        if metrics:
            metrics.timer("mycroft.stt.local.time_s", time.time() - start)
        return self.decoder.hyp()

    def is_recognized(self, byte_data, metrics):
        hyp = self.transcribe(byte_data, metrics)
        return hyp and self.key_phrase in hyp.hypstr.lower()

    def found_wake_word(self, hypothesis):
        return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
예제 #6
0
    def speech_recog(self, model):
        # Create a decoder with certain model
        config = Decoder.default_config()
        config.set_string('-hmm',
                          '/usr/local/share/pocketsphinx/model/en-us/en-us')
        config.set_int('-ds', 2)
        config.set_int('-topn', 3)
        config.set_int('-maxwpf', 5)
        #config.set_string('-kws', MODELDIR + model + '.txt')
        config.set_string('-lm', MODELDIR + model + '.lm')
        config.set_string('-dict', MODELDIR + model + '.dict')
        decoder = Decoder(config)

        decoder.start_utt()
        recog_text = ''

        with self.stream_in as stream:
            audio_generator = stream.generator()
            for content in audio_generator:
                decoder.process_raw(content, False, False)
                if decoder.hyp() and decoder.hyp().hypstr != '':
                    recog_text += decoder.hyp().hypstr
                    if len(recog_text) > 1:
                        decoder.end_utt()
                        logging.info("recog text: %s", recog_text)
                        return recog_text
        return recog_text
예제 #7
0
def audio2phoneme(audio_file):
    wave_read = wave.open(audio_file, 'rb')
    length = wave_read.getnframes()/wave_read.getframerate()
    wave_read.close()

    # Decode streaming data.
    decoder = Decoder(config)

    buf = bytearray(1024)
    with open(audio_file, 'rb') as f:
        decoder.start_utt()
        while f.readinto(buf):
            decoder.process_raw(buf, False, False)
        decoder.end_utt()

    nframes = decoder.n_frames()


    phonemes = []
    offset = None
    for seg in decoder.seg():
        if offset is None:
            offset = seg.start_frame
        start_frame = seg.start_frame - offset
        end_frame = seg.end_frame - offset
        phonemes.append((
            seg.word, start_frame/nframes*length, end_frame/nframes*length))

    return phonemes
예제 #8
0
    def process_file(self, audiofile):
        """
        processes audio file and returns the text
        """
        with open(audiofile, 'rb') as audiofile:
            decoder = Decoder(self.config)
            decoder.start_utt()

            while True:
                buf = audiofile.read(1024)
                if buf:
                    decoder.process_raw(buf, False, False)
                else:
                    break
            decoder.end_utt()

            hyp = decoder.hyp()
            print "Hyp:", hyp

            if hyp != None:
                print "Hyp Score", (hyp.prob, hyp.best_score)
                average_score = 0
                seg_count = 0
                for seg in decoder.seg():
                    if seg.word != "<sil>":
                        seg_count += 1
                        average_score += seg.ascore
                        print(seg.word, seg.ascore, seg.lscore)

                print "hyp:", hyp.hypstr
                print average_score / seg_count
                return hyp.hypstr
        return None
예제 #9
0
class SphinxDecoder():
    def __init__(self):
        self.MODELDIR = 'speech/'
        self.wav_name = 'media/temp.wav'
        self.raw_name = 'media/temp.raw'

        config = Decoder.default_config()
        config.set_string('-hmm', self.MODELDIR + 'ru_ru/')
        config.set_string('-dict', self.MODELDIR + 'ru.dic')
        self.decoder = Decoder(config)

        jsgf = Jsgf(self.MODELDIR + 'gr.gram')
        rule = jsgf.get_rule('gr.rule')
        fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5)
        fsg.writefile('gr.fsg')

        self.decoder.set_fsg('gr', fsg)
        self.decoder.set_search('gr')

        self.rec = Recognizer()
        self.mic = Microphone()

    def wav_to_raw(self):
        audio_file = AudioSegment.from_wav(self.wav_name)
        audio_file = audio_file.set_frame_rate(16000)
        audio_file.export(self.raw_name, format='raw')

    def record_audio(self):
        with self.mic as source:
            self.rec.adjust_for_ambient_noise(source)

            system('aplay media/beep.wav')
            audio = self.rec.listen(source)
            with open(self.wav_name, 'wb') as new_audio:
                new_audio.write(audio.get_wav_data())

        self.wav_to_raw()

    def get_from_audio(self):
        self.record_audio()

        self.decoder.start_utt()
        stream = open(self.raw_name, 'rb')
        while True:
            buf = stream.read(1024)
            if buf:
                self.decoder.process_raw(buf, False, False)
            else:
                break
        self.decoder.end_utt()
        stream.close()
        try:
            return self.decoder.hyp().hypstr
        except:
            return None
예제 #10
0
    def start_listening(self):
        ''' Starts streaming. Pauses until self.resume has been called '''
        config = Decoder.default_config()
        config.set_string('-hmm', path.join(self.model_dir, self.hmm))
        config.set_string('-lm', path.join(self.model_dir, self.lm))
        config.set_string('-dict', path.join(self.model_dir, self.dictionary))
        config.set_string('-logfn', self.logfn)

        # This takes a while
        decoder = Decoder(config)

        p = pyaudio.PyAudio()
        print(self.input_source_index)
        stream = p.open(format=pyaudio.paInt16,
                        channels=1,
                        rate=16000,
                        input=True,
                        input_device_index=self.input_source_index,
                        frames_per_buffer=1024)

        stream.start_stream()

        in_speech_bf = False
        decoder.start_utt()

        self.wait_to_resume_lock.acquire()

        while self.is_running:
            while self.paused:
                pass
            buf = stream.read(1024, exception_on_overflow=False)
            if buf:
                decoder.process_raw(buf, False, False)
                if decoder.get_in_speech() != in_speech_bf:
                    in_speech_bf = decoder.get_in_speech()
                    if not in_speech_bf:
                        decoder.end_utt()
                        # if self.wait_to_resume:
                        #     stream.stop_stream()

                        phrase = decoder.hyp().hypstr
                        if phrase != "":
                            self.all_speech_data.append(phrase)
                            # if self.wait_to_resume:
                            #     # print("waiting")
                            #     self.wait_to_resume_lock.acquire()
                            #     # print("resuming")

                        # if self.wait_to_resume:
                        # stream.start_stream()
                        decoder.start_utt()
            else:
                break
        decoder.end_utt()
예제 #11
0
파일: main.py 프로젝트: ervitis/speaktome
def main():
    abspath = os.path.dirname(os.path.abspath(__file__))
    abspath = os.path.join(abspath, '..')

    model_dir = os.path.join(abspath, 'model')

    hmm = os.path.join(model_dir, HMM)
    lm = os.path.join(model_dir, LM)
    dic = os.path.join(model_dir, DIC)

    config = Decoder.default_config()
    config.set_string('-hmm', hmm)
    config.set_string('-lm', lm)
    config.set_string('-dict', dic)
    config.set_string('-logfn', '/dev/null')
    decoder = Decoder(config)

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=BUFFER)
    stream.start_stream()
    in_speech_bf = True
    decoder.start_utt()
    while True:
        buf = stream.read(BUFFER)
        if buf:
            decoder.process_raw(buf, False, False)
            if decoder.get_in_speech():
                sys.stdout.write('.')
                sys.stdout.flush()
            if decoder.get_in_speech() == in_speech_bf:
                continue

            in_speech_bf = decoder.get_in_speech()
            if in_speech_bf:
                continue

            decoder.end_utt()
            try:
                if decoder.hyp().hypstr != '':
                    print('You said:', decoder.hyp().hypstr)
            except AttributeError:
                pass
            decoder.start_utt()
        else:
            break
    decoder.end_utt()
    print('An Error occured:', decoder.hyp().hypstr)
예제 #12
0
 def process_stream(self, stream, callback):
     """
     Processes continuosly an audio stream and
     trigger the callback when text is detected
     """
     decoder = Decoder(self.config)
     decoder.start_utt()
     while True:
         buf = stream.read(1024)
         decoder.process_raw(buf, False, False)
         if decoder.hyp() is not None and decoder.hyp().hypstr is not None:
             decoder.end_utt()
             callback(decoder.hyp().hypstr)
             decoder.start_utt()
예제 #13
0
파일: main.py 프로젝트: ervitis/speaktome
def main():
    abspath = os.path.dirname(os.path.abspath(__file__))
    abspath = os.path.join(abspath, '..')

    model_dir = os.path.join(abspath, 'model')

    hmm = os.path.join(model_dir, HMM)
    lm = os.path.join(model_dir, LM)
    dic = os.path.join(model_dir, DIC)

    config = Decoder.default_config()
    config.set_string('-hmm', hmm)
    config.set_string('-lm', lm)
    config.set_string('-dict', dic)
    config.set_string('-logfn', '/dev/null')
    decoder = Decoder(config)

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=BUFFER)
    stream.start_stream()
    in_speech_bf = True
    decoder.start_utt()
    while True:
        buf = stream.read(BUFFER)
        if buf:
            decoder.process_raw(buf, False, False)
            if decoder.get_in_speech():
                sys.stdout.write('.')
                sys.stdout.flush()
            if decoder.get_in_speech() == in_speech_bf:
                continue

            in_speech_bf = decoder.get_in_speech()
            if in_speech_bf:
                continue

            decoder.end_utt()
            try:
                if decoder.hyp().hypstr != '':
                    print('You said:', decoder.hyp().hypstr)
            except AttributeError:
                pass
            decoder.start_utt()
        else:
            break
    decoder.end_utt()
    print('An Error occured:', decoder.hyp().hypstr)
예제 #14
0
class LocalRecognizer(object):
    def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000,
                 lang="en-us"):
        self.lang = lang
        self.key_phrase = key_phrase
        self.sample_rate = sample_rate
        self.threshold = threshold
        self.phonemes = phonemes
        dict_name = self.create_dict(key_phrase, phonemes)
        self.decoder = Decoder(self.create_config(dict_name))

    def create_dict(self, key_phrase, phonemes):
        (fd, file_name) = tempfile.mkstemp()
        words = key_phrase.split()
        phoneme_groups = phonemes.split('.')
        with os.fdopen(fd, 'w') as f:
            for word, phoneme in zip(words, phoneme_groups):
                f.write(word + ' ' + phoneme + '\n')
        return file_name

    def create_config(self, dict_name):
        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang,
                                               'hmm'))
        config.set_string('-dict', dict_name)
        config.set_string('-keyphrase', self.key_phrase)
        config.set_float('-kws_threshold', float(self.threshold))
        config.set_float('-samprate', self.sample_rate)
        config.set_int('-nfft', 2048)
        config.set_string('-logfn', '/dev/null')
        return config

    def transcribe(self, byte_data, metrics=None):
        start = time.time()
        self.decoder.start_utt()
        self.decoder.process_raw(byte_data, False, False)
        self.decoder.end_utt()
        if metrics:
            metrics.timer("mycroft.stt.local.time_s", time.time() - start)
        return self.decoder.hyp()

    def is_recognized(self, byte_data, metrics):
        hyp = self.transcribe(byte_data, metrics)
        return hyp and self.key_phrase in hyp.hypstr.lower()

    def found_wake_word(self, hypothesis):
        return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
예제 #15
0
class PocketSphinxEngine(Engine):
    """Pocketsphinx engine."""
    def __init__(self, keyword, sensitivity):
        """
        Constructor.

        :param keyword: keyword to be detected.
        :param sensitivity: detection sensitivity.
        """

        from pocketsphinx import get_model_path
        from pocketsphinx.pocketsphinx import Decoder

        # Set the configuration.
        config = Decoder.default_config()
        config.set_string('-logfn', '/dev/null')
        # Set recognition model to US
        config.set_string('-hmm', os.path.join(get_model_path(), 'en-us'))
        config.set_string('-dict',
                          os.path.join(get_model_path(), 'cmudict-en-us.dict'))
        config.set_string('-keyphrase', keyword)
        config.set_float('-kws_threshold', sensitivity)
        self._decoder = Decoder(config)
        self._decoder.start_utt()

    def process(self, pcm):
        pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16).tobytes()
        self._decoder.process_raw(pcm, False, False)

        detected = self._decoder.hyp()
        if detected:
            self._decoder.end_utt()
            self._decoder.start_utt()

        return detected

    def release(self):
        self._decoder.end_utt()

    def __str__(self):
        return 'PocketSphinx'
예제 #16
0
    def speech_recog(self, model):

        # Create a decoder with certain model
        config = Decoder.default_config()
        config.set_string('-hmm',
                          '/usr/local/share/pocketsphinx/model/en-us/en-us')
        config.set_int('-ds', 2)
        config.set_int('-topn', 3)
        config.set_int('-maxwpf', 5)
        #config.set_string('-kws', MODELDIR + model + '.txt')
        config.set_string('-lm', MODELDIR + model + '.lm')
        config.set_string('-dict', MODELDIR + model + '.dict')
        decoder = Decoder(config)

        decoder.start_utt()
        tstamp = time.time()
        recog_text = ''

        while len(recog_text) < 1:
            try:
                buf = self.stream_in.read(CHUNK_SIZE)
                logging.info("actual voice")
                decoder.process_raw(buf, False, False)
                if decoder.hyp().hypstr != '':
                    recog_text += decoder.hyp().hypstr
                    print "text: " + decoder.hyp().hypstr
                    tstamp = time.time()
            except IOError as ex:
                if ex[1] != pyaudio.paInputOverflowed:
                    raise
                buf = '\x00' * CHUNK_SIZE  #white noise
                logging.info("white noise")
            except AttributeError:
                pass

        decoder.end_utt()

        logging.info("recog text: " + recog_text)
        return recog_text
예제 #17
0
class PocketSphinxASREngine(ASREngine):
    """https://pypi.org/project/pocketsphinx/"""
    def __init__(self):
        # https://github.com/cmusphinx/pocketsphinx-python/blob/master/example.py
        config = Decoder.default_config()
        config.set_string('-logfn', '/dev/null')
        config.set_string('-hmm', os.path.join(get_model_path(), 'en-us'))
        config.set_string('-lm', os.path.join(get_model_path(),
                                              'en-us.lm.bin'))
        config.set_string('-dict',
                          os.path.join(get_model_path(), 'cmudict-en-us.dict'))

        self._decoder = Decoder(config)

    def transcribe(self, path):
        pcm, sample_rate = soundfile.read(path)
        assert sample_rate == 16000
        pcm = (np.iinfo(np.int16).max * pcm).astype(np.int16).tobytes()

        self._decoder.start_utt()
        self._decoder.process_raw(pcm, no_search=False, full_utt=True)
        self._decoder.end_utt()

        words = []
        for seg in self._decoder.seg():
            word = seg.word

            # Remove special tokens.
            if word == '<sil>' or word == '<s>' or word == '</s>':
                continue

            word = ''.join([x for x in word if x.isalpha()])

            words.append(word)

        return ' '.join(words)

    def __str__(self):
        return 'PocketSphinx'
예제 #18
0
파일: audio.py 프로젝트: acquadrod/coderbot
  def speech_recog(self, model):

    # Create a decoder with certain model
    config = Decoder.default_config()
    config.set_string('-hmm', '/usr/local/share/pocketsphinx/model/en-us/en-us')
    config.set_int('-ds', 2)
    config.set_int('-topn', 3)
    config.set_int('-maxwpf', 5)
    #config.set_string('-kws', MODELDIR + model + '.txt')
    config.set_string('-lm', MODELDIR + model + '.lm')
    config.set_string('-dict', MODELDIR + model + '.dict')
    decoder = Decoder(config)

    decoder.start_utt()
    tstamp = time.time()
    recog_text = ''

    while len(recog_text) < 1:
      try:
        buf = self.stream_in.read(CHUNK_SIZE)
        logging.info("actual voice")
        decoder.process_raw(buf, False, False)
        if decoder.hyp().hypstr != '':
          recog_text += decoder.hyp().hypstr
          print "text: " + decoder.hyp().hypstr
          tstamp = time.time()
      except IOError as ex:
        if ex[1] != pyaudio.paInputOverflowed:
          raise
        buf = '\x00' * CHUNK_SIZE #white noise
        logging.info("white noise") 
      except AttributeError:
        pass

    decoder.end_utt()

    logging.info("recog text: " + recog_text)
    return recog_text
예제 #19
0
class PocketSphinxEngine(Engine):
    """Pocketsphinx engine."""
    def __init__(self, engine_type, keyword, sensitivity):
        """Initializer.

        :param engine_type: type of the engine.
        :param keyword: keyword being used for detection.
        :param sensitivity: sensitivity passed to the engine.
        """

        super().__init__(engine_type, keyword, sensitivity)
        # Set the configuration.
        config = Decoder.default_config()
        config.set_string('-logfn', '/dev/null')
        # Set recognition model to US
        config.set_string('-hmm', os.path.join(get_model_path(), 'en-us'))
        config.set_string('-dict',
                          os.path.join(get_model_path(), 'cmudict-en-us.dict'))
        config.set_string('-keyphrase', keyword)
        config.set_float('-kws_threshold', sensitivity)
        self._decoder = Decoder(config)
        self._decoder.start_utt()

    @prepare_pcm
    def process(self, pcm):
        """Process the PCM data for the keyword."""
        self._decoder.process_raw(pcm, False, False)
        detected = self._decoder.hyp()
        if detected:
            self._decoder.end_utt()
            self._decoder.start_utt()
        return detected

    def release(self):
        """Release the resources hold by the engine."""
        self._decoder.end_utt()
class PocketsphinxTrigger(BaseTrigger):


	type = triggers.TYPES.VOICE

	def __init__(self, config, trigger_callback):
		super(PocketsphinxTrigger, self).__init__(config, trigger_callback, 'pocketsphinx')

		self._enabled_lock = threading.Event()
		self._disabled_sync_lock = threading.Event()
		self._decoder = None

	def setup(self):
		# PocketSphinx configuration
		ps_config = Decoder.default_config()

		# Set recognition model to US
		ps_config.set_string('-hmm', os.path.join(get_model_path(), self._tconfig['language']))
		ps_config.set_string('-dict', os.path.join(get_model_path(), self._tconfig['dictionary']))

		# Specify recognition key phrase
		#ps_config.set_string('-keyphrase', self._tconfig['phrase'])
		#ps_config.set_float('-kws_threshold', float(self._tconfig['threshold']))

		### Multiple Hotwords
		#ps_config.set_string('-inmic', 'yes')
		ps_config.set_string('-kws', '/opt/AlexaPi/src/keyphrase.list')


		# Hide the VERY verbose logging information when not in debug
		if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG:
			ps_config.set_string('-logfn', '/dev/null')

		# Process audio chunk by chunk. On keyword detected perform action and restart search
		self._decoder = Decoder(ps_config)

	def run(self):
		thread = threading.Thread(target=self.thread, args=())
		thread.setDaemon(True)
		thread.start()

	def thread(self):
		while True:
			self._enabled_lock.wait()

			# Enable reading microphone raw data
			inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL, self._config['sound']['input_device'])
			inp.setchannels(1)
			inp.setrate(16000)
			inp.setformat(alsaaudio.PCM_FORMAT_S16_LE)
			inp.setperiodsize(1024)

			self._decoder.start_utt()

			triggered = False
			#assistantTriggered = False
			voice_command = ""

			while not triggered:

				if not self._enabled_lock.isSet():
					break

				# Read from microphone
				_, buf = inp.read()

				# Detect if keyword/trigger word was said
				self._decoder.process_raw(buf, False, False)

				triggered = self._decoder.hyp() is not None

			# To avoid overflows close the microphone connection
			inp.close()

			self._decoder.end_utt()

			self._disabled_sync_lock.set()

			if triggered:
				### Assistant Starts Here
				try:
					voice_command = self._decoder.hyp().hypstr
				except:
					voice_command = ""
				self._trigger_callback(self, voice_command)
				###

	def enable(self):
		self._enabled_lock.set()
		self._disabled_sync_lock.clear()

	def disable(self):
		self._enabled_lock.clear()
		self._disabled_sync_lock.wait()
예제 #21
0
stream = None
if len(sys.argv) > 1:
    stream = open(sys.argv[1], "rb")
else:
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
    stream.start_stream()


print('start...')

while True:
    buf = stream.read(1024)
    if buf:
        decoder.process_raw(buf, False, False)
    else:
        break

    hypothesis = decoder.hyp()
    if hypothesis:
        print('\nhypothesis: %s, score: %d' % (hypothesis.hypstr, hypothesis.best_score))
        print ([(seg.word, seg.prob, seg.start_frame, seg.end_frame) for seg in decoder.seg()])
        print ("Detected keyword, restarting search")
        os.system('mpg123 ' + os.path.join(script_dir, 'hi.mp3'))

        print('restart...')
        decoder.end_utt()
        decoder.start_utt()
        print('ok')
        # break
예제 #22
0
class AvaRecognizer(object):
    """Class to add ASR recognition functionality using language model + dictionary
    Publishes recognition output to recognizer/asr_output."""
    def __init__(self):

        # Initializing publisher with buffer size of 10 messages
        self.pub_ = rospy.Publisher("recognizer/asr_output",
                                    String,
                                    queue_size=10)
        # initialize node
        rospy.init_node("ava_recognizer")
        # Call custom function on node shutdown
        rospy.on_shutdown(self.shutdown)

        # Params
        # File containing language model
        _lm_param = "~lm"
        # Dictionary
        _dict_param = "~dict"
        # HMM Model
        _hmm_param = "~hmm"

        # used in process_audio for piecing full utterances
        self.in_speech_bf = False

        # Setting param values
        if rospy.has_param(
                _dict_param) and rospy.get_param(_dict_param) != ":default":
            self.dict = rospy.get_param(_dict_param)
        else:
            rospy.logerr(
                "No dictionary found. Please add an appropriate dictionary argument."
            )
            return

        if rospy.has_param(
                _lm_param) and rospy.get_param(_lm_param) != ':default':
            self._use_lm = 1
            self.class_lm = rospy.get_param(_lm_param)
        else:
            rospy.logerr("No lm found. Please add an appropriate lm argument.")
            return

        if rospy.has_param(_hmm_param):
            self.hmm = rospy.get_param(_hmm_param)
            if rospy.get_param(_hmm_param) == ":default":
                if os.path.isdir(
                        "/home/team5/.local/lib/python2.7/site-packages/pocketsphinx/model"
                ):
                    rospy.loginfo("Loading the default acoustic model")
                    self.hmm = "/home/team5/.local/lib/python2.7/site-packages/pocketsphinx/model/en-us"
                    rospy.loginfo("Done loading the default acoustic model")
                else:
                    rospy.logerr("Failed to find default model.")
                    return
        else:
            rospy.logerr(
                "No language model specified. Couldn't find default model.")
            return

        # All params satisfied. Starting recognizer and audio thread
        self._audio_queue = Queue.Queue()
        self._kill_audio = False
        threading.Thread(target=self.get_audio).start()

        self.start_recognizer()

    def start_recognizer(self):
        """Function to handle lm or grammar processing of audio."""
        config = Decoder.default_config()
        rospy.loginfo("Done initializing pocketsphinx")

        # Setting configuration of decoder using provided params
        config.set_string('-dict', self.dict)
        config.set_string('-lm', self.class_lm)
        config.set_string('-hmm', self.hmm)
        self.decoder = Decoder(config)

        # Start processing input audio
        self.decoder.start_utt()
        rospy.loginfo("Decoder started successfully")

        # Subscribe to audio topic
        rospy.Subscriber("recognizer/audio_ready", Bool, self.process_audio)
        rospy.spin()

    def process_audio(self, isready):
        """Audio processing based on decoder config."""
        # Check if input audio has ended
        assert (isready)
        data = self._audio_queue.get()
        self.decoder.process_raw(data, False, False)
        if self.decoder.get_in_speech() != self.in_speech_bf:
            self.in_speech_bf = self.decoder.get_in_speech()
            if not self.in_speech_bf:
                self.decoder.end_utt()
                if self.decoder.hyp() != None:
                    rospy.loginfo('OUTPUT: \"' + self.decoder.hyp().hypstr +
                                  '\"')
                    self.pub_.publish(self.decoder.hyp().hypstr)
                self.decoder.start_utt()

    @staticmethod
    def shutdown():
        """This function is executed on node shutdown."""
        # command executed after Ctrl+C is pressed
        rospy.loginfo("Stop AvaRecognizer")
        rospy.sleep(1)

    def get_audio(self):
        """ Used for audio parsing thread. """

        # parameters for PCM. view PCMs with 'pactl list sources short'.
        # don't modify me plz.
        device = 'sysdefault:CARD=Audio'
        inp = alsaaudio.PCM(type=alsaaudio.PCM_CAPTURE,
                            mode=alsaaudio.PCM_NORMAL,
                            card=device)
        inp.setchannels(1)
        inp.setrate(16000)
        inp.setformat(alsaaudio.PCM_FORMAT_S16_LE)
        inp.setperiodsize(1024)

        pub = rospy.Publisher('recognizer/audio_ready', Bool, queue_size=10)
        while not (self._kill_audio):
            _, data = inp.read()
            self._audio_queue.put(data)
            pub.publish(True)
        return
class PocketsphinxTrigger(BaseTrigger):

    type = triggers.TYPES.VOICE

    def __init__(self, config, trigger_callback):
        super(PocketsphinxTrigger, self).__init__(config, trigger_callback,
                                                  'pocketsphinx')

        self._enabled_lock = threading.Event()
        self._disabled_sync_lock = threading.Event()
        self._decoder = None

    def setup(self):
        # PocketSphinx configuration
        ps_config = Decoder.default_config()

        # Set recognition model to US
        ps_config.set_string('-hmm', os.path.join(get_model_path(), 'en-us'))
        ps_config.set_string(
            '-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict'))

        # Specify recognition key phrase
        ps_config.set_string('-keyphrase', self._tconfig['phrase'])
        ps_config.set_float('-kws_threshold',
                            float(self._tconfig['threshold']))

        # Hide the VERY verbose logging information when not in debug
        if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG:
            ps_config.set_string('-logfn', '/dev/null')

        # Process audio chunk by chunk. On keyword detected perform action and restart search
        self._decoder = Decoder(ps_config)

    def run(self):
        thread = threading.Thread(target=self.thread, args=())
        thread.setDaemon(True)
        thread.start()

    def thread(self):
        while True:
            self._enabled_lock.wait()

            # Enable reading microphone raw data
            inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL,
                                self._config['sound']['input_device'])
            inp.setchannels(1)
            inp.setrate(16000)
            inp.setformat(alsaaudio.PCM_FORMAT_S16_LE)
            inp.setperiodsize(1024)

            self._decoder.start_utt()

            triggered = False
            while not triggered:

                if not self._enabled_lock.isSet():
                    break

                # Read from microphone
                _, buf = inp.read()

                # Detect if keyword/trigger word was said
                self._decoder.process_raw(buf, False, False)

                triggered = self._decoder.hyp() is not None

            # To avoid overflows close the microphone connection
            inp.close()

            self._decoder.end_utt()

            self._disabled_sync_lock.set()

            if triggered:
                self._trigger_callback(self)

    def enable(self):
        self._enabled_lock.set()
        self._disabled_sync_lock.clear()

    def disable(self):
        self._enabled_lock.clear()
        self._disabled_sync_lock.wait()
예제 #24
0
class PocketGrammar(object):

    AUDIO_CHUNK_SIZE = 1024
    AUDIO_RATE = 16000
    HMM = 'cmusphinx-5prealpha-en-us-ptm-2.0/'
    DIC = 'dictionary.dic'
    GRAMMAR = 'grammar.jsgf'

    def __init__(self, device_index=0, model_path=None):

        self._decoder = None
        self._pa = None
        self._device_no = device_index
        self._model_path = model_path

        # PocketSphinx configuration
        logging.info('Grammar file:' + os.path.join(model_path, self.GRAMMAR))
        ps_config = Decoder.default_config()

        # Set recognition model to ...
        ps_config.set_string('-hmm', os.path.join(model_path, self.HMM))
        ps_config.set_string('-dict', os.path.join(model_path, self.DIC))
        ps_config.set_string('-jsgf', os.path.join(model_path, self.GRAMMAR))
        ps_config.set_string('-logfn', '/dev/null')

        # Process audio chunk by chunk. On keyword detected perform action and restart search
        self._decoder = Decoder(ps_config)
        self._pa = pyaudio.PyAudio()

    def _handle_init(self, rate, chunk_size):
        self._handle = self._pa.open(input=True,
                                     input_device_index=self._device_no,
                                     format=pyaudio.paInt16,
                                     channels=1,
                                     rate=rate,
                                     frames_per_buffer=chunk_size)

    def _handle_release(self):
        self._handle.stop_stream()
        self._handle.close()

    def _handle_read(self, chunk_size):
        return self._handle.read(chunk_size, exception_on_overflow=False)

    def getHypothesys(self):

        # init microphone
        self._handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE)
        self._decoder.start_utt()

        #  from speech to silence or from silence to speech?
        utteranceStarted = False
        triggered = False
        while not triggered:
            # Read from microphone and process
            data = self._handle_read(self.AUDIO_CHUNK_SIZE)
            self._decoder.process_raw(data, False, False)

            # checks for transition from silence to speech.
            inSpeech = self._decoder.get_in_speech()
            if inSpeech and not utteranceStarted:
                utteranceStarted = True
                logging.debug("Silence")

            # checks for the transition from speech to silence
            if not inSpeech and utteranceStarted:
                hypothesis = self._decoder.hyp()
                triggered = hypothesis is not None

        # close microphone
        self._handle_release()
        self._decoder.end_utt()
        if triggered:
            return hypothesis.hypstr
예제 #25
0
파일: microphone.py 프로젝트: y-nk/alexa-pi
    def detect(self):
        # create decoders on the fly
        if not self.decoders:
            self.decoders = []

            for id, phrase in self.config['triggers'].iteritems():
                config = Decoder.default_config()

                # set recognition model to US
                config.set_string('-hmm',
                                  os.path.join(get_model_path(), 'en-us'))
                config.set_string(
                    '-dict',
                    os.path.join(get_model_path(), 'cmudict-en-us.dict'))

                # specify recognition key phrase
                config.set_string('-keyphrase', phrase)
                config.set_float('-kws_threshold', 1e-5)

                # hide the VERY verbose logging information
                # if not self.config['debug']:
                config.set_string('-logfn', '/dev/null')

                decoder = Decoder(config)
                decoder.id = id

                self.decoders.append(decoder)

        events.fire('detection_started')

        # start decoding
        for decoder in self.decoders:
            decoder.start_utt()

        pcm = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL,
                            self.config['device'])
        pcm.setchannels(1)
        pcm.setrate(16000)
        pcm.setformat(alsaaudio.PCM_FORMAT_S16_LE)
        pcm.setperiodsize(1024)

        phrase = None
        triggered = False
        while not triggered:
            _, buffer = pcm.read()

            for decoder in self.decoders:
                decoder.process_raw(buffer, False, False)
                triggered = decoder.hyp() is not None

                if triggered:
                    phrase = decoder.id
                    break

        pcm.close()
        pcm = None

        for decoder in self.decoders:
            decoder.end_utt()

        events.fire('detection_fullfilled', id=phrase)
예제 #26
0
def main():
    """ A main method to that does a simple matching of sentences and executes scripts
    """

    notifier = sdnotify.SystemdNotifier()

    # Load config first
    config_file = open(os.path.join(os.getcwd(), 'config.yaml'), 'r')
    config = yaml.load(config_file)

    interaction_timeout = int(config['interaction_timeout'])

    # Create Decoder config
    pocketsphinx_config = Decoder.default_config()
    pocketsphinx_config.set_string('-hmm', os.path.join(os.getcwd(), config['hmm_path']))
    pocketsphinx_config.set_string('-dict', os.path.join(os.getcwd(), config['dict_path']))
    pocketsphinx_config.set_string('-featparams', os.path.join(os.getcwd(), config['feat_params_path']))
    pocketsphinx_config.set_boolean("-allphone_ci", True)
    # Using decoder.set_kws & decoder.set_lm_file
    # pocketsphinx_config.set_string('-lm', os.path.join(os.getcwd(), config['lm_path']))
    # pocketsphinx_config.set_string('-kws', os.path.join(os.getcwd(), config['keyphrase_path']))

    # Initialize audio
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
    stream.start_stream()

    # Load invocations and commands
    invocations = config['invocations']

    # Process audio chunk by chunk. On keyword detected perform action and restart search
    decoder = Decoder(pocketsphinx_config)
    logmath = decoder.get_logmath()
    decoder.set_kws('keyword', os.path.join(os.getcwd(), config['invocation_path']))
    decoder.set_lm_file('lm', os.path.join(os.getcwd(), config['lm_path']))

    invocation_ctx = None
    in_speech_bf = False

    # Run some initialization scripts for terminal displays
    subprocess.Popen([os.path.join(os.getcwd(), config['init_exec'])]).communicate()

    decoder.set_search('keyword')
    decoder.start_utt()
    notifier.notify("READY=1")

    interaction_time = None

    while True:
        notifier.notify("WATCHDOG=1")
        buf = stream.read(1024, exception_on_overflow = False)
        if buf:
            decoder.process_raw(buf, False, False)
        else:
            logging.error("Unable to get audio, exiting")
            break

        hyp = decoder.hyp()
        # seg = decoder.seg()
        hyp_str = hyp.hypstr.lower().strip() if hyp else None
        now_in_speech = decoder.get_in_speech()

        if now_in_speech != in_speech_bf:
            in_speech_bf = now_in_speech
            if not in_speech_bf:
                decoder.end_utt()
                if hyp_str:
                    logging.info("Heard: '%s' while being in '%s' context (score: %d, confidence: %d -> in log scale %d)" %
                                 (hyp_str, invocation_ctx, hyp.best_score, logmath.exp(hyp.prob), hyp.prob))

                    if not invocation_ctx:
                        if hyp_str in invocations:
                            logging.info("Matched invocation: '%s'" % hyp_str) 
                            invocation_ctx = hyp_str
                            subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['enter']),
                                             invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate()
                            interaction_time = time.time()
                            decoder.set_search('lm')
                        else:
                            logging.debug('Unknown invocation or wrongly heard, silently ignoring')
                    else:
                        matched = False
                        score_dict = defaultdict(list)

                        commands = invocations[invocation_ctx]['commands']
                        for command in commands:
                            logging.info("- command: '%s':" % command['name'])
                            for sentence in command['sentence']:
                                score = calc_similarity(command, sentence.lower(), hyp_str)
                                score_dict[score].append(command)
                                logging.debug("   - similarity: %d for sentence: %s" % (score, sentence))
                                if score == 1000:
                                    logging.debug("... seems like found perfect match, ignoring the rest")
                                    break

                        for best in sorted(score_dict.items(), reverse=True):
                            if best[0] > 90:
                                command = best[1][0]  # here might be some randomness
                                logging.info("The best matching command is '%s', executing: %s" % (command['name'], command['exec']))
                                subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['ack']),
                                                 invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate()
                                subprocess.Popen([os.path.join(os.getcwd(), command['exec']),
                                                 invocations[invocation_ctx]['voice_params'], invocation_ctx, command['name']]).communicate()
                                subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']),
                                                 invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str])
                                invocation_ctx = None
                                decoder.set_search('keyword')
                                matched = True
                            break  # take only the first which should be the best

                        if not matched:
                            logging.info("... not matched, ignoring")
                            subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['noop']),
                                              invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate()

                decoder.start_utt()

        if invocation_ctx and interaction_time and time.time() > interaction_time + interaction_timeout:
            logging.info("The invocation context has just timed out, returning to listen for invocation word.")
            subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']),
                              invocations[invocation_ctx]['voice_params'], invocation_ctx])
            invocation_ctx = None
            interaction_time = None
            decoder.end_utt()
            decoder.set_search('keyword')
            decoder.start_utt()
예제 #27
0
class PocketsphinxTrigger(BaseTrigger):

    type = triggers.TYPES.VOICE

    AUDIO_CHUNK_SIZE = 1024
    AUDIO_RATE = 16000

    _capture = None

    def __init__(self, config, trigger_callback, capture):
        super(PocketsphinxTrigger, self).__init__(config, trigger_callback,
                                                  'pocketsphinx')

        self._capture = capture

        self._enabled_lock = threading.Event()
        self._disabled_sync_lock = threading.Event()
        self._decoder = None

    def setup(self):
        # PocketSphinx configuration
        ps_config = Decoder.default_config()

        # Set recognition model to US
        ps_config.set_string(
            '-hmm', os.path.join(get_model_path(), self._tconfig['language']))
        ps_config.set_string(
            '-dict', os.path.join(get_model_path(),
                                  self._tconfig['dictionary']))

        # Specify recognition key phrase
        #ps_config.set_string('-keyphrase', self._tconfig['phrase'])
        #ps_config.set_float('-kws_threshold', float(self._tconfig['threshold']))

        ### Multiple Hotwords
        #ps_config.set_string('-inmic', 'yes')
        ps_config.set_string('-kws', '/opt/AlexaPi/src/keyphrase.list')

        # Hide the VERY verbose logging information when not in debug
        if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG:

            null_path = '/dev/null'
            if platform.system() == 'Windows':
                null_path = 'nul'

            ps_config.set_string('-logfn', null_path)

        # Process audio chunk by chunk. On keyword detected perform action and restart search
        self._decoder = Decoder(ps_config)

    def run(self):
        thread = threading.Thread(target=self.thread, args=())
        thread.setDaemon(True)
        thread.start()

    def thread(self):
        while True:
            self._enabled_lock.wait()

            self._capture.handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE)

            self._decoder.start_utt()

            triggered = False
            #assistantTriggered = False
            voice_command = ""

            while not triggered:

                if not self._enabled_lock.isSet():
                    break

                # Read from microphone
                data = self._capture.handle_read()

                # Detect if keyword/trigger word was said
                self._decoder.process_raw(data, False, False)

                triggered = self._decoder.hyp() is not None

            self._capture.handle_release()

            self._decoder.end_utt()

            self._disabled_sync_lock.set()

            if triggered:
                ### Assistant Starts Here
                try:
                    voice_command = self._decoder.hyp().hypstr
                except:
                    voice_command = ""
                self._trigger_callback(self, voice_command)
                ###

    def enable(self):
        self._enabled_lock.set()
        self._disabled_sync_lock.clear()

    def disable(self):
        self._enabled_lock.clear()
        self._disabled_sync_lock.wait()
예제 #28
0
class Words(Chain):
    """
    Chain to compute words and summarizes words occurences at levels of individual subject and dataset
    """
    allow_sample_layer_concurrency = True
    abstract_class = False
    requirements = [Preprocess]

    def __init__(self):
        super(Words, self).__init__()
        self._subject_words = {}
        self.decoder = None

    def dataset_preprocess(self, dataset):
        self._subject_words.clear()

    def subject_preprocess(self, subject, samples,
                           common_subject_settings):
        self._subject_words[subject] = []

    @staticmethod
    def sample_result_filename(out_sample_path):
        return f'{out_sample_path[:-5]}_words_result.json'

    def _compute_words(self, segments_path, words_result_path):
        """

        :param segments_path:
        :param words_result_path:
        :return:
        """
        model_dir = self.process_settings.get('model_dir', MODEL_DIR)
        decoder_hmm = self.process_settings.get('decoder_hmm', 'en-us/en-us')
        decoder_lm = self.process_settings.get('decoder_lm',
                                               'en-us/en-us.lm.bin')
        decoder_dict = self.process_settings.get('decoder_dict',
                                                 'en-us/cmudict-en-us.dict')
        decoder_lw = self.process_settings.get('decoder_lw', 2.0)
        decoder_pip = self.process_settings.get('decoder_pip', 0.3)
        decoder_beam = self.process_settings.get('decoder_beam', 1e-200)
        decoder_pbeam = self.process_settings.get('decoder_pbeam', 1e-20)
        decoder_mmap = self.process_settings.get('decoder_mmap', False)
        decoder_stream_buf_size = self.process_settings.get('decoder_stream_buf_size',
                                                            8192)
        pprint_indent = self.process_settings.get('pprint_indent', 4)
        hypothesis = PocketsphinxHypothesisSchema()
        ph_info = PocketsphinxSegmentSchema()

        def _get_decoder_results():
            self.decoder.end_utt()
            segment = [ph_info.dump(dict(word=seg.word,
                                         start=seg.start_frame / 100,
                                         end=seg.end_frame / 100,
                                         prob=seg.prob))
                       for seg in self.decoder.seg()]
            hyp = self.decoder.hyp()
            hyp_dict = dict(best_score=hyp.best_score,
                            hypstr=hyp.hypstr, prob=hyp.prob)
            hyp_result = hypothesis.dump(hyp_dict)
            return hyp_result, segment

        @check_if_already_done(words_result_path)
        def recognize_words(segments_path, words_result_path):

            # Create a decoder with certain model
            config = Decoder.default_config()
            config.set_string('-hmm', join(model_dir, decoder_hmm))
            config.set_string('-lm', join(model_dir, decoder_lm))
            config.set_string('-dict', join(model_dir, decoder_dict))
            config.set_float('-lw', decoder_lw)
            config.set_float('-pip', decoder_pip)
            config.set_float('-beam', decoder_beam)
            config.set_float('-pbeam', decoder_pbeam)
            config.set_boolean('-mmap', decoder_mmap)
            hyps=[]
            segs=[]
            self.decoder = Decoder(config)
            with open(segments_path, 'rb') as stream:
                in_speech_buffer = False
                self.decoder.start_utt()
                while True:
                    buf = stream.read(decoder_stream_buf_size)
                    if buf:
                        self.decoder.process_raw(buf, False, False)
                        if self.decoder.get_in_speech() != in_speech_buffer:
                            in_speech_buffer = self.decoder.get_in_speech()
                            if not in_speech_buffer:
                                hyp_result, segment = _get_decoder_results()
                                segs += segment
                                hyps.append(hyp_result)
                                self.decoder.start_utt()
                    else:
                        if in_speech_buffer:
                            hyp_result, segment = _get_decoder_results()
                            segs += segment
                            hyps.append(hyp_result)
                        break
            words_dict = dict(hypotheses=hyps, segment_info=segs)
            words_result = DecoderOutputSchema().dumps(words_dict)
            with open(words_result_path, 'w') as f:
                f.write(words_result)

        recognize_words(segments_path, words_result_path)

        with open(words_result_path, 'r') as f:
            logger.debug(f'words_result_path: {words_result_path}')
            json_file = json.load(f)
            result = DecoderOutputSchema().load(json_file)
            logger.debug(json.dumps(result, indent=pprint_indent))

    def sample_layer(self, subject, sample_json_filename, sample_settings):
        url = sample_settings.get('url')
        datatype = sample_settings.get('datatype')

        output_path_pattern = join(self.results_dir, subject, sample_json_filename)
        words_result_file = self.sample_result_filename(output_path_pattern)
        logger.info(f'words result file: {words_result_file}')
        audio_path = resolve_audio_path(url, datatype, output_path_pattern)
        _, segments_path = audio_and_segment_paths(audio_path, False)
        self._compute_words(segments_path, words_result_file)
예제 #29
0
class PocketKeyword(object):

    AUDIO_CHUNK_SIZE = 1024
    AUDIO_RATE = 16000

    def __init__(self, phrase, threshold, device_index=0):

        self._decoder = None
        self._pa = None
        self._device_no = device_index
        self._phrase = phrase
        self._threshold = float(threshold)

        # PocketSphinx configuration
        logging.info('Phrase: ' + phrase + ' Threshold: ' + str(threshold))
        ps_config = Decoder.default_config()

        # Set recognition model to US
        ps_config.set_string('-hmm',
                             os.path.join(get_model_path_keyword(), 'en-us'))
        ps_config.set_string(
            '-dict',
            os.path.join(get_model_path_keyword(), 'cmudict-en-us.dict'))
        # Specify recognition key phrase
        ps_config.set_string('-keyphrase', self._phrase)
        ps_config.set_float('-kws_threshold', self._threshold)
        ps_config.set_string('-logfn', '/dev/null')

        # Process audio chunk by chunk. On keyword detected perform action and restart search
        self._decoder = Decoder(ps_config)
        self._pa = pyaudio.PyAudio()

    def _handle_init(self, rate, chunk_size):
        self._handle = self._pa.open(input=True,
                                     input_device_index=self._device_no,
                                     format=pyaudio.paInt16,
                                     channels=1,
                                     rate=rate,
                                     frames_per_buffer=chunk_size)

    def _handle_release(self):
        self._handle.stop_stream()
        self._handle.close()

    def _handle_read(self, chunk_size):
        return self._handle.read(chunk_size, exception_on_overflow=False)

    def getHypothesys(self):

        # init microphone
        self._handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE)
        self._decoder.start_utt()

        triggered = False
        while not triggered:
            # Read from microphone and process
            data = self._handle_read(self.AUDIO_CHUNK_SIZE)
            self._decoder.process_raw(data, False, False)

            # best guess from CMU Sphinx STT
            hypothesis = self._decoder.hyp()
            triggered = hypothesis is not None

        # close microphone
        self._handle_release()
        self._decoder.end_utt()
        if triggered:
            return hypothesis.hypstr
예제 #30
0
    stream = open(sys.argv[1], "rb")
else:
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    frames_per_buffer=1024)
    stream.start_stream()

print('start...')

while True:
    buf = stream.read(1024)
    if buf:
        decoder.process_raw(buf, False, False)
    else:
        break

    hypothesis = decoder.hyp()
    if hypothesis:
        print('\nhypothesis: %s, score: %d' %
              (hypothesis.hypstr, hypothesis.best_score))
        print([(seg.word, seg.prob, seg.start_frame, seg.end_frame)
               for seg in decoder.seg()])
        print("Detected keyword, restarting search")
        os.system('mpg123 ' + os.path.join(script_dir, 'hi.mp3'))

        print('restart...')
        decoder.end_utt()
        decoder.start_utt()
예제 #31
0
파일: audio.py 프로젝트: TuxEatPi/tuxeatpi
class NLUAudio(NLUBase):
    """Define NLUAudio component

    For now hotword uses pocketsphinx with speech_recognition
    and Nuance services has NLU
    """
    def __init__(self, settings, action_queue, tts_queue, logger):
        NLUBase.__init__(self, settings, action_queue, None, tts_queue, logger)
        # Init private attributes
        self._rerun = True

        self._answer_sound_path = "sounds/answer.wav"
        self._config = Decoder.default_config()
        if not self._prepare_decoder():
            self._must_run = False

    def _prepare_decoder(self):
        """Set decoder config"""
        # prepare config
        self._hotword = self._settings['speech']['hotword']
        # self._answer = self._settings['hotword']['answer']
        if not os.path.isdir("pocketsphinx-data"):
            raise HotWordError("Missing pocketsphinx-data folder. Please run `make hotword`")

        acoustic_model = os.path.join("pocketsphinx-data",
                                      self._settings['speech']['language'],
                                      'acoustic-model',
                                      )
        language_model = os.path.join("pocketsphinx-data",
                                      self._settings['speech']['language'],
                                      'language-model.lm.bin',
                                      )
        pocket_dict = os.path.join("pocketsphinx-data",
                                   self._settings['speech']['language'],
                                   'pronounciation-dictionary.dict',
                                   )
        self._config.set_string('-logfn', "/dev/null")
        self._config.set_string('-hmm', acoustic_model)
        self._config.set_string('-lm', language_model)
        self._config.set_string('-dict', pocket_dict)
        try:
            self._decoder = Decoder(self._config)
        except RuntimeError:
            self.logger.critical("Error get audio decoder. Hotword not started")
            return False
        self._decoder.set_keyphrase('wakeup', self._hotword)
        self._decoder.set_search('wakeup')

    def stop(self):
        """Stop process"""
        self._rerun = False
        NLUBase.stop(self)

    def _answering(self):
        """Play the hotwoard confirmation sound"""
        f_ans = wave.open(self._answer_sound_path, "rb")
        stream = self._paudio.open(format=self._paudio.get_format_from_width(f_ans.getsampwidth()),
                                   channels=f_ans.getnchannels(),
                                   rate=f_ans.getframerate(),
                                   output=True)
        data = f_ans.readframes(1024)
        while len(data) > 0:
            stream.write(data)
            data = f_ans.readframes(1024)
        f_ans.close()

    def run(self):
        """Listen for NLU"""
        self._rerun = True
        self._must_run = True
        self.logger.debug("starting listening hotword %s", self._hotword)
        while self._rerun:
            self._rerun = False
            try:
                self._paudio = pyaudio.PyAudio()
                stream = self._paudio.open(format=pyaudio.paInt16, channels=1, rate=16000,
                                           input=True, frames_per_buffer=1024)
            except OSError:
                self.logger.warning("No audio device found can not listen for NLU")
                self.logger.warning("Disabling NLU audio")
                self._must_run = False
                self._rerun = False
                return
            stream.start_stream()
            self._paudio.get_default_input_device_info()

            self._decoder.start_utt()
            while self._must_run:
                buf = stream.read(1024)
                self._decoder.process_raw(buf, False, False)
                if not self.tts_queue.empty():
                    # If tts_queue is not empty, this means the Droid
                    # is currently speaking. So we don't want to it listen itself
                    # TODO replace this stuff by speaker annulation
                    continue
                if self._decoder.hyp() and self._decoder.hyp().hypstr == self._hotword:
                    self.logger.debug("Hotword detected")
                    # self.tts_queue.put(gtt(self._answer))
                    # self.tts_queue.put(gtt("mmm"))
                    self._answering()
                    ret = nlu_audio(self._settings, self.logger)

                    # GOT ACTIONS
                    interpretations = ret.get("nlu_interpretation_results", {}).\
                        get("payload", {}).get("interpretations", {})
                    # TODO: what about if len(interpretations) > 1 ??
                    for interpretation in interpretations:
                        intent = interpretation.get("action", {}).get("intent", {})
                        self.logger.info("Intent: {}".format(intent.get("value")))
                        self.logger.info("Confidence: {}".format(intent.get("confidence")))
                        # TODO log arguments
                        if intent.get("value") == "NO_MATCH":
                            # I don't understand :/
                            self._misunderstand(0, True, True)
                        elif intent.get("confidence") < 0.8:
                            # I'm not sure to undestand :/
                            self._misunderstand(intent.get("confidence"), True, True)
                        else:
                            # Check intent name
                            if len(intent.get("value").split("__")) != 2:
                                self.logger.critical("BAD Intent name: "
                                                     "{}".format(intent.get("value")))
                                self._misunderstand(0, True, True)
                            # Run function with parameters
                            action, method = intent.get("value").split("__")
                            # Run action
                            # TODO add parameters from NLU response
                            self._run_action(action, method, {}, False, True, True)
                    # TODO run nlu audio detection
                    self._rerun = True
                    break
            self._decoder.end_utt()
예제 #32
0
class PocketsphinxTrigger(VoiceTrigger):

    name = 'pocketsphinx'

    AUDIO_CHUNK_SIZE = 1024
    AUDIO_RATE = 16000

    _capture = None

    def __init__(self, config, trigger_callback, capture):
        super(PocketsphinxTrigger, self).__init__(config, trigger_callback)

        self._capture = capture

        self._enabled_lock = threading.Event()
        self._disabled_sync_lock = threading.Event()
        self._decoder = None

    def setup(self):
        # PocketSphinx configuration
        ps_config = Decoder.default_config()

        # Set recognition model to US
        ps_config.set_string('-hmm', os.path.join(get_model_path(), 'en-us'))
        ps_config.set_string(
            '-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict'))

        # Specify recognition key phrase
        ps_config.set_string('-keyphrase', self._tconfig['phrase'])
        ps_config.set_float('-kws_threshold',
                            float(self._tconfig['threshold']))

        # Hide the VERY verbose logging information when not in debug
        if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG:

            null_path = '/dev/null'
            if platform.system() == 'Windows':
                null_path = 'nul'

            ps_config.set_string('-logfn', null_path)

        # Process audio chunk by chunk. On keyword detected perform action and restart search
        self._detector = Decoder(ps_config)

    def thread(self):
        while True:
            self._enabled_lock.wait()

            self._capture.handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE)

            self._detector.start_utt()

            triggered = False
            while not triggered:

                if not self._enabled_lock.isSet():
                    break

                # Read from microphone
                data = self._capture.handle_read()

                # Detect if keyword/trigger word was said
                self._detector.process_raw(data, False, False)

                triggered = self._detector.hyp() is not None

            self._capture.handle_release()

            self._detector.end_utt()

            self._disabled_sync_lock.set()

            if triggered:
                self._trigger_callback(self)
예제 #33
0
class SpeechRecognizer(Interpreter):
    def __init__(self, name: str, sr: str = "pocketsphinx"):
        super().__init__(name, True)
        self.logger = self.get_logger()
        self.sr = sr
        self.current_data = []
        self.setup()

    def setup(self) -> None:
        self.RATE = int(os.getenv("RATE"))
        self.CHUNK = int(os.getenv("CHUNK"))
        self.setup_pocketsphinx()

        if (self.sr == "googlespeech"):
            self.setup_googlespeech()

    def setup_pocketsphinx(self) -> None:
        self.logger.info("Setting up PocketSphinx.")
        self.MODELDIR = "resources/model"

        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(self.MODELDIR, 'es-es'))
        config.set_string('-lm', os.path.join(self.MODELDIR, 'es-es.lm'))
        config.set_string('-dict', os.path.join(self.MODELDIR, 'es.dict'))
        config.set_string('-logfn', '/dev/null')

        self.decoder = Decoder(config)

        self.prev_buf_is_speech = False
        self.decoder.start_utt()
        self.logger.info("Done setting up PocketSphinx.")

    def setup_googlespeech(self) -> None:
        self.logger.info("Setting up Google Speech.")
        credentials = service_account.Credentials.from_service_account_file(
            'resources/keys/credentials.json')
        config = speech.types.RecognitionConfig(
            encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
            language_code='es-PE',
            sample_rate_hertz=self.RATE,
        )
        self.client = speech.SpeechClient(credentials=credentials)
        self.streaming_config = speech.types.StreamingRecognitionConfig(
            config=config)
        self.logger.info("Done setting up Google Speech.")

    def get_destinations_ID(self, raw_data) -> List[Identifier]:
        return [self.destinations_ID[0]]

    def preprocess(self, raw_data):
        """Filtering"""
        return raw_data

    def query_gs(self):
        requests = (speech.types.StreamingRecognizeRequest(audio_content=chunk)
                    for chunk in self.current_data)
        responses = self.client.streaming_recognize(
            config=self.streaming_config, requests=requests)
        try:
            response = next(responses)
            data = response.results[0].alternatives[0].transcript
            conf = response.results[0].alternatives[0].confidence
        except Exception as e:
            self.logger.info(f"{self.name}>> {e}")
            conf = None
            data = None
        self.current_data.clear()
        return data, conf

    def query_ps(self):
        try:
            data = self.decoder.hyp().hypstr
            conf = self.decoder.hyp().best_score
            if data == "":
                data = None
        except Exception as e:
            self.logger.info(f"{self.name}>> {e}")
            conf = None
            data = None
        return data, conf

    def process(self, raw_data) -> Generator:
        self.decoder.process_raw(raw_data, False, False)
        cur_buf_is_speech = self.decoder.get_in_speech()
        data = None
        self.logger.info(
            f"prev: {self.prev_buf_is_speech}, current: {cur_buf_is_speech}")

        force_speech = False
        if raw_data == bytes([0] * self.CHUNK * 16):
            force_speech = True
            self.logger.info("RECEIVED FORCE STOP")

        if force_speech or (self.prev_buf_is_speech and not cur_buf_is_speech):
            # No longer in speech -> stop listening and process
            self.logger.info("No longer in speech, yielding True.")
            yield True
            self.decoder.end_utt()
            if (self.sr == "googlespeech"):
                data, conf = self.query_gs()
            elif (self.sr == "pocketsphinx"):
                data, conf = self.query_ps()
            self.logger.info(
                f"{self.name}>> Heard DATA: '{data}' with confidence: {conf}.")
            self.decoder.start_utt()
            self.prev_buf_is_speech = cur_buf_is_speech
        elif not self.prev_buf_is_speech and cur_buf_is_speech:
            # Now in speech -> Start listening
            self.current_data.append(raw_data)
            self.prev_buf_is_speech = cur_buf_is_speech
            yield False

        elif self.prev_buf_is_speech and cur_buf_is_speech:
            # Still in speech -> Keep on listening
            self.current_data.append(raw_data)
            self.prev_buf_is_speech = cur_buf_is_speech
            yield False

        else:
            self.prev_buf_is_speech = cur_buf_is_speech
            yield False

        yield data
        return

    def pass_msg(self, msg: str) -> None:
        if msg == "RESUME":
            self.e.set()

    def dump_history(self, filename: str, data: List[Any]) -> None:
        pass
예제 #34
0
def main():
    environment: str = os.getenv("ENVIRONMENT", "dev")
    config: Dict = load_config(environment)
    initialize_logger(level=config["logging"]["level"],
                      filename=config["logging"]["filename"])
    redis_host = config["redis"]["host"]
    redis_port = config["redis"]["port"]
    logger.debug(f"Connecting to redis at {redis_host}:{redis_port}")
    redis_client: Redis = Redis(host=redis_host, port=redis_port, db=0)

    logger.debug("Initializing PyAudio interface")
    audio = pyaudio.PyAudio()
    microphone_index = get_microphone_index(audio,
                                            config["microphone"]["name"])
    logger.debug(
        f"Using microphone device '{config['microphone']['name']}' (card index {microphone_index})"
    )
    logger.debug(
        f"Intializing pocketsphinx Decoder using model dir {MODELDIR}")
    decoder_config: DecoderConfig = Decoder.default_config()
    decoder_config.set_string("-hmm", os.path.join(MODELDIR, "en-us/en-us"))
    decoder_config.set_string("-lm",
                              os.path.join(MODELDIR, "en-us/en-us.lm.bin"))
    decoder_config.set_string(
        "-dict", os.path.join(MODELDIR, "en-us/cmudict-en-us.dict"))
    decoder = Decoder(decoder_config)

    logger.debug("Opening audio stream")
    stream = audio.open(format=pyaudio.paInt16,
                        channels=1,
                        rate=44100,
                        input=True,
                        frames_per_buffer=2048,
                        input_device_index=microphone_index)
    stream.start_stream()

    in_speech_bf = False
    decoder.start_utt()

    try:
        logger.debug("Starting decoder loop")
        while cycle([True]):
            buf = stream.read(2048)
            if buf:
                logger.debug("Decoding raw audio")
                decoder.process_raw(buf, False, False)
                if decoder.get_in_speech() != in_speech_bf:
                    logger.debug("GOT HERE")
                    in_speech_bf = decoder.get_in_speech()
                    if not in_speech_bf:
                        decoder.end_utt()
                        transcription = decoder.hyp().hypstr
                        logger.debug(f"Result: {transcription}")
                        redis_client.publish("subsystem.listener.recording",
                                             transcription)
                        decoder.start_utt()
            else:
                logger.debug("Buffer closed. Ending")
                break
        decoder.end_utt()
    except Exception:
        logger.exception("Something bad happened")
    finally:
        redis_client.close()
예제 #35
0
class InstructionRecogniser(QThread):
    '''
	You should only use keyIn/keyOut, and shutdown after use. The thread starts itself when appropriate.
	Signals are emitted with any recognised instructions.
	'''
    def __init__(self, gui):
        QThread.__init__(self, gui)
        if settings.sphinx_acoustic_model_dir == '':  # use default acoustic model
            acoustic_model_directory = path.join(get_model_path(), 'en-us')
        else:  # use custom acoustic model
            acoustic_model_directory = settings.sphinx_acoustic_model_dir
        config = Decoder.default_config()
        config.set_string('-hmm', acoustic_model_directory)  # acoustic model
        config.set_string(
            '-dict', settings.prepared_lexicon_file)  # lexicon pronunciation
        config.set_string(
            '-jsgf',
            settings.prepared_grammar_file)  # language model from grammar
        config.set_string(
            '-logfn',
            settings.outputFileName(sphinx_decoder_log_file_base_name,
                                    ext='log'))
        self.listen = False
        self.decoder = Decoder(config)
        self.audio = None
        self.device = None

    def startup(self):
        self.audio = PyAudio()
        if 0 <= settings.audio_input_device_index < self.audio.get_device_count(
        ):  # out of range or -1 for default
            self.device = settings.audio_input_device_index
        else:
            self.device = None

    def shutdown(self):
        self.listen = False
        self.wait()
        self.audio.terminate()
        self.audio = None

    def keyIn(self):
        if not self.isRunning():
            self.listen = True
            self.start()

    def keyOut(self):
        self.listen = False

    def run(self):
        audio_stream = self.audio.open(input_device_index=self.device,
                                       channels=1,
                                       format=paInt16,
                                       rate=audio_sample_rate,
                                       frames_per_buffer=audio_chunk_size,
                                       input=True)
        chunks = []
        msg_duration = 0
        buff = audio_stream.read(audio_chunk_size)
        while self.listen and len(
                buff) > 0 and msg_duration < message_duration_limit:
            chunks.append(buff)
            buff = audio_stream.read(audio_chunk_size)
            msg_duration += audio_chunk_size / audio_sample_rate
        audio_stream.close()
        audio_message = b''.join(chunks)

        self.decoder.start_utt(
        )  # STYLE catch failures here (e.g. grammar/lex files not found)
        self.decoder.process_raw(audio_message, False, True)
        self.decoder.end_utt()
        hyp = self.decoder.hyp()
        if hyp:
            SR_log('VOICE: "%s"' % hyp.hypstr)
            if settings.show_recognised_voice_strings:
                signals.statusBarMsg.emit('VOICE: "%s"' % hyp.hypstr)
            callsign_tokens, instr_lst = interpret_string(hyp.hypstr)
            signals.voiceMsgRecognised.emit(callsign_tokens, instr_lst)
        else:
            SR_log('VOICE: no hypothesis, message duration was %g s' %
                   msg_duration)
            signals.voiceMsgNotRecognised.emit()