示例#1
0
class SphinxDecoder():
    def __init__(self):
        self.MODELDIR = 'speech/'
        self.wav_name = 'media/temp.wav'
        self.raw_name = 'media/temp.raw'

        config = Decoder.default_config()
        config.set_string('-hmm', self.MODELDIR + 'ru_ru/')
        config.set_string('-dict', self.MODELDIR + 'ru.dic')
        self.decoder = Decoder(config)

        jsgf = Jsgf(self.MODELDIR + 'gr.gram')
        rule = jsgf.get_rule('gr.rule')
        fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5)
        fsg.writefile('gr.fsg')

        self.decoder.set_fsg('gr', fsg)
        self.decoder.set_search('gr')

        self.rec = Recognizer()
        self.mic = Microphone()

    def wav_to_raw(self):
        audio_file = AudioSegment.from_wav(self.wav_name)
        audio_file = audio_file.set_frame_rate(16000)
        audio_file.export(self.raw_name, format='raw')

    def record_audio(self):
        with self.mic as source:
            self.rec.adjust_for_ambient_noise(source)

            system('aplay media/beep.wav')
            audio = self.rec.listen(source)
            with open(self.wav_name, 'wb') as new_audio:
                new_audio.write(audio.get_wav_data())

        self.wav_to_raw()

    def get_from_audio(self):
        self.record_audio()

        self.decoder.start_utt()
        stream = open(self.raw_name, 'rb')
        while True:
            buf = stream.read(1024)
            if buf:
                self.decoder.process_raw(buf, False, False)
            else:
                break
        self.decoder.end_utt()
        stream.close()
        try:
            return self.decoder.hyp().hypstr
        except:
            return None
示例#2
0
def main():
    """ A main method to that does a simple matching of sentences and executes scripts
    """

    notifier = sdnotify.SystemdNotifier()

    # Load config first
    config_file = open(os.path.join(os.getcwd(), 'config.yaml'), 'r')
    config = yaml.load(config_file)

    interaction_timeout = int(config['interaction_timeout'])

    # Create Decoder config
    pocketsphinx_config = Decoder.default_config()
    pocketsphinx_config.set_string('-hmm', os.path.join(os.getcwd(), config['hmm_path']))
    pocketsphinx_config.set_string('-dict', os.path.join(os.getcwd(), config['dict_path']))
    pocketsphinx_config.set_string('-featparams', os.path.join(os.getcwd(), config['feat_params_path']))
    pocketsphinx_config.set_boolean("-allphone_ci", True)
    # Using decoder.set_kws & decoder.set_lm_file
    # pocketsphinx_config.set_string('-lm', os.path.join(os.getcwd(), config['lm_path']))
    # pocketsphinx_config.set_string('-kws', os.path.join(os.getcwd(), config['keyphrase_path']))

    # Initialize audio
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
    stream.start_stream()

    # Load invocations and commands
    invocations = config['invocations']

    # Process audio chunk by chunk. On keyword detected perform action and restart search
    decoder = Decoder(pocketsphinx_config)
    logmath = decoder.get_logmath()
    decoder.set_kws('keyword', os.path.join(os.getcwd(), config['invocation_path']))
    decoder.set_lm_file('lm', os.path.join(os.getcwd(), config['lm_path']))

    invocation_ctx = None
    in_speech_bf = False

    # Run some initialization scripts for terminal displays
    subprocess.Popen([os.path.join(os.getcwd(), config['init_exec'])]).communicate()

    decoder.set_search('keyword')
    decoder.start_utt()
    notifier.notify("READY=1")

    interaction_time = None

    while True:
        notifier.notify("WATCHDOG=1")
        buf = stream.read(1024, exception_on_overflow = False)
        if buf:
            decoder.process_raw(buf, False, False)
        else:
            logging.error("Unable to get audio, exiting")
            break

        hyp = decoder.hyp()
        # seg = decoder.seg()
        hyp_str = hyp.hypstr.lower().strip() if hyp else None
        now_in_speech = decoder.get_in_speech()

        if now_in_speech != in_speech_bf:
            in_speech_bf = now_in_speech
            if not in_speech_bf:
                decoder.end_utt()
                if hyp_str:
                    logging.info("Heard: '%s' while being in '%s' context (score: %d, confidence: %d -> in log scale %d)" %
                                 (hyp_str, invocation_ctx, hyp.best_score, logmath.exp(hyp.prob), hyp.prob))

                    if not invocation_ctx:
                        if hyp_str in invocations:
                            logging.info("Matched invocation: '%s'" % hyp_str) 
                            invocation_ctx = hyp_str
                            subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['enter']),
                                             invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate()
                            interaction_time = time.time()
                            decoder.set_search('lm')
                        else:
                            logging.debug('Unknown invocation or wrongly heard, silently ignoring')
                    else:
                        matched = False
                        score_dict = defaultdict(list)

                        commands = invocations[invocation_ctx]['commands']
                        for command in commands:
                            logging.info("- command: '%s':" % command['name'])
                            for sentence in command['sentence']:
                                score = calc_similarity(command, sentence.lower(), hyp_str)
                                score_dict[score].append(command)
                                logging.debug("   - similarity: %d for sentence: %s" % (score, sentence))
                                if score == 1000:
                                    logging.debug("... seems like found perfect match, ignoring the rest")
                                    break

                        for best in sorted(score_dict.items(), reverse=True):
                            if best[0] > 90:
                                command = best[1][0]  # here might be some randomness
                                logging.info("The best matching command is '%s', executing: %s" % (command['name'], command['exec']))
                                subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['ack']),
                                                 invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate()
                                subprocess.Popen([os.path.join(os.getcwd(), command['exec']),
                                                 invocations[invocation_ctx]['voice_params'], invocation_ctx, command['name']]).communicate()
                                subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']),
                                                 invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str])
                                invocation_ctx = None
                                decoder.set_search('keyword')
                                matched = True
                            break  # take only the first which should be the best

                        if not matched:
                            logging.info("... not matched, ignoring")
                            subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['noop']),
                                              invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate()

                decoder.start_utt()

        if invocation_ctx and interaction_time and time.time() > interaction_time + interaction_timeout:
            logging.info("The invocation context has just timed out, returning to listen for invocation word.")
            subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']),
                              invocations[invocation_ctx]['voice_params'], invocation_ctx])
            invocation_ctx = None
            interaction_time = None
            decoder.end_utt()
            decoder.set_search('keyword')
            decoder.start_utt()